DEBUG + ADD DataGenerator for BigData+ Add Script for generating ngram...

DEBUG + ADD DataGenerator for BigData+ Add Script for generating ngram embedding once + desamb eval script

DEBUG + ADD DataGenerator for BigData+ Add Script for generating ngram...
DEBUG + ADD DataGenerator for BigData+ Add Script for generating ngram embedding once + desamb eval script
12946ad6 · Jacques Fize · 3487bb5d · 12946ad6 · 12946ad6 · 12946ad6
Commit 12946ad6 authored 5 years ago by Jacques Fize
--- a/.gitignore
+++ b/.gitignore
@@ -148,4 +148,5 @@ notes.md
 other/*
 test*
 nohup.out
 log*
\ No newline at end of file
+temp*
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -161,7 +161,7 @@ if args.admin_code_1 != "None":
 # GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
 filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
-bounds = get_bounds(filtered) # Required to get adjacency relationships
 #############################################################################################
@@ -177,6 +177,7 @@ if args.adjacency:
    logging.info("Retrieve adjacency relationships ! ")
    if not os.path.exists(ADJACENCY_REL_FILENAME):
+        bounds = get_bounds(filtered) # Required to get adjacency relationships
        rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
        json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
    else:

--- a/data_generator.py
+++ b/data_generator.py
+import os
+from gzip import GzipFile
+import keras
+import numpy as np
+import pandas as pd
+from helpers import parse_title_wiki,read_geonames
+from gensim.models.keyedvectors import KeyedVectors
+def wc_l(filename,gzip=True):
+    lc = 0
+    if not gzip:
+        f = open(filename)
+    if gzip:
+        f = GzipFile(filename)
+    while f.readline():
+        lc += 1 
+    f.close()       
+    return lc
+class SamplingProbabilities:
+    def __init__(self):
+        self.count = {}
+    def get_probs(self,item):
+        if not item in self.count:
+            self.count[item] = 0
+        self.count[item]+=1
+        return 1/self.count[item]
+    def __call__(self,a):
+        return self.get_probs(a)
+class DataSource(object):
+    def __init__(self,name,input_filename):
+        self.name = name
+        assert os.path.exists(input_filename)
+        self.input_filename = input_filename
+        self.len = 0 
+    def __next__(self):
+        raise NotImplementedError()
+    def __iter__(self):
+        return self
+    def __len__(self):
+        return self.len
+    def __reset__(self):
+        raise NotImplementedError()
+    def isOver(self):
+        raise NotImplementedError()
+class Adjacency(DataSource):
+    def __init__(self,filename,geonames_filename,sampling=3,len_=None,gzip=True):
+        super().__init__("Adjacency SRC",filename)
+        assert os.path.exists(geonames_filename)
+        self.geonames_data_dict = {row.geonameid:row.name for row in read_geonames(geonames_filename).itertuples()}
+        self.gzip = gzip
+        if not self.gzip:
+            self.data_src = open(self.input_filename,'rb')
+        else:
+            self.data_src = GzipFile(self.input_filename,'rb')
+        if len_:
+            self.len = len_*sampling
+        else:
+            self.len = wc_l(filename,gzip=gzip)
+        self.data_src.readline() # header line
+        self.sampling = sampling
+        if self.sampling:
+            self.probs_storage = SamplingProbabilities() 
+        self.topo = None
+        self.context_topo_context = []
+        self.curr_probs = None
+        self.lat, self.lon = None, None
+        self.i = 0
+        self.is_over = False
+    def __next__(self):
+        if  self.i >= len(self.context_topo_context):
+            line = self.data_src.readline()
+            if not line:
+                self.is_over = True
+                raise StopIteration
+            line = line.decode("utf-8").rstrip("\n")
+            geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(","))
+            self.topo = int(geonameid)
+            self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")]
+            if self.sampling:
+                self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
+                self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
+            self.lat, self.lon = float(latitude),float(longitude)
+            self.i = 0
+        self.i += 1
+        return (self.geonames_data_dict[self.topo],
+        self.geonames_data_dict[self.context_topo_context[self.i-1]],
+        self.lat,self.lon)
+    def __reset__(self):
+        if not self.gzip:
+            self.data_src = open(self.input_filename,'rb')
+        else:
+            self.data_src = GzipFile(self.input_filename,'rb')
+        self.data_src.readline() # header line
+        self.is_over = False
+    def isOver(self):
+        return self.is_over
+class Inclusion(DataSource):
+    def __init__(self, geonames_filename,hierarchy_filename,mask_ids=None):
+        super().__init__("Inclusion SRC",hierarchy_filename)
+        assert os.path.exists(geonames_filename)
+        self.geonames_data_dict = {row.geonameid:(row.name,row.latitude,row.longitude) for row in read_geonames(geonames_filename).itertuples()}
+        self.data_src = pd.read_csv(self.input_filename,
+            sep="\t",
+            header=None,
+            names="parentId,childId,type".split(",")
+        ).fillna("")
+        if mask_ids:
+            self.data_src = self.data_src[self.data_src.childId.isin(mask_ids)]
+        self.data_src= self.data_src[self.data_src.childId.isin(self.geonames_data_dict)]
+        self.data_src= self.data_src[self.data_src.parentId.isin(self.geonames_data_dict)]
+        self.data_src = self.data_src["childId parentId".split()].values.tolist()
+        self.len = len(self.data_src)
+        self.i = 0
+        self.is_over = False
+    def __next__(self):
+        if self.i+1 >= self.len:
+            self.eof = True
+            raise StopIteration
+        else:
+            self.i += 1
+            tup_ = tuple(self.data_src[self.i-1])
+            return (self.geonames_data_dict[tup_[0]][0],
+            self.geonames_data_dict[tup_[1]][0],
+            self.geonames_data_dict[tup_[0]][1],
+            self.geonames_data_dict[tup_[0]][2])
+    def __reset__(self):
+        self.i = 0
+        self.is_over = False
+    def isOver(self):
+        return (self.i == self.len)
+class CoOccurrences(DataSource):
+    def __init__(self, filename, sampling=3):
+        super().__init__("Co-Occurrence data",filename)
+        try:
+            self.data_src = pd.read_csv(filename)
+        except:
+            self.data_src = pd.read_csv(filename,sep="\t")
+        self.data_src["title"] = self.data_src.title.apply(parse_title_wiki)
+        self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki)
+        self.i = 0
+        self.j = 0
+        self.is_over = False
+        self.sampling = sampling
+        self.len = len(self.data_src)*self.sampling
+        if self.sampling:
+            self.probs_storage = SamplingProbabilities()
+        self.topo = None
+        self.context_topo_context = []
+        self.curr_probs = None
+        self.lat, self.lon = None, None
+    def __next__(self):
+        if self.isOver() or self.i*self.sampling == self.len:
+            self.is_over = True
+            raise StopIteration 
+        if  self.j >= len(self.context_topo_context):
+            line = self.data_src.iloc[self.i]
+            self.topo = line.title
+            self.context_topo_context = [x for x in line.interlinks.split("|")]
+            if self.sampling:
+                self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
+                self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
+            self.lat, self.lon = line.latitude,line.longitude
+            self.i += 1
+            self.j = 0
+        self.j += 1
+        return (self.topo,
+        self.context_topo_context[self.j-1],
+        self.lat,self.lon)
+    def __reset__(self):
+        self.i = 0
+        self.is_over = False
+    def isOver(self):
+        return self.is_over
+class DataGenerator(keras.utils.Sequence):
+    'Generates data for Keras'
+    def __init__(self,data_sources,ngram_index,**kwargs):
+        'Initialization'
+        self.data_src = data_sources
+        self.ngram_index = ngram_index
+        self.batch_size = kwargs.get("batch_size",1000)
+        self.len = sum([len(d) for d in self.data_src])
+        self.datasrc_index = 0
+        #self.on_epoch_end()
+    def __len__(self):
+        'Denotes the number of batches per epoch'
+        return int(np.floor(self.len / self.batch_size))
+    def __getitem__(self, index):
+        'Generate one batch of data'
+        X = np.empty((self.batch_size,2,self.ngram_index.max_len))
+        y = np.empty((self.batch_size,2),dtype=float)
+        if self.data_src[self.datasrc_index].isOver():
+                self.datasrc_index += 1
+        if self.datasrc_index >= len(self.data_src):
+            return X,y
+        for i in range(self.batch_size):
+            if self.data_src[self.datasrc_index].isOver():
+                return X, y
+            try:
+                topo, topo_context,latitude,longitude = self.data_src[self.datasrc_index].__next__()
+            except StopIteration as e:
+                return X, y
+            X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
+            y[i] = [longitude,latitude]
+        return X, y
+    def on_epoch_end(self):
+        'Updates indexes after each epoch'
+        [d.__reset__() for d in self.data_src]
+        self.datasrc_index = 0
+def load_embedding(model_fn,dim_vector=100):
+    model = KeyedVectors.load(model_fn)
+    N = len(model.wv.vocab)
+    M = np.zeros((N,dim_vector))
+    for i in range(N):
+        M[i] = model.wv[str(i)]
+    return M
+if __name__ == "__main__":
+    # All adj nb of line :7955000-1
+    from lib.ngram_index import NgramIndex
+    from tqdm import tqdm
+    ng = NgramIndex.load("../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json")
+    c= CoOccurrences("../data/wikipedia/cooccurrence_FR.txt_test.csv",sampling=3)
+    a = Adjacency("/home/jacques/sample_adjacency.txt",geonames_filename="../data/geonamesData/allCountries.txt",gzip=False,sampling=10)
+    i= Inclusion(geonames_filename="../data/geonamesData/allCountries.txt",hierarchy_filename="../data/geonamesData/hierarchy.txt")
+    d= DataGenerator([c,a,i],ng) 
+    for x in tqdm(range(len(d))):d[i]
--- a/desamb_eval.py
+++ b/desamb_eval.py
 from glob import glob
-import os
+import json
-os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # No need for GPU
 import argparse
 import logging
 import pandas as pd
-from predict_toponym_coordinates import Geocoder
-from lib.geo import haversine_pd
-logging.getLogger("tensorflow").setLevel(logging.CRITICAL)
-logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL)
 parser = argparse.ArgumentParser()
 parser.add_argument("eval_dataset")
 parser.add_argument("models_directory")
-args = parser.parse_args()
+parser.add_argument("-g","--gpu",action="store_true")
+args = parser.parse_args()#("-g ../data/geocoding_evaluation/fr_cooc_test.csv outputs/FR_RESULT".split())
+if not args.gpu:
+    import os
+    os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # No need for GPU
+from predict_toponym_coordinates import Geocoder
+from lib.geo import haversine_pd
+logging.getLogger("tensorflow").setLevel(logging.CRITICAL)
+logging.getLogger("tensorflow_hub").setLevel(logging.CRITICAL)
 EVAL_DATASET_FN= args.eval_dataset#"./test_dataset_ambiguity.csv"
@@ -39,7 +46,17 @@ def eval_model(eval_dataset_fn,model_fn,model_index_fn):
    print("100km",(df.dist<100).sum()/len(df))
    print("50km",(df.dist<50).sum()/len(df))
    print("20km",(df.dist<20).sum()/len(df))
+    return df
 prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")]
+final_output = []
 for prefix in prefixes:
-    eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
+    df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
\ No newline at end of file
+    data = json.load(open(prefix+".json"))
+    data["acccuracy@100km"] = (df.dist<100).sum()/len(df)
+    data["acccuracy@50km"] = (df.dist<50).sum()/len(df)
+    data["acccuracy@25km"] = (df.dist<25).sum()/len(df)
+    final_output.append(data)
+pd.DataFrame(final_output).to_csv("{0}_RESULT.csv".format(EVAL_DATASET_FN.rstrip(".csv")))
\ No newline at end of file
--- a/desamb_eval_runs.sh
+++ b/desamb_eval_runs.sh
+python3 desamb_eval.py -g ../data/geocoding_evaluation/fr_dataset_ambiguity_sample50percent.csv outputs/FR_RESULT 
+#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/US\ FR\ results
+#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/US\ FR\ results
--- a/get_all_adjacency_rel.py
+++ b/get_all_adjacency_rel.py
+import pandas as pd, numpy as np
+from numba import njit
+from helpers import read_geonames
+from tqdm import tqdm
+from joblib import Parallel,delayed
+import geopandas as gpd
+from lib.geo import Grid,haversine_pd
+import matplotlib.pyplot as plt
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("geoname_fn")
+parser.add_argument("kilometer_threshold",type=int,default=20)
+parser.add_argument("output_fn_prefix")
+args = parser.parse_args("../data/geonamesData/allCountries.txt 20 /home/jacques/ALL_ADJ_224+_".split())
+GEONAME_FN = args.geoname_fn
+PREFIX_OUTPUT_FN = args.output_fn_prefix
+KM_THRESHOLD = args.kilometer_threshold
+df = read_geonames(GEONAME_FN)
+def to_str(list_):
+    """
+    Return str representation for each value in list_
+    Parameters
+    ----------
+    list_ : array
+        array
+    Returns
+    -------
+    array
+        str list
+    """
+    return list(map(str,list_))
+def get_adjacent(geonameid,ids,lon1, lat1, lon2, lat2,threshold):
+    """
+    Write adjacent entry in geonames for a selected entry
+    """
+    dist_ = haversine_pd(lon1, lat1, lon2, lat2)
+    adj_ids = ids[dist_<threshold]
+    out_.write("\n{0},{1},{2},{3}".format(geonameid,"|".join(to_str(adj_ids)),lat2,lon2))
+    out_.flush()
+# WE BUILD a grid over the world map
+# It allows to limit unnecessary calculus thus accelerate the whole process
+world = gpd.read_file("/media/jacques/DATA/GEODATA/WORLD/world.geo.50m.dissolved")
+g = Grid(*world.bounds.values[0],[40,20]) #We build a grid of cell of 40° by 20°
+g.fit_data(world)
+# Prepare first output
+first_output_fn = "{1}{0}_cells.csv".format(KM_THRESHOLD,PREFIX_OUTPUT_FN)
+out_ = open(first_output_fn,'w')
+out_.write("geonameid,adjacent_geonameid,latitude,longitude") # HEADER
+out_.flush() # Avoid writing bugs
+def get_rels(cells_list):
+    for c in tqdm(cells_list):
+        mask1 = (df.latitude <= c.bottomright_y) & (df.latitude >= c.upperleft_y)
+        new_df = df[mask1].copy() 
+        mask2 = (new_df.longitude >= c.upperleft_x) & (new_df.longitude <= c.bottomright_x)
+        new_df = new_df[mask2]
+        for ix,row in new_df.iterrows():
+            get_adjacent(row.geonameid,new_df.geonameid.values,new_df.longitude,new_df.latitude,row.longitude,row.latitude,KM_THRESHOLD)
+        #Parallel(n_jobs=-1,backend="multiprocessing",temp_folder="/home/jacques/temp/")(delayed(get_adjacent)(row.geonameid,new_df.geonameid.values,new_df.longitude,new_df.latitude,row.longitude,row.latitude,KM_THRESHOLD) for ix,row in new_df.iterrows())
+world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
+ax = world.plot(color="white",edgecolor="black")
+for c in g.cells[224:]:
+    ax.plot(*c.box_.exterior.xy)
+plt.show()
+get_rels(g.cells[224:]) #~3h
+# Prepare second output
+# second_output_fn = "{1}{0}_inter_cells.csv".format(KM_THRESHOLD,PREFIX_OUTPUT_FN)
+# out_ = open(second_output_fn,'w')
+# out_.write("geonameid,adjacent_geonameid,latitude,longitude") # HEADER
+# out_.flush()# Avoid writing bugs
+# get_rels(g.inter_cells) 594
--- a/lib/geo.py
+++ b/lib/geo.py
@@ -17,6 +17,25 @@ from joblib import Parallel,delayed
 def haversine_pd(lon1, lat1, lon2, lat2):
+    """
+    Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
+    Parameters
+    ----------
+    lon1 : numeric or array-like (pandas Dataframe works also)
+        longitude of first coordinates
+    lat1 :  numeric or array-like (pandas Dataframe works also)
+        latitude of first coordinates
+    lon2 :  numeric or array-like (pandas Dataframe works also)
+        longitude of second coordinates
+    lat2 :  numeric or array-like (pandas Dataframe works also)
+        longitude of second coordinates
+    Returns
+    -------
+    float or array-like
+        distance(s) value(s)
+    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1
    dlat = lat2 - lat1

--- a/scripts/embeddingngram.py
+++ b/scripts/embeddingngram.py
+#!/usr/bin/env python
+# coding: utf-8
+from lib.ngram_index import NgramIndex
+from lib.geo import read_geonames
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from tqdm import tqdm
+from gensim.models import Word2Vec
+import logging
+logging.basicConfig(level="INFO")
+df_cooc = pd.read_csv("../data/wikipedia/cooccurrence_ALL.txt",sep="\t")
+df_geo = read_geonames("../data/geonamesData/allCountries.txt") 
+geonames_label = df_geo.name.values.tolist()
+wiki_labels = df_cooc.title.values.tolist()
+p= [wiki_labels.extend(x.split("|")) for x in df_cooc["interlinks"].values]
+del df_geo
+del df_cooc
+ng = NgramIndex(4)
+p = [ng.split_and_add(x) for x in tqdm(geonames_label)]
+p = [ng.split_and_add(x) for x in tqdm(wiki_labels)]
+ng.save("4gramWiki+Geonames_index.json")
+geonames_label.extend(wiki_labels)
+class MySentences(object):
+    def __init__(self, texts):
+        self.texts = texts
+    def __iter__(self):
+        for w in self.texts:
+            yield [str(x)for x in ng.encode(w)]
+model = Word2Vec(MySentences(geonames_label), size=100, window=5, min_count=1, workers=4)
+model.save("embedding4gramWiki+Geonames.bin")