Cleaning and Organizing the code+ Add function to fast compute every adjacency rel available

7496fba4 · Jacques Fize · 46f8b391 · 7496fba4 · 7496fba4 · 7496fba4
Commit 7496fba4 authored 5 years ago by Jacques Fize
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/

--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -12,6 +12,8 @@ import geopandas as gpd
 from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM
 from keras.models import Model
 from keras import backend as K
+from keras.callbacks import ModelCheckpoint
+
 import tensorflow as tf

 # Geometry
@@ -19,31 +21,15 @@ from shapely.geometry import Point

 # Custom module
 from helpers import read_geonames
-from utils import Grid
-from utils import  zero_one_encoding, NgramIndex,ConfigurationReader
-from metrics import lat_accuracy,lon_accuracy
+from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
+from lib.ngram_index import NgramIndex
+from lib.utils import ConfigurationReader
+from lib.metrics import lat_accuracy,lon_accuracy

 # Logging
 from tqdm import tqdm
 import logging
-from helpers import Chronometer
-
-
-def parse_title_wiki(title_wiki):
-    """
-    Parse Wikipedia title
-    
-    Parameters
-    ----------
-    title_wiki : str
-        wikipedia title
-    
-    Returns
-    -------
-    str
-        parsed wikipedia title
-    """
-    return re.sub("\(.*\)","",title_wiki).strip().lower()
+from helpers import parse_title_wiki

 def get_new_ids(cooc_data,id_first_value):
    """
@@ -74,96 +60,122 @@ def get_new_ids(cooc_data,id_first_value):
                topo_id[id_]=interlink
    return topo_id

-
-
 # LOGGING CONF
 logging.basicConfig(
    format='[%(asctime)s][%(levelname)s] %(message)s ', 
    datefmt='%m/%d/%Y %I:%M:%S %p',
    level=logging.INFO  
    )
-chrono = Chronometer()

 args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
-    .parse_args()#("-n 4 -t 0.002 -e 20  -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
+    .parse_args()#("-i -e 5 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())

-# Initialisee CONSTANTS
-GEONAME_FN = args.geoname_input
-GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
+#
+#################################################
+############# MODEL TRAINING PARAMETER ##########
+#################################################
 NGRAM_SIZE = args.ngram_size
 ACCURACY_TOLERANCE = args.tolerance_value
 EPOCHS = args.epochs
 ITER_ADJACENCY = args.adjacency_iteration
-COOC_SAMPLING_NUMBER = 3
-WORDVEC_ITER = 50
+COOC_SAMPLING_NUMBER = args.cooc_sample_size
+WORDVEC_ITER = args.ngram_word2vec_dim
+#################################################
+########## FILENAME VARIABLE ####################
+#################################################
+GEONAME_FN = args.geoname_input
+GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
+REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
+ADJACENCY_REL_FILENAME = "../data/geonamesData/{0}_{1}{2}adjacency.json".format(
+        GEONAME_FN.split("/")[-1],
+        ITER_ADJACENCY,
+        REGION_SUFFIX_FN)
+
+COOC_FN = "../data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1]
+PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
+    GEONAME_FN.split("/")[-1],
+    EPOCHS,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE,
+    REGION_SUFFIX_FN)

-# check for output dir
-if not os.path.exists("outputs/"):
-    os.makedirs("outputs/")
+if args.adjacency:
+    PREFIX_OUTPUT_FN += "_A"
+if args.inclusion:
+    PREFIX_OUTPUT_FN += "_I"
+if args.wikipedia_cooc:
+    PREFIX_OUTPUT_FN += "_C"
+
+MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
+INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
+
+#############################################################################################
+################################# LOAD DATA #################################################
+#############################################################################################

 # LOAD  Geonames DATA
 logging.info("Load Geonames data...")
 geoname_data = read_geonames(GEONAME_FN).fillna("")
-hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")

-train_indices,test_indices = pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values, pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values
-train_indices,test_indices = set(train_indices),set(test_indices)
+train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
+test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)

 logging.info("Geonames data loaded!")

 # SELECT ENTRY with class == to A and P (Areas and Populated Places)
 filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
+#CLEAR RAM
+del geoname_data

-# IF REGION (ONLY FR for now !)
-admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split()
-region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1
-if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth:
+
+# IF REGION
+if args.admin_code_1 != "None":
    filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()

-# REDUCE DATA STORED
+# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
 filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
+bounds = get_bounds(filtered) # Required to get adjacency relationships
+

-# Geometry operation 
-filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
-filtered = gpd.GeoDataFrame(filtered)
-filtered["i"]=1
-bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships
+#############################################################################################
+################################# RETRIEVE RELATIONSHIPS ####################################
+#############################################################################################


+# INITIALIZE RELATION STORE
 rel_store = []

+# Retrieve adjacency relationships
 if args.adjacency:
-    # RETRIEVE ADJACENCY REL
    logging.info("Retrieve adjacency relationships ! ")
-    fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn)
-    if not os.path.exists(fn):
-        g = Grid(*bounds,[360,180])
-        g.fit_data(filtered)
-        [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))]
-        rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)])
-        json.dump(rel_store,open(fn,'w'))
+
+    if not os.path.exists(ADJACENCY_REL_FILENAME):
+        rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
+        json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
    else:
        logging.info("Open and load data from previous computation!")
-        rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))]
+        rel_store=json.load(open(ADJACENCY_REL_FILENAME))
+
    logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))

+# Retrieve inclusion relationships
 if args.inclusion:
-    # RETRIEVE INCLUSION RELATIONSHIPS
    logging.info("Retrieve inclusion relationships ! ")
-    geonamesIDS = set(filtered.geonameid.values)
-    filter_mask = (hierarchy_data.childId.isin(geonamesIDS) & hierarchy_data.parentId.isin(geonamesIDS))
-    rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist()))
-    logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask])))

-del filtered["geometry"]
+    cpt_rel = len(rel_store)
+    rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
+
+    logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
+
+

 if args.wikipedia_cooc:
    logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
-    COOC_FN = "./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1]
+    
    cooc_data = pd.read_csv(COOC_FN,sep="\t")
    cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
    cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
-    id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max())
+    id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
    wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
    title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()}
    cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
@@ -191,37 +203,30 @@ geoname2name = dict(filtered["geonameid name".split()].values)
 # ENCODING NAME USING N-GRAM SPLITTING
 logging.info("Encoding toponyms to ngram...")
 index = NgramIndex(NGRAM_SIZE)
-filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
-if args.wikipedia_cooc:
-    [index.split_and_add(k) for k in wikipediatitle_id]
-filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
-max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
-if args.wikipedia_cooc:
-    extension = {v:index.encode(k) for k,v in wikipediatitle_id.items()}

-index.max_len = int(max_len) # For Index state dump
+ # Identify all ngram available
+filtered.name.apply(lambda x : index.split_and_add(x))
+if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]

-filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
-if args.wikipedia_cooc:
-    extension = {k:index.complete(v,max_len) for k,v in extension.items()}
-geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
+geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association

 if args.wikipedia_cooc:
-    geoname2encodedname.update(extension)
+    geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})

+# SAVE THE INDEX TO REUSE THE MODEL
+index.save(INDEX_FN)

 logging.info("Done !")

-#CLEAR RAM
-del hierarchy_data
-del geoname_data
+
+#############################################################################################
+################################# ENCODE COORDINATES #################################################
+#############################################################################################
+
+

 # Encode each geonames entry coordinates
-filtered["cell_vec"]=filtered.apply(
-    lambda x : zero_one_encoding(x.longitude,x.latitude),
-    axis=1
-    )
-geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
+geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()}
 # CLEAR RAM
 del filtered

@@ -231,14 +236,17 @@ num_words = len(index.index_ngram) # necessary for the embedding matrix

 logging.info("Preparing Input and Output data...")

+
+#############################################################################################
+################################# BUILD TRAIN/TEST DATASETS #################################
+#############################################################################################
+
 X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
 X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]

-cpt=0
 for couple in rel_store:
    geonameId_1,geonameId_2 = couple[0],couple[1]
    if not geonameId_1 in geoname2encodedname:
-        cpt+=1
        continue
    top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
    if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
@@ -270,29 +278,28 @@ y_lon_test = np.array(y_lon_test)
 logging.info("Data prepared !")


-# OUTPUT FN BASE
-name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn)
-if args.adjacency:
-    name += "_A"
-if args.inclusion:
-    name += "_I"
-if args.wikipedia_cooc:
-    name += "_C"
+# check for output dir
+if not os.path.exists("outputs/"):
+    os.makedirs("outputs/")

-index.save("outputs/"+name+"_index")
+#############################################################################################
+################################# NGRAM EMBEDDINGS ##########################################
+#############################################################################################


-# NGRAM EMBDEDDING
 logging.info("Generating N-GRAM Embedding...")
 embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=WORDVEC_ITER)
 logging.info("Embedding generated !")

-# DEEP MODEL
-name = "LSTM_"+ name
-input_1 = Input(shape=(max_len,))
-input_2 = Input(shape=(max_len,))
+#############################################################################################
+################################# MODEL DEFINITION ##########################################
+#############################################################################################

-embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
+
+input_1 = Input(shape=(index.max_len,))
+input_2 = Input(shape=(index.max_len,))
+
+embedding_layer = Embedding(num_words, embedding_dim,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)

 x1 = Bidirectional(LSTM(98))(embedding_layer(input_1))
 x2 = Bidirectional(LSTM(98))(embedding_layer(input_2))
@@ -315,15 +322,29 @@ output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
 model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3

 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
+
+
+checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
+    save_best_only=True, mode='auto', period=1)
+
+
+#############################################################################################
+################################# TRAINING LAUNCH ###########################################
+#############################################################################################
+
 history = model.fit(x=[X_1_train,X_2_train],
    y=[y_lon_train,y_lat_train],
    verbose=True, batch_size=100,
    epochs=EPOCHS,
-    validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
+    validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]),
+    callbacks=[checkpoint])


 hist_df = pd.DataFrame(history.history)
-hist_df.to_csv("outputs/{0}.csv".format(name))
+hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN))

-model.save("outputs/"+name+".h5")
+model.save(MODEL_OUTPUT_FN)

+# Erase Model Checkpoint file
+if os.path.exists(output_fn + ".part"):
+    os.remove(output_fn + ".part")
\ No newline at end of file
--- a/documentation/imgs/first_approach.png
+++ b/documentation/imgs/first_approach.png
--- a/documentation/imgs/second_approach.png
+++ b/documentation/imgs/second_approach.png
--- a/documentation/imgs/third_approach.png
+++ b/documentation/imgs/third_approach.png
--- a/helpers.py
+++ b/helpers.py
+import os
+import time
+import re
+
+import numpy as np
+import pandas as pd
+
+
+def read_geonames(file):
+    """
+    Return a dataframe that contains Geonames data.
+    
+    Parameters
+    ----------
+    file : str
+        path of the Geonames Csv file
+    
+    Returns
+    -------
+    pd.DataFrame
+        geonames data
+    """
+    dtypes_dict = {
+    0: int, # geonameid
+    1: str,  # name
+    2: str,  # asciiname
+    3: str,  # alternatenames
+    4: float, # latitude
+    5: float, # longitude
+    6: str, # feature class
+    7: str, # feature code
+    8: str, # country code
+    9: str, # cc2
+    10: str, # admin1 code
+    11: str, # admin2 code
+    12: str, # admin3 code
+    13: str, # admin4 code
+    14: int, # population
+    15: str, # elevation
+    16: int, # dem (digital elevation model)
+    17: str, # timezone
+    18: str # modification date yyyy-MM-dd
+    }
+    rename_cols = {
+    0:"geonameid", # geonameid
+    1:"name",  # name
+    2:"asciiname",  # asciiname
+    3:"alternatenames",  # alternatenames
+    4:"latitude", # latitude
+    5:"longitude", # longitude
+    6:"feature_class", # feature class
+    7:"feature_code", # feature code
+    8:"country_code", # country code
+    9:"cc2", # cc2
+    10:"admin1_code", # admin1 code
+    11:"admin2_code", # admin2 code
+    12:"admin3_code", # admin3 code
+    13:"admin4_code", # admin4 code
+    14:"population", # population
+    15:"elevation", # elevation
+    16:"dem", # dem (digital elevation model)
+    17:"timezone", # timezone
+    18:"modification_date" # modification date yyyy-MM-dd
+    }
+    data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
+    data.rename(columns=rename_cols,inplace=True)
+    return data
+
+
+def parse_title_wiki(title_wiki):
+    """
+    Parse Wikipedia title
+    
+    Parameters
+    ----------
+    title_wiki : str
+        wikipedia title
+    
+    Returns
+    -------
+    str
+        parsed wikipedia title
+    """
+    return re.sub("\(.*\)","",title_wiki).strip().lower()
+
+
+def _split(lst,n,complete_chunk_value):
+    """
+    Split a list into chunk of n-size.
+    
+    Parameters
+    ----------
+    lst : list
+        input list
+    n : int
+        chunk size
+    complete_chunk_value : object
+        if last chunk size not equal to n, this value is used to complete it
+    
+    Returns
+    -------
+    list
+        chunked list
+    """
+    chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
+    if not chunks:return chunks
+    if len(chunks[-1]) != n:
+        chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
+    return np.array(chunks)
+
+class Chronometer():
+    def __init__(self):
+        self.__task_begin_timestamp = {}
+
+    def start(self,task_name):
+        """
+        Start a new task chronometer
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Raises
+        ------
+        ValueError
+            if a running task already exists with that name
+        """
+        if task_name in self.__task_begin_timestamp:
+            raise ValueError("A running task exists with the name {0}!".format(task_name))
+        self.__task_begin_timestamp[task_name] = time.time()
+
+    def stop(self,task_name):
+        """
+        Stop and return the duration of the task
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Returns
+        -------
+        float
+            duration of the task in seconds
+        
+        Raises
+        ------
+        ValueError
+            if no task exist with the id `task_name`
+        """
+        if not task_name in self.__task_begin_timestamp:
+             raise ValueError("The {0} task does not exist!".format(task_name))
+        duration = time.time() - self.__task_begin_timestamp[task_name]
+        del self.__task_begin_timestamp[task_name]
+        return duration
+
+if __name__ == "__main__":
+    chrono = Chronometer()
+    chrono.start("test")
+    chrono.start("test2")
+    time.sleep(3)
+    print(chrono.stop("test"))
+    time.sleep(3)
+    print(chrono.stop("test2"))
\ No newline at end of file
--- a/lib/__init__.py
+++ b/lib/__init__.py
--- a/utils.py
+++ b/utils.py
-# Basic import 
-import math
-import argparse
-import os
-import json

-# Data Structure
-import numpy as np
 import geopandas as gpd
-from shapely.geometry import Point,box
-
-# NLP 
-from nltk.tokenize import word_tokenize
-from ngram import NGram
+import numpy as np
+import pandas as pd

-# Machine learning 
-from gensim.models import Word2Vec
+from shapely.geometry import Point,box 

-# Visualisation and parallelisation
 from tqdm import tqdm


-class TokenizerCustom():
-    def __init__(self,vocab):
-        self.word_index = {vocab[i]:i for i in range(len(vocab))}
-        self.index_word = {i:vocab[i] for i in range(len(vocab))}
-        self.N = len(self.index_word)
-    def texts_to_sequences(self,listText):
-        seqs = []
-        for text in listText:
-            seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index])
-        return seqs
-
-
-class CoordinatesEncoder:
-    """
-    Will be replaced by Grid in grid2.py
-    """
-    def __init__(self, cell_size_lat=0.5, cell_size_lon=0.5):
-        self.min_lon = -180
-        self.max_lon = -(self.min_lon)  #  Symetric
-        self.min_lat = -90
-        self.max_lat = -(self.min_lat)  # Symetric
-
-        self.ecart_lat = self.max_lat - self.min_lat
-        self.ecart_lon = self.max_lon - self.min_lon
-
-        self.cell_size_lat = cell_size_lat
-        self.cell_size_lon = cell_size_lon
-
-        self.unit_size_lat = self.ecart_lat / self.cell_size_lat
-        self.unit_size_lon = self.ecart_lon / self.cell_size_lon
-
-    def encode(self, lat, lon):
-        return (
-            math.floor(((lat + self.max_lat) / self.ecart_lat) * self.unit_size_lat),
-            math.floor(((lon + self.max_lon) / self.ecart_lon) * (self.unit_size_lon))
-        )
-
-    def number_lat_cell(self):
-        return int(self.unit_size_lat)
-
-    def number_lon_cell(self):
-        return int(self.unit_size_lon)
-
-    def oneDimensionOutputSize(self):
-        return self.number_lat_cell() * self.number_lon_cell()
-
-    def vector(self, lat, lon):
-        lat_v, lon_v = np.zeros(self.number_lat_cell()), np.zeros(self.number_lon_cell())
-        new_coords = self.encode(lat, lon)
-        lat_v[int(new_coords[0])] = 1
-        lon_v[int(new_coords[1])] = 1
-        return lat_v, lon_v
-
-    def vector_flatten(self, lat, lon):
-        vec = np.zeros(self.oneDimensionOutputSize())  # 2D Dense softmax isn't possible
-        new_coords = self.encode(lat, lon)
-        pos = self.number_lat_cell() * (new_coords[0]) + new_coords[1]
-        vec[pos] = 1  # lon * lon size
-        return vec
-
-
-class NgramIndex():
-    """
-    Class used for encoding words in ngram representation
-    """
-    def __init__(self,n):
-        """
-        Constructor
-        
-        Parameters
-        ----------
-        n : int
-            ngram size
-        """
-        self.ngram_gen = NGram(N=n)
+import pandas as pd, numpy as np
+from numba import njit
+from helpers import read_geonames
+from tqdm import tqdm
+from joblib import Parallel,delayed

-        self.size = n
-        self.ngram_index = {"":0}
-        self.index_ngram = {0:""}
-        self.cpt = 0
-        self.max_len = 0

-    def split_and_add(self,word):
-        """
-        Split word in multiple ngram and add each one of them to the index
-        
-        Parameters
-        ----------
-        word : str
-            a word
-        """
-        ngrams = word.lower().replace(" ","$")
-        ngrams = list(self.ngram_gen.split(ngrams))
-        [self.add(ngram) for ngram in ngrams]

-    def add(self,ngram):
-        """
-        Add a ngram to the index
-        
-        Parameters
-        ----------
-        ngram : str
-            ngram
-        """
-        if not ngram in self.ngram_index:
-            self.cpt+=1
-            self.ngram_index[ngram]=self.cpt
-            self.index_ngram[self.cpt]=ngram
-
-    def encode(self,word):
-        """
-        Return a ngram representation of a word
-        
-        Parameters
-        ----------
-        word : str
-            a word
-        
-        Returns
-        -------
-        list of int
-            listfrom shapely.geometry import Point,box
- of ngram index
-        """
-        ngrams = word.lower().replace(" ","$")
-        ngrams = list(self.ngram_gen.split(ngrams))
-        [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
-        return [self.ngram_index[ng] for ng in ngrams]
-
-    def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
-        """
-        Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
-        
-        Parameters
-        ----------
-        ngram_encoding : list of int
-            first encoding of a word
-        MAX_LEN : int
-            desired length of the encoding
-        filling_item : int, optional
-            ngram index you wish to use, by default 0
-        
-        Returns
-        -------
-        list of int
-            list of ngram index
-        """
-        assert len(ngram_encoding) <= MAX_LEN
-        diff = MAX_LEN - len(ngram_encoding)
-        ngram_encoding.extend([filling_item]*diff)  
-        return ngram_encoding
+def haversine_pd(lon1, lat1, lon2, lat2):
+    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
+    dlon = lon2 - lon1
+    dlat = lat2 - lat1
+    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    
-    def get_embedding_layer(self,texts,dim=100,**kwargs):
-        """
-        Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
-        
-        Parameters
-        ----------
-        texts : list of [list of int]
-            list of encoded word
-        dim : int, optional
-            embedding dimension, by default 100
-        
-        Returns
-        -------
-        np.array
-            embedding matrix
-        """
-        model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
-        N = len(self.ngram_index)
-        embedding_matrix = np.zeros((N,dim))
-        for i in range(N):
-            embedding_matrix[i] = model.wv[str(i)]
-        return embedding_matrix
-
-    def save(self,fn):
-        """
-
-        Save the NgramIndex
-        
-        Parameters
-        ----------
-        fn : str
-            output filename
-        """
-        data = {
-            "ngram_size": self.size,
-            "ngram_index": self.ngram_index,
-            "cpt_state": self.cpt,
-            "max_len_state": self.max_len
-        }
-        json.dump(data,open(fn,'w'))
+    return 6367 * 2 * np.arcsin(np.sqrt(a))

-    @staticmethod
-    def load(fn):
-        """
-        
-        Load a NgramIndex state from a file.
-        
-        Parameters
-        ----------
-        fn : str
-            input filename
-        
-        Returns
-        -------
-        NgramIndex
-            ngram index
-        
-        Raises
-        ------
-        KeyError
-            raised if a required field does not appear in the input file
-        """
-        try:
-            data = json.load(open(fn))
-        except json.JSONDecodeError:
-            print("Data file must be a JSON")
-        for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
-            if not key in data:
-                raise KeyError("{0} field cannot be found in given file".format(key))
-        new_obj = NgramIndex(data["ngram_size"])
-        new_obj.ngram_index = data["ngram_index"]
-        new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
-        new_obj.cpt = data["cpt_state"]
-        new_obj.max_len = data["max_len_state"]
-        return new_obj

+def get_adjacent(ids,lon1, lat1, lon2, lat2,threshold):
+    dist_ = haversine_pd(lon1, lat1, lon2, lat2)
+    return ids[dist_<threshold]

-def zero_one_encoding(long,lat):
-    """
-    Encode coordinates (WGS84) between 0 and 1
-    
-    Parameters
-    ----------
-    long : float
-        longitude value
-    lat : float
-        latitude value
-    
-    Returns
-    -------
-    float,float
-        longitude, latitude
-    """
-    return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) 
+def get_geonames_adjacency(geoname_data,threshold):
+    return Parallel(n_jobs=-1,backend="multiprocessing")(delayed(get_adjacent)(geoname_data.geonameid.values,
+    geoname_data.longitude,
+    geoname_data.latitude,
+    row.longitude,
+    row.latitude,
+    threshold) for ix,row in tqdm(geoname_data.iterrows(),total=len(geoname_data)))

-def _split(lst,n,complete_chunk_value):
-    """
-    Split a list into chunk of n-size.
-    
-    Parameters
-    ----------
-    lst : list
-        input list
-    n : int
-        chunk size
-    complete_chunk_value : object
-        if last chunk size not equal to n, this value is used to complete it
-    
-    Returns
-    -------
-    list
-        chunked list
-    """
-    chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
-    if not chunks:return chunks
-    if len(chunks[-1]) != n:
-        chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
-    return np.array(chunks)

 def generate_couple(object_list):
    """
@@ -338,7 +85,24 @@ def _hash_couple(o1,o2):



-### GEO ADJAC BEGIN
+def zero_one_encoding(long,lat):
+    """
+    Encode coordinates (WGS84) between 0 and 1
+    
+    Parameters
+    ----------
+    long : float
+        longitude value
+    lat : float
+        latitude value
+    
+    Returns
+    -------
+    float,float
+        longitude, latitude
+    """
+    return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) 
+
 class Cell(object):
    """
    A cell is box placed in geeographical space.
@@ -349,8 +113,6 @@ class Cell(object):
        
        Parameters
        ----------
-        object : [type]
-            [description]
        upperleft_x : float
            upperleft longitude
        upperleft_y : float
@@ -410,6 +172,7 @@ class Cell(object):
            
    def __repr__(self):
        return  "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
+    
        
 class Grid(object):
    """
@@ -541,74 +304,32 @@ class Grid(object):
        """
        relationships = set([])
        for c1 in tqdm(range(len(self.cells))):
-            for i in range(random_iteration):
+            for _ in range(random_iteration):
                for t in generate_couple(list(self.cells[c1].list_object.keys())):
                    relationships.add(_hash_couple(t[0],t[1]))

        for c1 in tqdm(range(len(self.inter_cells))):
-            for i in range(random_iteration):
+            for _ in range(random_iteration):
                for t in generate_couple(list(self.inter_cells[c1].list_object.keys())):
                    relationships.add(_hash_couple(t[0],t[1]))
        return relationships
    

-### GEO ADJAC END
-
-class ConfigurationReader(object):
-    def __init__(self,configuration_file):
-        if not os.path.exists(configuration_file):
-            raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file))
-
-        self.configuration = json.load(open(configuration_file))
-
-        self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"])
-        self.parser = argparse.ArgumentParser(description=self.__argparser_desc)
-
-        self.parse_conf()
-    
-    def parse_conf(self):
-        if not "args" in self.configuration:
-            raise argparse.ArgumentError("","No args given in the configuration file")
-        
-        for dict_args in self.configuration["args"]:
-            if not isinstance(dict_args,dict):
-                raise ValueError("Args must be dictionnary")
-
-            short_command = dict_args.get("short",None)
-            long_command = dict_args.get("long",None)
-            
-            if not short_command and not long_command:
-                raise ValueError("No command name was given !") 
-            
-            add_func_dict_= {}
-            if "help" in dict_args:
-                add_func_dict_["help"]= dict_args["help"]
-            if "default" in dict_args:
-                add_func_dict_["default"]= dict_args["default"]
-            if "action" in dict_args:
-                add_func_dict_["action"]= dict_args["action"]
-            if "type" in dict_args:
-                add_func_dict_["type"]= eval(dict_args["type"])
-            if "choices" in dict_args:
-                add_func_dict_["choices"]= dict_args["choices"]
-
-            if not (short_command and long_command):
-                command = (short_command if not long_command else long_command)
-                self.parser.add_argument(command,**add_func_dict_)
-
-            elif long_command and short_command:
-                self.parser.add_argument(short_command,long_command,**add_func_dict_)
-    
-    def parse_args(self,input_=None):
-        if not input_:
-            return self.parser.parse_args()
-        return self.parser.parse_args(input_)
-

+def get_adjacency_rels(geodataframe,bounds,subdiv_tuple,random_iter_adjacency):
+    g = Grid(*bounds,subdiv_tuple)
+    g.fit_data()
+    [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(geodataframe["geonameid longitude latitude".split()].iterrows(),total=len(geodataframe))]
+    return [[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(random_iter_adjacency)]

-if __name__ == "__main__":
+def get_geonames_inclusion_rel(geonames_data,geonames_hierarchy_data_fn):
+    geonames_hierarchy_data = pd.read_csv(geonames_hierarchy_data_fn,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
+    geonamesIDS = set(geonames_data.geonameid.values)
+    filter_mask = (geonames_hierarchy_data.childId.isin(geonamesIDS) & geonames_hierarchy_data.parentId.isin(geonamesIDS))
+    return (geonames_hierarchy_data[filter_mask]["childId parentId".split()].values.tolist())

-    index = NgramIndex(3)
-    index.split_and_add("J'aime le paté")
-    encoding = index.encode("xxxyyyy")
-    index.complete(encoding,10)
\ No newline at end of file
+def get_bounds(geodataframe):
+    geodataframe["geometry"] = geodataframe["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
+    geodataframe = gpd.GeoDataFrame(geodataframe)
+    geodataframe["i"]=1
+    return geodataframe.dissolve("i").bounds.values[0] # Required to get adjacency relationships
--- a/metrics.py
+++ b/metrics.py
--- a/lib/ngram_index.py
+++ b/lib/ngram_index.py
+import json
+
+import numpy as np
+
+from ngram import NGram
+
+# Machine learning 
+from gensim.models import Word2Vec
+
+class NgramIndex():
+    """
+    Class used for encoding words in ngram representation
+    """
+    def __init__(self,n):
+        """
+        Constructor
+        
+        Parameters
+        ----------
+        n : int
+            ngram size
+        """
+        self.ngram_gen = NGram(N=n)
+
+        self.size = n
+        self.ngram_index = {"":0}
+        self.index_ngram = {0:""}
+        self.cpt = 0
+        self.max_len = 0
+
+    def split_and_add(self,word):
+        """
+        Split word in multiple ngram and add each one of them to the index
+        
+        Parameters
+        ----------
+        word : str
+            a word
+        """
+        ngrams = word.lower().replace(" ","$")
+        ngrams = list(self.ngram_gen.split(ngrams))
+        [self.add(ngram) for ngram in ngrams]
+        self.max_len = max(self.max_len,len(ngrams))
+
+    def add(self,ngram):
+        """
+        Add a ngram to the index
+        
+        Parameters
+        ----------
+        ngram : str
+            ngram
+        """
+        if not ngram in self.ngram_index:
+            self.cpt+=1
+            self.ngram_index[ngram]=self.cpt
+            self.index_ngram[self.cpt]=ngram
+        
+
+    def encode(self,word):
+        """
+        Return a ngram representation of a word
+        
+        Parameters
+        ----------
+        word : str
+            a word
+        
+        Returns
+        -------
+        list of int
+            listfrom shapely.geometry import Point,box
+ of ngram index
+        """
+        ngrams = word.lower().replace(" ","$")
+        ngrams = list(self.ngram_gen.split(ngrams))
+        [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
+        return self.complete([self.ngram_index[ng] for ng in ngrams],self.max_len)
+
+    def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
+        """
+        Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
+        
+        Parameters
+        ----------
+        ngram_encoding : list of int
+            first encoding of a word
+        MAX_LEN : int
+            desired length of the encoding
+        filling_item : int, optional
+            ngram index you wish to use, by default 0
+        
+        Returns
+        -------
+        list of int
+            list of ngram index
+        """
+        assert len(ngram_encoding) <= MAX_LEN
+        diff = MAX_LEN - len(ngram_encoding)
+        ngram_encoding.extend([filling_item]*diff)  
+        return ngram_encoding
+    
+    def get_embedding_layer(self,texts,dim=100,**kwargs):
+        """
+        Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
+        
+        Parameters
+        ----------
+        texts : list of [list of int]
+            list of encoded word
+        dim : int, optional
+            embedding dimension, by default 100
+        
+        Returns
+        -------
+        np.array
+            embedding matrix
+        """
+        model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
+        N = len(self.ngram_index)
+        embedding_matrix = np.zeros((N,dim))
+        for i in range(N):
+            embedding_matrix[i] = model.wv[str(i)]
+        return embedding_matrix
+
+    def save(self,fn):
+        """
+
+        Save the NgramIndex
+        
+        Parameters
+        ----------
+        fn : str
+            output filename
+        """
+        data = {
+            "ngram_size": self.size,
+            "ngram_index": self.ngram_index,
+            "cpt_state": self.cpt,
+            "max_len_state": self.max_len
+        }
+        json.dump(data,open(fn,'w'))
+
+    @staticmethod
+    def load(fn):
+        """
+        
+        Load a NgramIndex state from a file.
+        
+        Parameters
+        ----------
+        fn : str
+            input filename
+        
+        Returns
+        -------
+        NgramIndex
+            ngram index
+        
+        Raises
+        ------
+        KeyError
+            raised if a required field does not appear in the input file
+        """
+        try:
+            data = json.load(open(fn))
+        except json.JSONDecodeError:
+            print("Data file must be a JSON")
+        for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
+            if not key in data:
+                raise KeyError("{0} field cannot be found in given file".format(key))
+        new_obj = NgramIndex(data["ngram_size"])
+        new_obj.ngram_index = data["ngram_index"]
+        new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
+        new_obj.cpt = data["cpt_state"]
+        new_obj.max_len = data["max_len_state"]
+        return new_obj
+
--- a/lib/utils.py
+++ b/lib/utils.py
+# Basic import 
+import math
+import argparse
+import os
+import json
+
+# Data Structure
+import numpy as np
+import geopandas as gpd
+from shapely.geometry import Point,box
+
+# NLP 
+from nltk.tokenize import word_tokenize
+from ngram import NGram
+
+# Visualisation and parallelisation
+from tqdm import tqdm
+
+
+class TokenizerCustom():
+    def __init__(self,vocab):
+        self.word_index = {vocab[i]:i for i in range(len(vocab))}
+        self.index_word = {i:vocab[i] for i in range(len(vocab))}
+        self.N = len(self.index_word)
+    def texts_to_sequences(self,listText):
+        seqs = []
+        for text in listText:
+            seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index])
+        return seqs
+
+
+class ConfigurationReader(object):
+    def __init__(self,configuration_file):
+        if not os.path.exists(configuration_file):
+            raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file))
+
+        self.configuration = json.load(open(configuration_file))
+
+        self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"])
+        self.parser = argparse.ArgumentParser(description=self.__argparser_desc)
+
+        self.parse_conf()
+    
+    def parse_conf(self):
+        if not "args" in self.configuration:
+            raise argparse.ArgumentError("","No args given in the configuration file")
+        
+        for dict_args in self.configuration["args"]:
+            if not isinstance(dict_args,dict):
+                raise ValueError("Args must be dictionnary")
+
+            short_command = dict_args.get("short",None)
+            long_command = dict_args.get("long",None)
+            
+            if not short_command and not long_command:
+                raise ValueError("No command name was given !") 
+            
+            add_func_dict_= {}
+            if "help" in dict_args:
+                add_func_dict_["help"]= dict_args["help"]
+            if "default" in dict_args:
+                add_func_dict_["default"]= dict_args["default"]
+            if "action" in dict_args:
+                add_func_dict_["action"]= dict_args["action"]
+            if "type" in dict_args:
+                add_func_dict_["type"]= eval(dict_args["type"])
+            if "choices" in dict_args:
+                add_func_dict_["choices"]= dict_args["choices"]
+
+            if not (short_command and long_command):
+                command = (short_command if not long_command else long_command)
+                self.parser.add_argument(command,**add_func_dict_)
+
+            elif long_command and short_command:
+                self.parser.add_argument(short_command,long_command,**add_func_dict_)
+    
+    def parse_args(self,input_=None):
+        if not input_:
+            return self.parser.parse_args()
+        return self.parser.parse_args(input_)
--- a/parser_config/embeddings_lat_lon.json
+++ b/parser_config/embeddings_lat_lon.json
-{
-    "description": "Toponym Combination",
-    "args": [
-        { "short": "input", "help": "Corpus used to learn the embeddings" },
-        { "short": "-g", "long": "--glove__dir", "default": "data/glove" },
-        {"long": "--max_sequence_length", "type":"int","default":15},
-        {"long": "--max_num_words", "type":"int","default":400000},
-        {"long": "--embedding_dimension", "type":"int","default":100},
-        {"long": "--batch_size", "type":"int","default":100},
-        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }
-    ]
-}
\ No newline at end of file
--- a/parser_config/toponym_combination_embedding.json
+++ b/parser_config/toponym_combination_embedding.json
@@ -7,8 +7,10 @@
        { "short": "-i", "long": "--inclusion", "action": "store_true" },
        { "short": "-a", "long": "--adjacency", "action": "store_true" },
        { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
+        { "long": "--cooc-sample-size", "type": "int", "default": 3 },
        {"long": "--adjacency-iteration", "type":"int","default":1},
        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
+        { "long": "--ngram-word2vec-dim", "type": "int", "default": 50 },
        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
        { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },

--- a/predict_toponym_coordinates.py
+++ b/predict_toponym_coordinates.py
@@ -2,6 +2,7 @@ from keras.models import load_model
 import tensorflow as tf
 import keras.backend as K
 from utils import NgramIndex
+import numpy as np

 from tensorflow.python.keras.backend import set_session
 from tensorflow.python.keras.models import load_model
@@ -9,7 +10,41 @@ from tensorflow.python.keras.models import load_model
 sess = None
 graph = None

-from metrics import lat_accuracy,lon_accuracy
+def lat_accuracy(LAT_TOL =1/180.):
+    def accuracy_at_k_lat(y_true, y_pred):
+        """
+        Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
+        task for neural network to obtain the exact  coordinate.
+
+        Parameters
+        ----------
+        y_true : tf.Tensor
+            truth data
+        y_pred : tf.Tensor
+            predicted output
+        """
+        diff = tf.abs(y_true - y_pred)
+        fit = tf.dtypes.cast(tf.less(diff,LAT_TOL),tf.int64)
+        return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
+    return accuracy_at_k_lat
+
+def lon_accuracy(LON_TOL=1/360.):
+    def accuracy_at_k_lon(y_true, y_pred):
+        """
+        Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
+        task for neural network to obtain the exact  coordinate.
+
+        Parameters
+        ----------
+        y_true : tf.Tensor
+            truth data
+        y_pred : tf.Tensor
+            predicted output
+        """
+        diff = tf.abs(y_true - y_pred)
+        fit = tf.dtypes.cast(tf.less(diff,LON_TOL),tf.int64)
+        return tf.reduce_sum(fit)/tf.size(y_pred,out_type=tf.dtypes.int64)
+    return accuracy_at_k_lon

 class Geocoder(object):
    """
@@ -21,12 +56,12 @@ class Geocoder(object):
    if you want an interactive map using leafletJS, set to True the `interactive_map` parameter of `Geocoder.plot_coord()`
    """
    def __init__(self,keras_model_fn,ngram_index_file):
-        global sess
-        global graph
-        sess = tf.compat.v1.Session()
-        graph = tf.compat.v1.get_default_graph()
-        set_session(sess)
-        self.keras_model = load_model(keras_model_fn,custom_objects={"lat_accuracy":lat_accuracy,"lon_accuracy":lon_accuracy})
+        # global sess
+        # global graph
+        # sess = tf.compat.v1.Session()
+        # graph = tf.compat.v1.get_default_graph()
+        # set_session(sess)
+        self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()})
        self.ngram_encoder = NgramIndex.load(ngram_index_file)

    def get_coord(self,toponym,context_toponym):
@@ -34,9 +69,11 @@ class Geocoder(object):
        global graph
        p = self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len)
        c = self.ngram_encoder.complete(self.ngram_encoder.encode(context_toponym),self.ngram_encoder.max_len)
-        with sess.as_default():
-            with graph.as_default():
-                lon,lat = self.keras_model.predict([[p],[c]])
+        p = np.array(p)
+        c = np.array(c)       
+        # with sess.as_default():
+        #     with graph.as_default():
+        lon,lat = self.keras_model.predict([[p],[c]])
        return lon[0][0],lat[0][0]

    def wgs_coord(self,lon,lat):
@@ -61,13 +98,19 @@ class Geocoder(object):
            ax.plot(lon,lat,marker='o', color='red', markersize=5)
            plt.show()

+geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index")
+top,topc = "Paris","Cherbourg"
+lon,lat = geocoder.get_coord(top,topc)
+lon,lat = geocoder.wgs_coord(lon,lat)
+geocoder.plot_coord("{0},{1}".format(top,topc),lat,lon)
+
 if __name__ == "__main__":
    from flask import Flask, escape, request, render_template

    app = Flask(__name__)


-    geocoder = Geocoder("outputs/LSTM_FR.txt_20_4_0.002_None_A_I_C.h5","outputs/index_4gram_FR_backup.txt")
+    geocoder = Geocoder("outputs/LSTM_FR.txt_100_4_0.002_None_A_I_C.h5","./outputs/FR.txt_100_4_0.002_None_A_I_C_index")

    @app.route('/',methods=["GET"])
    def display():

--- a/train_test_split_cooccurrence_data.py
+++ b/train_test_split_cooccurrence_data.py
@@ -13,7 +13,7 @@ logging.basicConfig(
 from sklearn.model_selection import train_test_split
 from shapely.geometry import Point

-from utils import Grid
+from lib.geo import Grid

 from tqdm import tqdm 


--- a/train_test_split_geonames.py
+++ b/train_test_split_geonames.py
@@ -14,7 +14,7 @@ logging.basicConfig(
 from sklearn.model_selection import train_test_split
 from shapely.geometry import Point

-from utils import Grid
+from lib.geo import Grid
 from helpers import read_geonames

 from tqdm import tqdm