From 637949fa23ff4e10d71f37f34a43bae08e944cc9 Mon Sep 17 00:00:00 2001
From: jfize <jacques.fize@insa-lyon.fr>
Date: Mon, 3 Feb 2020 15:04:02 +0100
Subject: [PATCH] Add train test split on geographic criteria + Ngram Index
 state can be saved+add doc to utils class and func+add multilinguage
 possibility in the Wikidata extraction +Update README and requirements

---
 .gitignore                                    |   1 +
 1_extractDataFromWikidata.py                  |  25 +-
 README.md                                     |  37 +-
 combination_embeddings.py                     |  88 ++++-
 .../toponym_combination_embedding.json        |   6 +-
 requirements.txt                              |   5 +-
 train_test_split_geonames.py                  |  83 +++++
 utils.py                                      | 347 ++++++++++++++++--
 8 files changed, 518 insertions(+), 74 deletions(-)
 create mode 100644 train_test_split_geonames.py

diff --git a/.gitignore b/.gitignore
index 9f23506..d004fc1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -143,3 +143,4 @@ WikipediaExtract/*
 test_comb.sh
 
 .vscode/*
+notes.md
\ No newline at end of file
diff --git a/1_extractDataFromWikidata.py b/1_extractDataFromWikidata.py
index 64c777c..3048c2c 100644
--- a/1_extractDataFromWikidata.py
+++ b/1_extractDataFromWikidata.py
@@ -34,17 +34,20 @@ def job(line):
     try: 
         data = json.loads(line.strip(",\n"))
         if "sitelinks" in data and "claims" in data:
-            if "enwiki" in data["sitelinks"]:
-                id_ = data["id"]
-                coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]
-                title = data["sitelinks"]["enwiki"]["title"]
-                url = "https://en.wikipedia.org/wiki/{0}".format(title.replace(" ","_"))
-                lat = coords_data["latitude"]
-                lon = coords_data["longitude"]
-                classes_ = ""
-                for claimP31 in data["claims"]["P31"]:
-                    classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"])
-                output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_")))
+            if "enwiki" in data["sitelinks"] or "frwiki"  in data["sitelinks"]:
+                page_available = [i for i in ["en","fr"] if i+"wiki" in data["sitelinks"]]
+                for site in page_available:
+                    site = 'en' if 'enwiki' in data["sitelinks"] else 'fr'
+                    id_ = data["id"]
+                    coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]
+                    title = data["sitelinks"]["{0}wiki".format(site)]["title"] 
+                    url = "https://{1}.wikipedia.org/wiki/{0}".format(title.replace(" ","_"),site)
+                    lat = coords_data["latitude"]
+                    lon = coords_data["longitude"]
+                    classes_ = ""
+                    for claimP31 in data["claims"]["P31"]:
+                        classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"])
+                    output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_")))
     except Exception: # First Line is "['" and last line is "]'"
         pass
 
diff --git a/README.md b/README.md
index de19b3a..84d5178 100644
--- a/README.md
+++ b/README.md
@@ -88,19 +88,32 @@ Gensim word2vec format is saved in the execution directory.
 ## Embedding : train using concatenation of close places
 
 
+### Prepare required data
 
+ * download the Geonames data use to train the network [here](download.geonames.org/export/dump/)
+ * download the hierarchy data [here](http://download.geonames.org/export/dump/hierarchy.zip)
+ * unzip both file in the directory of your choice
+ * run the script `train_test_split_geonames.py <geoname_filename>`
 
-    Toponym Combination
+### Train the network
 
-    positional arguments:
-    geoname_input         Filepath of the Geonames file you want to use.
-    geoname_hierachy_input
-                            Filepath of the Geonames file you want to use.
+The script `combination_embeddings.py` is the one responsible of the neural network training
 
-    optional arguments:
-    -h, --help            show this help message and exit
-    -v, --verbose
-    -n NGRAM_SIZE, --ngram-size NGRAM_SIZE
-    -t TOLERANCE_VALUE, --tolerance-value TOLERANCE_VALUE
-    -e EPOCHS, --epochs EPOCHS
-    -m {CNN,LSTM}, --model {CNN,LSTM}
\ No newline at end of file
+To train the network with default parameter use the following command : 
+
+    python3 combination_embeddings.py -a -i <geoname data filename> <hierarchy geonames data filename>
+
+### Available parameters
+
+
+| Parameter            | Description                                                          |
+|----------------------|----------------------------------------------------------------------|
+| -i,--inclusion       | Use inclusion relationships to train the network                     |
+| -a,--adjacency       | Use adjacency relationships to train the network                     |
+| -w,--wikipedia-coo   | Use Wikipedia place co-occurrences to train the network              |
+| -n,--ngram-size      | ngram size                                                           |
+| -t,--tolerance-value | K-value in the computation of the accuracy@k                         |
+| -e,--epochs          | number of epochs                                                     |
+| -d,--dimension       | size of the ngram embeddings                                         |
+| -m,--model           | Neural Network architecture used                                     |
+| --admin_code_1       |  (Optional) If you wish to train the network on a specificate region |
diff --git a/combination_embeddings.py b/combination_embeddings.py
index 72a8f3b..7913784 100644
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -1,4 +1,5 @@
 # Base module 
+import re
 import os
 import sys
 from argparse import ArgumentParser
@@ -40,6 +41,25 @@ def tqdm(*args, **kwargs):
         for instance in list(tqdm_base._instances):
             tqdm_base._decr_instances(instance)
     return tqdm_base(*args, **kwargs)
+
+
+def parse_title_wiki(title_wiki):
+    return re.sub("\(.*\)","",title_wiki).strip().lower()
+
+def get_new_ids(cooc_data,id_first_value):
+    topo_id = {}
+    id_ = id_first_value
+    for title in cooc_data.title.values:
+        if not title in topo_id:
+            id_+=1
+            topo_id[id_]=title
+    for interlinks in cooc_data.interlinks.values:
+        for interlink in interlinks.split("|"):
+            if not interlink in topo_id:
+                id_+=1
+                topo_id[id_]=interlink
+    return topo_id
+
 # Logging
 import logging
 from chrono import Chronometer
@@ -50,7 +70,7 @@ logging.basicConfig(
     )
 chrono = Chronometer()
 
-args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("-i -a -n 2 -t 0.002 -e 5 -m CNN data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
+args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("--admin_code_1 94 -n 2 -t 0.002 -e 100 -m LSTM -a -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
 
 GEONAME_FN = args.geoname_input
 GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
@@ -65,6 +85,11 @@ if args.model == "CNN":
 else:
     LSTM_train = True
 
+# check for output dir
+if not os.path.exists("outputs/"):
+    os.makedirs("outputs/")
+
+
 # LOAD DATA
 logging.info("Load Geonames data...")
 geoname_data = read_geonames(GEONAME_FN).fillna("")
@@ -78,45 +103,74 @@ logging.info("Geonames data loaded!")
 # SELECT ENTRY with class == to A and P (Areas and Populated Places)
 filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places
 
+admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split()
+region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1
+if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth:
+    filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
+
 # Geometry operation 
 filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
 filtered = gpd.GeoDataFrame(filtered)
 filtered["i"]=1
 bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships
+geoname2name = dict(filtered["geonameid name".split()].values)
+
+
 
-rel_dict ={}
+rel_store = []
 
 if args.adjacency:
-    logging.info("Retrieve inclusion relationships ! ")
-    fn = "data/geonamesData/{0}_{1}_adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY)
+    # RETRIEVE ADJACENCY REL
+    logging.info("Retrieve adjacency relationships ! ")
+    fn = "data/geonamesData/{0}_{1}{2}adjacency.json".format(GEONAME_FN.split("/")[-1],ITER_ADJACENCY,region_fn)
     if not os.path.exists(fn):
         g = Grid(*bounds,[360,180])
         g.fit_data(filtered)
         [g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered["geonameid longitude latitude".split()].iterrows(),total=len(filtered))]
-        rel_dict.update(dict([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)]))
-        json.dump(rel_dict,open(fn,'w'))
+        rel_store.extend([[int(i) for i in r.split("|")] for r in g.get_adjacent_relationships(ITER_ADJACENCY)])
+        json.dump(rel_store,open(fn,'w'))
     else:
         logging.info("Open and load data from previous computation!")
-        rel_dict.update({int(k):int(v) for k,v in json.load(open(fn)).items()})
-    logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_dict)))
+        rel_store=[[int(couple[0]),int(couple[1])] for couple in json.load(open(fn))]
+    logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
 
 if args.inclusion:
     # RETRIEVE INCLUSION RELATIONSHIPS
     logging.info("Retrieve inclusion relationships ! ")
-    geoname2name = dict(filtered["geonameid name".split()].values)
     filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
-    rel_dict.update(dict(hierarchy_data[filter_mask]["childId parentId".split()].values))
+    rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist()))
     logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask])))
 
 
+if args.wikipedia_cooc:
+    cooc_data = pd.read_csv("./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1],sep="\t")
+    cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
+    cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
+    id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max())
+    wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
+    title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()}
+
 # ENCODING NAME USING N-GRAM SPLITTING
 logging.info("Encoding toponyms to ngram...")
 index = NgramIndex(NGRAM_SIZE)
 filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
 filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
 max_len = filtered.encode_name.apply(len).max() #Â Retrieve the encodings max length
+
+if args.wikipedia_cooc:
+    [index.split_and_add(x) for x in id_wikipediatitle.values()]
+    idwiki_encoded = {id_: index.encode(toponym) for id_,toponym in id_wikipediatitle.items()}
+    max_len = max(max_len,max([len(enc) for _,enc in idwiki_encoded.items()]))
+
+index.max_len = int(max_len) #Â For Index state dump
+
 filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
 geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
+
+if args.wikipedia_cooc:
+    idwiki_encoded = {id_: index.complete(enc,max_len) for id_,enc in idwiki_encoded.items()}
+
+index.save("outputs/index_{0}gram_{1}".format(NGRAM_SIZE,GEONAME_FN.split("/")[-1]))
 logging.info("Done !")
 
 #CLEAR RAM
@@ -129,6 +183,8 @@ filtered["cell_vec"]=filtered.apply(
     axis=1
     )
 geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
+if args.wikipedia_cooc:
+    wikipediaid_vec = {wikipediatitle_id[title]: zero_one_encoding(*title_coord[title]) for title in cooc_data.title.values}
 
 # CLEAR RAM
 del filtered
@@ -142,9 +198,8 @@ logging.info("Preparing Input and Output data...")
 X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
 X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
 
-for geonameId_1,geonameId_2 in rel_dict.items():
-    if not geonameId_2 in rel_dict:
-      continue
+for couple in rel_store:
+    geonameId_1,geonameId_2 = couple[0],couple[1]
     top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
     if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
         
@@ -190,7 +245,7 @@ def accuracy_at_k(y_true, y_pred):
     fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
     return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
 
-name = "{0}_{1}_{2}_{3}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE)
+name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn)
 if args.adjacency:
     name+="_A"
 if args.inclusion:
@@ -263,4 +318,7 @@ if CONV :
     history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
 
 hist_df = pd.DataFrame(history.history)
-hist_df.to_csv("outputs/{0}.csv".format(name))
\ No newline at end of file
+hist_df.to_csv("outputs/{0}.csv".format(name))
+
+model.save("outputs/"+name+".h5")
+
diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json
index 22249bb..9f3fe94 100644
--- a/parser_config/toponym_combination_embedding.json
+++ b/parser_config/toponym_combination_embedding.json
@@ -6,11 +6,13 @@
         { "short": "-v", "long": "--verbose", "action": "store_true" },
         { "short": "-i", "long": "--inclusion", "action": "store_true" },
         { "short": "-a", "long": "--adjacency", "action": "store_true" },
-        {"long": "--adjacency-iteration", "type":"int","default":10},
+        { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
+        {"long": "--adjacency-iteration", "type":"int","default":50},
         { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
         { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
         { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
         { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
-        { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" }
+        { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" },
+        {  "long": "--admin_code_1", "default": "None" }
     ]
 }
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 8919e09..c5b83fd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
-#pyroutelib3
+pyroutelib3
 node2vec
-#osrm
+osrm
 geopandas
 pandas
 numpy
@@ -15,3 +15,4 @@ keras
 ngram
 shapely
 sqlitedict
+nltk
\ No newline at end of file
diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py
new file mode 100644
index 0000000..d8f8962
--- /dev/null
+++ b/train_test_split_geonames.py
@@ -0,0 +1,83 @@
+import argparse
+
+import numpy as np
+import pandas as pd
+import geopandas as gpd
+
+import logging
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO
+    )
+
+from sklearn.model_selection import train_test_split
+from shapely.geometry import Point
+
+from utils import Grid
+from helpers import read_geonames
+
+from tqdm import tqdm 
+
+parser = argparse.ArgumentParser()
+parser.add_argument("geoname_file")
+parser.add_argument("--feature_classes",help="List of class",default="A P")
+
+args = parser.parse_args()#("data/geonamesData/FR.txt".split())
+
+# LOAD DATAgeopandas
+GEONAME_FN = args.geoname_file
+FEATURE_CLASSES = args.feature_classes
+
+
+logging.info("Load Geonames data...")
+geoname_data = read_geonames(GEONAME_FN).fillna("")
+logging.info("Geonames data loaded!")
+
+# SELECT ENTRY with class == to A and P (Areas and Populated Places)
+filtered = geoname_data[geoname_data.feature_class.isin(FEATURE_CLASSES.split())].copy() # Only take area and populated places
+
+#Â World Shape bounds
+world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
+world["nn"] = 1
+dissolved = world.dissolve(by="nn").iloc[0].geometry
+
+#Creating Grid
+logging.info("Initializing Grid (360,180)...")
+g = Grid(*dissolved.bounds,[360,180])
+logging.info("Fit Data to the Grid...")
+g.fit_data(filtered)
+logging.info("Placing place into the grid...")
+[g+(int(row.geonameid),row.latitude,row.longitude) for ix,row in tqdm(filtered.iterrows(),total=len(filtered))]
+
+#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
+logging.info("Associate a cell number to each place in the Geoname Dataframe")
+def foo(g,id_):
+    for ix,cell in enumerate(g.cells):
+        if id_ in cell.list_object:
+            return ix
+
+filtered["cat"] = filtered.geonameid.apply(lambda x:foo(g,x))
+
+# TRAIN AND TEST SPLIT
+logging.info("Split Between Train and Test")
+
+#  Cell can be empty
+i=0
+while 1:
+    if len(filtered[filtered.cat == i])> 1:
+        X_train,X_test = train_test_split(filtered[filtered.cat == i])
+        break
+    i+=1
+
+for i in range(i+1,len(g.cells)):
+    try:
+        x_train,x_test = train_test_split(filtered[filtered.cat == i])
+        X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test))
+    except:
+        pass #print("Error",len(filtered[filtered.cat == i]))
+
+# SAVING THE DATA
+logging.info("Saving Output !")
+X_train.to_csv(GEONAME_FN+"_train.csv")
+X_test.to_csv(GEONAME_FN+"_test.csv")
\ No newline at end of file
diff --git a/utils.py b/utils.py
index a468525..37eb4a7 100644
--- a/utils.py
+++ b/utils.py
@@ -1,16 +1,24 @@
 import math
 import numpy as np
 
-from stop_words import get_stop_words
 from nltk.tokenize import word_tokenize
 
 import textwrap
 from ngram import NGram
 
+
 import argparse
 import os
 import json
+
 from tqdm import tqdm
+import geopandas as gpd
+
+from keras.layers import Embedding
+from gensim.models import Word2Vec
+
+from joblib import Parallel,delayed
+from shapely.geometry import Point,box
 
 
 class TokenizerCustom():
@@ -25,40 +33,113 @@ class TokenizerCustom():
         return seqs
 
 
-from keras.layers import Embedding
-from gensim.models import Word2Vec
+
 class NgramIndex():
+    """
+    Class used for encoding words in ngram representation
+    """
     def __init__(self,n):
-        self.ngram_gen = NGram(N=n)
+        """
+        Constructor
         
+        Parameters
+        ----------
+        n : int
+            ngram size
+        """
+        self.ngram_gen = NGram(N=n)
+
+        self.size = n
         self.ngram_index = {}
         self.index_ngram = {}
         self.cpt = 0
         self.max_len = 0
+
     def split_and_add(self,word):
+        """
+        Split word in multiple ngram and add each one of them to the index
+        
+        Parameters
+        ----------
+        word : str
+            a word
+        """
         ngrams = word.lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))
         [self.add(ngram) for ngram in ngrams]
 
     def add(self,ngram):
+        """
+        Add a ngram to the index
+        
+        Parameters
+        ----------
+        ngram : str
+            ngram
+        """
         if not ngram in self.ngram_index:
             self.cpt+=1
             self.ngram_index[ngram]=self.cpt
             self.index_ngram[self.cpt]=ngram
 
     def encode(self,word):
+        """
+        Return a ngram representation of a word
+        
+        Parameters
+        ----------
+        word : str
+            a word
+        
+        Returns
+        -------
+        list of int
+            list of ngram index
+        """
         ngrams = word.lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))
         [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
         return [self.ngram_index[ng] for ng in ngrams]
 
     def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
+        """
+        Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
+        
+        Parameters
+        ----------
+        ngram_encoding : list of int
+            first encoding of a word
+        MAX_LEN : int
+            desired length of the encoding
+        filling_item : int, optional
+            ngram index you wish to use, by default 0
+        
+        Returns
+        -------
+        list of int
+            list of ngram index
+        """
         assert len(ngram_encoding) <= MAX_LEN
         diff = MAX_LEN - len(ngram_encoding)
         ngram_encoding.extend([filling_item]*diff)  
         return ngram_encoding
     
     def get_embedding_layer(self,texts,dim=100,**kwargs):
+        """
+        Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
+        
+        Parameters
+        ----------
+        texts : list of [list of int]
+            list of encoded word
+        dim : int, optional
+            embedding dimension, by default 100
+        
+        Returns
+        -------
+        np.array
+            embedding matrix
+        """
         model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
         N = len(self.ngram_index)
         embedding_matrix = np.zeros((N,dim))
@@ -66,19 +147,116 @@ class NgramIndex():
             embedding_matrix[i] = model.wv[str(i)]
         return embedding_matrix
 
+    def save(self,fn):
+        """
+
+        Save the NgramIndex
+        
+        Parameters
+        ----------
+        fn : str
+            output filename
+        """
+        data = {
+            "ngram_size": self.size,
+            "ngram_index": self.ngram_index,
+            "cpt_state": self.cpt,
+            "max_len_state": self.max_len
+        }
+        json.dump(data,open(fn,'w'))
+
+    @staticmethod
+    def load(fn):
+        """
+        
+        Load a NgramIndex state from a file.
+        
+        Parameters
+        ----------
+        fn : str
+            input filename
+        
+        Returns
+        -------
+        NgramIndex
+            ngram index
+        
+        Raises
+        ------
+        KeyError
+            raised if a required field does not appear in the input file
+        """
+        try:
+            data = json.load(open(fn))
+        except json.JSONDecodeError:
+            print("Data file must be a JSON")
+        for key in ["ngram_size","ngram_index","cpt_state","max_len_state"]:
+            if not key in data:
+                raise KeyError("{0} field cannot be found in given file".format(key))
+        new_obj = NgramIndex(data["ngram_size"])
+        new_obj.ngram_index = data["ngram_index"]
+        new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
+        new_obj.cpt = data["cpt_state"]
+        new_obj.max_len = data["max_len_state"]
+        return new_obj
+
+
 def zero_one_encoding(long,lat):
+    """
+    Encode coordinates (WGS84) between 0 and 1
+    
+    Parameters
+    ----------
+    long : float
+        longitude value
+    lat : float
+        latitude value
+    
+    Returns
+    -------
+    float,float
+        longitude, latitude
+    """
     return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) 
 
 def _split(lst,n,complete_chunk_value):
+    """
+    Split a list into chunk of n-size.
+    
+    Parameters
+    ----------
+    lst : list
+        input list
+    n : int
+        chunk size
+    complete_chunk_value : object
+        if last chunk size not equal to n, this value is used to complete it
+    
+    Returns
+    -------
+    list
+        chunked list
+    """
     chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
     if not chunks:return chunks
     if len(chunks[-1]) != n:
         chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
     return np.array(chunks)
 
-
-
 def generate_couple(object_list):
+    """
+    Return a randomly selected couple from an object list.
+    
+    Parameters
+    ----------
+    object_list : list
+        object list
+    
+    Returns
+    -------
+    list
+        list of coupled object
+    """
     couples = []
     lst = np.arange(len(object_list))
     for _ in range(len(object_list)):
@@ -93,22 +271,73 @@ def generate_couple(object_list):
     return couples
 
 def _hash_couple(o1,o2):
+    """
+    Return an hash for two object ids.
+    
+    Parameters
+    ----------
+    o1 : str or int
+        id of the first objeeect
+    o2 : str of int
+        id of the second object
+    
+    Returns
+    -------
+    str
+        hash
+    """
     return "|".join(map(str,sorted([int(o1),int(o2)])))
 
 
 
 ### GEO ADJAC BEGIN
-from joblib import Parallel,delayed
-from shapely.geometry import Point,box
-
 class Cell(object):
-    def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y):
+    """
+    A cell is box placed in geeographical space.
+    """
+    def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,x,y):
+        """
+        Constructor
         
+        Parameters
+        ----------
+        object : [type]
+            [description]
+        upperleft_x : float
+            upperleft longitude
+        upperleft_y : float
+            upperleft latitude
+        bottomright_x : float
+            bottom right longitude
+        bottomright_y : float
+            bottom right latitude
+        x : int
+            cell x coordinates in the grid
+        y : int
+            cell y coordinates in the grid
+        """
         self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
         self.box_ = box(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
         self.list_object={} # {id:Point(coord)}
-    
+
+        self.x,self.y = x, y
+
     def contains(self,lat,lon):
+        """
+        Return true if the cell contains a point at given coordinates
+        
+        Parameters
+        ----------
+        lat : float
+            latitude
+        lon : float
+            longitude
+        
+        Returns
+        -------
+        bool
+            true if contains
+        """ 
         x,y = lon,lat
         if x < self.upperleft_x or x > self.bottomright_x:
             return False
@@ -117,13 +346,45 @@ class Cell(object):
         return True
     
     def add_object(self,id_,lat,lon):
+        """
+        Connect an object to the cell
+        
+        Parameters
+        ----------
+        id_ : int
+            id
+        lat : float
+            latitude
+        lon : float
+            longitude
+        """
         self.list_object[id_] = Point(lon,lat)
             
     def __repr__(self):
         return  "upperleft:{0}_{1}_;bottom_right:{2}_{3}".format(self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y)
         
 class Grid(object):
+    """
+    Define a grid 
+    
+    """
     def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,cell_sub_div_index=[100,50]):
+        """
+        Constructor
+        
+        Parameters
+        ----------
+        upperleft_x : float
+            upperleft longitude
+        upperleft_y : float
+            upperleft latitude
+        bottomright_x : float
+            bottom right longitude
+        bottomright_y : float
+            bottom right latitude
+        cell_sub_div_index : list, optional
+            number of division in both latitude and longitude axis (longitude first), by default [100,50]
+        """
         self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
         
         self.x_r = abs(self.bottomright_x - self.upperleft_x)/cell_sub_div_index[0]
@@ -142,7 +403,7 @@ class Grid(object):
                     self.upperleft_y+i*self.y_r,
                     self.upperleft_x+((j+1)*self.x_r),
                     self.upperleft_y+((i+1)*self.y_r),
-                    )
+                    j,i)
                 )
         dec_y = 0 
         for i in range(cell_sub_div_index[1]):
@@ -153,34 +414,45 @@ class Grid(object):
                     self.upperleft_x+(j*self.x_r)-self.c_x_r, # TOP
                     self.upperleft_y+(i*self.y_r)-dec_y,
                     self.upperleft_x+((j+1)*self.x_r)-self.c_x_r,#(self.u_pos*self.c_x_r),
-                    self.upperleft_y+((i+1)*self.y_r)+self.c_y_r#(self.u_neg*self.c_y_r),
-                    )
+                    self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
+                    j,i)
                 )
                 self.inter_cells[-1].append(Cell(
                     self.upperleft_x+(j*self.x_r)-self.c_x_r, # CENTER
                     self.upperleft_y+(i*self.y_r)-self.c_y_r,
                     self.upperleft_x+((j+1)*self.x_r)+self.c_x_r,
                     self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
-                    )
+                    j,i)
                 )
                 self.inter_cells[-1].append(Cell(
                     self.upperleft_x+(j*self.x_r)+dec_x, # CENTER
                     self.upperleft_y+(i*self.y_r)-self.c_y_r,
                     self.upperleft_x+((j+1)*self.x_r)-self.c_x_r, #LEFT
-                    self.upperleft_y+((i+1)*self.y_r)+self.c_y_r
-                    )
+                    self.upperleft_y+((i+1)*self.y_r)+self.c_y_r,
+                    j,i)
                 )
                 dec_x = self.c_x_r
             dec_y = self.c_y_r
     
-    def fit_data(self,data):
-        data["nn"] = 1
-        dissolved = data.dissolve(by="nn")
+    def fit_data(self,data = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))):
+        """
+        
+        To avoid unnecessary check when connecting an entity to one or multiple cells, we 
+        filter cells that does not appears in our geographic context (here countries surface).
+        
+        Parameters
+        ----------
+        data : GeoDataFrame
+            geographic context
+        """
+        world = data 
+        world["nn"] = 1
+        dissolved = world.dissolve(by="nn").iloc[0].geometry
         new_cells= []
         new_inter_cells=[]
         for i in tqdm(range(len(self.cells))):
             for j in range(len(self.cells[i])):
-                if dissolved.intersects(self.cells[i][j].box_).all():
+                if dissolved.intersects(self.cells[i][j].box_):
                     new_cells.append(self.cells[i][j])
                     new_inter_cells.extend(self.inter_cells[i][j*3:(j+1)*3])
                     
@@ -188,7 +460,15 @@ class Grid(object):
         self.inter_cells = new_inter_cells
         
                     
-    def __add__(self,a):    
+    def __add__(self,a): 
+        """
+        Add an object to the grid
+        
+        Parameters
+        ----------
+        a : tuple
+            (id, latitude, longitude)
+        """
         for c1 in range(len(self.cells)):
             if self.cells[c1].contains(a[1],a[2]):
                 self.cells[c1].add_object(*a)
@@ -199,6 +479,19 @@ class Grid(object):
                 break
                 
     def get_adjacent_relationships(self,random_iteration=10):
+        """
+        Return a list of adjacent relationships founds in each cell.
+        
+        Parameters
+        ----------
+        random_iteration : int, optional
+            number of iteration for random selection of adjacency relationships, by default 10
+        
+        Returns
+        -------
+        list
+            adjacency relationships
+        """
         relationships = set([])
         for c1 in tqdm(range(len(self.cells))):
             for i in range(random_iteration):
@@ -214,12 +507,6 @@ class Grid(object):
 
 ### GEO ADJAC END
 
-
-
-import argparse
-import os
-import json
-
 class ConfigurationReader(object):
     def __init__(self,configuration_file):
         if not os.path.exists(configuration_file):
@@ -273,10 +560,6 @@ class ConfigurationReader(object):
 
 
 if __name__ == "__main__":
-    q = Quadtree(-180,90,180,-90) 
-    hash_ = q.encode((1.2,1.3))
-    q.decode(hash_)
-
 
     index = NgramIndex(3)
     index.split_and_add("J'aime le patÃ©")
-- 
GitLab