diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..24a8e87939aa53cdd833f6be7610cb4972e063ad
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index d004fc118703ea29ec3f728c3f4d1220ccf8d795..589ecbd98650129ffab8c9cb9447c47966228249 100644
--- a/.gitignore
+++ b/.gitignore
@@ -143,4 +143,7 @@ WikipediaExtract/*
 test_comb.sh
 
 .vscode/*
-notes.md
\ No newline at end of file
+notes.md
+
+.idea/*
+.vscode/*
\ No newline at end of file
diff --git a/3_samplingLearningDataset.py b/3_samplingLearningDataset.py
deleted file mode 100644
index 2d512ce71cf324213f2e6e67ce74f495b28d35e0..0000000000000000000000000000000000000000
--- a/3_samplingLearningDataset.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import subprocess,os,json
-import numpy as np
-import time
-
-import logging
-logging.basicConfig(
-    format='[%(asctime)s][%(levelname)s] %(message)s ', 
-    datefmt='%m/%d/%Y %I:%M:%S %p',
-    level=logging.INFO
-    )
-
-from tqdm import tqdm
-
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("corpus_filename")
-parser.add_argument("sampling_size",type=int)
-parser.add_argument("output_filename")
-
-args= parser.parse_args()
-
-CORPUS_FILENAME = args.corpus_filename
-SAMPLE_SIZE = args.sampling_size
-
-# Compute the size of input corpus 
-logging.info("Computing the corpus size...")
-wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ])
-NUMBER_OF_INPUT=int(wc_l.strip().split()[-2])
-logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT))
-
-# Sampling
-logging.info("Sampling...")
-arr_ = np.arange(NUMBER_OF_INPUT)
-sample = np.random.choice(arr_,SAMPLE_SIZE)
-
-# Prepare Output file
-output = open(args.output_filename,'w')
-
-# Writing in the output file
-logging.info("Writing the output...")
-for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT):
-    if ix in sample:
-        output.write(line)
-
-logging.info("Done !")
-
-
diff --git a/README.md b/README.md
index 84d51789db8d8b0f64c18d74dd52ecb20a6e27fb..c9fc710b441da4c643d446a6f900945164e0e60e 100644
--- a/README.md
+++ b/README.md
@@ -3,18 +3,22 @@
  - Python3.6+
  - Os free (all dependencies work on Windows !)
 
+It is strongly advised to used Anaconda in a windows environnement ! 
+
 ## Install dependencies
 
     pip3 install -r requirements.txt
 
+For Anaconda users
+
+    while read requirement; do conda install --yes $requirement; done < requirements.txt
+
 # Different approaches execution
 
 ## Embedding using places Wikipedia pages
 
-Three scripts need to be used : 
- * 1_extractDataFromWikidata.py
- * 2_extractLearningDataset.py
- * 4_embeddings_lat_lon_type.py
+![first_approach](documentation/imgs/first_approach.png)
+
 
 ### Step 1: Parse Wikipedia data !
 
@@ -62,6 +66,8 @@ The different outputs (on for each neural network architecture) are put in the `
 
 ## Geonames place embedding
 
+![second_approach](documentation/imgs/second_approach.png)
+
 First, download the Geonames dump here : https://download.geonames.org/export/dump/ 
 
 *N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM).
@@ -87,6 +93,7 @@ Gensim word2vec format is saved in the execution directory.
 
 ## Embedding : train using concatenation of close places
 
+![second_approach](documentation/imgs/third_approach.png)
 
 ### Prepare required data
 
diff --git a/chrono.py b/chrono.py
deleted file mode 100644
index 8e0adfd61f63449bce232df919402448735db4b6..0000000000000000000000000000000000000000
--- a/chrono.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import time
-
-class Chronometer():
-    def __init__(self):
-        self.__task_begin_timestamp = {}
-
-    def start(self,task_name):
-        """
-        Start a new task chronometer
-        
-        Parameters
-        ----------
-        task_name : str
-            task id
-        
-        Raises
-        ------
-        ValueError
-            if a running task already exists with that name
-        """
-        if task_name in self.__task_begin_timestamp:
-            raise ValueError("A running task exists with the name {0}!".format(task_name))
-        self.__task_begin_timestamp[task_name] = time.time()
-
-    def stop(self,task_name):
-        """
-        Stop and return the duration of the task
-        
-        Parameters
-        ----------
-        task_name : str
-            task id
-        
-        Returns
-        -------
-        float
-            duration of the task in seconds
-        
-        Raises
-        ------
-        ValueError
-            if no task exist with the id `task_name`
-        """
-        if not task_name in self.__task_begin_timestamp:
-             raise ValueError("The {0} task does not exist!".format(task_name))
-        duration = time.time() - self.__task_begin_timestamp[task_name]
-        del self.__task_begin_timestamp[task_name]
-        return duration
-
-if __name__ == "__main__":
-    chrono = Chronometer()
-    chrono.start("test")
-    chrono.start("test2")
-    time.sleep(3)
-    print(chrono.stop("test"))
-    time.sleep(3)
-    print(chrono.stop("test2"))
\ No newline at end of file
diff --git a/combination_embeddings.py b/combination_embeddings.py
index 7913784f82da7a7fa04aef23e23ecf864c619464..94ee452266826d6f2815bf74ddb257bc5e8ae4c4 100644
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -62,7 +62,7 @@ def get_new_ids(cooc_data,id_first_value):
 
 # Logging
 import logging
-from chrono import Chronometer
+from helpers import Chronometer
 logging.basicConfig(
     format='[%(asctime)s][%(levelname)s] %(message)s ', 
     datefmt='%m/%d/%Y %I:%M:%S %p',
diff --git a/documentation/imgs/first_approach.png b/documentation/imgs/first_approach.png
new file mode 100644
index 0000000000000000000000000000000000000000..297c1a5025d993acfae6e501d88acac24dfc7e59
--- /dev/null
+++ b/documentation/imgs/first_approach.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a243605f4d58dee8bad4a18845ab78ca2319049e633b35e6a89540add684be8
+size 298011
diff --git a/documentation/imgs/second_approach.png b/documentation/imgs/second_approach.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5e693fbaf11113de2673b366d4bf603047239c2
--- /dev/null
+++ b/documentation/imgs/second_approach.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bab13df1e420e97a08977aa38382076491a8294d85b7daa0a10d69a36a52fc0
+size 457738
diff --git a/documentation/imgs/third_approach.png b/documentation/imgs/third_approach.png
new file mode 100644
index 0000000000000000000000000000000000000000..d96596ad9ee35b8ada81b0a68535e593ed8e1a0e
--- /dev/null
+++ b/documentation/imgs/third_approach.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad7cbed2e748b814c38eb070d29a23f7417469169a56aeb0e660a743e00430fd
+size 31104
diff --git a/4_embeddings_lat_lon_type.py b/embeddings_lat_lon_type.py
similarity index 99%
rename from 4_embeddings_lat_lon_type.py
rename to embeddings_lat_lon_type.py
index e674e508052df416b5378c3b79a6d6cd0de3451b..d5778f6aefcfebe7e1d970e4520feb95cee8a8a8 100644
--- a/4_embeddings_lat_lon_type.py
+++ b/embeddings_lat_lon_type.py
@@ -39,7 +39,7 @@ from utils import CoordinatesEncoder,TokenizerCustom,_split
 
 # Logging
 import logging
-from chrono import Chronometer
+from helpers import Chronometer
 logging.basicConfig(
     format='[%(asctime)s][%(levelname)s] %(message)s ', 
     datefmt='%m/%d/%Y %I:%M:%S %p',
diff --git a/evalgeonamesembeddings.py b/evalgeonamesembeddings.py
deleted file mode 100644
index c7d346dd4a58940a1c0beb1e5f3a5782489b52bd..0000000000000000000000000000000000000000
--- a/evalgeonamesembeddings.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Evaluation process
-import gensim
-import glob
-import re
-import gensim
-import random
-from helpers import *
-from scipy.spatial.distance import cosine
-from shapely.geometry import Point
-from scipy.stats.stats import pearsonr
-
-import pandas as pd
-import geopandas as gpd
-
-from tqdm import tqdm
-
-NPAIR = 100000
-fns = glob.glob("data/embeddings/*.bin")
-
-def get_data(fn):
-    data = [int(x) for x in re.findall("\d+",fn)]
-    if not len(data) == 4:
-        return {"embedding_size":data[0],
-        "walk_length":data[1],
-        "number_of_walks":data[2],
-        "word2vec_window_size":data[3],
-        "filepath":fn,
-        "noise":data[4]
-        }
-        #raise Exception("filename should have 4 integers")
-    return {
-        "embedding_size":data[0],
-        "walk_length":data[1],
-        "number_of_walks":data[2],
-        "word2vec_window_size":data[3],
-        "filepath":fn
-    }
-    
-df = read_geonames("./data/geonamesData/FR.txt")
-df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1)
-
-# Create GeoDataFrame for faster spatial comparison operations
-gdf = gpd.GeoDataFrame(df)
-
-# Select a sample that concerns the departement "La Manche"
-manche_gdf = gdf[gdf.admin2_code == "50"].copy()
-
-df =pd.DataFrame([get_data(fn) for fn in fns])
-
-def get_pearsons(model):
-    manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid
-    coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values)
-    places = list(coords.keys())
-    geodesic_d = []
-    embeddings_d = []
-    for i in tqdm(range(NPAIR),disable=True):
-        placeA=random.choice(places)
-        placeB=random.choice(places)
-        geodesic_d.append(coords[placeA].distance(coords[placeB]))
-        embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)]))
-    return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value
-
-df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0])
-df.fillna(0,inplace=True)
-df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno')
-plt.show()
-df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno')
-plt.show()
-df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno')
-plt.show()
\ No newline at end of file
diff --git a/1_extractDataFromWikidata.py b/extractDataFromWikidata.py
similarity index 100%
rename from 1_extractDataFromWikidata.py
rename to extractDataFromWikidata.py
diff --git a/2_extractLearningDataset.py b/extractLearningDataset.py
similarity index 100%
rename from 2_extractLearningDataset.py
rename to extractLearningDataset.py
diff --git a/helpers.py b/helpers.py
index c1e1f34178bcd39c5f49f90e4d1fd2d9f3cbf803..554e62033471e11f2587d1f6dec879dfd75cc2e1 100644
--- a/helpers.py
+++ b/helpers.py
@@ -88,3 +88,60 @@ def save_embedding(model,tokenizer,layer_idx,fn):
             f.write('\n')
 
 
+import time
+
+class Chronometer():
+    def __init__(self):
+        self.__task_begin_timestamp = {}
+
+    def start(self,task_name):
+        """
+        Start a new task chronometer
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Raises
+        ------
+        ValueError
+            if a running task already exists with that name
+        """
+        if task_name in self.__task_begin_timestamp:
+            raise ValueError("A running task exists with the name {0}!".format(task_name))
+        self.__task_begin_timestamp[task_name] = time.time()
+
+    def stop(self,task_name):
+        """
+        Stop and return the duration of the task
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Returns
+        -------
+        float
+            duration of the task in seconds
+        
+        Raises
+        ------
+        ValueError
+            if no task exist with the id `task_name`
+        """
+        if not task_name in self.__task_begin_timestamp:
+             raise ValueError("The {0} task does not exist!".format(task_name))
+        duration = time.time() - self.__task_begin_timestamp[task_name]
+        del self.__task_begin_timestamp[task_name]
+        return duration
+
+if __name__ == "__main__":
+    chrono = Chronometer()
+    chrono.start("test")
+    chrono.start("test2")
+    time.sleep(3)
+    print(chrono.stop("test"))
+    time.sleep(3)
+    print(chrono.stop("test2"))
\ No newline at end of file
diff --git a/utils.py b/utils.py
index 37eb4a7372b9823f1c91d42c4f837de56bebc026..6e052dbf5cc7b3194d69043c4e40a88596c459be 100644
--- a/utils.py
+++ b/utils.py
@@ -1,24 +1,27 @@
+# Basic import 
 import math
-import numpy as np
-
-from nltk.tokenize import word_tokenize
-
-import textwrap
-from ngram import NGram
-
-
 import argparse
 import os
 import json
 
-from tqdm import tqdm
+# Data Structure
+import numpy as np
 import geopandas as gpd
+from shapely.geometry import Point,box
+
+# NLP 
+from nltk.tokenize import word_tokenize
+import textwrap
+from ngram import NGram
 
+# Machine learning 
 from keras.layers import Embedding
 from gensim.models import Word2Vec
 
+# Visualisation and parallelisation
+from tqdm import tqdm
 from joblib import Parallel,delayed
-from shapely.geometry import Point,box
+
 
 
 class TokenizerCustom():
@@ -94,7 +97,8 @@ class NgramIndex():
         Returns
         -------
         list of int
-            list of ngram index
+            listfrom shapely.geometry import Point,box
+ of ngram index
         """
         ngrams = word.lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))