From 8e0560b4653883bbcbdadcdd3c330b4b6a869d39 Mon Sep 17 00:00:00 2001
From: jfize <jacques.fize@insa-lyon.fr>
Date: Wed, 5 Feb 2020 12:43:58 +0100
Subject: [PATCH] DEBUG + CODE CLEANING + ADD COOCCURRENCE INTEGRATION

---
 README.md                                     |  84 +++++--
 combination_embeddings.py                     | 234 +++++++++---------
 embeddings_lat_lon_type.py                    |  40 +--
 extractDataFromWikidata.py                    |  11 +-
 extractLearningDataset.py                     |   2 -
 geonames_embedding.py                         |   7 -
 helpers.py                                    |   8 +-
 models.py                                     |   7 +-
 parser_config/embeddings_lat_lon.json         |  12 +
 .../toponym_combination_embedding.json        |   2 +-
 train_test_split_cooccurrence_data.py         |  85 +++++++
 train_test_split_geonames.py                  |   9 +
 utils.py                                      |  56 ++++-
 13 files changed, 354 insertions(+), 203 deletions(-)
 create mode 100644 parser_config/embeddings_lat_lon.json
 create mode 100644 train_test_split_cooccurrence_data.py

diff --git a/README.md b/README.md
index c9fc710..5ef0cb2 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,21 @@
-# Requirements
+# Work on Place-embedding 
 
- - Python3.6+
- - Os free (all dependencies work on Windows !)
+This repo contains various approach around geographic place embedding, and more precisely on its use for geocoding. At this moment, we designed three approaches :
 
-It is strongly advised to used Anaconda in a windows environnement ! 
+ * Use of geographic places Wikipedia pages to learn an embedding for toponyms
+ * Use Geonames place topology to produce an embedding using graph-embedding techniques
+ * Use toponym colocation (combination ?) based on spatial relatationships (inclusion, adjacency) for geocoding
 
-## Install dependencies
+<hr>
+
+## Setup environnement
+
+- Python3.6+
+- Os free (all dependencies work on Windows !)
+
+It is strongly advised to used Anaconda in a windows environnement! 
+
+### Install dependencies
 
     pip3 install -r requirements.txt
 
@@ -13,16 +23,30 @@ For Anaconda users
 
     while read requirement; do conda install --yes $requirement; done < requirements.txt
 
-# Different approaches execution
+<hr>
+
+## First approach : Embedding using places Wikipedia pages
+
+<div style="text-align:center">
+<img src="documentation/imgs/first_approach.png"/>
+<p>Figure 1 : First approach general workflow</p>
+</div>
+
+In this first approach, the goal is to produce embedding for place name. In order to do this, we designed a neural network that takes :
 
-## Embedding using places Wikipedia pages
+* **Input:** Text sequence (phrase)
+* **Output** Latitute, Longitude, and the place type
 
-![first_approach](documentation/imgs/first_approach.png)
+Input texts are selected using Wikidata to filter Wikipedia pages about geographic places. Then, the filtered pages are retrieved on the Wikipedia corpus file. For each pages, we got :
 
+* Title
+* Introduction text
+* Coordinates of the place (laatitude-Longitude)
+* Place type (using a mapping between Wikidata and DBpedia Place subclasses)
 
 ### Step 1: Parse Wikipedia data !
 
-First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2* 
+First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2*
 
 Then, use the `gensim` parser (doc [here](https://radimrehurek.com/gensim/scripts/segment_wiki.html)). Use the following command :
 
@@ -42,7 +66,7 @@ Using previous output, we extract text data from selected Wikipedia pages with t
 
 ### Step 4 : Run Embedding extraction
 
-To learn extract the place embedding, use the `4_embeddings_lat_lon_type.py`
+To learn extract the place embedding, use the `embeddings_lat_lon_type.py`
 
 #### Available Parameters
 
@@ -63,37 +87,57 @@ The different outputs (on for each neural network architecture) are put in the `
 * outputs/Bi-GRU_100dim_20epoch_1000batch.csv : **training history**
 * outputs/Bi-GRU_100dim_20epoch_1000batch.txt : **embeddings**
 
+<hr>
 
-## Geonames place embedding
+## 2nd Approach: Geonames place embedding
 
-![second_approach](documentation/imgs/second_approach.png)
+From this point, we change our vantage point by focusing our model propositions by using heavily spatial/geographical data, in this context gazetteer. In this second approach, we propose to generate an embedding for places (not place's toponym) based on their topology.
 
-First, download the Geonames dump here : https://download.geonames.org/export/dump/ 
+In order to do that, we use Geonames data to build a topology graph. This graph is generated based on intersection found between place buffer intersection.
 
-*N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM).
+(image ici)
+
+Then, using topology network, we use node-embedding techniques to generate an embedding for each vertex (places).
+
+<div style="text-align:center">
+<img src="documentation/imgs/second_approach.png"/>
+<p><strong>Figure 2</strong> : Second approach general workflow</p>
+</div>
+
+### Generate the embedding
+
+First, download the Geonames dump : [here](https://download.geonames.org/export/dump/)
+
+*N.B.* We advise you to take only the data from one country ! Topology network can be really dense and large !
 
     python3 geonames_embedding.py <geonames dump(*.txt)>
 
 ### Available Parameters
 
-| Parameter              | Value (default)                                                   |
+| Parameter              | Description (default)                                             |
 |------------------------|-------------------------------------------------------------------|
-| --nbcpu                | Cpu used for the embedding learning phase                         |
-| --vector-size          | embedding size                                                    |
-| --walk-length          | Generated Walk length                                             |
+| --nbcpu                | Number of CPU used for during the learning phase                  |
+| --vector-size          | Embedding size                                                    |
+| --walk-length          | Generated walk length                                             |
 | --num-walks            | Number of walks for each vertex (place)                           |
 | --word2vec-window-size | Window-size used in Word2vec                                      |
 | --buffer-size          | Buffer size used to detect adjacency relationships between places |
 | -d                     | Integrate distances between places in the topology graph          |
 | --dist                 | Distance used if '-d'                                             |
 
-### Output 
+### Output files 
 
 Gensim word2vec format is saved in the execution directory.
 
+<hr>
+
 ## Embedding : train using concatenation of close places
 
-![second_approach](documentation/imgs/third_approach.png)
+<div style="text-align:center">
+<img src="documentation/imgs/third_approach.png"/>
+<p><strong>Figure 3</strong> : Third approach general workflow</p>
+</div>
+
 
 ### Prepare required data
 
diff --git a/combination_embeddings.py b/combination_embeddings.py
index 94ee452..d22359e 100644
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -1,8 +1,6 @@
 # Base module 
 import re
 import os
-import sys
-from argparse import ArgumentParser
 import json
 
 # Structure
@@ -11,15 +9,8 @@ import numpy as np
 import geopandas as gpd
 
 # DEEPL module
-from keras.preprocessing.text import Tokenizer
-from keras.preprocessing.sequence import pad_sequences
-from keras.utils import to_categorical
-from keras.layers import Dense, Input, GlobalMaxPooling1D
-from keras.layers import Conv1D, MaxPooling1D, Embedding
-from keras.layers import Add,concatenate,Dropout
+from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM
 from keras.models import Model
-from keras.initializers import Constant
-from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D, Conv2D
 from keras import backend as K
 import tensorflow as tf
 
@@ -31,22 +22,44 @@ from helpers import read_geonames
 from utils import Grid
 from utils import  zero_one_encoding, NgramIndex,ConfigurationReader
 
-
-# Visualisation module
-import matplotlib.pyplot as plt
-from tqdm import tqdm as tqdm_base
-
-def tqdm(*args, **kwargs):
-    if hasattr(tqdm_base, '_instances'):
-        for instance in list(tqdm_base._instances):
-            tqdm_base._decr_instances(instance)
-    return tqdm_base(*args, **kwargs)
+# Logging
+from tqdm import tqdm
+import logging
+from helpers import Chronometer
 
 
 def parse_title_wiki(title_wiki):
+    """
+    Parse Wikipedia title
+    
+    Parameters
+    ----------
+    title_wiki : str
+        wikipedia title
+    
+    Returns
+    -------
+    str
+        parsed wikipedia title
+    """
     return re.sub("\(.*\)","",title_wiki).strip().lower()
 
 def get_new_ids(cooc_data,id_first_value):
+    """
+    Return new ids from cooccurrence data
+    
+    Parameters
+    ----------
+    cooc_data : pd.DataFrame
+        cooccurrence da
+    id_first_value : int
+        id beginning value
+    
+    Returns
+    -------
+    dict
+        new ids for each toponyms
+    """
     topo_id = {}
     id_ = id_first_value
     for title in cooc_data.title.values:
@@ -60,9 +73,23 @@ def get_new_ids(cooc_data,id_first_value):
                 topo_id[id_]=interlink
     return topo_id
 
-# Logging
-import logging
-from helpers import Chronometer
+def accuracy_at_k(y_true, y_pred):
+    """
+    Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
+    task for neural network to obtain the exact  coordinate.
+
+    Parameters
+    ----------
+    y_true : tf.Tensor
+        truth data
+    y_pred : tf.Tensor
+        predicted output
+    """
+    diff = tf.abs(y_true - y_pred)
+    fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
+    return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
+
+# LOGGING CONF
 logging.basicConfig(
     format='[%(asctime)s][%(levelname)s] %(message)s ', 
     datefmt='%m/%d/%Y %I:%M:%S %p',
@@ -70,8 +97,9 @@ logging.basicConfig(
     )
 chrono = Chronometer()
 
-args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("--admin_code_1 94 -n 2 -t 0.002 -e 100 -m LSTM -a -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
+args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-n 4 -t 0.002 -e 100 -m LSTM -a data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split())
 
+# Initialisee CONSTANTS
 GEONAME_FN = args.geoname_input
 GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
 NGRAM_SIZE = args.ngram_size
@@ -79,18 +107,11 @@ ACCURACY_TOLERANCE = args.tolerance_value
 EPOCHS = args.epochs
 ITER_ADJACENCY = args.adjacency_iteration
 
-CONV, LSTM_train = False,False
-if args.model == "CNN":
-    CONV = True
-else:
-    LSTM_train = True
-
 # check for output dir
 if not os.path.exists("outputs/"):
     os.makedirs("outputs/")
 
-
-# LOAD DATA
+# LOAD  Geonames DATA
 logging.info("Load Geonames data...")
 geoname_data = read_geonames(GEONAME_FN).fillna("")
 hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
@@ -103,18 +124,20 @@ logging.info("Geonames data loaded!")
 # SELECT ENTRY with class == to A and P (Areas and Populated Places)
 filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
 
+# IF REGION (ONLY FR for now !)
 admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split()
 region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1
 if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth:
     filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
 
+# REDUCE DATA STORED
+filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
+
 # Geometry operation 
 filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
 filtered = gpd.GeoDataFrame(filtered)
 filtered["i"]=1
 bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships
-geoname2name = dict(filtered["geonameid name".split()].values)
-
 
 
 rel_store = []
@@ -137,38 +160,64 @@ if args.adjacency:
 if args.inclusion:
     # RETRIEVE INCLUSION RELATIONSHIPS
     logging.info("Retrieve inclusion relationships ! ")
-    filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
+    geonamesIDS = set(filtered.geonameid.values)
+    filter_mask = (hierarchy_data.childId.isin(geonamesIDS) & hierarchy_data.parentId.isin(geonamesIDS))
     rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist()))
     logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask])))
 
+del filtered["geometry"]
 
 if args.wikipedia_cooc:
-    cooc_data = pd.read_csv("./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1],sep="\t")
+    logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
+    COOC_FN = "./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1]
+    cooc_data = pd.read_csv(COOC_FN,sep="\t")
     cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
     cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
     id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max())
     wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
     title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()}
+    cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
+    filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
+
+    train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
+    train_indices.union(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]))
+    test_indices.union(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]))
+
+    logging.info("Merged with Geonames data !")
+
+    # EXTRACT rel
+    logging.info("Extracting cooccurrence relationships")
+    cpt=0
+    for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
+        for inter in row.interlinks.split("|"):
+            cpt+=1
+            rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
+    logging.info("Extract {0} cooccurrence relationships !".format(cpt))
+
+
+# STORE ID to name
+geoname2name = dict(filtered["geonameid name".split()].values)
 
 # ENCODING NAME USING N-GRAM SPLITTING
 logging.info("Encoding toponyms to ngram...")
 index = NgramIndex(NGRAM_SIZE)
 filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
+if args.wikipedia_cooc:
+    [index.split_and_add(k) for k in wikipediatitle_id]
 filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
 max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
-
 if args.wikipedia_cooc:
-    [index.split_and_add(x) for x in id_wikipediatitle.values()]
-    idwiki_encoded = {id_: index.encode(toponym) for id_,toponym in id_wikipediatitle.items()}
-    max_len = max(max_len,max([len(enc) for _,enc in idwiki_encoded.items()]))
+    extension = {v:index.encode(k) for k,v in wikipediatitle_id.items()}
 
 index.max_len = int(max_len) # For Index state dump
 
 filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
+if args.wikipedia_cooc:
+    extension = {k:index.complete(v,max_len) for k,v in extension.items()}
 geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
 
 if args.wikipedia_cooc:
-    idwiki_encoded = {id_: index.complete(enc,max_len) for id_,enc in idwiki_encoded.items()}
+    geoname2encodedname.update(extension)
 
 index.save("outputs/index_{0}gram_{1}".format(NGRAM_SIZE,GEONAME_FN.split("/")[-1]))
 logging.info("Done !")
@@ -183,9 +232,6 @@ filtered["cell_vec"]=filtered.apply(
     axis=1
     )
 geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
-if args.wikipedia_cooc:
-    wikipediaid_vec = {wikipediatitle_id[title]: zero_one_encoding(*title_coord[title]) for title in cooc_data.title.values}
-
 # CLEAR RAM
 del filtered
 
@@ -198,8 +244,12 @@ logging.info("Preparing Input and Output data...")
 X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
 X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
 
+cpt=0
 for couple in rel_store:
     geonameId_1,geonameId_2 = couple[0],couple[1]
+    if not geonameId_1 in geoname2encodedname:
+        cpt+=1
+        continue
     top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
     if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
         
@@ -229,93 +279,53 @@ y_lon_test = np.array(y_lon_test)
 
 logging.info("Data prepared !")
 
-def accuracy_at_k(y_true, y_pred):
-    """
-    Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
-    task for neural network to obtain the exact coordinate.
-
-    Parameters
-    ----------
-    y_true : tf.Tensor
-        truth data
-    y_pred : tf.Tensor
-        predicted output
-    """
-    diff = tf.abs(y_true - y_pred)
-    fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
-    return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
 
+# OUTPUT FN BASE
 name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn)
 if args.adjacency:
     name+="_A"
 if args.inclusion:
     name+="_I"
 
+# NGRAM EMBDEDDING
 logging.info("Generating N-GRAM Embedding...")
-embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50)
+embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=5)
 logging.info("Embedding generated !")
 
-if LSTM_train:
-    name = "LSTM_"+ name
-    input_1 = Input(shape=(max_len,))
-    input_2 = Input(shape=(max_len,))
-
-    embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
-
-    x1 = Bidirectional(LSTM(98))(embedding_layer(input_1))
-    x2 = Bidirectional(LSTM(98))(embedding_layer(input_2))
-
-    x = concatenate([x1,x2])#,x3])
-
-    x1 = Dense(500,activation="relu")(x)
-    #x1 = Dropout(0.3)(x1)
-    x1 = Dense(500,activation="relu")(x1)
-    #x1 = Dropout(0.3)(x1)
-
-    x2 = Dense(500,activation="relu")(x)
-    #x2 = Dropout(0.3)(x2)
-    x2 = Dense(500,activation="relu")(x2)
-    #x2 = Dropout(0.3)(x2)
-
-    output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
-    output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
-
-    model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
+# DEEP MODEL
+name = "LSTM_"+ name
+input_1 = Input(shape=(max_len,))
+input_2 = Input(shape=(max_len,))
 
-    model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
-    history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
+embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
 
-if CONV :
-    name = "CONV_"+ name
-    input_1 = Input(shape=(max_len,))
-    input_2 = Input(shape=(max_len,))
+x1 = Bidirectional(LSTM(98))(embedding_layer(input_1))
+x2 = Bidirectional(LSTM(98))(embedding_layer(input_2))
 
-    embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)# weights=[embedding_weights],trainable=False)
+x = concatenate([x1,x2])#,x3])
 
-    x1 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_1))
-    x1 = Dropout(0.5)(x1)
-    x1 = MaxPooling1D(pool_size=2)(x1)
-    x1 = Flatten()(x1)
+x1 = Dense(500,activation="relu")(x)
+#x1 = Dropout(0.3)(x1)
+x1 = Dense(500,activation="relu")(x1)
+#x1 = Dropout(0.3)(x1)
 
-    x2 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_2))
-    x2 = Dropout(0.5)(x2)
-    x2 = MaxPooling1D(pool_size=2)(x2)
-    x2 = Flatten()(x2)
+x2 = Dense(500,activation="relu")(x)
+#x2 = Dropout(0.3)(x2)
+x2 = Dense(500,activation="relu")(x2)
+#x2 = Dropout(0.3)(x2)
 
-    x = concatenate([x1,x2])
+output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
+output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
 
-    x = Dense(500,activation="relu")(x)
-    x = Dropout(0.3)(x)
-    x = Dense(500,activation="relu")(x)
-    x = Dropout(0.3)(x)
+model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
 
-    output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x)
-    output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x)
+model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
+history = model.fit(x=[X_1_train,X_2_train],
+    y=[y_lon_train,y_lat_train],
+    verbose=True, batch_size=100,
+    epochs=EPOCHS,
+    validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
 
-    model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
-    model.summary()
-    model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
-    history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]))
 
 hist_df = pd.DataFrame(history.history)
 hist_df.to_csv("outputs/{0}.csv".format(name))
diff --git a/embeddings_lat_lon_type.py b/embeddings_lat_lon_type.py
index d5778f6..39461b7 100644
--- a/embeddings_lat_lon_type.py
+++ b/embeddings_lat_lon_type.py
@@ -1,41 +1,21 @@
 # Basic module
-import time
-import random
 import json
-import os
-import sys
 import argparse
 
 # Data module
 import numpy as np
 import pandas as pd
 
-from joblib import Parallel,delayed
-
-# Keras basic
-import keras
-from keras import backend as K
-from keras.initializers import Constant
-
 # preprocessing
 from sklearn import preprocessing
-from keras.preprocessing import sequence
 from keras.preprocessing.text import Tokenizer
-from keras.preprocessing.sequence import pad_sequences
-from keras.utils import to_categorical
-from keras.preprocessing.text import text_to_word_sequence
-
-# Neural Network Model and layers class
-from keras.layers import Dense, Input, GlobalAveragePooling1D, Embedding, LSTM, Bidirectional, Conv1D, GRU
-from keras.models import Model
 
 # Neural network model and visualisation function
 from models import getModel,BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model
 from helpers import plot_accuracy_from_history, save_embedding
 
 # Utils
-from utils import CoordinatesEncoder,TokenizerCustom,_split
-
+from utils import CoordinatesEncoder,_split, ConfigurationReader
 
 # Logging
 import logging
@@ -51,21 +31,8 @@ chrono = Chronometer()
 import matplotlib.pyplot as plt
 from tqdm import tqdm
 
-parser = argparse.ArgumentParser()
-
-parser.add_argument("input")
-parser.add_argument("--glove_dir",default="data/glove")
-
-parser.add_argument("--max_sequence_length",type=int, default=15)
-parser.add_argument("--max_num_words",type=int, default=400000)
-
-parser.add_argument("--embedding_dimension",type=int, default=100)
-
-parser.add_argument("--batch_size",type=int, default=100)
-parser.add_argument("--epochs",type=int, default=100)
-
-parser.add_argument("-v",action="store_true",help="Display Keras training verbose")
-
+parser = ConfigurationReader(configuration_file="parser_config/embeddings_lat_lon.json")
+args = parser.parse_args()
 
 def clean(x):
     return x.lower().replace("\n","").replace("\'\'\'","").replace("\'\'","")
@@ -141,7 +108,6 @@ logging.info("The vocabulary contains {0} words".format(len(list(vocab_))))
 
 logging.info("Initialize Tokenizer/ClassEncoder/CoordinateEncoder...")
 # Tokenizer
-#tokenizer = TokenizerCustom(list(vocab_))
 max_key_tokenizer = np.max(list(tokenizer.index_word.keys()))
 num_words = min(MAX_NUM_WORDS, len(tokenizer.word_index)) + 1
 # Coordinate Encoder
diff --git a/extractDataFromWikidata.py b/extractDataFromWikidata.py
index 3048c2c..0f9e237 100644
--- a/extractDataFromWikidata.py
+++ b/extractDataFromWikidata.py
@@ -1,19 +1,12 @@
 import json
 import gzip
 import argparse
-import re
-
-import pandas as pd
 
 from joblib import Parallel, delayed
 
 # To avoid progressbar issue
-from tqdm import tqdm as tqdm_base
-def tqdm(*args, **kwargs):
-    if hasattr(tqdm_base, '_instances'):
-        for instance in list(tqdm_base._instances):
-            tqdm_base._decr_instances(instance)
-    return tqdm_base(*args, **kwargs)
+from tqdm import tqdm
+
 
 
 parser = argparse.ArgumentParser()
diff --git a/extractLearningDataset.py b/extractLearningDataset.py
index 725d82a..e2bc717 100644
--- a/extractLearningDataset.py
+++ b/extractLearningDataset.py
@@ -1,7 +1,5 @@
 import gzip
 import json
-import re
-
 import argparse
 
 import pandas as pd
diff --git a/geonames_embedding.py b/geonames_embedding.py
index 8906e95..2185349 100644
--- a/geonames_embedding.py
+++ b/geonames_embedding.py
@@ -6,7 +6,6 @@ from multiprocessing import cpu_count
 from argparse import RawTextHelpFormatter
 
 # COMMON DATA STRUCTURE MODULE
-import pandas as pd
 import numpy as np
 import networkx as nx
 
@@ -16,16 +15,10 @@ import osrm
 osrm.RequestConfig.host = "jacquesfize.com:5000"
 from shapely.geometry import Point
 
-# DISTANCE MODULE
-from scipy.spatial.distance import cosine
-from scipy.stats.stats import pearsonr
-
 # Machine Learning MODULE
 from node2vec import Node2Vec
-import gensim
 
 # VISUALISATION MODULE
-import matplotlib.pyplot as plt
 from tqdm import tqdm
 tqdm.pandas()
 
diff --git a/helpers.py b/helpers.py
index 554e620..825dd4a 100644
--- a/helpers.py
+++ b/helpers.py
@@ -1,6 +1,9 @@
-import pandas as pd
-import matplotlib.pyplot as plt 
 import os
+import time
+
+import pandas as pd
+
+import matplotlib.pyplot as plt
 
 def read_geonames(file):
     """
@@ -88,7 +91,6 @@ def save_embedding(model,tokenizer,layer_idx,fn):
             f.write('\n')
 
 
-import time
 
 class Chronometer():
     def __init__(self):
diff --git a/models.py b/models.py
index fa259fb..7c3370a 100644
--- a/models.py
+++ b/models.py
@@ -1,10 +1,5 @@
-
 from keras import Model
-from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding, GRU, GlobalAveragePooling1D, Dropout
-
-# name,model_2=MPC_model(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder)
-# model_2.fit(x=new_X,y=[Y_type,Y_lat,Y_lon],validation_split=0.33,epochs=EPOCHS,batch_size=BATCH_SIZE,verbose=1)
-
+from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding, GRU, GlobalAveragePooling1D
 
 def getModel(model_func,max_sequence_length,embedding_dim,num_words,class_encoder,coordinate_encoder):
     sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
diff --git a/parser_config/embeddings_lat_lon.json b/parser_config/embeddings_lat_lon.json
new file mode 100644
index 0000000..1a0c774
--- /dev/null
+++ b/parser_config/embeddings_lat_lon.json
@@ -0,0 +1,12 @@
+{
+    "description": "Toponym Combination",
+    "args": [
+        { "short": "input", "help": "Corpus used to learn the embeddings" },
+        { "short": "-g", "long": "--glove__dir", "default": "data/glove" },
+        {"long": "--max_sequence_length", "type":"int","default":15},
+        {"long": "--max_num_words", "type":"int","default":400000},
+        {"long": "--embedding_dimension", "type":"int","default":100},
+        {"long": "--batch_size", "type":"int","default":100},
+        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }
+    ]
+}
\ No newline at end of file
diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json
index 9f3fe94..93662e1 100644
--- a/parser_config/toponym_combination_embedding.json
+++ b/parser_config/toponym_combination_embedding.json
@@ -7,7 +7,7 @@
         { "short": "-i", "long": "--inclusion", "action": "store_true" },
         { "short": "-a", "long": "--adjacency", "action": "store_true" },
         { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
-        {"long": "--adjacency-iteration", "type":"int","default":50},
+        {"long": "--adjacency-iteration", "type":"int","default":5},
         { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
         { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
         { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py
new file mode 100644
index 0000000..4748f3e
--- /dev/null
+++ b/train_test_split_cooccurrence_data.py
@@ -0,0 +1,85 @@
+import argparse
+
+import pandas as pd
+import geopandas as gpd
+
+import logging
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO
+    )
+
+from sklearn.model_selection import train_test_split
+from shapely.geometry import Point
+
+from utils import Grid
+
+from tqdm import tqdm 
+
+parser = argparse.ArgumentParser()
+parser.add_argument("cooccurrence_file")
+
+args = parser.parse_args("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split())
+
+# LOAD DATAgeopandas
+COOC_FN = args.cooccurrence_file
+
+
+
+logging.info("Load Cooc DATA data...")
+cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("")
+cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
+cooc_data = gpd.GeoDataFrame(cooc_data)
+logging.info("Cooc data loaded!")
+
+# World Shape bounds
+world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
+world["nn"] = 1
+dissolved = world.dissolve(by="nn").iloc[0].geometry
+
+#Creating Grid
+logging.info("Initializing Grid (360,180)...")
+g = Grid(*dissolved.bounds,[360,180])
+logging.info("Fit Data to the Grid...")
+g.fit_data(cooc_data)
+logging.info("Placing place into the grid...")
+[g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))]
+
+#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
+logging.info("Associate a cell number to each place in the Geoname Dataframe")
+def foo(g,id_):
+    for ix,cell in enumerate(g.cells):
+        if id_ in cell.list_object:
+            return ix
+
+cooc_data["cat"] = cooc_data.title.apply(lambda x:foo(g,x))
+
+# TRAIN AND TEST SPLIT
+logging.info("Split Between Train and Test")
+
+#  Cell can be empty
+i=0
+while 1:
+    if len(cooc_data[cooc_data.cat == i])> 1:
+        X_train,X_test = train_test_split(cooc_data[cooc_data.cat == i])
+        break
+    i+=1
+
+for i in range(i+1,len(g.cells)):
+    try:
+        x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i])
+        X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test))
+    except Exception as e:
+        print(e) #print("Error",len(filtered[filtered.cat == i]))
+
+del X_train["geometry"]
+del X_train["nn"]
+del X_train["cat"]
+del X_test["cat"]
+del X_test["geometry"]
+del X_test["nn"]
+# SAVING THE DATA
+logging.info("Saving Output !")
+X_train.to_csv(COOC_FN+"_train.csv")
+X_test.to_csv(COOC_FN+"_test.csv")
\ No newline at end of file
diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py
index d8f8962..ff87967 100644
--- a/train_test_split_geonames.py
+++ b/train_test_split_geonames.py
@@ -32,6 +32,8 @@ FEATURE_CLASSES = args.feature_classes
 
 logging.info("Load Geonames data...")
 geoname_data = read_geonames(GEONAME_FN).fillna("")
+geoname_data["geometry"] = geoname_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
+geoname_data = gpd.GeoDataFrame(geoname_data)
 logging.info("Geonames data loaded!")
 
 # SELECT ENTRY with class == to A and P (Areas and Populated Places)
@@ -77,6 +79,13 @@ for i in range(i+1,len(g.cells)):
     except:
         pass #print("Error",len(filtered[filtered.cat == i]))
 
+
+del X_train["geometry"]
+del X_train["nn"]
+del X_train["cat"]
+del X_test["cat"]
+del X_test["geometry"]
+del X_test["nn"]
 # SAVING THE DATA
 logging.info("Saving Output !")
 X_train.to_csv(GEONAME_FN+"_train.csv")
diff --git a/utils.py b/utils.py
index 6e052db..bd767ed 100644
--- a/utils.py
+++ b/utils.py
@@ -11,17 +11,13 @@ from shapely.geometry import Point,box
 
 # NLP 
 from nltk.tokenize import word_tokenize
-import textwrap
 from ngram import NGram
 
 # Machine learning 
-from keras.layers import Embedding
 from gensim.models import Word2Vec
 
 # Visualisation and parallelisation
 from tqdm import tqdm
-from joblib import Parallel,delayed
-
 
 
 class TokenizerCustom():
@@ -36,6 +32,54 @@ class TokenizerCustom():
         return seqs
 
 
+class CoordinatesEncoder:
+    """
+    Will be replaced by Grid in grid2.py
+    """
+    def __init__(self, cell_size_lat=0.5, cell_size_lon=0.5):
+        self.min_lon = -180
+        self.max_lon = -(self.min_lon)  #  Symetric
+        self.min_lat = -90
+        self.max_lat = -(self.min_lat)  # Symetric
+
+        self.ecart_lat = self.max_lat - self.min_lat
+        self.ecart_lon = self.max_lon - self.min_lon
+
+        self.cell_size_lat = cell_size_lat
+        self.cell_size_lon = cell_size_lon
+
+        self.unit_size_lat = self.ecart_lat / self.cell_size_lat
+        self.unit_size_lon = self.ecart_lon / self.cell_size_lon
+
+    def encode(self, lat, lon):
+        return (
+            math.floor(((lat + self.max_lat) / self.ecart_lat) * self.unit_size_lat),
+            math.floor(((lon + self.max_lon) / self.ecart_lon) * (self.unit_size_lon))
+        )
+
+    def number_lat_cell(self):
+        return int(self.unit_size_lat)
+
+    def number_lon_cell(self):
+        return int(self.unit_size_lon)
+
+    def oneDimensionOutputSize(self):
+        return self.number_lat_cell() * self.number_lon_cell()
+
+    def vector(self, lat, lon):
+        lat_v, lon_v = np.zeros(self.number_lat_cell()), np.zeros(self.number_lon_cell())
+        new_coords = self.encode(lat, lon)
+        lat_v[int(new_coords[0])] = 1
+        lon_v[int(new_coords[1])] = 1
+        return lat_v, lon_v
+
+    def vector_flatten(self, lat, lon):
+        vec = np.zeros(self.oneDimensionOutputSize())  # 2D Dense softmax isn't possible
+        new_coords = self.encode(lat, lon)
+        pos = self.number_lat_cell() * (new_coords[0]) + new_coords[1]
+        vec[pos] = 1  # lon * lon size
+        return vec
+
 
 class NgramIndex():
     """
@@ -53,8 +97,8 @@ class NgramIndex():
         self.ngram_gen = NGram(N=n)
 
         self.size = n
-        self.ngram_index = {}
-        self.index_ngram = {}
+        self.ngram_index = {"":0}
+        self.index_ngram = {0:""}
         self.cpt = 0
         self.max_len = 0
 
-- 
GitLab