From cb8a6d14fab0e1b8a4ba0cc24fe3be2488220b03 Mon Sep 17 00:00:00 2001
From: Jacques Fize <jacques.fize@insa-lyon.fr>
Date: Wed, 14 Oct 2020 18:09:33 +0200
Subject: [PATCH] debug + add datagenerator train geocoder v2

---
 README.md                              | 36 +++++++++----
 documentation/imgs/LSTM_arch.png       |  3 --
 documentation/imgs/LSTM_archv2.png     |  3 ++
 documentation/imgs/first_approach.png  |  3 --
 documentation/imgs/second_approach.png |  3 --
 documentation/imgs/third_approach.png  |  3 --
 lib/datageneratorv4.py                 | 55 ++++++++++++++++---
 lib/geocoder/bert_geocoder.py          |  4 +-
 lib/torch_generator.py                 |  2 +-
 train_geocoder_v2.py                   | 74 +++++++++++++++-----------
 10 files changed, 121 insertions(+), 65 deletions(-)
 delete mode 100644 documentation/imgs/LSTM_arch.png
 create mode 100644 documentation/imgs/LSTM_archv2.png
 delete mode 100644 documentation/imgs/first_approach.png
 delete mode 100644 documentation/imgs/second_approach.png
 delete mode 100644 documentation/imgs/third_approach.png

diff --git a/README.md b/README.md
index 9bd2998..e41fe94 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,7 @@
 # Toponym Geocoding
 
-Use of ngram representation and colocation of toponyms in geography and text for geocoding
+This repository contains the code for *"Using a deep neural network for toponym geocoding based on co-occurrences and spatial relations"*. In a nutshell, we propose to geocode place names using the less information available (two place names, one to geocode and the second used as context) and rely on deep learning network architecture.
 
-<div style="text-align:center">
-<img src="documentation/imgs/LSTM_arch.png"/>
-<p><strong>Figure 1</strong> : General workflow</p>
-</div>
 
 
 <hr>
@@ -13,9 +9,9 @@ Use of ngram representation and colocation of toponyms in geography and text for
 ## Setup environnement
 
 - Python3.6+
-- Os free (all dependencies should work on Windows !)
+- Os free**
 
-It is strongly advised to used Anaconda in a windows environnement! 
+***It is strongly advised to used Anaconda in a Windows environnement!*
 
 ### Install dependencies
 
@@ -50,11 +46,29 @@ French Geonames, French Wikipedia cooccurence data, and their train/test splits
 
 <hr>
 
-## Train the network
+## Train your own model
+
+
+
+### First model
+Like every proposed model, this model is neural network. The first model is illustrated in the Figure 1. In a nutshell
+<div style="text-align:center">
+<img src="documentation/imgs/LSTM_archv2.png"/>
+<p><strong>Figure 1</strong> : General workflow</p>
+</div>
+    python3 train_geocoder.py
+
+### Second model
+
+This model is the same as the first one, except that the output is a concatenation of the latitude and longitude outputs.
+
+    python3 train_geocoder_v2.py
+
 
-To train the network with default parameter use the following command : 
+### BERT model
+Recently, a popular model called BERT has show great promises on various NLP tasks. This last model uses BERT pretrained model. More precisely, we use BERT in a classifier where classes corresponds to Healpix cells. 
 
-    python3 combination_embeddings.py -i <geoname data filename> <hierarchy geonames data filename>
+    python3 train_geocoder.py
 
 ### Train the network with different parameters
 
@@ -66,7 +80,7 @@ from lib.run import GridSearchModel
 from collections import OrderedDict
 
 grid = GridSearchModel(\
-    "python3 combination_embeddings.py",
+    "python3 train_geocoder_v2.py",
     **OrderedDict({ # We use an OrderedDict since the order of parameters is important
     "rel":["-i","-a","-c"],
     "-n":[4],
diff --git a/documentation/imgs/LSTM_arch.png b/documentation/imgs/LSTM_arch.png
deleted file mode 100644
index 76768fe..0000000
--- a/documentation/imgs/LSTM_arch.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bfef79723d44b6ef9ac7784eba28b7b022d18967c5e7d3b40b421b16401f5907
-size 19907
diff --git a/documentation/imgs/LSTM_archv2.png b/documentation/imgs/LSTM_archv2.png
new file mode 100644
index 0000000..4c8170b
--- /dev/null
+++ b/documentation/imgs/LSTM_archv2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:45db879a1449953316babe33ba0402a96c791f14b2c40cac5df54aca2ee89334
+size 187798
diff --git a/documentation/imgs/first_approach.png b/documentation/imgs/first_approach.png
deleted file mode 100644
index 297c1a5..0000000
--- a/documentation/imgs/first_approach.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5a243605f4d58dee8bad4a18845ab78ca2319049e633b35e6a89540add684be8
-size 298011
diff --git a/documentation/imgs/second_approach.png b/documentation/imgs/second_approach.png
deleted file mode 100644
index e5e693f..0000000
--- a/documentation/imgs/second_approach.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8bab13df1e420e97a08977aa38382076491a8294d85b7daa0a10d69a36a52fc0
-size 457738
diff --git a/documentation/imgs/third_approach.png b/documentation/imgs/third_approach.png
deleted file mode 100644
index d96596a..0000000
--- a/documentation/imgs/third_approach.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ad7cbed2e748b814c38eb070d29a23f7417469169a56aeb0e660a743e00430fd
-size 31104
diff --git a/lib/datageneratorv4.py b/lib/datageneratorv4.py
index c1093b2..e451035 100644
--- a/lib/datageneratorv4.py
+++ b/lib/datageneratorv4.py
@@ -6,19 +6,60 @@ from keras.utils import to_categorical
 import numpy as np
 import pandas as pd
 
-from .utils_geo import zero_one_encoding
+from lib.utils_geo import zero_one_encoding
 
 from helpers import parse_title_wiki,read_geonames
 from gensim.models.keyedvectors import KeyedVectors
 
 from sklearn.preprocessing import LabelEncoder
 
+import numpy as np
+import keras
 
 class DataGenerator(keras.utils.Sequence):
-    def __init__(self,*dataset):
-        pass
-    def __next__(self):
-        pass
-    def isOver(self):
-        pass
+    'Generates data for Keras'
+    def __init__(self, pairs_of_toponyms,encoder, batch_size=32, shuffle=True):
+        'Initialization'
+        self.data= pairs_of_toponyms
+        self.encoder = encoder
+        self.dim = self.encoder.max_len
+        self.shuffle = shuffle
+    
+        self.batch_size = batch_size
+        self.on_epoch_end()
+
+    def __len__(self):
+        'Denotes the number of batches per epoch'
+        return int(np.floor(len(self.data) / self.batch_size))
+
+    def __getitem__(self, index):
+        'Generate one batch of data'
+        # Generate indexes of the batch
+        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
+
+        # Generate data
+        return self.__data_generation(indexes)
+
+
+    def on_epoch_end(self):
+        'Updates indexes after each epoch'
+        self.indexes = np.arange(len(self.data))
+        if self.shuffle == True:
+            np.random.shuffle(self.indexes)
+
+    def __data_generation(self, list_ids):
+        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
+        # Initialization
+        X1 = np.empty((self.batch_size, self.dim))
+        X2 = np.empty((self.batch_size, self.dim))
+        y = np.zeros((self.batch_size,2), dtype=float)
+
+        # Generate data
+        for ix,i in enumerate(list_ids):
+            # Store sample
+            X1[ix,] = self.encoder.encode(self.data.toponym.iloc[i])
+            X2[ix,] = self.encoder.encode(self.data.toponym_context.iloc[i])
+            # Store class
+            y[ix,] = list(zero_one_encoding(self.data.longitude.iloc[i],self.data.latitude.iloc[i]))
 
+        return [X1,X2],y
\ No newline at end of file
diff --git a/lib/geocoder/bert_geocoder.py b/lib/geocoder/bert_geocoder.py
index e74b7b9..037ddad 100644
--- a/lib/geocoder/bert_geocoder.py
+++ b/lib/geocoder/bert_geocoder.py
@@ -42,7 +42,7 @@ class BertGeocoder():
     def __init__(self,bert_model_dir,label_healpix_file,healpix_nside=128,batch_size=1):
         self.bert_model = BertForSequenceClassification.from_pretrained(bert_model_dir)
         self.bert_model.to(device)
-        self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir)
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir,truncation=True)
         self.label_healpix = {v:k for k, v in pickle.load(open(label_healpix_file,'rb')).items()}
 
         self.nside = healpix_nside
@@ -50,7 +50,7 @@ class BertGeocoder():
         self.batch_size = batch_size
 
     def geocode(self,toponyms, context_toponyms):
-        data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=self.batch_size,shuffle=False)
+        data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=len(toponyms),shuffle=False)
         dataloader = DataLoader(data,  batch_size=self.batch_size)
         results = []
         for step, batch in enumerate(dataloader):
diff --git a/lib/torch_generator.py b/lib/torch_generator.py
index 7c6de28..3613086 100644
--- a/lib/torch_generator.py
+++ b/lib/torch_generator.py
@@ -27,7 +27,7 @@ class SentenceDataset(torch.utils.data.Dataset):
         self.current_batch_tokenized = self.tokenize(self.current_batch_id)
 
     def tokenize(self,batch_index):
-        X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512) for x in self.batch_tokenization[batch_index]]# Tokenizer
+        X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512,truncation=True) for x in self.batch_tokenization[batch_index]]# Tokenizer
         X = pad_sequences(X, maxlen=self.max_len, dtype="long", value=0, truncating="post", padding="post").tolist()
         return X
 
diff --git a/train_geocoder_v2.py b/train_geocoder_v2.py
index f1b1ddf..e5138f8 100644
--- a/train_geocoder_v2.py
+++ b/train_geocoder_v2.py
@@ -17,6 +17,7 @@ from lib.ngram_index import NgramIndex
 from lib.utils import ConfigurationReader
 from lib.utils_geo import accuracy_k,haversine_tf_1circle
 from helpers import EpochTimer
+from lib.datageneratorv4 import DataGenerator
 
 # Logging
 import logging
@@ -32,7 +33,7 @@ physical_devices = tf.config.list_physical_devices('GPU')
 tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
 # COMMAND ARGS
 args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
-    .parse_args()#("IGN GB_inclusion_perm.csv ../data/IGN/IGN_adjacent.csv GB_cooc_perm.csv  -a".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
+    .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w  -a -n 4 --ngram-word2vec-iter 1".split())
 
 #
 #################################################
@@ -120,31 +121,33 @@ logging.info("Embedding generated !")
 #############################################################################################
 logging.info("Preparing Input and Output data...")
 
-X_1_train,X_2_train=[],[]
-X_1_test,X_2_test=[],[]
-y_train,y_test = [],[]
-
-for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples():
-    top,top_c,split_ = couple[1], couple[2], couple[3]
-    coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding
-    enc_top, enc_top_c = index.encode(top),index.encode(top_c)
-    if split_ == "train":
-        X_1_train.append(enc_top)
-        X_2_train.append(enc_top_c)
-        y_train.append(list(coord))
-    else:
-        X_1_test.append(enc_top)
-        X_2_test.append(enc_top_c)
-        y_test.append(list(coord))
-
-# "NUMPYZE" inputs and output lists
-X_1_train = np.array(X_1_train)
-X_2_train = np.array(X_2_train)
-y_train = np.array(y_train)
-
-X_1_test = np.array(X_1_test)
-X_2_test = np.array(X_2_test)
-y_test = np.array(y_test)
+training_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "train"],index)
+validation_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "test"],index)
+# X_1_train,X_2_train=[],[]
+# X_1_test,X_2_test=[],[]
+# y_train,y_test = [],[]
+
+# for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples():
+#     top,top_c,split_ = couple[1], couple[2], couple[3]
+#     coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding
+#     enc_top, enc_top_c = index.encode(top),index.encode(top_c)
+#     if split_ == "train":
+#         X_1_train.append(enc_top)
+#         X_2_train.append(enc_top_c)
+#         y_train.append(list(coord))
+#     else:
+#         X_1_test.append(enc_top)
+#         X_2_test.append(enc_top_c)
+#         y_test.append(list(coord))
+
+# # "NUMPYZE" inputs and output lists
+# X_1_train = np.array(X_1_train)
+# X_2_train = np.array(X_2_train)
+# y_train = np.array(y_train)
+
+# X_1_test = np.array(X_1_test)
+# X_2_test = np.array(X_2_test)
+# y_test = np.array(y_test)
 
 logging.info("Data prepared !")
 
@@ -195,12 +198,19 @@ checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=
 epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
 
 
-history = model.fit(x=[X_1_train,X_2_train],
-    y=y_train,
-    verbose=True, batch_size=100,
-    epochs=EPOCHS,
-    validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
-    callbacks=[checkpoint,epoch_timer])
+
+history = model.fit_generator(generator=training_generator,
+                    validation_data=validation_generator,
+                    use_multiprocessing=True,
+                    workers=6,
+                    callbacks=[checkpoint,epoch_timer],epochs=EPOCHS)
+
+# history = model.fit(x=[X_1_train,X_2_train],
+#     y=y_train,
+#     verbose=True, batch_size=100,
+#     epochs=EPOCHS,
+#     validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
+#     callbacks=[checkpoint,epoch_timer])
 
 
 hist_df = pd.DataFrame(history.history)
-- 
GitLab