debug + add datagenerator train geocoder v2

cb8a6d14 · Jacques Fize · c3dfbda0 · cb8a6d14 · c3dfbda0 · cb8a6d14
Commit cb8a6d14 authored 4 years ago by Jacques Fize
--- a/README.md
+++ b/README.md
 # Toponym Geocoding

-Use of ngram representation and colocation of toponyms in geography and text for geocoding
+This repository contains the code for *"Using a deep neural network for toponym geocoding based on co-occurrences and spatial relations"*. In a nutshell, we propose to geocode place names using the less information available (two place names, one to geocode and the second used as context) and rely on deep learning network architecture.

-<div style="text-align:center">
-<img src="documentation/imgs/LSTM_arch.png"/>
-<p><strong>Figure 1</strong> : General workflow</p>
-</div>


 <hr>
@@ -13,9 +9,9 @@ Use of ngram representation and colocation of toponyms in geography and text for
 ## Setup environnement

 - Python3.6+
- Os free (all dependencies should work on Windows !)
+- Os free**

-It is strongly advised to used Anaconda in a windows environnement! 
+***It is strongly advised to used Anaconda in a Windows environnement!*

 ### Install dependencies

@@ -50,11 +46,29 @@ French Geonames, French Wikipedia cooccurence data, and their train/test splits

 <hr>

-## Train the network
+## Train your own model
+
+
+
+### First model
+Like every proposed model, this model is neural network. The first model is illustrated in the Figure 1. In a nutshell
+<div style="text-align:center">
+<img src="documentation/imgs/LSTM_archv2.png"/>
+<p><strong>Figure 1</strong> : General workflow</p>
+</div>
+    python3 train_geocoder.py
+
+### Second model
+
+This model is the same as the first one, except that the output is a concatenation of the latitude and longitude outputs.
+
+    python3 train_geocoder_v2.py
+

-To train the network with default parameter use the following command : 
+### BERT model
+Recently, a popular model called BERT has show great promises on various NLP tasks. This last model uses BERT pretrained model. More precisely, we use BERT in a classifier where classes corresponds to Healpix cells. 

-    python3 combination_embeddings.py -i <geoname data filename> <hierarchy geonames data filename>
+    python3 train_geocoder.py

 ### Train the network with different parameters

@@ -66,7 +80,7 @@ from lib.run import GridSearchModel
 from collections import OrderedDict

 grid = GridSearchModel(\
-    "python3 combination_embeddings.py",
+    "python3 train_geocoder_v2.py",
    **OrderedDict({ # We use an OrderedDict since the order of parameters is important
    "rel":["-i","-a","-c"],
    "-n":[4],

--- a/documentation/imgs/LSTM_arch.png
+++ b/documentation/imgs/LSTM_arch.png
--- a/documentation/imgs/first_approach.png
+++ b/documentation/imgs/first_approach.png
--- a/documentation/imgs/second_approach.png
+++ b/documentation/imgs/second_approach.png
--- a/documentation/imgs/third_approach.png
+++ b/documentation/imgs/third_approach.png
--- a/lib/datageneratorv4.py
+++ b/lib/datageneratorv4.py
@@ -6,19 +6,60 @@ from keras.utils import to_categorical
 import numpy as np
 import pandas as pd

-from .utils_geo import zero_one_encoding
+from lib.utils_geo import zero_one_encoding

 from helpers import parse_title_wiki,read_geonames
 from gensim.models.keyedvectors import KeyedVectors

 from sklearn.preprocessing import LabelEncoder

+import numpy as np
+import keras

 class DataGenerator(keras.utils.Sequence):
-    def __init__(self,*dataset):
-        pass
-    def __next__(self):
-        pass
-    def isOver(self):
-        pass
+    'Generates data for Keras'
+    def __init__(self, pairs_of_toponyms,encoder, batch_size=32, shuffle=True):
+        'Initialization'
+        self.data= pairs_of_toponyms
+        self.encoder = encoder
+        self.dim = self.encoder.max_len
+        self.shuffle = shuffle
+    
+        self.batch_size = batch_size
+        self.on_epoch_end()
+
+    def __len__(self):
+        'Denotes the number of batches per epoch'
+        return int(np.floor(len(self.data) / self.batch_size))
+
+    def __getitem__(self, index):
+        'Generate one batch of data'
+        # Generate indexes of the batch
+        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
+
+        # Generate data
+        return self.__data_generation(indexes)
+
+
+    def on_epoch_end(self):
+        'Updates indexes after each epoch'
+        self.indexes = np.arange(len(self.data))
+        if self.shuffle == True:
+            np.random.shuffle(self.indexes)
+
+    def __data_generation(self, list_ids):
+        'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
+        # Initialization
+        X1 = np.empty((self.batch_size, self.dim))
+        X2 = np.empty((self.batch_size, self.dim))
+        y = np.zeros((self.batch_size,2), dtype=float)
+
+        # Generate data
+        for ix,i in enumerate(list_ids):
+            # Store sample
+            X1[ix,] = self.encoder.encode(self.data.toponym.iloc[i])
+            X2[ix,] = self.encoder.encode(self.data.toponym_context.iloc[i])
+            # Store class
+            y[ix,] = list(zero_one_encoding(self.data.longitude.iloc[i],self.data.latitude.iloc[i]))

+        return [X1,X2],y
\ No newline at end of file
--- a/lib/geocoder/bert_geocoder.py
+++ b/lib/geocoder/bert_geocoder.py
@@ -42,7 +42,7 @@ class BertGeocoder():
    def __init__(self,bert_model_dir,label_healpix_file,healpix_nside=128,batch_size=1):
        self.bert_model = BertForSequenceClassification.from_pretrained(bert_model_dir)
        self.bert_model.to(device)
-        self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir)
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir,truncation=True)
        self.label_healpix = {v:k for k, v in pickle.load(open(label_healpix_file,'rb')).items()}

        self.nside = healpix_nside
@@ -50,7 +50,7 @@ class BertGeocoder():
        self.batch_size = batch_size

    def geocode(self,toponyms, context_toponyms):
-        data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=self.batch_size,shuffle=False)
+        data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=len(toponyms),shuffle=False)
        dataloader = DataLoader(data,  batch_size=self.batch_size)
        results = []
        for step, batch in enumerate(dataloader):

--- a/lib/torch_generator.py
+++ b/lib/torch_generator.py
@@ -27,7 +27,7 @@ class SentenceDataset(torch.utils.data.Dataset):
        self.current_batch_tokenized = self.tokenize(self.current_batch_id)

    def tokenize(self,batch_index):
-        X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512) for x in self.batch_tokenization[batch_index]]# Tokenizer
+        X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512,truncation=True) for x in self.batch_tokenization[batch_index]]# Tokenizer
        X = pad_sequences(X, maxlen=self.max_len, dtype="long", value=0, truncating="post", padding="post").tolist()
        return X


--- a/train_geocoder_v2.py
+++ b/train_geocoder_v2.py
@@ -17,6 +17,7 @@ from lib.ngram_index import NgramIndex
 from lib.utils import ConfigurationReader
 from lib.utils_geo import accuracy_k,haversine_tf_1circle
 from helpers import EpochTimer
+from lib.datageneratorv4 import DataGenerator

 # Logging
 import logging
@@ -32,7 +33,7 @@ physical_devices = tf.config.list_physical_devices('GPU')
 tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
 # COMMAND ARGS
 args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
-    .parse_args()#("IGN GB_inclusion_perm.csv ../data/IGN/IGN_adjacent.csv GB_cooc_perm.csv  -a".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
+    .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w  -a -n 4 --ngram-word2vec-iter 1".split())

 #
 #################################################
@@ -120,31 +121,33 @@ logging.info("Embedding generated !")
 #############################################################################################
 logging.info("Preparing Input and Output data...")

-X_1_train,X_2_train=[],[]
-X_1_test,X_2_test=[],[]
-y_train,y_test = [],[]
-
-for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples():
-    top,top_c,split_ = couple[1], couple[2], couple[3]
-    coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding
-    enc_top, enc_top_c = index.encode(top),index.encode(top_c)
-    if split_ == "train":
-        X_1_train.append(enc_top)
-        X_2_train.append(enc_top_c)
-        y_train.append(list(coord))
-    else:
-        X_1_test.append(enc_top)
-        X_2_test.append(enc_top_c)
-        y_test.append(list(coord))
-
-# "NUMPYZE" inputs and output lists
-X_1_train = np.array(X_1_train)
-X_2_train = np.array(X_2_train)
-y_train = np.array(y_train)
-
-X_1_test = np.array(X_1_test)
-X_2_test = np.array(X_2_test)
-y_test = np.array(y_test)
+training_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "train"],index)
+validation_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "test"],index)
+# X_1_train,X_2_train=[],[]
+# X_1_test,X_2_test=[],[]
+# y_train,y_test = [],[]
+
+# for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples():
+#     top,top_c,split_ = couple[1], couple[2], couple[3]
+#     coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding
+#     enc_top, enc_top_c = index.encode(top),index.encode(top_c)
+#     if split_ == "train":
+#         X_1_train.append(enc_top)
+#         X_2_train.append(enc_top_c)
+#         y_train.append(list(coord))
+#     else:
+#         X_1_test.append(enc_top)
+#         X_2_test.append(enc_top_c)
+#         y_test.append(list(coord))
+
+# # "NUMPYZE" inputs and output lists
+# X_1_train = np.array(X_1_train)
+# X_2_train = np.array(X_2_train)
+# y_train = np.array(y_train)
+
+# X_1_test = np.array(X_1_test)
+# X_2_test = np.array(X_2_test)
+# y_test = np.array(y_test)

 logging.info("Data prepared !")

@@ -195,12 +198,19 @@ checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=
 epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")


-history = model.fit(x=[X_1_train,X_2_train],
-    y=y_train,
-    verbose=True, batch_size=100,
-    epochs=EPOCHS,
-    validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
-    callbacks=[checkpoint,epoch_timer])
+
+history = model.fit_generator(generator=training_generator,
+                    validation_data=validation_generator,
+                    use_multiprocessing=True,
+                    workers=6,
+                    callbacks=[checkpoint,epoch_timer],epochs=EPOCHS)
+
+# history = model.fit(x=[X_1_train,X_2_train],
+#     y=y_train,
+#     verbose=True, batch_size=100,
+#     epochs=EPOCHS,
+#     validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
+#     callbacks=[checkpoint,epoch_timer])


 hist_df = pd.DataFrame(history.history)