From cb8a6d14fab0e1b8a4ba0cc24fe3be2488220b03 Mon Sep 17 00:00:00 2001 From: Jacques Fize <jacques.fize@insa-lyon.fr> Date: Wed, 14 Oct 2020 18:09:33 +0200 Subject: [PATCH] debug + add datagenerator train geocoder v2 --- README.md | 36 +++++++++---- documentation/imgs/LSTM_arch.png | 3 -- documentation/imgs/LSTM_archv2.png | 3 ++ documentation/imgs/first_approach.png | 3 -- documentation/imgs/second_approach.png | 3 -- documentation/imgs/third_approach.png | 3 -- lib/datageneratorv4.py | 55 ++++++++++++++++--- lib/geocoder/bert_geocoder.py | 4 +- lib/torch_generator.py | 2 +- train_geocoder_v2.py | 74 +++++++++++++++----------- 10 files changed, 121 insertions(+), 65 deletions(-) delete mode 100644 documentation/imgs/LSTM_arch.png create mode 100644 documentation/imgs/LSTM_archv2.png delete mode 100644 documentation/imgs/first_approach.png delete mode 100644 documentation/imgs/second_approach.png delete mode 100644 documentation/imgs/third_approach.png diff --git a/README.md b/README.md index 9bd2998..e41fe94 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,7 @@ # Toponym Geocoding -Use of ngram representation and colocation of toponyms in geography and text for geocoding +This repository contains the code for *"Using a deep neural network for toponym geocoding based on co-occurrences and spatial relations"*. In a nutshell, we propose to geocode place names using the less information available (two place names, one to geocode and the second used as context) and rely on deep learning network architecture. -<div style="text-align:center"> -<img src="documentation/imgs/LSTM_arch.png"/> -<p><strong>Figure 1</strong> : General workflow</p> -</div> <hr> @@ -13,9 +9,9 @@ Use of ngram representation and colocation of toponyms in geography and text for ## Setup environnement - Python3.6+ -- Os free (all dependencies should work on Windows !) +- Os free** -It is strongly advised to used Anaconda in a windows environnement! +***It is strongly advised to used Anaconda in a Windows environnement!* ### Install dependencies @@ -50,11 +46,29 @@ French Geonames, French Wikipedia cooccurence data, and their train/test splits <hr> -## Train the network +## Train your own model + + + +### First model +Like every proposed model, this model is neural network. The first model is illustrated in the Figure 1. In a nutshell +<div style="text-align:center"> +<img src="documentation/imgs/LSTM_archv2.png"/> +<p><strong>Figure 1</strong> : General workflow</p> +</div> + python3 train_geocoder.py + +### Second model + +This model is the same as the first one, except that the output is a concatenation of the latitude and longitude outputs. + + python3 train_geocoder_v2.py + -To train the network with default parameter use the following command : +### BERT model +Recently, a popular model called BERT has show great promises on various NLP tasks. This last model uses BERT pretrained model. More precisely, we use BERT in a classifier where classes corresponds to Healpix cells. - python3 combination_embeddings.py -i <geoname data filename> <hierarchy geonames data filename> + python3 train_geocoder.py ###Â Train the network with different parameters @@ -66,7 +80,7 @@ from lib.run import GridSearchModel from collections import OrderedDict grid = GridSearchModel(\ - "python3 combination_embeddings.py", + "python3 train_geocoder_v2.py", **OrderedDict({ # We use an OrderedDict since the order of parameters is important "rel":["-i","-a","-c"], "-n":[4], diff --git a/documentation/imgs/LSTM_arch.png b/documentation/imgs/LSTM_arch.png deleted file mode 100644 index 76768fe..0000000 --- a/documentation/imgs/LSTM_arch.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bfef79723d44b6ef9ac7784eba28b7b022d18967c5e7d3b40b421b16401f5907 -size 19907 diff --git a/documentation/imgs/LSTM_archv2.png b/documentation/imgs/LSTM_archv2.png new file mode 100644 index 0000000..4c8170b --- /dev/null +++ b/documentation/imgs/LSTM_archv2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45db879a1449953316babe33ba0402a96c791f14b2c40cac5df54aca2ee89334 +size 187798 diff --git a/documentation/imgs/first_approach.png b/documentation/imgs/first_approach.png deleted file mode 100644 index 297c1a5..0000000 --- a/documentation/imgs/first_approach.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5a243605f4d58dee8bad4a18845ab78ca2319049e633b35e6a89540add684be8 -size 298011 diff --git a/documentation/imgs/second_approach.png b/documentation/imgs/second_approach.png deleted file mode 100644 index e5e693f..0000000 --- a/documentation/imgs/second_approach.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8bab13df1e420e97a08977aa38382076491a8294d85b7daa0a10d69a36a52fc0 -size 457738 diff --git a/documentation/imgs/third_approach.png b/documentation/imgs/third_approach.png deleted file mode 100644 index d96596a..0000000 --- a/documentation/imgs/third_approach.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad7cbed2e748b814c38eb070d29a23f7417469169a56aeb0e660a743e00430fd -size 31104 diff --git a/lib/datageneratorv4.py b/lib/datageneratorv4.py index c1093b2..e451035 100644 --- a/lib/datageneratorv4.py +++ b/lib/datageneratorv4.py @@ -6,19 +6,60 @@ from keras.utils import to_categorical import numpy as np import pandas as pd -from .utils_geo import zero_one_encoding +from lib.utils_geo import zero_one_encoding from helpers import parse_title_wiki,read_geonames from gensim.models.keyedvectors import KeyedVectors from sklearn.preprocessing import LabelEncoder +import numpy as np +import keras class DataGenerator(keras.utils.Sequence): - def __init__(self,*dataset): - pass - def __next__(self): - pass - def isOver(self): - pass + 'Generates data for Keras' + def __init__(self, pairs_of_toponyms,encoder, batch_size=32, shuffle=True): + 'Initialization' + self.data= pairs_of_toponyms + self.encoder = encoder + self.dim = self.encoder.max_len + self.shuffle = shuffle + + self.batch_size = batch_size + self.on_epoch_end() + + def __len__(self): + 'Denotes the number of batches per epoch' + return int(np.floor(len(self.data) / self.batch_size)) + + def __getitem__(self, index): + 'Generate one batch of data' + # Generate indexes of the batch + indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size] + + # Generate data + return self.__data_generation(indexes) + + + def on_epoch_end(self): + 'Updates indexes after each epoch' + self.indexes = np.arange(len(self.data)) + if self.shuffle == True: + np.random.shuffle(self.indexes) + + def __data_generation(self, list_ids): + 'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels) + # Initialization + X1 = np.empty((self.batch_size, self.dim)) + X2 = np.empty((self.batch_size, self.dim)) + y = np.zeros((self.batch_size,2), dtype=float) + + # Generate data + for ix,i in enumerate(list_ids): + # Store sample + X1[ix,] = self.encoder.encode(self.data.toponym.iloc[i]) + X2[ix,] = self.encoder.encode(self.data.toponym_context.iloc[i]) + # Store class + y[ix,] = list(zero_one_encoding(self.data.longitude.iloc[i],self.data.latitude.iloc[i])) + return [X1,X2],y \ No newline at end of file diff --git a/lib/geocoder/bert_geocoder.py b/lib/geocoder/bert_geocoder.py index e74b7b9..037ddad 100644 --- a/lib/geocoder/bert_geocoder.py +++ b/lib/geocoder/bert_geocoder.py @@ -42,7 +42,7 @@ class BertGeocoder(): def __init__(self,bert_model_dir,label_healpix_file,healpix_nside=128,batch_size=1): self.bert_model = BertForSequenceClassification.from_pretrained(bert_model_dir) self.bert_model.to(device) - self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir) + self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir,truncation=True) self.label_healpix = {v:k for k, v in pickle.load(open(label_healpix_file,'rb')).items()} self.nside = healpix_nside @@ -50,7 +50,7 @@ class BertGeocoder(): self.batch_size = batch_size def geocode(self,toponyms, context_toponyms): - data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=self.batch_size,shuffle=False) + data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=len(toponyms),shuffle=False) dataloader = DataLoader(data, batch_size=self.batch_size) results = [] for step, batch in enumerate(dataloader): diff --git a/lib/torch_generator.py b/lib/torch_generator.py index 7c6de28..3613086 100644 --- a/lib/torch_generator.py +++ b/lib/torch_generator.py @@ -27,7 +27,7 @@ class SentenceDataset(torch.utils.data.Dataset): self.current_batch_tokenized = self.tokenize(self.current_batch_id) def tokenize(self,batch_index): - X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512) for x in self.batch_tokenization[batch_index]]# Tokenizer + X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512,truncation=True) for x in self.batch_tokenization[batch_index]]# Tokenizer X = pad_sequences(X, maxlen=self.max_len, dtype="long", value=0, truncating="post", padding="post").tolist() return X diff --git a/train_geocoder_v2.py b/train_geocoder_v2.py index f1b1ddf..e5138f8 100644 --- a/train_geocoder_v2.py +++ b/train_geocoder_v2.py @@ -17,6 +17,7 @@ from lib.ngram_index import NgramIndex from lib.utils import ConfigurationReader from lib.utils_geo import accuracy_k,haversine_tf_1circle from helpers import EpochTimer +from lib.datageneratorv4 import DataGenerator # Logging import logging @@ -32,7 +33,7 @@ physical_devices = tf.config.list_physical_devices('GPU') tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) #Â COMMAND ARGS args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ - .parse_args()#("IGN GB_inclusion_perm.csv ../data/IGN/IGN_adjacent.csv GB_cooc_perm.csv -a".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) + .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split()) # ################################################# @@ -120,31 +121,33 @@ logging.info("Embedding generated !") ############################################################################################# logging.info("Preparing Input and Output data...") -X_1_train,X_2_train=[],[] -X_1_test,X_2_test=[],[] -y_train,y_test = [],[] - -for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples(): - top,top_c,split_ = couple[1], couple[2], couple[3] - coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding - enc_top, enc_top_c = index.encode(top),index.encode(top_c) - if split_ == "train": - X_1_train.append(enc_top) - X_2_train.append(enc_top_c) - y_train.append(list(coord)) - else: - X_1_test.append(enc_top) - X_2_test.append(enc_top_c) - y_test.append(list(coord)) - -# "NUMPYZE" inputs and output lists -X_1_train = np.array(X_1_train) -X_2_train = np.array(X_2_train) -y_train = np.array(y_train) - -X_1_test = np.array(X_1_test) -X_2_test = np.array(X_2_test) -y_test = np.array(y_test) +training_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "train"],index) +validation_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "test"],index) +# X_1_train,X_2_train=[],[] +# X_1_test,X_2_test=[],[] +# y_train,y_test = [],[] + +# for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples(): +# top,top_c,split_ = couple[1], couple[2], couple[3] +# coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding +# enc_top, enc_top_c = index.encode(top),index.encode(top_c) +# if split_ == "train": +# X_1_train.append(enc_top) +# X_2_train.append(enc_top_c) +# y_train.append(list(coord)) +# else: +# X_1_test.append(enc_top) +# X_2_test.append(enc_top_c) +# y_test.append(list(coord)) + +# # "NUMPYZE" inputs and output lists +# X_1_train = np.array(X_1_train) +# X_2_train = np.array(X_2_train) +# y_train = np.array(y_train) + +# X_1_test = np.array(X_1_test) +# X_2_test = np.array(X_2_test) +# y_test = np.array(y_test) logging.info("Data prepared !") @@ -195,12 +198,19 @@ checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose= epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") -history = model.fit(x=[X_1_train,X_2_train], - y=y_train, - verbose=True, batch_size=100, - epochs=EPOCHS, - validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), - callbacks=[checkpoint,epoch_timer]) + +history = model.fit_generator(generator=training_generator, + validation_data=validation_generator, + use_multiprocessing=True, + workers=6, + callbacks=[checkpoint,epoch_timer],epochs=EPOCHS) + +# history = model.fit(x=[X_1_train,X_2_train], +# y=y_train, +# verbose=True, batch_size=100, +# epochs=EPOCHS, +# validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), +# callbacks=[checkpoint,epoch_timer]) hist_df = pd.DataFrame(history.history) -- GitLab