Skip to content
Snippets Groups Projects
Commit cb8a6d14 authored by Jacques Fize's avatar Jacques Fize
Browse files

debug + add datagenerator train geocoder v2

parent c3dfbda0
No related branches found
No related tags found
No related merge requests found
# Toponym Geocoding
Use of ngram representation and colocation of toponyms in geography and text for geocoding
This repository contains the code for *"Using a deep neural network for toponym geocoding based on co-occurrences and spatial relations"*. In a nutshell, we propose to geocode place names using the less information available (two place names, one to geocode and the second used as context) and rely on deep learning network architecture.
<div style="text-align:center">
<img src="documentation/imgs/LSTM_arch.png"/>
<p><strong>Figure 1</strong> : General workflow</p>
</div>
<hr>
......@@ -13,9 +9,9 @@ Use of ngram representation and colocation of toponyms in geography and text for
## Setup environnement
- Python3.6+
- Os free (all dependencies should work on Windows !)
- Os free**
It is strongly advised to used Anaconda in a windows environnement!
***It is strongly advised to used Anaconda in a Windows environnement!*
### Install dependencies
......@@ -50,11 +46,29 @@ French Geonames, French Wikipedia cooccurence data, and their train/test splits
<hr>
## Train the network
## Train your own model
### First model
Like every proposed model, this model is neural network. The first model is illustrated in the Figure 1. In a nutshell
<div style="text-align:center">
<img src="documentation/imgs/LSTM_archv2.png"/>
<p><strong>Figure 1</strong> : General workflow</p>
</div>
python3 train_geocoder.py
### Second model
This model is the same as the first one, except that the output is a concatenation of the latitude and longitude outputs.
python3 train_geocoder_v2.py
To train the network with default parameter use the following command :
### BERT model
Recently, a popular model called BERT has show great promises on various NLP tasks. This last model uses BERT pretrained model. More precisely, we use BERT in a classifier where classes corresponds to Healpix cells.
python3 combination_embeddings.py -i <geoname data filename> <hierarchy geonames data filename>
python3 train_geocoder.py
### Train the network with different parameters
......@@ -66,7 +80,7 @@ from lib.run import GridSearchModel
from collections import OrderedDict
grid = GridSearchModel(\
"python3 combination_embeddings.py",
"python3 train_geocoder_v2.py",
**OrderedDict({ # We use an OrderedDict since the order of parameters is important
"rel":["-i","-a","-c"],
"-n":[4],
......
documentation/imgs/LSTM_arch.png

130 B | W: | H:

documentation/imgs/LSTM_archv2.png

131 B | W: | H:

documentation/imgs/LSTM_arch.png
documentation/imgs/LSTM_archv2.png
documentation/imgs/LSTM_arch.png
documentation/imgs/LSTM_archv2.png
  • 2-up
  • Swipe
  • Onion skin
documentation/imgs/first_approach.png

131 B

documentation/imgs/second_approach.png

131 B

documentation/imgs/third_approach.png

130 B

......@@ -6,19 +6,60 @@ from keras.utils import to_categorical
import numpy as np
import pandas as pd
from .utils_geo import zero_one_encoding
from lib.utils_geo import zero_one_encoding
from helpers import parse_title_wiki,read_geonames
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import LabelEncoder
import numpy as np
import keras
class DataGenerator(keras.utils.Sequence):
def __init__(self,*dataset):
pass
def __next__(self):
pass
def isOver(self):
pass
'Generates data for Keras'
def __init__(self, pairs_of_toponyms,encoder, batch_size=32, shuffle=True):
'Initialization'
self.data= pairs_of_toponyms
self.encoder = encoder
self.dim = self.encoder.max_len
self.shuffle = shuffle
self.batch_size = batch_size
self.on_epoch_end()
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(len(self.data) / self.batch_size))
def __getitem__(self, index):
'Generate one batch of data'
# Generate indexes of the batch
indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
# Generate data
return self.__data_generation(indexes)
def on_epoch_end(self):
'Updates indexes after each epoch'
self.indexes = np.arange(len(self.data))
if self.shuffle == True:
np.random.shuffle(self.indexes)
def __data_generation(self, list_ids):
'Generates data containing batch_size samples' # X : (n_samples, *dim, n_channels)
# Initialization
X1 = np.empty((self.batch_size, self.dim))
X2 = np.empty((self.batch_size, self.dim))
y = np.zeros((self.batch_size,2), dtype=float)
# Generate data
for ix,i in enumerate(list_ids):
# Store sample
X1[ix,] = self.encoder.encode(self.data.toponym.iloc[i])
X2[ix,] = self.encoder.encode(self.data.toponym_context.iloc[i])
# Store class
y[ix,] = list(zero_one_encoding(self.data.longitude.iloc[i],self.data.latitude.iloc[i]))
return [X1,X2],y
\ No newline at end of file
......@@ -42,7 +42,7 @@ class BertGeocoder():
def __init__(self,bert_model_dir,label_healpix_file,healpix_nside=128,batch_size=1):
self.bert_model = BertForSequenceClassification.from_pretrained(bert_model_dir)
self.bert_model.to(device)
self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir)
self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir,truncation=True)
self.label_healpix = {v:k for k, v in pickle.load(open(label_healpix_file,'rb')).items()}
self.nside = healpix_nside
......@@ -50,7 +50,7 @@ class BertGeocoder():
self.batch_size = batch_size
def geocode(self,toponyms, context_toponyms):
data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=self.batch_size,shuffle=False)
data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=len(toponyms),shuffle=False)
dataloader = DataLoader(data, batch_size=self.batch_size)
results = []
for step, batch in enumerate(dataloader):
......
......@@ -27,7 +27,7 @@ class SentenceDataset(torch.utils.data.Dataset):
self.current_batch_tokenized = self.tokenize(self.current_batch_id)
def tokenize(self,batch_index):
X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512) for x in self.batch_tokenization[batch_index]]# Tokenizer
X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512,truncation=True) for x in self.batch_tokenization[batch_index]]# Tokenizer
X = pad_sequences(X, maxlen=self.max_len, dtype="long", value=0, truncating="post", padding="post").tolist()
return X
......
......@@ -17,6 +17,7 @@ from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader
from lib.utils_geo import accuracy_k,haversine_tf_1circle
from helpers import EpochTimer
from lib.datageneratorv4 import DataGenerator
# Logging
import logging
......@@ -32,7 +33,7 @@ physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
# COMMAND ARGS
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
.parse_args()#("IGN GB_inclusion_perm.csv ../data/IGN/IGN_adjacent.csv GB_cooc_perm.csv -a".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
.parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split())
#
#################################################
......@@ -120,31 +121,33 @@ logging.info("Embedding generated !")
#############################################################################################
logging.info("Preparing Input and Output data...")
X_1_train,X_2_train=[],[]
X_1_test,X_2_test=[],[]
y_train,y_test = [],[]
for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples():
top,top_c,split_ = couple[1], couple[2], couple[3]
coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding
enc_top, enc_top_c = index.encode(top),index.encode(top_c)
if split_ == "train":
X_1_train.append(enc_top)
X_2_train.append(enc_top_c)
y_train.append(list(coord))
else:
X_1_test.append(enc_top)
X_2_test.append(enc_top_c)
y_test.append(list(coord))
# "NUMPYZE" inputs and output lists
X_1_train = np.array(X_1_train)
X_2_train = np.array(X_2_train)
y_train = np.array(y_train)
X_1_test = np.array(X_1_test)
X_2_test = np.array(X_2_test)
y_test = np.array(y_test)
training_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "train"],index)
validation_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "test"],index)
# X_1_train,X_2_train=[],[]
# X_1_test,X_2_test=[],[]
# y_train,y_test = [],[]
# for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples():
# top,top_c,split_ = couple[1], couple[2], couple[3]
# coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding
# enc_top, enc_top_c = index.encode(top),index.encode(top_c)
# if split_ == "train":
# X_1_train.append(enc_top)
# X_2_train.append(enc_top_c)
# y_train.append(list(coord))
# else:
# X_1_test.append(enc_top)
# X_2_test.append(enc_top_c)
# y_test.append(list(coord))
# # "NUMPYZE" inputs and output lists
# X_1_train = np.array(X_1_train)
# X_2_train = np.array(X_2_train)
# y_train = np.array(y_train)
# X_1_test = np.array(X_1_test)
# X_2_test = np.array(X_2_test)
# y_test = np.array(y_test)
logging.info("Data prepared !")
......@@ -195,12 +198,19 @@ checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit(x=[X_1_train,X_2_train],
y=y_train,
verbose=True, batch_size=100,
epochs=EPOCHS,
validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
callbacks=[checkpoint,epoch_timer])
history = model.fit_generator(generator=training_generator,
validation_data=validation_generator,
use_multiprocessing=True,
workers=6,
callbacks=[checkpoint,epoch_timer],epochs=EPOCHS)
# history = model.fit(x=[X_1_train,X_2_train],
# y=y_train,
# verbose=True, batch_size=100,
# epochs=EPOCHS,
# validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
# callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment