From c9c3c602de060879111a61fc3ddb5363b11c3eaa Mon Sep 17 00:00:00 2001 From: Jacques Fize <jacques.fize@insa-lyon.fr> Date: Thu, 15 Oct 2020 13:15:28 +0200 Subject: [PATCH] UPDATE README+ add geocoding heuristics file+ minor debug --- README.md | 164 +++++++++++++++++++++++++---------- lib/geocoder/heuristics.py | 0 lib/geocoder/our_geocoder.py | 4 +- 3 files changed, 120 insertions(+), 48 deletions(-) create mode 100644 lib/geocoder/heuristics.py diff --git a/README.md b/README.md index e41fe94..f015b7b 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Toponym Geocoding + This repository contains the code for *"Using a deep neural network for toponym geocoding based on co-occurrences and spatial relations"*. In a nutshell, we propose to geocode place names using the less information available (two place names, one to geocode and the second used as context) and rely on deep learning network architecture. @@ -6,14 +6,28 @@ This repository contains the code for *"Using a deep neural network for toponym <hr> -## Setup environnement +# Model architecture + +The model is neural network. The first model is illustrated in the Figure 1. In a nutshell, the model aims to predict coordinates (output) from two place names. The first place name is the one we want to geocode and the second place name is used as context. + +In a experiment (presented [here](https://jacobe2169.github.io/mapthetoponymsim/)), we found and assume that specific toponym affixes (suffix or prefix for example) are bound to certain geographic area. Based on this assumption, we decide to use n-gram sequence representation of input toponyms. For example, Paris will be transformed to Par,ari,ris. + +<div style="text-align:center"> +<img src="documentation/imgs/LSTM_archv2.png"/> +<p><strong>Figure 1</strong> : General workflow</p> +</div> + +<hr> + + +# Setup environnement - Python3.6+ - Os free** ***It is strongly advised to used Anaconda in a Windows environnement!* -### Install dependencies +## Install dependencies pip3 install -r requirements.txt @@ -22,15 +36,60 @@ For Anaconda users while read requirement; do conda install --yes $requirement; done < requirements.txt + +<hr> + +# Get Started + +## Get pre-trained model + +Pre-trained model are available : +| Geographic Area | Description | URL | +|-----------------|-------------------------------------------------------------------------|--------------------------------------------------------------------------------------| +| FR | Model trained on the France populated places and area | [Download](https://projet.liris.cnrs.fr/hextgeo/files/trained_models/FR_MODEL_2.zip) | +| GB | Model trained on the England populated places and area | [Download](https://projet.liris.cnrs.fr/hextgeo/files/trained_models/GB_MODEL_2.zip) | +| US | Model trained on the United States of America populated places and area | [Download](https://projet.liris.cnrs.fr/hextgeo/files/trained_models/US_MODEL_2.zip) | + +## Load and use the model + +First thing is to import the dedicated module and load pre-trained model file. Here, we'll be using the France model. + +```python +from lib.geocoder.our_geocoder import Geocoder +g = Geocoder("FR_MODEL_2/FR.txt_100_4_100__A_C.h5","FR_MODEL_2/FR.txt_100_4_100__A_C_index") + +``` + +To geocode a pair of toponym use the `model.get_coord` method: +```python +print(g.get_coord("Paris","France")) +#(2.7003836631774902, 41.24913454055786) #lon,lat +``` + +To reduce computation time, use the `model.get_coords` to geocode multiple pairs of toponyms: + +```python +print(g.get_coords(["Paris","Paris"],["Cherbourg","Montpellier"])) +#(array([2.6039734, 3.480011 ], dtype=float32), +# array([48.27507 , 48.075943], dtype=float32)) + +``` + <hr> -## Prepare required data +# Train your own model + +We propose an implementation of the model illustrated in Figure 1 and a second based on the same input but using BERT pre-trained model. + + +## Prepare data + +The data preparation is divided into three steps. First, we retrieve required data from Geonames. Second, we retrieve place names co-occurrences from Wikipedia. Finally, we generate the datasets to train the model. ### Geonames data 1. Download the Geonames data use to train the network [here](download.geonames.org/export/dump/) 2. download the hierarchy data [here](http://download.geonames.org/export/dump/hierarchy.zip) 3. unzip both file in the directory of your choice - 4. run the script `train_test_split_geonames.py <geoname_filename>` ### Cooccurence data @@ -38,39 +97,73 @@ For Anaconda users 6. Parse the corpus with Gensim script using the following command : `python3 -m gensim.scripts.segment_wiki -i -f <wikicorpus> -o <1stoutputname>.json.gz` 7. Build a page of interest file that contains a list of Wikipedia pages. The file must be a csv with the following column : title,latitude,longitude.<br> You can find [here](https://projet.liris.cnrs.fr/hextgeo/files/place_en_fr_page_clean.csv) a page of interest file that contains places that appears in both FR and EN wikipedia. 8. Then using and index that contains pages of interest run the command : `python3 script/get_cooccurrence.py <page_of_interest_file> <2noutputname> -c <1stoutputname>.json.gz` - 9. Finally, split the resulting dataset with the script `train_test_split_cooccurrence_data.py <2ndoutputname>` +### Generate dataset + +Use the following command to generate the datasets for training your model. + + python3 generate_dataset.py <geonames_dataset> <wikipedia_dataset> <geonames_hierarchy_data> + +| Parameter | Description | +|-----------------|------------------------------------------------------------------------------------------------------------------------------------------------------| +| --cooc-sampling | Number of cooccurrence sampled for each place in the cooccurrence dataset | +| --adj-sampling | Number of adjacent relation extracted for each place in a Healpix cell | +| --adj-nside | Healpix resolution where places within are considered adjacent | +| --split-nside | Size of the zone where the train/test split are done | +| --split-method | [per_pair\|per_entity] Split each dataset based on places (place cannot exists in both train and test) or pairs(place can appears in train and test) | ### If you're in a hurry French Geonames, French Wikipedia cooccurence data, and their train/test splits datasets can be found here : [https://projet.liris.cnrs.fr/hextgeo/files/](https://projet.liris.cnrs.fr/hextgeo/files/) -<hr> -## Train your own model +## Our model +To train the first model use the following command : + python3 train_geocoder_v2.py <dataset_name> <inclusion_dataset> <adjacent_dataset> <cooccurrence_dataset> [-i | -a | -w ]+ [optional args] -### First model -Like every proposed model, this model is neural network. The first model is illustrated in the Figure 1. In a nutshell -<div style="text-align:center"> -<img src="documentation/imgs/LSTM_archv2.png"/> -<p><strong>Figure 1</strong> : General workflow</p> -</div> - python3 train_geocoder.py +| Parameter | Description | +|-----------------------|---------------------------------------------------------------------------------| +| -i,--inclusion | Use inclusion relationships to train the network | +| -a,--adjacency | Use adjacency relationships to train the network | +| -w,--wikipedia-coo | Use Wikipedia place co-occurrences to train the network | +| -n,--ngram-size | ngram size | +| -t,--tolerance-value | K-value in the computation of the accuracy@k (K unit is kilometer) | +| -e,--epochs | number of epochs | +| -d,--dimension | size of the ngram embeddings | +| --admin_code_1 | (Optional) If you wish to train the network on a specific region | + + +<hr> + +# [In Progress] BERT model +In the recent years, BERT architecture proposed by Google researches enables to outperform state-of-art methods for differents tasks in NLP (POS, NER, Classification). To verify if BERT embeddings would permit to increase the performance of our approach, we code a script to use bert with our data. In our previous model, the model returned two values each on between [0,1]. Using Bert, the task has shifted to classification (softmax) where each class correspond to a cell on the glob. We use the hierarchical projection model : Healpix. Other projections model like S2geometry can be considered : https://s2geometry.io/about/overview. -### Second model +In order, to run this model training, run the `train_bert_geocoder.py` script : -This model is the same as the first one, except that the output is a concatenation of the latitude and longitude outputs. + python3 train_bert_geocoder.py \ + <train_dataset>\ + <test_dataset>\ + <output_dir>\ + [--batch_size BATCH_SIZE | --epochs EPOCHS] - python3 train_geocoder_v2.py +The train and test dataset are table data composed of two columns: sentence and label. +### Pretrained-model -### BERT model -Recently, a popular model called BERT has show great promises on various NLP tasks. This last model uses BERT pretrained model. More precisely, we use BERT in a classifier where classes corresponds to Healpix cells. +Pretrained model can be found [here](https://projet.liris.cnrs.fr/hextgeo/files/trained_models/BERT_MODELS/) - python3 train_geocoder.py +### Use BERT model -### Train the network with different parameters +```python +from lib.geocoder.bert_geocoder import BertGeocoder +geocoder = BertGeocoder(<bert_model_dir>,<label_healpix_file>) +geocoder.geocode(<toponyms>,<context,toponyms>) +``` + +<hr> + +# Train multiple model with different parameters We built a tiny module that allows to run the network training using different parameters. To do that use the GridSearchModel class in `lib.run`. You can find an example in the following code: @@ -91,30 +184,9 @@ grid = GridSearchModel(\ grid.run() ``` -### Available parameters - - -| Parameter | Description | -|-----------------------|---------------------------------------------------------------------------------| -| -i,--inclusion | Use inclusion relationships to train the network | -| -a,--adjacency | Use adjacency relationships to train the network | -| -w,--wikipedia-coo | Use Wikipedia place co-occurrences to train the network | -| --wikipedia-cooc-fn | File that contains the coooccurrence data | -| --cooc-sample-size- | Number of cooccurence relation selected for each location in cooccurrences data | -| --adjacency-iteration | Number of iteration in the adjacency extraction process | -| -n,--ngram-size | ngram size x | -| -t,--tolerance-value | K-value in the computation of the accuracy@k | -| -e,--epochs | number of epochs | -| -d,--dimension | size of the ngram embeddings | -| --admin_code_1 | (Optional) If you wish to train the network on a specific region | - - -# New model based on BERT embeddings - -In the recent years, BERT architecture proposed by Google researches enables to outperform state-of-art methods for differents tasks in NLP (POS, NER, Classification). To verify if BERT embeddings would permit to increase the performance of our approach, we code a script to use bert with our data. In our previous model, the model returned two values each on between [0,1]. Using Bert, the task has shifted to classification (softmax) where each class correspond to a cell on the glob. We use the hierarchical projection model : Healpix. Other projections model like S2geometry can be considered : https://s2geometry.io/about/overview. -In order, to run this model training, run the `bert.py` script : +# Authors and Acknowledgment - python3 bert.py <train_dataset> <test_dataset> +Proposed by **Jacques Fize**, **Ludovic Moncla** and **Bruno Martins** -The train and test dataset are table data composed of two columns: sentence and label. \ No newline at end of file +This research is supported by an IDEXLYON project of the University of Lyon within the framework of the Investments for the Future Program (ANR-16-IDEX-0005). Bruno Martins was supported by the Fundação para a Ciência e a Tecnologia (FCT), through the project grants PTDC/CCI-CIF/32607/2017 CMIMU) and UIBD/50021/2020 (INESC-ID multi-annual funding). \ No newline at end of file diff --git a/lib/geocoder/heuristics.py b/lib/geocoder/heuristics.py new file mode 100644 index 0000000..e69de29 diff --git a/lib/geocoder/our_geocoder.py b/lib/geocoder/our_geocoder.py index 0cd8520..cba4d0d 100644 --- a/lib/geocoder/our_geocoder.py +++ b/lib/geocoder/our_geocoder.py @@ -46,7 +46,7 @@ class Geocoder(object): p = np.array(p) c = np.array(c) coord = self.keras_model.predict([[p],[c]]) - return coord[0][0],coord[0][1] + return self.wgs_coord(coord[0][0],coord[0][1]) def get_coords(self,list_toponym,list_toponym_context): p = [self.ngram_encoder.complete(self.ngram_encoder.encode(toponym),self.ngram_encoder.max_len) for toponym in list_toponym] @@ -56,7 +56,7 @@ class Geocoder(object): c = np.array(c) coords = self.keras_model.predict([p,c]) - return coords[:,0],coords[:,1] #lon lat + return self.wgs_coord(coords[:,0],coords[:,1]) #lon lat def wgs_coord(self,lon,lat): return ((lon*360)-180),((lat*180)-90) -- GitLab