From 8e0560b4653883bbcbdadcdd3c330b4b6a869d39 Mon Sep 17 00:00:00 2001 From: jfize <jacques.fize@insa-lyon.fr> Date: Wed, 5 Feb 2020 12:43:58 +0100 Subject: [PATCH] DEBUG + CODE CLEANING + ADD COOCCURRENCE INTEGRATION --- README.md | 84 +++++-- combination_embeddings.py | 234 +++++++++--------- embeddings_lat_lon_type.py | 40 +-- extractDataFromWikidata.py | 11 +- extractLearningDataset.py | 2 - geonames_embedding.py | 7 - helpers.py | 8 +- models.py | 7 +- parser_config/embeddings_lat_lon.json | 12 + .../toponym_combination_embedding.json | 2 +- train_test_split_cooccurrence_data.py | 85 +++++++ train_test_split_geonames.py | 9 + utils.py | 56 ++++- 13 files changed, 354 insertions(+), 203 deletions(-) create mode 100644 parser_config/embeddings_lat_lon.json create mode 100644 train_test_split_cooccurrence_data.py diff --git a/README.md b/README.md index c9fc710..5ef0cb2 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,21 @@ -# Requirements +# Work on Place-embedding - - Python3.6+ - - Os free (all dependencies work on Windows !) +This repo contains various approach around geographic place embedding, and more precisely on its use for geocoding. At this moment, we designed three approaches : -It is strongly advised to used Anaconda in a windows environnement ! + * Use of geographic places Wikipedia pages to learn an embedding for toponyms + * Use Geonames place topology to produce an embedding using graph-embedding techniques + * Use toponym colocation (combination ?) based on spatial relatationships (inclusion, adjacency) for geocoding -## Install dependencies +<hr> + +## Setup environnement + +- Python3.6+ +- Os free (all dependencies work on Windows !) + +It is strongly advised to used Anaconda in a windows environnement! + +### Install dependencies pip3 install -r requirements.txt @@ -13,16 +23,30 @@ For Anaconda users while read requirement; do conda install --yes $requirement; done < requirements.txt -# Different approaches execution +<hr> + +## First approach : Embedding using places Wikipedia pages + +<div style="text-align:center"> +<img src="documentation/imgs/first_approach.png"/> +<p>Figure 1 : First approach general workflow</p> +</div> + +In this first approach, the goal is to produce embedding for place name. In order to do this, we designed a neural network that takes : -## Embedding using places Wikipedia pages +* **Input:** Text sequence (phrase) +* **Output** Latitute, Longitude, and the place type - +Input texts are selected using Wikidata to filter Wikipedia pages about geographic places. Then, the filtered pages are retrieved on the Wikipedia corpus file. For each pages, we got : +* Title +* Introduction text +* Coordinates of the place (laatitude-Longitude) +* Place type (using a mapping between Wikidata and DBpedia Place subclasses) ### Step 1: Parse Wikipedia data ! -First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2* +First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2* Then, use the `gensim` parser (doc [here](https://radimrehurek.com/gensim/scripts/segment_wiki.html)). Use the following command : @@ -42,7 +66,7 @@ Using previous output, we extract text data from selected Wikipedia pages with t ### Step 4 : Run Embedding extraction -To learn extract the place embedding, use the `4_embeddings_lat_lon_type.py` +To learn extract the place embedding, use the `embeddings_lat_lon_type.py` #### Available Parameters @@ -63,37 +87,57 @@ The different outputs (on for each neural network architecture) are put in the ` * outputs/Bi-GRU_100dim_20epoch_1000batch.csv : **training history** * outputs/Bi-GRU_100dim_20epoch_1000batch.txt : **embeddings** +<hr> -## Geonames place embedding +## 2nd Approach: Geonames place embedding - +From this point, we change our vantage point by focusing our model propositions by using heavily spatial/geographical data, in this context gazetteer. In this second approach, we propose to generate an embedding for places (not place's toponym) based on their topology. -First, download the Geonames dump here : https://download.geonames.org/export/dump/ +In order to do that, we use Geonames data to build a topology graph. This graph is generated based on intersection found between place buffer intersection. -*N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM). +(image ici) + +Then, using topology network, we use node-embedding techniques to generate an embedding for each vertex (places). + +<div style="text-align:center"> +<img src="documentation/imgs/second_approach.png"/> +<p><strong>Figure 2</strong> : Second approach general workflow</p> +</div> + +### Generate the embedding + +First, download the Geonames dump : [here](https://download.geonames.org/export/dump/) + +*N.B.* We advise you to take only the data from one country ! Topology network can be really dense and large ! python3 geonames_embedding.py <geonames dump(*.txt)> ### Available Parameters -| Parameter | Value (default) | +| Parameter | Description (default) | |------------------------|-------------------------------------------------------------------| -| --nbcpu | Cpu used for the embedding learning phase | -| --vector-size | embedding size | -| --walk-length | Generated Walk length | +| --nbcpu | Number of CPU used for during the learning phase | +| --vector-size | Embedding size | +| --walk-length | Generated walk length | | --num-walks | Number of walks for each vertex (place) | | --word2vec-window-size | Window-size used in Word2vec | | --buffer-size | Buffer size used to detect adjacency relationships between places | | -d | Integrate distances between places in the topology graph | | --dist | Distance used if '-d' | -### Output +### Output files Gensim word2vec format is saved in the execution directory. +<hr> + ## Embedding : train using concatenation of close places - +<div style="text-align:center"> +<img src="documentation/imgs/third_approach.png"/> +<p><strong>Figure 3</strong> : Third approach general workflow</p> +</div> + ### Prepare required data diff --git a/combination_embeddings.py b/combination_embeddings.py index 94ee452..d22359e 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -1,8 +1,6 @@ # Base module import re import os -import sys -from argparse import ArgumentParser import json #Â Structure @@ -11,15 +9,8 @@ import numpy as np import geopandas as gpd #Â DEEPL module -from keras.preprocessing.text import Tokenizer -from keras.preprocessing.sequence import pad_sequences -from keras.utils import to_categorical -from keras.layers import Dense, Input, GlobalMaxPooling1D -from keras.layers import Conv1D, MaxPooling1D, Embedding -from keras.layers import Add,concatenate,Dropout +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM from keras.models import Model -from keras.initializers import Constant -from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D, Conv2D from keras import backend as K import tensorflow as tf @@ -31,22 +22,44 @@ from helpers import read_geonames from utils import Grid from utils import zero_one_encoding, NgramIndex,ConfigurationReader - -#Â Visualisation module -import matplotlib.pyplot as plt -from tqdm import tqdm as tqdm_base - -def tqdm(*args, **kwargs): - if hasattr(tqdm_base, '_instances'): - for instance in list(tqdm_base._instances): - tqdm_base._decr_instances(instance) - return tqdm_base(*args, **kwargs) +# Logging +from tqdm import tqdm +import logging +from helpers import Chronometer def parse_title_wiki(title_wiki): + """ + Parse Wikipedia title + + Parameters + ---------- + title_wiki : str + wikipedia title + + Returns + ------- + str + parsed wikipedia title + """ return re.sub("\(.*\)","",title_wiki).strip().lower() def get_new_ids(cooc_data,id_first_value): + """ + Return new ids from cooccurrence data + + Parameters + ---------- + cooc_data : pd.DataFrame + cooccurrence da + id_first_value : int + id beginning value + + Returns + ------- + dict + new ids for each toponyms + """ topo_id = {} id_ = id_first_value for title in cooc_data.title.values: @@ -60,9 +73,23 @@ def get_new_ids(cooc_data,id_first_value): topo_id[id_]=interlink return topo_id -# Logging -import logging -from helpers import Chronometer +def accuracy_at_k(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = tf.abs(y_true - y_pred) + fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) + return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) + +# LOGGING CONF logging.basicConfig( format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', @@ -70,8 +97,9 @@ logging.basicConfig( ) chrono = Chronometer() -args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()#("--admin_code_1 94 -n 2 -t 0.002 -e 100 -m LSTM -a -i data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args("-n 4 -t 0.002 -e 100 -m LSTM -a data/geonamesData/FR.txt data/geonamesData/hierarchy.txt".split()) +# Initialisee CONSTANTS GEONAME_FN = args.geoname_input GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input NGRAM_SIZE = args.ngram_size @@ -79,18 +107,11 @@ ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs ITER_ADJACENCY = args.adjacency_iteration -CONV, LSTM_train = False,False -if args.model == "CNN": - CONV = True -else: - LSTM_train = True - # check for output dir if not os.path.exists("outputs/"): os.makedirs("outputs/") - -# LOAD DATA +# LOAD Geonames DATA logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") @@ -103,18 +124,20 @@ logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places +# IF REGION (ONLY FR for now !) admin_id_authorised_auth = "1 2 3 4 5 6 11 24 27 28 32 44 52 53 75 76 84 93 94".split() region_fn = "" if args.admin_code_1 == None else "_"+args.admin_code_1 if args.admin_code_1 != None and args.admin_code_1 in admin_id_authorised_auth: filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() +# REDUCE DATA STORED +filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD + # Geometry operation filtered["geometry"] = filtered["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) filtered = gpd.GeoDataFrame(filtered) filtered["i"]=1 bounds = filtered.dissolve("i").bounds.values[0] # Required to get adjacency relationships -geoname2name = dict(filtered["geonameid name".split()].values) - rel_store = [] @@ -137,38 +160,64 @@ if args.adjacency: if args.inclusion: # RETRIEVE INCLUSION RELATIONSHIPS logging.info("Retrieve inclusion relationships ! ") - filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) + geonamesIDS = set(filtered.geonameid.values) + filter_mask = (hierarchy_data.childId.isin(geonamesIDS) & hierarchy_data.parentId.isin(geonamesIDS)) rel_store.extend((hierarchy_data[filter_mask]["childId parentId".split()].values.tolist())) logging.info("{0} inclusion relationships retrieved ! ".format(len(hierarchy_data[filter_mask]))) +del filtered["geometry"] if args.wikipedia_cooc: - cooc_data = pd.read_csv("./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1],sep="\t") + logging.info("Load Wikipedia Cooccurrence data and merge with geonames") + COOC_FN = "./data/wikipedia/cooccurrence_"+GEONAME_FN.split("/")[-1] + cooc_data = pd.read_csv(COOC_FN,sep="\t") cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) id_wikipediatitle = get_new_ids(cooc_data,geoname_data.geonameid.max()) wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} title_coord = {row.title: (row.longitude,row.latitude) for _,row in cooc_data.iterrows()} + cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) + filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) + + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") + train_indices.union(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)])) + test_indices.union(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)])) + + logging.info("Merged with Geonames data !") + + # EXTRACT rel + logging.info("Extracting cooccurrence relationships") + cpt=0 + for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): + for inter in row.interlinks.split("|"): + cpt+=1 + rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) + logging.info("Extract {0} cooccurrence relationships !".format(cpt)) + + +# STORE ID to name +geoname2name = dict(filtered["geonameid name".split()].values) # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available +if args.wikipedia_cooc: + [index.split_and_add(k) for k in wikipediatitle_id] filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding max_len = filtered.encode_name.apply(len).max() #Â Retrieve the encodings max length - if args.wikipedia_cooc: - [index.split_and_add(x) for x in id_wikipediatitle.values()] - idwiki_encoded = {id_: index.encode(toponym) for id_,toponym in id_wikipediatitle.items()} - max_len = max(max_len,max([len(enc) for _,enc in idwiki_encoded.items()])) + extension = {v:index.encode(k) for k,v in wikipediatitle_id.items()} index.max_len = int(max_len) #Â For Index state dump filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len +if args.wikipedia_cooc: + extension = {k:index.complete(v,max_len) for k,v in extension.items()} geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association if args.wikipedia_cooc: - idwiki_encoded = {id_: index.complete(enc,max_len) for id_,enc in idwiki_encoded.items()} + geoname2encodedname.update(extension) index.save("outputs/index_{0}gram_{1}".format(NGRAM_SIZE,GEONAME_FN.split("/")[-1])) logging.info("Done !") @@ -183,9 +232,6 @@ filtered["cell_vec"]=filtered.apply( axis=1 ) geoname_vec = dict(filtered["geonameid cell_vec".split()].values) -if args.wikipedia_cooc: - wikipediaid_vec = {wikipediatitle_id[title]: zero_one_encoding(*title_coord[title]) for title in cooc_data.title.values} - # CLEAR RAM del filtered @@ -198,8 +244,12 @@ logging.info("Preparing Input and Output data...") X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] +cpt=0 for couple in rel_store: geonameId_1,geonameId_2 = couple[0],couple[1] + if not geonameId_1 in geoname2encodedname: + cpt+=1 + continue top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] if geonameId_1 in train_indices: #and geonameId_2 in train_indices: @@ -229,93 +279,53 @@ y_lon_test = np.array(y_lon_test) logging.info("Data prepared !") -def accuracy_at_k(y_true, y_pred): - """ - Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible - task for neural network to obtain the exact coordinate. - - Parameters - ---------- - y_true : tf.Tensor - truth data - y_pred : tf.Tensor - predicted output - """ - diff = tf.abs(y_true - y_pred) - fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) - return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) +# OUTPUT FN BASE name = "{0}_{1}_{2}_{3}{4}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE,region_fn) if args.adjacency: name+="_A" if args.inclusion: name+="_I" +# NGRAM EMBDEDDING logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=50) +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim,iter=5) logging.info("Embedding generated !") -if LSTM_train: - name = "LSTM_"+ name - input_1 = Input(shape=(max_len,)) - input_2 = Input(shape=(max_len,)) - - embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) - - x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) - x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) - - x = concatenate([x1,x2])#,x3]) - - x1 = Dense(500,activation="relu")(x) - #x1 = Dropout(0.3)(x1) - x1 = Dense(500,activation="relu")(x1) - #x1 = Dropout(0.3)(x1) - - x2 = Dense(500,activation="relu")(x) - #x2 = Dropout(0.3)(x2) - x2 = Dense(500,activation="relu")(x2) - #x2 = Dropout(0.3)(x2) - - output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) - output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - - model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 +# DEEP MODEL +name = "LSTM_"+ name +input_1 = Input(shape=(max_len,)) +input_2 = Input(shape=(max_len,)) - model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) - history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) +embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) -if CONV : - name = "CONV_"+ name - input_1 = Input(shape=(max_len,)) - input_2 = Input(shape=(max_len,)) +x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) +x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) - embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)# weights=[embedding_weights],trainable=False) +x = concatenate([x1,x2])#,x3]) - x1 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_1)) - x1 = Dropout(0.5)(x1) - x1 = MaxPooling1D(pool_size=2)(x1) - x1 = Flatten()(x1) +x1 = Dense(500,activation="relu")(x) +#x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +#x1 = Dropout(0.3)(x1) - x2 = Conv1D(filters=32, kernel_size=10, activation='relu')(embedding_layer(input_2)) - x2 = Dropout(0.5)(x2) - x2 = MaxPooling1D(pool_size=2)(x2) - x2 = Flatten()(x2) +x2 = Dense(500,activation="relu")(x) +#x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +#x2 = Dropout(0.3)(x2) - x = concatenate([x1,x2]) +output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) +output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - x = Dense(500,activation="relu")(x) - x = Dropout(0.3)(x) - x = Dense(500,activation="relu")(x) - x = Dropout(0.3)(x) +model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x) - output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x) +model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) +history = model.fit(x=[X_1_train,X_2_train], + y=[y_lon_train,y_lat_train], + verbose=True, batch_size=100, + epochs=EPOCHS, + validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) - model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - model.summary() - model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) - history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, epochs=EPOCHS,validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test])) hist_df = pd.DataFrame(history.history) hist_df.to_csv("outputs/{0}.csv".format(name)) diff --git a/embeddings_lat_lon_type.py b/embeddings_lat_lon_type.py index d5778f6..39461b7 100644 --- a/embeddings_lat_lon_type.py +++ b/embeddings_lat_lon_type.py @@ -1,41 +1,21 @@ #Â Basic module -import time -import random import json -import os -import sys import argparse # Data module import numpy as np import pandas as pd -from joblib import Parallel,delayed - -# Keras basic -import keras -from keras import backend as K -from keras.initializers import Constant - # preprocessing from sklearn import preprocessing -from keras.preprocessing import sequence from keras.preprocessing.text import Tokenizer -from keras.preprocessing.sequence import pad_sequences -from keras.utils import to_categorical -from keras.preprocessing.text import text_to_word_sequence - -# Neural Network Model and layers class -from keras.layers import Dense, Input, GlobalAveragePooling1D, Embedding, LSTM, Bidirectional, Conv1D, GRU -from keras.models import Model #Â Neural network model and visualisation function from models import getModel,BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model from helpers import plot_accuracy_from_history, save_embedding #Â Utils -from utils import CoordinatesEncoder,TokenizerCustom,_split - +from utils import CoordinatesEncoder,_split, ConfigurationReader # Logging import logging @@ -51,21 +31,8 @@ chrono = Chronometer() import matplotlib.pyplot as plt from tqdm import tqdm -parser = argparse.ArgumentParser() - -parser.add_argument("input") -parser.add_argument("--glove_dir",default="data/glove") - -parser.add_argument("--max_sequence_length",type=int, default=15) -parser.add_argument("--max_num_words",type=int, default=400000) - -parser.add_argument("--embedding_dimension",type=int, default=100) - -parser.add_argument("--batch_size",type=int, default=100) -parser.add_argument("--epochs",type=int, default=100) - -parser.add_argument("-v",action="store_true",help="Display Keras training verbose") - +parser = ConfigurationReader(configuration_file="parser_config/embeddings_lat_lon.json") +args = parser.parse_args() def clean(x): return x.lower().replace("\n","").replace("\'\'\'","").replace("\'\'","") @@ -141,7 +108,6 @@ logging.info("The vocabulary contains {0} words".format(len(list(vocab_)))) logging.info("Initialize Tokenizer/ClassEncoder/CoordinateEncoder...") # Tokenizer -#tokenizer = TokenizerCustom(list(vocab_)) max_key_tokenizer = np.max(list(tokenizer.index_word.keys())) num_words = min(MAX_NUM_WORDS, len(tokenizer.word_index)) + 1 # Coordinate Encoder diff --git a/extractDataFromWikidata.py b/extractDataFromWikidata.py index 3048c2c..0f9e237 100644 --- a/extractDataFromWikidata.py +++ b/extractDataFromWikidata.py @@ -1,19 +1,12 @@ import json import gzip import argparse -import re - -import pandas as pd from joblib import Parallel, delayed # To avoid progressbar issue -from tqdm import tqdm as tqdm_base -def tqdm(*args, **kwargs): - if hasattr(tqdm_base, '_instances'): - for instance in list(tqdm_base._instances): - tqdm_base._decr_instances(instance) - return tqdm_base(*args, **kwargs) +from tqdm import tqdm + parser = argparse.ArgumentParser() diff --git a/extractLearningDataset.py b/extractLearningDataset.py index 725d82a..e2bc717 100644 --- a/extractLearningDataset.py +++ b/extractLearningDataset.py @@ -1,7 +1,5 @@ import gzip import json -import re - import argparse import pandas as pd diff --git a/geonames_embedding.py b/geonames_embedding.py index 8906e95..2185349 100644 --- a/geonames_embedding.py +++ b/geonames_embedding.py @@ -6,7 +6,6 @@ from multiprocessing import cpu_count from argparse import RawTextHelpFormatter # COMMON DATA STRUCTURE MODULE -import pandas as pd import numpy as np import networkx as nx @@ -16,16 +15,10 @@ import osrm osrm.RequestConfig.host = "jacquesfize.com:5000" from shapely.geometry import Point -# DISTANCE MODULE -from scipy.spatial.distance import cosine -from scipy.stats.stats import pearsonr - # Machine Learning MODULE from node2vec import Node2Vec -import gensim # VISUALISATION MODULE -import matplotlib.pyplot as plt from tqdm import tqdm tqdm.pandas() diff --git a/helpers.py b/helpers.py index 554e620..825dd4a 100644 --- a/helpers.py +++ b/helpers.py @@ -1,6 +1,9 @@ -import pandas as pd -import matplotlib.pyplot as plt import os +import time + +import pandas as pd + +import matplotlib.pyplot as plt def read_geonames(file): """ @@ -88,7 +91,6 @@ def save_embedding(model,tokenizer,layer_idx,fn): f.write('\n') -import time class Chronometer(): def __init__(self): diff --git a/models.py b/models.py index fa259fb..7c3370a 100644 --- a/models.py +++ b/models.py @@ -1,10 +1,5 @@ - from keras import Model -from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding, GRU, GlobalAveragePooling1D, Dropout - -# name,model_2=MPC_model(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder) -# model_2.fit(x=new_X,y=[Y_type,Y_lat,Y_lon],validation_split=0.33,epochs=EPOCHS,batch_size=BATCH_SIZE,verbose=1) - +from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding, GRU, GlobalAveragePooling1D def getModel(model_func,max_sequence_length,embedding_dim,num_words,class_encoder,coordinate_encoder): sequence_input = Input(shape=(max_sequence_length,), dtype='int32') diff --git a/parser_config/embeddings_lat_lon.json b/parser_config/embeddings_lat_lon.json new file mode 100644 index 0000000..1a0c774 --- /dev/null +++ b/parser_config/embeddings_lat_lon.json @@ -0,0 +1,12 @@ +{ + "description": "Toponym Combination", + "args": [ + { "short": "input", "help": "Corpus used to learn the embeddings" }, + { "short": "-g", "long": "--glove__dir", "default": "data/glove" }, + {"long": "--max_sequence_length", "type":"int","default":15}, + {"long": "--max_num_words", "type":"int","default":400000}, + {"long": "--embedding_dimension", "type":"int","default":100}, + {"long": "--batch_size", "type":"int","default":100}, + { "short": "-e", "long": "--epochs", "type": "int", "default": 100 } + ] +} \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index 9f3fe94..93662e1 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -7,7 +7,7 @@ { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, - {"long": "--adjacency-iteration", "type":"int","default":50}, + {"long": "--adjacency-iteration", "type":"int","default":5}, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py new file mode 100644 index 0000000..4748f3e --- /dev/null +++ b/train_test_split_cooccurrence_data.py @@ -0,0 +1,85 @@ +import argparse + +import pandas as pd +import geopandas as gpd + +import logging +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +from sklearn.model_selection import train_test_split +from shapely.geometry import Point + +from utils import Grid + +from tqdm import tqdm + +parser = argparse.ArgumentParser() +parser.add_argument("cooccurrence_file") + +args = parser.parse_args("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split()) + +# LOAD DATAgeopandas +COOC_FN = args.cooccurrence_file + + + +logging.info("Load Cooc DATA data...") +cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("") +cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) +cooc_data = gpd.GeoDataFrame(cooc_data) +logging.info("Cooc data loaded!") + +#Â World Shape bounds +world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) +world["nn"] = 1 +dissolved = world.dissolve(by="nn").iloc[0].geometry + +#Creating Grid +logging.info("Initializing Grid (360,180)...") +g = Grid(*dissolved.bounds,[360,180]) +logging.info("Fit Data to the Grid...") +g.fit_data(cooc_data) +logging.info("Placing place into the grid...") +[g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))] + +#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME +logging.info("Associate a cell number to each place in the Geoname Dataframe") +def foo(g,id_): + for ix,cell in enumerate(g.cells): + if id_ in cell.list_object: + return ix + +cooc_data["cat"] = cooc_data.title.apply(lambda x:foo(g,x)) + +# TRAIN AND TEST SPLIT +logging.info("Split Between Train and Test") + +# Cell can be empty +i=0 +while 1: + if len(cooc_data[cooc_data.cat == i])> 1: + X_train,X_test = train_test_split(cooc_data[cooc_data.cat == i]) + break + i+=1 + +for i in range(i+1,len(g.cells)): + try: + x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i]) + X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) + except Exception as e: + print(e) #print("Error",len(filtered[filtered.cat == i])) + +del X_train["geometry"] +del X_train["nn"] +del X_train["cat"] +del X_test["cat"] +del X_test["geometry"] +del X_test["nn"] +# SAVING THE DATA +logging.info("Saving Output !") +X_train.to_csv(COOC_FN+"_train.csv") +X_test.to_csv(COOC_FN+"_test.csv") \ No newline at end of file diff --git a/train_test_split_geonames.py b/train_test_split_geonames.py index d8f8962..ff87967 100644 --- a/train_test_split_geonames.py +++ b/train_test_split_geonames.py @@ -32,6 +32,8 @@ FEATURE_CLASSES = args.feature_classes logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") +geoname_data["geometry"] = geoname_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) +geoname_data = gpd.GeoDataFrame(geoname_data) logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) @@ -77,6 +79,13 @@ for i in range(i+1,len(g.cells)): except: pass #print("Error",len(filtered[filtered.cat == i])) + +del X_train["geometry"] +del X_train["nn"] +del X_train["cat"] +del X_test["cat"] +del X_test["geometry"] +del X_test["nn"] # SAVING THE DATA logging.info("Saving Output !") X_train.to_csv(GEONAME_FN+"_train.csv") diff --git a/utils.py b/utils.py index 6e052db..bd767ed 100644 --- a/utils.py +++ b/utils.py @@ -11,17 +11,13 @@ from shapely.geometry import Point,box # NLP from nltk.tokenize import word_tokenize -import textwrap from ngram import NGram # Machine learning -from keras.layers import Embedding from gensim.models import Word2Vec #Â Visualisation and parallelisation from tqdm import tqdm -from joblib import Parallel,delayed - class TokenizerCustom(): @@ -36,6 +32,54 @@ class TokenizerCustom(): return seqs +class CoordinatesEncoder: + """ + Will be replaced by Grid in grid2.py + """ + def __init__(self, cell_size_lat=0.5, cell_size_lon=0.5): + self.min_lon = -180 + self.max_lon = -(self.min_lon) # Â Symetric + self.min_lat = -90 + self.max_lat = -(self.min_lat) # Symetric + + self.ecart_lat = self.max_lat - self.min_lat + self.ecart_lon = self.max_lon - self.min_lon + + self.cell_size_lat = cell_size_lat + self.cell_size_lon = cell_size_lon + + self.unit_size_lat = self.ecart_lat / self.cell_size_lat + self.unit_size_lon = self.ecart_lon / self.cell_size_lon + + def encode(self, lat, lon): + return ( + math.floor(((lat + self.max_lat) / self.ecart_lat) * self.unit_size_lat), + math.floor(((lon + self.max_lon) / self.ecart_lon) * (self.unit_size_lon)) + ) + + def number_lat_cell(self): + return int(self.unit_size_lat) + + def number_lon_cell(self): + return int(self.unit_size_lon) + + def oneDimensionOutputSize(self): + return self.number_lat_cell() * self.number_lon_cell() + + def vector(self, lat, lon): + lat_v, lon_v = np.zeros(self.number_lat_cell()), np.zeros(self.number_lon_cell()) + new_coords = self.encode(lat, lon) + lat_v[int(new_coords[0])] = 1 + lon_v[int(new_coords[1])] = 1 + return lat_v, lon_v + + def vector_flatten(self, lat, lon): + vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible + new_coords = self.encode(lat, lon) + pos = self.number_lat_cell() * (new_coords[0]) + new_coords[1] + vec[pos] = 1 # lon * lon size + return vec + class NgramIndex(): """ @@ -53,8 +97,8 @@ class NgramIndex(): self.ngram_gen = NGram(N=n) self.size = n - self.ngram_index = {} - self.index_ngram = {} + self.ngram_index = {"":0} + self.index_ngram = {0:""} self.cpt = 0 self.max_len = 0 -- GitLab