diff --git a/combination_embeddings.py b/combination_embeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..78009e04c7cb1db7b9dd2249904040eb715050b5 --- /dev/null +++ b/combination_embeddings.py @@ -0,0 +1,197 @@ +# Base module +import os +import sys + +# Structure +import pandas as pd +import numpy as np + +# DEEPL module +from keras.preprocessing.text import Tokenizer +from keras.preprocessing.sequence import pad_sequences +from keras.utils import to_categorical +from keras.layers import Dense, Input, GlobalMaxPooling1D +from keras.layers import Conv1D, MaxPooling1D, Embedding +from keras.layers import Add,concatenate,Dropout +from keras.models import Model +from keras.initializers import Constant +from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D +from keras import backend as K +import tensorflow as tf + +# Geometry +from shapely.geometry import Point + +# Custom module +from helpers import read_geonames +from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex + + +# Visualisation module +import matplotlib.pyplot as plt +from tqdm import tqdm as tqdm_base + +def tqdm(*args, **kwargs): + if hasattr(tqdm_base, '_instances'): + for instance in list(tqdm_base._instances): + tqdm_base._decr_instances(instance) + return tqdm_base(*args, **kwargs) +# Logging +import logging +from chrono import Chronometer +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) +chrono = Chronometer() + + +GEONAME_FN = "data/geonamesData/FR.txt" +GEONAMES_HIERARCHY_FN = "data/geonamesData/hierarchy.txt" +NGRAM_SIZE = 2 +ACCURACY_TOLERANCE = 0.002 + +CONV = False +LSTM_train = False + +# LOAD DATA +geoname_data = read_geonames(GEONAME_FN).fillna("") +hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places + +# RETRIEVE INCLUSION RELATIONSHIPS +geoname2name = dict(filtered["geonameid name".split()].values) +filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) +inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values) + +# ENCODING NAME USING N-GRAM SPLITTING +index = NgramIndex(NGRAM_SIZE) +filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available +filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding +max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length +filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len +geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association + +#CLEAR RAM +del hierarchy_data +del geoname_data + +# Encode each geonames entry coordinates +filtered["cell_vec"]=filtered.apply( + lambda x : zero_one_encoding(x.longitude,x.latitude), + axis=1 + ) +geoname_vec = dict(filtered["geonameid cell_vec".split()].values) + +# CLEAR RAM +del filtered + + + + + +embedding_dim = 256 +num_words = len(index.index_ngram) # necessary for the embedding matrix + +X_1,X_2,y_lat,y_lon=[],[],[],[] +X_3 = [] +for geonameId_1,geonameId_2 in inclusion_dict.items(): + if not geonameId_2 in inclusion_dict: + continue + geonameId_3 = inclusion_dict[geonameId_2] + top3 = geoname2encodedname[geonameId_3] + X_3.append(top3) + + top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] + X_1.append(top1) + X_2.append(top2) + + y_lon.append(geoname_vec[geonameId_1][0]) + y_lat.append(geoname_vec[geonameId_1][1]) + +# NUMPYZE inputs and output lists +X_1 = np.array(X_1) +X_2 = np.array(X_2) +X_3 = np.array(X_3) +y_lat = np.array(y_lat) +y_lon = np.array(y_lon) + +def accuracy_at_k(y_true, y_pred): + """ + Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible + task for neural network to obtain the exact coordinate. + + Parameters + ---------- + y_true : tf.Tensor + truth data + y_pred : tf.Tensor + predicted output + """ + diff = y_true - y_pred + fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) + return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) + return K.size(fit)/K.size(y_pred) + + +if LSTM_train: + input_1 = Input(shape=(max_len,)) + input_2 = Input(shape=(max_len,)) + #input_3 = Input(shape=(1,)) + + embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True) + + x1 = Bidirectional(LSTM(10))(embedding_layer(input_1)) + x2 = Bidirectional(LSTM(10))(embedding_layer(input_2)) + + x = concatenate([x1,x2])#,x3]) + + x = Dense(500,activation="relu")(x) + x = Dropout(0.3)(x) + x = Dense(500,activation="relu")(x) + x = Dropout(0.3)(x) + + output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x) + output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x) + + model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 + + model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) + history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3) + +if CONV : + input_1 = Input(shape=(max_len,)) + input_2 = Input(shape=(max_len,)) + #input_3 = Input(shape=(1,)) + + embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True) + + x1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_1)) + x1 = Dropout(0.5)(x1) + x1 = MaxPooling1D(pool_size=2)(x1) + x1 = Flatten()(x1) + + x2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_2)) + x2 = Dropout(0.5)(x2) + x2 = MaxPooling1D(pool_size=2)(x2) + x2 = Flatten()(x2) + # x1 = Bidirectional(LSTM(max_len))(embedding_layer(input_1)) + # x2 = Bidirectional(LSTM(max_len))(embedding_layer(input_2)) + + x = concatenate([x1,x2])#,x3]) + + x = Dense(500,activation="relu")(x) + x = Dropout(0.3)(x) + x = Dense(500,activation="relu")(x) + x = Dropout(0.3)(x) + + output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x) + output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x) + + model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 + + model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) + history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3) diff --git a/helpers.py b/helpers.py index 19ac5551fff18870e8e76f2cb18dcf4394358d3c..c1e1f34178bcd39c5f49f90e4d1fd2d9f3cbf803 100644 --- a/helpers.py +++ b/helpers.py @@ -45,7 +45,7 @@ def read_geonames(file): 4:"latitude", # latitude 5:"longitude", # longitude 6:"feature_class", # feature class - 7:"feature_class", # feature code + 7:"feature_code", # feature code 8:"country_code", # country code 9:"cc2", # cc2 10:"admin1_code", # admin1 code diff --git a/requirements.txt b/requirements.txt index 6c3f0d19a8b7b000c536884c28b1b5c96d2d909d..7755253ba05d8a6d1d5d5cd931d364bc62747174 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,7 @@ tqdm networkx matplotlib joblib -gensim \ No newline at end of file +gensim +sklearn +tensorflow +keras diff --git a/utils.py b/utils.py index d41f4e0ddb988363aad8411da5c854e23cb37d1e..c92bb2b7ac8990317bd662bf94550fcca2f09eaf 100644 --- a/utils.py +++ b/utils.py @@ -4,6 +4,9 @@ import numpy as np from stop_words import get_stop_words from nltk.tokenize import word_tokenize +import textwrap +from ngram import NGram + class TokenizerCustom(): @@ -61,9 +64,129 @@ class CoordinatesEncoder: vec[pos] = 1 #lon * lon size return vec + +class Quadtree(object): + def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0): + self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + + x_r = abs(self.bottomright_x - self.upperleft_x)/2 + y_r = abs(self.upperleft_y - self.bottomright_y )/2 + + # if abs(self.bottomright_x - self.upperleft_x) <= cell_size[0] or abs(self.upperleft_y - self.bottomright_y) <=cell_size[1]: + if curr_prec == precision: + self.value = "" + else: + #print(ix,x_r,y_r)#print(x_r,y_r) + self.value = [ + Quadtree(upperleft_x, + upperleft_y, + bottomright_x-x_r, + bottomright_y+y_r, + precision=self.precision, + curr_prec=curr_prec+1 + ), + Quadtree(upperleft_x+x_r, + upperleft_y, + bottomright_x, + bottomright_y+y_r, + precision=self.precision, + curr_prec=curr_prec+1 + ), + Quadtree(upperleft_x, + upperleft_y-y_r, + bottomright_x-x_r, + bottomright_y, + precision=self.precision, + curr_prec=curr_prec+1 + ), + Quadtree(upperleft_x+x_r, + upperleft_y-y_r, + bottomright_x, + bottomright_y, + precision=self.precision, + curr_prec=curr_prec+1 + ) + ] + def contains_obj(self,pos): + x,y = pos[0],pos[1] + if x < self.upperleft_x or x > self.bottomright_x: + return False + if y >self.upperleft_y or y < self.bottomright_y: + return False + return True + + def binary(self,integer): + ch = "{0:b}".format(integer) + return "0"*(2-len(ch))+ch + + def encode(self,pos): + if not isinstance(self.value,list): + return "" + for ix,q in enumerate(self.value): + if q.contains_obj(pos): + return self.binary(ix)+q.encode(pos) + + def int_encode(self,pos): + return list(map(int,textwrap.wrap(self.encode(pos),1))) + + def decode(self,hash_): + if not len(hash_)%2 ==0: + raise ValueError("Wrong Hash ! ") + q_pos = eval("0b"+hash_[:2]) + q = self.value[q_pos] + if len(hash_) == 2: + return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y + return q.decode(hash_[2:]) + + +class NgramIndex(): + def __init__(self,n): + self.ngram_gen = NGram(N=n) + + self.ngram_index = {} + self.index_ngram = {} + self.cpt = 0 + self.max_len = 0 + def split_and_add(self,word): + ngrams = word.lower().replace(" ","$") + ngrams = list(self.ngram_gen.split(ngrams)) + [self.add(ngram) for ngram in ngrams] + + def add(self,ngram): + if not ngram in self.ngram_index: + self.cpt+=1 + self.ngram_index[ngram]=self.cpt + self.index_ngram[self.cpt]=ngram + + def encode(self,word): + ngrams = word.lower().replace(" ","$") + ngrams = list(self.ngram_gen.split(ngrams)) + [self.add(ng) for ng in ngrams if not ng in self.ngram_index] + return [self.ngram_index[ng] for ng in ngrams] + + def complete(self,ngram_encoding,MAX_LEN,filling_item=0): + assert len(ngram_encoding) <= MAX_LEN + diff = MAX_LEN - len(ngram_encoding) + ngram_encoding.extend([filling_item]*diff) + return ngram_encoding + +def zero_one_encoding(long,lat): + return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) + def _split(lst,n,complete_chunk_value): chunks = [lst[i:i + n] for i in range(0, len(lst), n)] if not chunks:return chunks if len(chunks[-1]) != n: chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) - return np.array(chunks) \ No newline at end of file + return np.array(chunks) + +if __name__ == "__main__": + q = Quadtree(-180,90,180,-90) + hash_ = q.encode((1.2,1.3)) + q.decode(hash_) + + + index = NgramIndex(3) + index.split_and_add("J'aime le paté") + encoding = index.encode("xxxyyyy") + index.complete(encoding,10) \ No newline at end of file