ADD new embedding process using toponym signature combination

23beb70c · Fize Jacques · 35f9959b · 23beb70c · 23beb70c · 23beb70c
Commit 23beb70c authored 5 years ago by Fize Jacques
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
+# Base module 
+import os
+import sys
+# Structure
+import pandas as pd
+import numpy as np
+# DEEPL module
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
+from keras.layers import Dense, Input, GlobalMaxPooling1D
+from keras.layers import Conv1D, MaxPooling1D, Embedding
+from keras.layers import Add,concatenate,Dropout
+from keras.models import Model
+from keras.initializers import Constant
+from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D
+from keras import backend as K
+import tensorflow as tf
+# Geometry
+from shapely.geometry import Point
+# Custom module
+from helpers import read_geonames
+from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex
+# Visualisation module
+import matplotlib.pyplot as plt
+from tqdm import tqdm as tqdm_base
+def tqdm(*args, **kwargs):
+    if hasattr(tqdm_base, '_instances'):
+        for instance in list(tqdm_base._instances):
+            tqdm_base._decr_instances(instance)
+    return tqdm_base(*args, **kwargs)
+# Logging
+import logging
+from chrono import Chronometer
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO
+    )
+chrono = Chronometer()
+GEONAME_FN = "data/geonamesData/FR.txt"
+GEONAMES_HIERARCHY_FN = "data/geonamesData/hierarchy.txt"
+NGRAM_SIZE = 2
+ACCURACY_TOLERANCE = 0.002
+CONV = False
+LSTM_train = False
+# LOAD DATA
+geoname_data = read_geonames(GEONAME_FN).fillna("")
+hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
+# SELECT ENTRY with class == to A and P (Areas and Populated Places)
+filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
+# RETRIEVE INCLUSION RELATIONSHIPS
+geoname2name = dict(filtered["geonameid name".split()].values)
+filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
+inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values)
+# ENCODING NAME USING N-GRAM SPLITTING
+index = NgramIndex(NGRAM_SIZE)
+filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
+filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
+max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
+filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
+geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
+#CLEAR RAM
+del hierarchy_data
+del geoname_data
+# Encode each geonames entry coordinates
+filtered["cell_vec"]=filtered.apply(
+    lambda x : zero_one_encoding(x.longitude,x.latitude),
+    axis=1
+    )
+geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
+# CLEAR RAM
+del filtered
+embedding_dim = 256
+num_words = len(index.index_ngram) # necessary for the embedding matrix 
+X_1,X_2,y_lat,y_lon=[],[],[],[]
+X_3 = []
+for geonameId_1,geonameId_2 in inclusion_dict.items():
+    if not geonameId_2 in inclusion_dict:
+      continue
+    geonameId_3 = inclusion_dict[geonameId_2]
+    top3 = geoname2encodedname[geonameId_3]
+    X_3.append(top3)
+    top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
+    X_1.append(top1)
+    X_2.append(top2)
+    y_lon.append(geoname_vec[geonameId_1][0])
+    y_lat.append(geoname_vec[geonameId_1][1])
+# NUMPYZE inputs and output lists
+X_1 = np.array(X_1)
+X_2 = np.array(X_2)
+X_3 = np.array(X_3)
+y_lat = np.array(y_lat)
+y_lon = np.array(y_lon)
+def accuracy_at_k(y_true, y_pred):
+    """
+    Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
+    task for neural network to obtain the exact coordinate.
+    Parameters
+    ----------
+    y_true : tf.Tensor
+        truth data
+    y_pred : tf.Tensor
+        predicted output
+    """
+    diff = y_true - y_pred
+    fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
+    return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
+    return K.size(fit)/K.size(y_pred)
+if LSTM_train:
+    input_1 = Input(shape=(max_len,))
+    input_2 = Input(shape=(max_len,))
+    #input_3 = Input(shape=(1,))
+    embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True)
+    x1 = Bidirectional(LSTM(10))(embedding_layer(input_1))
+    x2 = Bidirectional(LSTM(10))(embedding_layer(input_2))
+    x = concatenate([x1,x2])#,x3])
+    x = Dense(500,activation="relu")(x)
+    x = Dropout(0.3)(x)
+    x = Dense(500,activation="relu")(x)
+    x = Dropout(0.3)(x)
+    output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x)
+    output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x)
+    model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
+    model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
+    history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
+if CONV :
+    input_1 = Input(shape=(max_len,))
+    input_2 = Input(shape=(max_len,))
+    #input_3 = Input(shape=(1,))
+    embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True)
+    x1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_1))
+    x1 = Dropout(0.5)(x1)
+    x1 = MaxPooling1D(pool_size=2)(x1)
+    x1 = Flatten()(x1)
+    x2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_2))
+    x2 = Dropout(0.5)(x2)
+    x2 = MaxPooling1D(pool_size=2)(x2)
+    x2 = Flatten()(x2)
+    # x1 = Bidirectional(LSTM(max_len))(embedding_layer(input_1))
+    # x2 = Bidirectional(LSTM(max_len))(embedding_layer(input_2))
+    x = concatenate([x1,x2])#,x3])
+    x = Dense(500,activation="relu")(x)
+    x = Dropout(0.3)(x)
+    x = Dense(500,activation="relu")(x)
+    x = Dropout(0.3)(x)
+    output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x)
+    output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x)
+    model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
+    model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
+    history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
--- a/helpers.py
+++ b/helpers.py
@@ -45,7 +45,7 @@ def read_geonames(file):
    4:"latitude", # latitude
    5:"longitude", # longitude
    6:"feature_class", # feature class
-    7:"feature_class", # feature code
+    7:"feature_code", # feature code
    8:"country_code", # country code
    9:"cc2", # cc2
    10:"admin1_code", # admin1 code

--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,7 @@ tqdm
 networkx
 matplotlib
 joblib
 gensim
\ No newline at end of file
+sklearn
+tensorflow
+keras
--- a/utils.py
+++ b/utils.py
@@ -4,6 +4,9 @@ import numpy as np
 from stop_words import get_stop_words
 from nltk.tokenize import word_tokenize
+import textwrap
+from ngram import NGram
 class TokenizerCustom():
@@ -61,9 +64,129 @@ class CoordinatesEncoder:
        vec[pos] = 1 #lon * lon size
        return vec
+class Quadtree(object):
+    def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0):
+        self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
+        x_r = abs(self.bottomright_x - self.upperleft_x)/2 
+        y_r = abs(self.upperleft_y - self.bottomright_y )/2
+        # if abs(self.bottomright_x - self.upperleft_x) <= cell_size[0] or abs(self.upperleft_y - self.bottomright_y) <=cell_size[1]:
+        if curr_prec == precision:
+            self.value = ""
+        else:
+            #print(ix,x_r,y_r)#print(x_r,y_r)
+            self.value = [
+                Quadtree(upperleft_x,
+                upperleft_y,
+                bottomright_x-x_r,
+                bottomright_y+y_r,
+                precision=self.precision,
+                curr_prec=curr_prec+1
+                ),
+                Quadtree(upperleft_x+x_r,
+                upperleft_y,
+                bottomright_x,
+                bottomright_y+y_r,
+                precision=self.precision,
+                curr_prec=curr_prec+1
+                ),
+                Quadtree(upperleft_x,
+                upperleft_y-y_r,
+                bottomright_x-x_r,
+                bottomright_y,
+                precision=self.precision,
+                curr_prec=curr_prec+1
+                ),
+                Quadtree(upperleft_x+x_r,
+                upperleft_y-y_r,
+                bottomright_x,
+                bottomright_y,
+                precision=self.precision,
+                curr_prec=curr_prec+1
+                )
+            ] 
+    def contains_obj(self,pos):
+        x,y = pos[0],pos[1]
+        if x < self.upperleft_x or x > self.bottomright_x:
+            return False
+        if y >self.upperleft_y or y < self.bottomright_y:
+            return False
+        return True
+    def binary(self,integer):
+        ch = "{0:b}".format(integer) 
+        return "0"*(2-len(ch))+ch
+    def encode(self,pos):
+        if not isinstance(self.value,list):
+            return ""
+        for ix,q in enumerate(self.value):
+            if q.contains_obj(pos):
+                return self.binary(ix)+q.encode(pos)
+    def int_encode(self,pos):
+        return list(map(int,textwrap.wrap(self.encode(pos),1)))
+    def decode(self,hash_):
+        if not len(hash_)%2 ==0:
+            raise ValueError("Wrong Hash ! ")
+        q_pos = eval("0b"+hash_[:2])
+        q = self.value[q_pos]
+        if len(hash_) == 2:
+            return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y
+        return q.decode(hash_[2:])
+class NgramIndex():
+    def __init__(self,n):
+        self.ngram_gen = NGram(N=n)
+        self.ngram_index = {}
+        self.index_ngram = {}
+        self.cpt = 0
+        self.max_len = 0
+    def split_and_add(self,word):
+        ngrams = word.lower().replace(" ","$")
+        ngrams = list(self.ngram_gen.split(ngrams))
+        [self.add(ngram) for ngram in ngrams]
+    def add(self,ngram):
+        if not ngram in self.ngram_index:
+            self.cpt+=1
+            self.ngram_index[ngram]=self.cpt
+            self.index_ngram[self.cpt]=ngram
+    def encode(self,word):
+        ngrams = word.lower().replace(" ","$")
+        ngrams = list(self.ngram_gen.split(ngrams))
+        [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
+        return [self.ngram_index[ng] for ng in ngrams]
+    def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
+        assert len(ngram_encoding) <= MAX_LEN
+        diff = MAX_LEN - len(ngram_encoding)
+        ngram_encoding.extend([filling_item]*diff)  
+        return ngram_encoding
+def zero_one_encoding(long,lat):
+    return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) 
 def _split(lst,n,complete_chunk_value):
    chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
    if not chunks:return chunks
    if len(chunks[-1]) != n:
        chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
    return np.array(chunks)
\ No newline at end of file
+if __name__ == "__main__":
+    q = Quadtree(-180,90,180,-90) 
+    hash_ = q.encode((1.2,1.3))
+    q.decode(hash_)
+    index = NgramIndex(3)
+    index.split_and_add("J'aime le paté")
+    encoding = index.encode("xxxyyyy")
+    index.complete(encoding,10)
\ No newline at end of file