Skip to content
Snippets Groups Projects
Commit 23beb70c authored by Fize Jacques's avatar Fize Jacques
Browse files

ADD new embedding process using toponym signature combination

parent 35f9959b
No related branches found
No related tags found
No related merge requests found
# Base module
import os
import sys
# Structure
import pandas as pd
import numpy as np
# DEEPL module
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Add,concatenate,Dropout
from keras.models import Model
from keras.initializers import Constant
from keras.layers import GlobalAveragePooling1D,Bidirectional,LSTM,Average, Flatten, Conv1D
from keras import backend as K
import tensorflow as tf
# Geometry
from shapely.geometry import Point
# Custom module
from helpers import read_geonames
from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex
# Visualisation module
import matplotlib.pyplot as plt
from tqdm import tqdm as tqdm_base
def tqdm(*args, **kwargs):
if hasattr(tqdm_base, '_instances'):
for instance in list(tqdm_base._instances):
tqdm_base._decr_instances(instance)
return tqdm_base(*args, **kwargs)
# Logging
import logging
from chrono import Chronometer
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
chrono = Chronometer()
GEONAME_FN = "data/geonamesData/FR.txt"
GEONAMES_HIERARCHY_FN = "data/geonamesData/hierarchy.txt"
NGRAM_SIZE = 2
ACCURACY_TOLERANCE = 0.002
CONV = False
LSTM_train = False
# LOAD DATA
geoname_data = read_geonames(GEONAME_FN).fillna("")
hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
# RETRIEVE INCLUSION RELATIONSHIPS
geoname2name = dict(filtered["geonameid name".split()].values)
filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values)
# ENCODING NAME USING N-GRAM SPLITTING
index = NgramIndex(NGRAM_SIZE)
filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
#CLEAR RAM
del hierarchy_data
del geoname_data
# Encode each geonames entry coordinates
filtered["cell_vec"]=filtered.apply(
lambda x : zero_one_encoding(x.longitude,x.latitude),
axis=1
)
geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
# CLEAR RAM
del filtered
embedding_dim = 256
num_words = len(index.index_ngram) # necessary for the embedding matrix
X_1,X_2,y_lat,y_lon=[],[],[],[]
X_3 = []
for geonameId_1,geonameId_2 in inclusion_dict.items():
if not geonameId_2 in inclusion_dict:
continue
geonameId_3 = inclusion_dict[geonameId_2]
top3 = geoname2encodedname[geonameId_3]
X_3.append(top3)
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
X_1.append(top1)
X_2.append(top2)
y_lon.append(geoname_vec[geonameId_1][0])
y_lat.append(geoname_vec[geonameId_1][1])
# NUMPYZE inputs and output lists
X_1 = np.array(X_1)
X_2 = np.array(X_2)
X_3 = np.array(X_3)
y_lat = np.array(y_lat)
y_lon = np.array(y_lon)
def accuracy_at_k(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
task for neural network to obtain the exact coordinate.
Parameters
----------
y_true : tf.Tensor
truth data
y_pred : tf.Tensor
predicted output
"""
diff = y_true - y_pred
fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
return K.size(fit)/K.size(y_pred)
if LSTM_train:
input_1 = Input(shape=(max_len,))
input_2 = Input(shape=(max_len,))
#input_3 = Input(shape=(1,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True)
x1 = Bidirectional(LSTM(10))(embedding_layer(input_1))
x2 = Bidirectional(LSTM(10))(embedding_layer(input_2))
x = concatenate([x1,x2])#,x3])
x = Dense(500,activation="relu")(x)
x = Dropout(0.3)(x)
x = Dense(500,activation="relu")(x)
x = Dropout(0.3)(x)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x)
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
if CONV :
input_1 = Input(shape=(max_len,))
input_2 = Input(shape=(max_len,))
#input_3 = Input(shape=(1,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True)
x1 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_1))
x1 = Dropout(0.5)(x1)
x1 = MaxPooling1D(pool_size=2)(x1)
x1 = Flatten()(x1)
x2 = Conv1D(filters=32, kernel_size=3, activation='relu')(embedding_layer(input_2))
x2 = Dropout(0.5)(x2)
x2 = MaxPooling1D(pool_size=2)(x2)
x2 = Flatten()(x2)
# x1 = Bidirectional(LSTM(max_len))(embedding_layer(input_1))
# x2 = Bidirectional(LSTM(max_len))(embedding_layer(input_2))
x = concatenate([x1,x2])#,x3])
x = Dense(500,activation="relu")(x)
x = Dropout(0.3)(x)
x = Dense(500,activation="relu")(x)
x = Dropout(0.3)(x)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x)
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
...@@ -45,7 +45,7 @@ def read_geonames(file): ...@@ -45,7 +45,7 @@ def read_geonames(file):
4:"latitude", # latitude 4:"latitude", # latitude
5:"longitude", # longitude 5:"longitude", # longitude
6:"feature_class", # feature class 6:"feature_class", # feature class
7:"feature_class", # feature code 7:"feature_code", # feature code
8:"country_code", # country code 8:"country_code", # country code
9:"cc2", # cc2 9:"cc2", # cc2
10:"admin1_code", # admin1 code 10:"admin1_code", # admin1 code
......
...@@ -8,4 +8,7 @@ tqdm ...@@ -8,4 +8,7 @@ tqdm
networkx networkx
matplotlib matplotlib
joblib joblib
gensim gensim
\ No newline at end of file sklearn
tensorflow
keras
...@@ -4,6 +4,9 @@ import numpy as np ...@@ -4,6 +4,9 @@ import numpy as np
from stop_words import get_stop_words from stop_words import get_stop_words
from nltk.tokenize import word_tokenize from nltk.tokenize import word_tokenize
import textwrap
from ngram import NGram
class TokenizerCustom(): class TokenizerCustom():
...@@ -61,9 +64,129 @@ class CoordinatesEncoder: ...@@ -61,9 +64,129 @@ class CoordinatesEncoder:
vec[pos] = 1 #lon * lon size vec[pos] = 1 #lon * lon size
return vec return vec
class Quadtree(object):
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0):
self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
x_r = abs(self.bottomright_x - self.upperleft_x)/2
y_r = abs(self.upperleft_y - self.bottomright_y )/2
# if abs(self.bottomright_x - self.upperleft_x) <= cell_size[0] or abs(self.upperleft_y - self.bottomright_y) <=cell_size[1]:
if curr_prec == precision:
self.value = ""
else:
#print(ix,x_r,y_r)#print(x_r,y_r)
self.value = [
Quadtree(upperleft_x,
upperleft_y,
bottomright_x-x_r,
bottomright_y+y_r,
precision=self.precision,
curr_prec=curr_prec+1
),
Quadtree(upperleft_x+x_r,
upperleft_y,
bottomright_x,
bottomright_y+y_r,
precision=self.precision,
curr_prec=curr_prec+1
),
Quadtree(upperleft_x,
upperleft_y-y_r,
bottomright_x-x_r,
bottomright_y,
precision=self.precision,
curr_prec=curr_prec+1
),
Quadtree(upperleft_x+x_r,
upperleft_y-y_r,
bottomright_x,
bottomright_y,
precision=self.precision,
curr_prec=curr_prec+1
)
]
def contains_obj(self,pos):
x,y = pos[0],pos[1]
if x < self.upperleft_x or x > self.bottomright_x:
return False
if y >self.upperleft_y or y < self.bottomright_y:
return False
return True
def binary(self,integer):
ch = "{0:b}".format(integer)
return "0"*(2-len(ch))+ch
def encode(self,pos):
if not isinstance(self.value,list):
return ""
for ix,q in enumerate(self.value):
if q.contains_obj(pos):
return self.binary(ix)+q.encode(pos)
def int_encode(self,pos):
return list(map(int,textwrap.wrap(self.encode(pos),1)))
def decode(self,hash_):
if not len(hash_)%2 ==0:
raise ValueError("Wrong Hash ! ")
q_pos = eval("0b"+hash_[:2])
q = self.value[q_pos]
if len(hash_) == 2:
return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y
return q.decode(hash_[2:])
class NgramIndex():
def __init__(self,n):
self.ngram_gen = NGram(N=n)
self.ngram_index = {}
self.index_ngram = {}
self.cpt = 0
self.max_len = 0
def split_and_add(self,word):
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams]
def add(self,ngram):
if not ngram in self.ngram_index:
self.cpt+=1
self.ngram_index[ngram]=self.cpt
self.index_ngram[self.cpt]=ngram
def encode(self,word):
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return [self.ngram_index[ng] for ng in ngrams]
def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
assert len(ngram_encoding) <= MAX_LEN
diff = MAX_LEN - len(ngram_encoding)
ngram_encoding.extend([filling_item]*diff)
return ngram_encoding
def zero_one_encoding(long,lat):
return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0)
def _split(lst,n,complete_chunk_value): def _split(lst,n,complete_chunk_value):
chunks = [lst[i:i + n] for i in range(0, len(lst), n)] chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
if not chunks:return chunks if not chunks:return chunks
if len(chunks[-1]) != n: if len(chunks[-1]) != n:
chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
return np.array(chunks) return np.array(chunks)
\ No newline at end of file
if __name__ == "__main__":
q = Quadtree(-180,90,180,-90)
hash_ = q.encode((1.2,1.3))
q.decode(hash_)
index = NgramIndex(3)
index.split_and_add("J'aime le paté")
encoding = index.encode("xxxyyyy")
index.complete(encoding,10)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment