Skip to content
Snippets Groups Projects
Commit 935b46a8 authored by Fize Jacques's avatar Fize Jacques
Browse files

Embeddings used are pretrained

parent 23beb70c
No related branches found
No related tags found
No related merge requests found
......@@ -139,4 +139,5 @@ outputs/*
temp/*
WikipediaExtract/*
*.DS_Store
\ No newline at end of file
*.DS_Store
test_comb.sh
# Base module
import os
import sys
from argparse import ArgumentParser
# Structure
import pandas as pd
......@@ -24,7 +25,7 @@ from shapely.geometry import Point
# Custom module
from helpers import read_geonames
from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex
from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex,ConfigurationReader
# Visualisation module
......@@ -46,34 +47,49 @@ logging.basicConfig(
)
chrono = Chronometer()
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()
GEONAME_FN = "data/geonamesData/FR.txt"
GEONAMES_HIERARCHY_FN = "data/geonamesData/hierarchy.txt"
NGRAM_SIZE = 2
ACCURACY_TOLERANCE = 0.002
CONV = False
LSTM_train = False
GEONAME_FN = args.geoname_input
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value
CONV, LSTM_train = False,False
if args.model == "CNN":
CONV = True
else:
LSTM_train = True
EPOCHS = args.epochs
# LOAD DATA
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
# RETRIEVE INCLUSION RELATIONSHIPS
logging.info("Retrieve inclusion relationships ! ")
geoname2name = dict(filtered["geonameid name".split()].values)
filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values)
logging.info("{0} inclusion relationships retrieved ! ".format(len(inclusion_dict)))
# ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE)
filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
logging.info("Done !")
#CLEAR RAM
del hierarchy_data
......@@ -96,6 +112,8 @@ del filtered
embedding_dim = 256
num_words = len(index.index_ngram) # necessary for the embedding matrix
logging.info("Preparing Input and Output data...")
X_1,X_2,y_lat,y_lon=[],[],[],[]
X_3 = []
for geonameId_1,geonameId_2 in inclusion_dict.items():
......@@ -119,6 +137,8 @@ X_3 = np.array(X_3)
y_lat = np.array(y_lat)
y_lon = np.array(y_lon)
logging.info("Data prepared !")
def accuracy_at_k(y_true, y_pred):
"""
Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible
......@@ -134,15 +154,19 @@ def accuracy_at_k(y_true, y_pred):
diff = y_true - y_pred
fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
return K.size(fit)/K.size(y_pred)
name = "{0}_{1}_{2}_{3}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE)
logging.info("Generating N-GRAM Embedding...")
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim)
logging.info("Embedding generated !")
if LSTM_train:
name = "LSTM_"+ name
input_1 = Input(shape=(max_len,))
input_2 = Input(shape=(max_len,))
#input_3 = Input(shape=(1,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True)
embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = Bidirectional(LSTM(10))(embedding_layer(input_1))
x2 = Bidirectional(LSTM(10))(embedding_layer(input_2))
......@@ -160,9 +184,10 @@ if LSTM_train:
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3)
if CONV :
name = "CONV_"+ name
input_1 = Input(shape=(max_len,))
input_2 = Input(shape=(max_len,))
#input_3 = Input(shape=(1,))
......@@ -194,4 +219,7 @@ if CONV :
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3)
hist_df = pd.DataFrame(history.history)
hist_df.to_csv("outputs/{0}.csv".format(name))
\ No newline at end of file
{
"description": "Toponym Combination",
"args": [
{ "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." },
{ "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." },
{ "short": "-v", "long": "--verbose", "action": "store_true" },
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" }
]
}
\ No newline at end of file
......@@ -12,3 +12,5 @@ gensim
sklearn
tensorflow
keras
ngram
shapely
\ No newline at end of file
......@@ -7,6 +7,10 @@ from nltk.tokenize import word_tokenize
import textwrap
from ngram import NGram
import argparse
import os
import json
class TokenizerCustom():
......@@ -21,6 +25,10 @@ class TokenizerCustom():
return seqs
class CoordinatesEncoder:
"""
Deprecated !
"""
def __init__(self,cell_size_lat=0.5,cell_size_lon=0.5):
self.min_lon = -180
self.max_lon = -(self.min_lon) # Symetric
......@@ -69,6 +77,7 @@ class Quadtree(object):
def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0):
self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y
self.precision = precision
x_r = abs(self.bottomright_x - self.upperleft_x)/2
y_r = abs(self.upperleft_y - self.bottomright_y )/2
......@@ -138,7 +147,8 @@ class Quadtree(object):
return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y
return q.decode(hash_[2:])
from keras.layers import Embedding
from gensim.models import Word2Vec
class NgramIndex():
def __init__(self,n):
self.ngram_gen = NGram(N=n)
......@@ -169,6 +179,14 @@ class NgramIndex():
diff = MAX_LEN - len(ngram_encoding)
ngram_encoding.extend([filling_item]*diff)
return ngram_encoding
def get_embedding_layer(self,texts,dim=100):
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim, window=5, min_count=1, workers=4)
N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim))
for i in range(N):
embedding_matrix[i] = model.wv[str(i)]
return embedding_matrix
def zero_one_encoding(long,lat):
return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0)
......@@ -180,6 +198,63 @@ def _split(lst,n,complete_chunk_value):
chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
return np.array(chunks)
import argparse
import os
import json
class ConfigurationReader(object):
def __init__(self,configuration_file):
if not os.path.exists(configuration_file):
raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file))
self.configuration = json.load(open(configuration_file))
self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"])
self.parser = argparse.ArgumentParser(description=self.__argparser_desc)
self.parse_conf()
def parse_conf(self):
if not "args" in self.configuration:
raise argparse.ArgumentError("","No args given in the configuration file")
for dict_args in self.configuration["args"]:
if not isinstance(dict_args,dict):
raise ValueError("Args must be dictionnary")
short_command = dict_args.get("short",None)
long_command = dict_args.get("long",None)
if not short_command and not long_command:
raise ValueError("No command name was given !")
add_func_dict_= {}
if "help" in dict_args:
add_func_dict_["help"]= dict_args["help"]
if "default" in dict_args:
add_func_dict_["default"]= dict_args["default"]
if "action" in dict_args:
add_func_dict_["action"]= dict_args["action"]
if "type" in dict_args:
add_func_dict_["type"]= eval(dict_args["type"])
if "choices" in dict_args:
add_func_dict_["choices"]= dict_args["choices"]
if not (short_command and long_command):
command = (short_command if not long_command else long_command)
self.parser.add_argument(command,**add_func_dict_)
elif long_command and short_command:
self.parser.add_argument(short_command,long_command,**add_func_dict_)
def parse_args(self,input_=None):
if not input_:
return self.parser.parse_args()
return self.parser.parse_args(input_)
if __name__ == "__main__":
q = Quadtree(-180,90,180,-90)
hash_ = q.encode((1.2,1.3))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment