Embeddings used are pretrained

935b46a8 · Fize Jacques · 23beb70c · 935b46a8 · 935b46a8 · 935b46a8
Commit 935b46a8 authored 5 years ago by Fize Jacques
--- a/.gitignore
+++ b/.gitignore
@@ -139,4 +139,5 @@ outputs/*
 temp/*
 WikipediaExtract/*

-*.DS_Store
\ No newline at end of file
+*.DS_Store
+test_comb.sh
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
 # Base module 
 import os
 import sys
+from argparse import ArgumentParser

 # Structure
 import pandas as pd
@@ -24,7 +25,7 @@ from shapely.geometry import Point

 # Custom module
 from helpers import read_geonames
-from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex
+from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex,ConfigurationReader


 # Visualisation module
@@ -46,34 +47,49 @@ logging.basicConfig(
    )
 chrono = Chronometer()

+args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args()

-GEONAME_FN = "data/geonamesData/FR.txt"
-GEONAMES_HIERARCHY_FN = "data/geonamesData/hierarchy.txt"
-NGRAM_SIZE = 2
-ACCURACY_TOLERANCE = 0.002

-CONV = False
-LSTM_train = False
+
+
+GEONAME_FN = args.geoname_input
+GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
+NGRAM_SIZE = args.ngram_size
+ACCURACY_TOLERANCE = args.tolerance_value
+
+CONV, LSTM_train = False,False
+if args.model == "CNN":
+    CONV = True
+else:
+    LSTM_train = True
+
+EPOCHS = args.epochs

 # LOAD DATA
+logging.info("Load Geonames data...")
 geoname_data = read_geonames(GEONAME_FN).fillna("")
 hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("")
+logging.info("Geonames data loaded!")

 # SELECT ENTRY with class == to A and P (Areas and Populated Places)
 filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places

 # RETRIEVE INCLUSION RELATIONSHIPS
+logging.info("Retrieve inclusion relationships ! ")
 geoname2name = dict(filtered["geonameid name".split()].values)
 filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name))
 inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values)
+logging.info("{0} inclusion relationships retrieved ! ".format(len(inclusion_dict)))

 # ENCODING NAME USING N-GRAM SPLITTING
+logging.info("Encoding toponyms to ngram...")
 index = NgramIndex(NGRAM_SIZE)
 filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available
 filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding
 max_len = filtered.encode_name.apply(len).max() # Retrieve the encodings max length
 filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len
 geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association
+logging.info("Done !")

 #CLEAR RAM
 del hierarchy_data
@@ -96,6 +112,8 @@ del filtered
 embedding_dim = 256
 num_words = len(index.index_ngram) # necessary for the embedding matrix 

+logging.info("Preparing Input and Output data...")
+
 X_1,X_2,y_lat,y_lon=[],[],[],[]
 X_3 = []
 for geonameId_1,geonameId_2 in inclusion_dict.items():
@@ -119,6 +137,8 @@ X_3 = np.array(X_3)
 y_lat = np.array(y_lat)
 y_lon = np.array(y_lon)

+logging.info("Data prepared !")
+
 def accuracy_at_k(y_true, y_pred):
    """
    Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible 
@@ -134,15 +154,19 @@ def accuracy_at_k(y_true, y_pred):
    diff = y_true - y_pred
    fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE))
    return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred)
-    return K.size(fit)/K.size(y_pred)

+name = "{0}_{1}_{2}_{3}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE)
+logging.info("Generating N-GRAM Embedding...")
+embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim)
+logging.info("Embedding generated !")

 if LSTM_train:
+    name = "LSTM_"+ name
    input_1 = Input(shape=(max_len,))
    input_2 = Input(shape=(max_len,))
    #input_3 = Input(shape=(1,))

-    embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True)
+    embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True)

    x1 = Bidirectional(LSTM(10))(embedding_layer(input_1))
    x2 = Bidirectional(LSTM(10))(embedding_layer(input_2))
@@ -160,9 +184,10 @@ if LSTM_train:
    model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3

    model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
-    history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
+    history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3)

 if CONV :
+    name = "CONV_"+ name
    input_1 = Input(shape=(max_len,))
    input_2 = Input(shape=(max_len,))
    #input_3 = Input(shape=(1,))
@@ -194,4 +219,7 @@ if CONV :
    model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3

    model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k])
-    history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3)
+    history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3)
+
+hist_df = pd.DataFrame(history.history)
+hist_df.to_csv("outputs/{0}.csv".format(name))
\ No newline at end of file
--- a/parser_config/toponym_combination_embedding.json
+++ b/parser_config/toponym_combination_embedding.json
+{
+    "description": "Toponym Combination",
+    "args": [
+        { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." },
+        { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." },
+        { "short": "-v", "long": "--verbose", "action": "store_true" },
+        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
+        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
+        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
+        { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" }
+    ]
+}
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,3 +12,5 @@ gensim
 sklearn
 tensorflow
 keras
+ngram
+shapely
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
@@ -7,6 +7,10 @@ from nltk.tokenize import word_tokenize
 import textwrap
 from ngram import NGram

+import argparse
+import os
+import json
+


 class TokenizerCustom():
@@ -21,6 +25,10 @@ class TokenizerCustom():
        return seqs

 class CoordinatesEncoder:
+    """
+    Deprecated !
+    
+    """
    def __init__(self,cell_size_lat=0.5,cell_size_lon=0.5):
        self.min_lon = -180
        self.max_lon = -(self.min_lon) # Symetric
@@ -69,6 +77,7 @@ class Quadtree(object):
    def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0):
        self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y

+        self.precision = precision
        x_r = abs(self.bottomright_x - self.upperleft_x)/2 
        y_r = abs(self.upperleft_y - self.bottomright_y )/2

@@ -138,7 +147,8 @@ class Quadtree(object):
            return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y
        return q.decode(hash_[2:])

-
+from keras.layers import Embedding
+from gensim.models import Word2Vec
 class NgramIndex():
    def __init__(self,n):
        self.ngram_gen = NGram(N=n)
@@ -169,6 +179,14 @@ class NgramIndex():
        diff = MAX_LEN - len(ngram_encoding)
        ngram_encoding.extend([filling_item]*diff)  
        return ngram_encoding
+    
+    def get_embedding_layer(self,texts,dim=100):
+        model = Word2Vec([[str(w) for w in t] for t in texts], size=dim, window=5, min_count=1, workers=4)
+        N = len(self.ngram_index)
+        embedding_matrix = np.zeros((N,dim))
+        for i in range(N):
+            embedding_matrix[i] = model.wv[str(i)]
+        return embedding_matrix

 def zero_one_encoding(long,lat):
    return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) 
@@ -180,6 +198,63 @@ def _split(lst,n,complete_chunk_value):
        chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
    return np.array(chunks)

+
+import argparse
+import os
+import json
+
+class ConfigurationReader(object):
+    def __init__(self,configuration_file):
+        if not os.path.exists(configuration_file):
+            raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file))
+
+        self.configuration = json.load(open(configuration_file))
+
+        self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"])
+        self.parser = argparse.ArgumentParser(description=self.__argparser_desc)
+
+        self.parse_conf()
+    
+    def parse_conf(self):
+        if not "args" in self.configuration:
+            raise argparse.ArgumentError("","No args given in the configuration file")
+        
+        for dict_args in self.configuration["args"]:
+            if not isinstance(dict_args,dict):
+                raise ValueError("Args must be dictionnary")
+
+            short_command = dict_args.get("short",None)
+            long_command = dict_args.get("long",None)
+            
+            if not short_command and not long_command:
+                raise ValueError("No command name was given !") 
+            
+            add_func_dict_= {}
+            if "help" in dict_args:
+                add_func_dict_["help"]= dict_args["help"]
+            if "default" in dict_args:
+                add_func_dict_["default"]= dict_args["default"]
+            if "action" in dict_args:
+                add_func_dict_["action"]= dict_args["action"]
+            if "type" in dict_args:
+                add_func_dict_["type"]= eval(dict_args["type"])
+            if "choices" in dict_args:
+                add_func_dict_["choices"]= dict_args["choices"]
+
+            if not (short_command and long_command):
+                command = (short_command if not long_command else long_command)
+                self.parser.add_argument(command,**add_func_dict_)
+
+            elif long_command and short_command:
+                self.parser.add_argument(short_command,long_command,**add_func_dict_)
+    
+    def parse_args(self,input_=None):
+        if not input_:
+            return self.parser.parse_args()
+        return self.parser.parse_args(input_)
+
+
+
 if __name__ == "__main__":
    q = Quadtree(-180,90,180,-90) 
    hash_ = q.encode((1.2,1.3))