diff --git a/.gitignore b/.gitignore index 769c55f833fc02c195e1134d4ebf9d2d061a47e6..58819acb099b9ebc9f084efef5398476b303936b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,5 @@ outputs/* temp/* WikipediaExtract/* -*.DS_Store \ No newline at end of file +*.DS_Store +test_comb.sh diff --git a/combination_embeddings.py b/combination_embeddings.py index 78009e04c7cb1db7b9dd2249904040eb715050b5..1fddfe6967d40e025bcd36c2319ce94df530d8eb 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -1,6 +1,7 @@ # Base module import os import sys +from argparse import ArgumentParser #Â Structure import pandas as pd @@ -24,7 +25,7 @@ from shapely.geometry import Point #Â Custom module from helpers import read_geonames -from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex +from utils import CoordinatesEncoder, zero_one_encoding, NgramIndex,ConfigurationReader #Â Visualisation module @@ -46,34 +47,49 @@ logging.basicConfig( ) chrono = Chronometer() +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json").parse_args() -GEONAME_FN = "data/geonamesData/FR.txt" -GEONAMES_HIERARCHY_FN = "data/geonamesData/hierarchy.txt" -NGRAM_SIZE = 2 -ACCURACY_TOLERANCE = 0.002 -CONV = False -LSTM_train = False + + +GEONAME_FN = args.geoname_input +GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.tolerance_value + +CONV, LSTM_train = False,False +if args.model == "CNN": + CONV = True +else: + LSTM_train = True + +EPOCHS = args.epochs # LOAD DATA +logging.info("Load Geonames data...") geoname_data = read_geonames(GEONAME_FN).fillna("") hierarchy_data = pd.read_csv(GEONAMES_HIERARCHY_FN,sep="\t",header=None,names="parentId,childId,type".split(",")).fillna("") +logging.info("Geonames data loaded!") # SELECT ENTRY with class == to A and P (Areas and Populated Places) filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places # RETRIEVE INCLUSION RELATIONSHIPS +logging.info("Retrieve inclusion relationships ! ") geoname2name = dict(filtered["geonameid name".split()].values) filter_mask = (hierarchy_data.childId.isin(geoname2name) & hierarchy_data.parentId.isin(geoname2name)) inclusion_dict = dict(hierarchy_data[filter_mask]["childId parentId".split()].values) +logging.info("{0} inclusion relationships retrieved ! ".format(len(inclusion_dict))) # ENCODING NAME USING N-GRAM SPLITTING +logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) filtered.name.apply(lambda x : index.split_and_add(x)) # Identify all ngram available filtered["encode_name"] = filtered.name.apply(lambda x : index.encode(x)) # First encoding max_len = filtered.encode_name.apply(len).max() #Â Retrieve the encodings max length filtered["encode_name"] = filtered.encode_name.apply(lambda x: index.complete(x,max_len)) # Expend encodings with size < max_len geoname2encodedname = dict(filtered["geonameid encode_name".split()].values) #init a dict with the 'geonameid' --> 'encoded toponym' association +logging.info("Done !") #CLEAR RAM del hierarchy_data @@ -96,6 +112,8 @@ del filtered embedding_dim = 256 num_words = len(index.index_ngram) # necessary for the embedding matrix +logging.info("Preparing Input and Output data...") + X_1,X_2,y_lat,y_lon=[],[],[],[] X_3 = [] for geonameId_1,geonameId_2 in inclusion_dict.items(): @@ -119,6 +137,8 @@ X_3 = np.array(X_3) y_lat = np.array(y_lat) y_lon = np.array(y_lon) +logging.info("Data prepared !") + def accuracy_at_k(y_true, y_pred): """ Metrics use to measure the accuracy of the coordinate prediction. But in comparison to the normal accuracy metrics, we add a tolerance threshold due to the (quasi) impossible @@ -134,15 +154,19 @@ def accuracy_at_k(y_true, y_pred): diff = y_true - y_pred fit = tf.where(tf.less(diff,ACCURACY_TOLERANCE)) return K.size(fit[:,0])/K.size(y_pred),K.size(fit[:,1])/K.size(y_pred) - return K.size(fit)/K.size(y_pred) +name = "{0}_{1}_{2}_{3}".format(GEONAME_FN.split("/")[-1],EPOCHS,NGRAM_SIZE,ACCURACY_TOLERANCE) +logging.info("Generating N-GRAM Embedding...") +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= embedding_dim) +logging.info("Embedding generated !") if LSTM_train: + name = "LSTM_"+ name input_1 = Input(shape=(max_len,)) input_2 = Input(shape=(max_len,)) #input_3 = Input(shape=(1,)) - embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len, trainable=True) + embedding_layer = Embedding(num_words, embedding_dim,input_length=max_len,weights=[embedding_weights],trainable=False)#, trainable=True) x1 = Bidirectional(LSTM(10))(embedding_layer(input_1)) x2 = Bidirectional(LSTM(10))(embedding_layer(input_2)) @@ -160,9 +184,10 @@ if LSTM_train: model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) - history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3) + history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3) if CONV : + name = "CONV_"+ name input_1 = Input(shape=(max_len,)) input_2 = Input(shape=(max_len,)) #input_3 = Input(shape=(1,)) @@ -194,4 +219,7 @@ if CONV : model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics=[accuracy_at_k]) - history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=50,validation_split=0.3) + history = model.fit(x=[X_1,X_2], y=[y_lon,y_lat], verbose=True, batch_size=100, epochs=EPOCHS,validation_split=0.3) + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv("outputs/{0}.csv".format(name)) \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json new file mode 100644 index 0000000000000000000000000000000000000000..bd8fb0f591165b13fc32989b16281a05c2365bda --- /dev/null +++ b/parser_config/toponym_combination_embedding.json @@ -0,0 +1,12 @@ +{ + "description": "Toponym Combination", + "args": [ + { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." }, + { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." }, + { "short": "-v", "long": "--verbose", "action": "store_true" }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, + { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, + { "short": "-m", "long": "--model", "choices": ["CNN", "LSTM"], "default": "CNN" } + ] +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7755253ba05d8a6d1d5d5cd931d364bc62747174..b1092025f56271ebf59773a34320337867cef96e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,3 +12,5 @@ gensim sklearn tensorflow keras +ngram +shapely \ No newline at end of file diff --git a/utils.py b/utils.py index c92bb2b7ac8990317bd662bf94550fcca2f09eaf..bf7727349e00be0453adf21eca38b5f431690a70 100644 --- a/utils.py +++ b/utils.py @@ -7,6 +7,10 @@ from nltk.tokenize import word_tokenize import textwrap from ngram import NGram +import argparse +import os +import json + class TokenizerCustom(): @@ -21,6 +25,10 @@ class TokenizerCustom(): return seqs class CoordinatesEncoder: + """ + Deprecated ! + + """ def __init__(self,cell_size_lat=0.5,cell_size_lon=0.5): self.min_lon = -180 self.max_lon = -(self.min_lon) #Â Symetric @@ -69,6 +77,7 @@ class Quadtree(object): def __init__(self,upperleft_x,upperleft_y,bottomright_x,bottomright_y,precision=10,curr_prec=0): self.upperleft_x,self.upperleft_y,self.bottomright_x,self.bottomright_y = upperleft_x,upperleft_y,bottomright_x,bottomright_y + self.precision = precision x_r = abs(self.bottomright_x - self.upperleft_x)/2 y_r = abs(self.upperleft_y - self.bottomright_y )/2 @@ -138,7 +147,8 @@ class Quadtree(object): return q.upperleft_x,q.upperleft_y,q.bottomright_x,q.bottomright_y return q.decode(hash_[2:]) - +from keras.layers import Embedding +from gensim.models import Word2Vec class NgramIndex(): def __init__(self,n): self.ngram_gen = NGram(N=n) @@ -169,6 +179,14 @@ class NgramIndex(): diff = MAX_LEN - len(ngram_encoding) ngram_encoding.extend([filling_item]*diff) return ngram_encoding + + def get_embedding_layer(self,texts,dim=100): + model = Word2Vec([[str(w) for w in t] for t in texts], size=dim, window=5, min_count=1, workers=4) + N = len(self.ngram_index) + embedding_matrix = np.zeros((N,dim)) + for i in range(N): + embedding_matrix[i] = model.wv[str(i)] + return embedding_matrix def zero_one_encoding(long,lat): return ((long + 180.0 ) / 360.0), ((lat + 90.0 ) / 180.0) @@ -180,6 +198,63 @@ def _split(lst,n,complete_chunk_value): chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) return np.array(chunks) + +import argparse +import os +import json + +class ConfigurationReader(object): + def __init__(self,configuration_file): + if not os.path.exists(configuration_file): + raise FileNotFoundError("'{0} file could not be found ! '".format(configuration_file)) + + self.configuration = json.load(open(configuration_file)) + + self.__argparser_desc = ("" if not "description" in self.configuration else self.configuration["description"]) + self.parser = argparse.ArgumentParser(description=self.__argparser_desc) + + self.parse_conf() + + def parse_conf(self): + if not "args" in self.configuration: + raise argparse.ArgumentError("","No args given in the configuration file") + + for dict_args in self.configuration["args"]: + if not isinstance(dict_args,dict): + raise ValueError("Args must be dictionnary") + + short_command = dict_args.get("short",None) + long_command = dict_args.get("long",None) + + if not short_command and not long_command: + raise ValueError("No command name was given !") + + add_func_dict_= {} + if "help" in dict_args: + add_func_dict_["help"]= dict_args["help"] + if "default" in dict_args: + add_func_dict_["default"]= dict_args["default"] + if "action" in dict_args: + add_func_dict_["action"]= dict_args["action"] + if "type" in dict_args: + add_func_dict_["type"]= eval(dict_args["type"]) + if "choices" in dict_args: + add_func_dict_["choices"]= dict_args["choices"] + + if not (short_command and long_command): + command = (short_command if not long_command else long_command) + self.parser.add_argument(command,**add_func_dict_) + + elif long_command and short_command: + self.parser.add_argument(short_command,long_command,**add_func_dict_) + + def parse_args(self,input_=None): + if not input_: + return self.parser.parse_args() + return self.parser.parse_args(input_) + + + if __name__ == "__main__": q = Quadtree(-180,90,180,-90) hash_ = q.encode((1.2,1.3))