diff --git a/.gitignore b/.gitignore index 96d316e916ade4b9b4e4081aea3e4a2ed935b257..f3c43d12fee7bb9a5da3fcd97157bedfd7f1dcf7 100644 --- a/.gitignore +++ b/.gitignore @@ -134,17 +134,17 @@ data/* deprecated/* *.ipynb_checkpoints notebooks/* -outputs/* +outputs* temp/* WikipediaExtract/* *.DS_Store test_comb.sh -.vscode/* +.vscode* notes.md -.idea/* -.vscode/* +.idea* other/* -test* \ No newline at end of file +test* +nohup.out \ No newline at end of file diff --git a/combination_embeddings.py b/combination_embeddings.py index 025c586caa830963928ba0c47e0a816127966d88..bdfe83139d3c243add7c676c292ddcc88f4c71bd 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -9,7 +9,7 @@ import numpy as np import geopandas as gpd #Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout from keras.models import Model from keras import backend as K from keras.callbacks import ModelCheckpoint @@ -79,7 +79,7 @@ ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs ITER_ADJACENCY = args.adjacency_iteration COOC_SAMPLING_NUMBER = args.cooc_sample_size -WORDVEC_ITER = args.ngram_word2vec_dim +WORDVEC_ITER = args.ngram_word2vec_iter ################################################# ########## FILENAME VARIABLE #################### ################################################# @@ -220,7 +220,7 @@ logging.info("Done !") ############################################################################################# -################################# ENCODE COORDINATES ################################################# +################################# ENCODE COORDINATES ######################################## ############################################################################################# @@ -301,20 +301,24 @@ input_2 = Input(shape=(index.max_len,)) embedding_layer = Embedding(num_words, embedding_dim,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) -x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) -x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +#Â Each LSTM learn on a permutation of the input toponyms +x1 = Bidirectional(LSTM(98))(x1) +x2 = Bidirectional(LSTM(98))(x2) x = concatenate([x1,x2])#,x3]) x1 = Dense(500,activation="relu")(x) -#x1 = Dropout(0.3)(x1) +x1 = Dropout(0.3)(x1) x1 = Dense(500,activation="relu")(x1) -#x1 = Dropout(0.3)(x1) +x1 = Dropout(0.3)(x1) x2 = Dense(500,activation="relu")(x) -#x2 = Dropout(0.3)(x2) +x2 = Dropout(0.3)(x2) x2 = Dense(500,activation="relu")(x2) -#x2 = Dropout(0.3)(x2) +x2 = Dropout(0.3)(x2) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) @@ -324,14 +328,13 @@ model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#inp model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - - ############################################################################################# ################################# TRAINING LAUNCH ########################################### ############################################################################################# +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + history = model.fit(x=[X_1_train,X_2_train], y=[y_lon_train,y_lat_train], verbose=True, batch_size=100, @@ -346,5 +349,5 @@ hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN)) model.save(MODEL_OUTPUT_FN) #Â Erase Model Checkpoint file -if os.path.exists(output_fn + ".part"): - os.remove(output_fn + ".part") \ No newline at end of file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/helpers.py b/helpers.py index 0a47034ec466467011042648e9b43a1ccc4a7187..ce17f9e4c6e496da525569088af7c93d9ce54e90 100644 --- a/helpers.py +++ b/helpers.py @@ -21,49 +21,58 @@ def read_geonames(file): geonames data """ dtypes_dict = { - 0: int, # geonameid - 1: str, # name - 2: str, # asciiname - 3: str, # alternatenames - 4: float, # latitude - 5: float, # longitude - 6: str, # feature class - 7: str, # feature code - 8: str, # country code - 9: str, # cc2 - 10: str, # admin1 code - 11: str, # admin2 code - 12: str, # admin3 code - 13: str, # admin4 code - 14: int, # population - 15: str, # elevation - 16: int, # dem (digital elevation model) - 17: str, # timezone - 18: str # modification date yyyy-MM-dd + 0: int, # geonameid + 1: str, # name + 2: str, # asciiname + 3: str, # alternatenames + 4: float, # latitude + 5: float, # longitude + 6: str, # feature class + 7: str, # feature code + 8: str, # country code + 9: str, # cc2 + 10: str, # admin1 code + 11: str, # admin2 code + 12: str, # admin3 code + 13: str, # admin4 code + 14: int, # population + 15: str, # elevation + 16: int, # dem (digital elevation model) + 17: str, # timezone + 18: str, # modification date yyyy-MM-dd } rename_cols = { - 0:"geonameid", # geonameid - 1:"name", # name - 2:"asciiname", # asciiname - 3:"alternatenames", # alternatenames - 4:"latitude", # latitude - 5:"longitude", # longitude - 6:"feature_class", # feature class - 7:"feature_code", # feature code - 8:"country_code", # country code - 9:"cc2", # cc2 - 10:"admin1_code", # admin1 code - 11:"admin2_code", # admin2 code - 12:"admin3_code", # admin3 code - 13:"admin4_code", # admin4 code - 14:"population", # population - 15:"elevation", # elevation - 16:"dem", # dem (digital elevation model) - 17:"timezone", # timezone - 18:"modification_date" # modification date yyyy-MM-dd + 0: "geonameid", # geonameid + 1: "name", # name + 2: "asciiname", # asciiname + 3: "alternatenames", # alternatenames + 4: "latitude", # latitude + 5: "longitude", # longitude + 6: "feature_class", # feature class + 7: "feature_code", # feature code + 8: "country_code", # country code + 9: "cc2", # cc2 + 10: "admin1_code", # admin1 code + 11: "admin2_code", # admin2 code + 12: "admin3_code", # admin3 code + 13: "admin4_code", # admin4 code + 14: "population", # population + 15: "elevation", # elevation + 16: "dem", # dem (digital elevation model) + 17: "timezone", # timezone + 18: "modification_date", # modification date yyyy-MM-dd } - data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) - data.rename(columns=rename_cols,inplace=True) + data = pd.read_csv( + file, + sep="\t", + header=None, + quoting=3, + dtype=dtypes_dict, + na_values="", + keep_default_na=False, + error_bad_lines=False, + ) + data.rename(columns=rename_cols, inplace=True) return data @@ -81,10 +90,10 @@ def parse_title_wiki(title_wiki): str parsed wikipedia title """ - return re.sub("\(.*\)","",title_wiki).strip().lower() + return re.sub("\(.*\)", "", title_wiki).strip().lower() -def _split(lst,n,complete_chunk_value): +def _split(lst, n, complete_chunk_value): """ Split a list into chunk of n-size. @@ -102,17 +111,19 @@ def _split(lst,n,complete_chunk_value): list chunked list """ - chunks = [lst[i:i + n] for i in range(0, len(lst), n)] - if not chunks:return chunks + chunks = [lst[i : i + n] for i in range(0, len(lst), n)] + if not chunks: + return chunks if len(chunks[-1]) != n: - chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) + chunks[-1].extend([complete_chunk_value] * (n - len(chunks[-1]))) return np.array(chunks) -class Chronometer(): + +class Chronometer: def __init__(self): self.__task_begin_timestamp = {} - def start(self,task_name): + def start(self, task_name): """ Start a new task chronometer @@ -127,10 +138,12 @@ class Chronometer(): if a running task already exists with that name """ if task_name in self.__task_begin_timestamp: - raise ValueError("A running task exists with the name {0}!".format(task_name)) + raise ValueError( + "A running task exists with the name {0}!".format(task_name) + ) self.__task_begin_timestamp[task_name] = time.time() - def stop(self,task_name): + def stop(self, task_name): """ Stop and return the duration of the task @@ -150,11 +163,14 @@ class Chronometer(): if no task exist with the id `task_name` """ if not task_name in self.__task_begin_timestamp: - raise ValueError("The {0} task does not exist!".format(task_name)) + raise ValueError("The {0} task does not exist!".format(task_name)) + duration = time.time() - self.__task_begin_timestamp[task_name] del self.__task_begin_timestamp[task_name] + return duration + if __name__ == "__main__": chrono = Chronometer() chrono.start("test") @@ -162,4 +178,4 @@ if __name__ == "__main__": time.sleep(3) print(chrono.stop("test")) time.sleep(3) - print(chrono.stop("test2")) \ No newline at end of file + print(chrono.stop("test2")) diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index 7f57c885d5149a24db6e7830d9d8fef249f05227..9828f53c4a9e53cc497276200f8eec4240a91616 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -10,7 +10,7 @@ { "long": "--cooc-sample-size", "type": "int", "default": 3 }, {"long": "--adjacency-iteration", "type":"int","default":1}, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, - { "long": "--ngram-word2vec-dim", "type": "int", "default": 50 }, + { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },