DEBUG

c1530d9e · Jacques Fize · b648cf9e · c1530d9e · c1530d9e · c1530d9e
Commit c1530d9e authored 5 years ago by Jacques Fize
--- a/.gitignore
+++ b/.gitignore
@@ -134,17 +134,17 @@ data/*
 deprecated/*
 *.ipynb_checkpoints
 notebooks/*
-outputs/*
+outputs*
 temp/*
 WikipediaExtract/*
 *.DS_Store
 test_comb.sh
-.vscode/*
+.vscode*
 notes.md
-.idea/*
+.idea*
-.vscode/*
 other/*
 test*
\ No newline at end of file
+nohup.out
\ No newline at end of file
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -9,7 +9,7 @@ import numpy as np
 import geopandas as gpd
 # DEEPL module
-from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM
+from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout
 from keras.models import Model
 from keras import backend as K
 from keras.callbacks import ModelCheckpoint
@@ -79,7 +79,7 @@ ACCURACY_TOLERANCE = args.tolerance_value
 EPOCHS = args.epochs
 ITER_ADJACENCY = args.adjacency_iteration
 COOC_SAMPLING_NUMBER = args.cooc_sample_size
-WORDVEC_ITER = args.ngram_word2vec_dim
+WORDVEC_ITER = args.ngram_word2vec_iter
 #################################################
 ########## FILENAME VARIABLE ####################
 #################################################
@@ -220,7 +220,7 @@ logging.info("Done !")
 #############################################################################################
-################################# ENCODE COORDINATES #################################################
+################################# ENCODE COORDINATES ########################################
 #############################################################################################
@@ -301,20 +301,24 @@ input_2 = Input(shape=(index.max_len,))
 embedding_layer = Embedding(num_words, embedding_dim,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
-x1 = Bidirectional(LSTM(98))(embedding_layer(input_1))
+x1 = embedding_layer(input_1)
-x2 = Bidirectional(LSTM(98))(embedding_layer(input_2))
+x2 = embedding_layer(input_2)
+# Each LSTM learn on a permutation of the input toponyms
+x1 = Bidirectional(LSTM(98))(x1)
+x2 = Bidirectional(LSTM(98))(x2)
 x = concatenate([x1,x2])#,x3])
 x1 = Dense(500,activation="relu")(x)
-#x1 = Dropout(0.3)(x1)
+x1 = Dropout(0.3)(x1)
 x1 = Dense(500,activation="relu")(x1)
-#x1 = Dropout(0.3)(x1)
+x1 = Dropout(0.3)(x1)
 x2 = Dense(500,activation="relu")(x)
-#x2 = Dropout(0.3)(x2)
+x2 = Dropout(0.3)(x2)
 x2 = Dense(500,activation="relu")(x2)
-#x2 = Dropout(0.3)(x2)
+x2 = Dropout(0.3)(x2)
 output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
 output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
@@ -324,14 +328,13 @@ model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#inp
 model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
-checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
-    save_best_only=True, mode='auto', period=1)
 #############################################################################################
 ################################# TRAINING LAUNCH ###########################################
 #############################################################################################
+checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
+    save_best_only=True, mode='auto', period=1)
 history = model.fit(x=[X_1_train,X_2_train],
    y=[y_lon_train,y_lat_train],
    verbose=True, batch_size=100,
@@ -346,5 +349,5 @@ hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN))
 model.save(MODEL_OUTPUT_FN)
 # Erase Model Checkpoint file
-if os.path.exists(output_fn + ".part"):
+if os.path.exists(MODEL_OUTPUT_FN + ".part"):
-    os.remove(output_fn + ".part")
+    os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
--- a/helpers.py
+++ b/helpers.py
@@ -21,49 +21,58 @@ def read_geonames(file):
        geonames data
    """
    dtypes_dict = {
-    0: int, # geonameid
+        0: int,  # geonameid
-    1: str,  # name
+        1: str,  # name
-    2: str,  # asciiname
+        2: str,  # asciiname
-    3: str,  # alternatenames
+        3: str,  # alternatenames
-    4: float, # latitude
+        4: float,  # latitude
-    5: float, # longitude
+        5: float,  # longitude
-    6: str, # feature class
+        6: str,  # feature class
-    7: str, # feature code
+        7: str,  # feature code
-    8: str, # country code
+        8: str,  # country code
-    9: str, # cc2
+        9: str,  # cc2
-    10: str, # admin1 code
+        10: str,  # admin1 code
-    11: str, # admin2 code
+        11: str,  # admin2 code
-    12: str, # admin3 code
+        12: str,  # admin3 code
-    13: str, # admin4 code
+        13: str,  # admin4 code
-    14: int, # population
+        14: int,  # population
-    15: str, # elevation
+        15: str,  # elevation
-    16: int, # dem (digital elevation model)
+        16: int,  # dem (digital elevation model)
-    17: str, # timezone
+        17: str,  # timezone
-    18: str # modification date yyyy-MM-dd
+        18: str,  # modification date yyyy-MM-dd
    }
    rename_cols = {
-    0:"geonameid", # geonameid
+        0: "geonameid",  # geonameid
-    1:"name",  # name
+        1: "name",  # name
-    2:"asciiname",  # asciiname
+        2: "asciiname",  # asciiname
-    3:"alternatenames",  # alternatenames
+        3: "alternatenames",  # alternatenames
-    4:"latitude", # latitude
+        4: "latitude",  # latitude
-    5:"longitude", # longitude
+        5: "longitude",  # longitude
-    6:"feature_class", # feature class
+        6: "feature_class",  # feature class
-    7:"feature_code", # feature code
+        7: "feature_code",  # feature code
-    8:"country_code", # country code
+        8: "country_code",  # country code
-    9:"cc2", # cc2
+        9: "cc2",  # cc2
-    10:"admin1_code", # admin1 code
+        10: "admin1_code",  # admin1 code
-    11:"admin2_code", # admin2 code
+        11: "admin2_code",  # admin2 code
-    12:"admin3_code", # admin3 code
+        12: "admin3_code",  # admin3 code
-    13:"admin4_code", # admin4 code
+        13: "admin4_code",  # admin4 code
-    14:"population", # population
+        14: "population",  # population
-    15:"elevation", # elevation
+        15: "elevation",  # elevation
-    16:"dem", # dem (digital elevation model)
+        16: "dem",  # dem (digital elevation model)
-    17:"timezone", # timezone
+        17: "timezone",  # timezone
-    18:"modification_date" # modification date yyyy-MM-dd
+        18: "modification_date",  # modification date yyyy-MM-dd
    }
-    data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
+    data = pd.read_csv(
-    data.rename(columns=rename_cols,inplace=True)
+        file,
+        sep="\t",
+        header=None,
+        quoting=3,
+        dtype=dtypes_dict,
+        na_values="",
+        keep_default_na=False,
+        error_bad_lines=False,
+    )
+    data.rename(columns=rename_cols, inplace=True)
    return data
@@ -81,10 +90,10 @@ def parse_title_wiki(title_wiki):
    str
        parsed wikipedia title
    """
-    return re.sub("\(.*\)","",title_wiki).strip().lower()
+    return re.sub("\(.*\)", "", title_wiki).strip().lower()
-def _split(lst,n,complete_chunk_value):
+def _split(lst, n, complete_chunk_value):
    """
    Split a list into chunk of n-size.
@@ -102,17 +111,19 @@ def _split(lst,n,complete_chunk_value):
    list
        chunked list
    """
-    chunks = [lst[i:i + n] for i in range(0, len(lst), n)]
+    chunks = [lst[i : i + n] for i in range(0, len(lst), n)]
-    if not chunks:return chunks
+    if not chunks:
+        return chunks
    if len(chunks[-1]) != n:
-        chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1])))
+        chunks[-1].extend([complete_chunk_value] * (n - len(chunks[-1])))
    return np.array(chunks)
-class Chronometer():
+class Chronometer:
    def __init__(self):
        self.__task_begin_timestamp = {}
-    def start(self,task_name):
+    def start(self, task_name):
        """
        Start a new task chronometer
@@ -127,10 +138,12 @@ class Chronometer():
            if a running task already exists with that name
        """
        if task_name in self.__task_begin_timestamp:
-            raise ValueError("A running task exists with the name {0}!".format(task_name))
+            raise ValueError(
+                "A running task exists with the name {0}!".format(task_name)
+            )
        self.__task_begin_timestamp[task_name] = time.time()
-    def stop(self,task_name):
+    def stop(self, task_name):
        """
        Stop and return the duration of the task
@@ -150,11 +163,14 @@ class Chronometer():
            if no task exist with the id `task_name`
        """
        if not task_name in self.__task_begin_timestamp:
-             raise ValueError("The {0} task does not exist!".format(task_name))
+            raise ValueError("The {0} task does not exist!".format(task_name))
        duration = time.time() - self.__task_begin_timestamp[task_name]
        del self.__task_begin_timestamp[task_name]
        return duration
 if __name__ == "__main__":
    chrono = Chronometer()
    chrono.start("test")
@@ -162,4 +178,4 @@ if __name__ == "__main__":
    time.sleep(3)
    print(chrono.stop("test"))
    time.sleep(3)
    print(chrono.stop("test2"))
\ No newline at end of file
--- a/parser_config/toponym_combination_embedding.json
+++ b/parser_config/toponym_combination_embedding.json
@@ -10,7 +10,7 @@
        { "long": "--cooc-sample-size", "type": "int", "default": 3 },
        {"long": "--adjacency-iteration", "type":"int","default":1},
        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
-        { "long": "--ngram-word2vec-dim", "type": "int", "default": 50 },
+        { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
        { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },