Skip to content
Snippets Groups Projects
Commit c1530d9e authored by Jacques Fize's avatar Jacques Fize
Browse files

DEBUG

parent b648cf9e
No related branches found
No related tags found
No related merge requests found
...@@ -134,17 +134,17 @@ data/* ...@@ -134,17 +134,17 @@ data/*
deprecated/* deprecated/*
*.ipynb_checkpoints *.ipynb_checkpoints
notebooks/* notebooks/*
outputs/* outputs*
temp/* temp/*
WikipediaExtract/* WikipediaExtract/*
*.DS_Store *.DS_Store
test_comb.sh test_comb.sh
.vscode/* .vscode*
notes.md notes.md
.idea/* .idea*
.vscode/*
other/* other/*
test* test*
\ No newline at end of file nohup.out
\ No newline at end of file
...@@ -9,7 +9,7 @@ import numpy as np ...@@ -9,7 +9,7 @@ import numpy as np
import geopandas as gpd import geopandas as gpd
# DEEPL module # DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout
from keras.models import Model from keras.models import Model
from keras import backend as K from keras import backend as K
from keras.callbacks import ModelCheckpoint from keras.callbacks import ModelCheckpoint
...@@ -79,7 +79,7 @@ ACCURACY_TOLERANCE = args.tolerance_value ...@@ -79,7 +79,7 @@ ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = args.cooc_sample_size COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = args.ngram_word2vec_dim WORDVEC_ITER = args.ngram_word2vec_iter
################################################# #################################################
########## FILENAME VARIABLE #################### ########## FILENAME VARIABLE ####################
################################################# #################################################
...@@ -220,7 +220,7 @@ logging.info("Done !") ...@@ -220,7 +220,7 @@ logging.info("Done !")
############################################################################################# #############################################################################################
################################# ENCODE COORDINATES ################################################# ################################# ENCODE COORDINATES ########################################
############################################################################################# #############################################################################################
...@@ -301,20 +301,24 @@ input_2 = Input(shape=(index.max_len,)) ...@@ -301,20 +301,24 @@ input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, embedding_dim,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) embedding_layer = Embedding(num_words, embedding_dim,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = Bidirectional(LSTM(98))(embedding_layer(input_1)) x1 = embedding_layer(input_1)
x2 = Bidirectional(LSTM(98))(embedding_layer(input_2)) x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
x1 = Bidirectional(LSTM(98))(x1)
x2 = Bidirectional(LSTM(98))(x2)
x = concatenate([x1,x2])#,x3]) x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x) x1 = Dense(500,activation="relu")(x)
#x1 = Dropout(0.3)(x1) x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1) x1 = Dense(500,activation="relu")(x1)
#x1 = Dropout(0.3)(x1) x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x) x2 = Dense(500,activation="relu")(x)
#x2 = Dropout(0.3)(x2) x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2) x2 = Dense(500,activation="relu")(x2)
#x2 = Dropout(0.3)(x2) x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
...@@ -324,14 +328,13 @@ model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#inp ...@@ -324,14 +328,13 @@ model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#inp
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
############################################################################################# #############################################################################################
################################# TRAINING LAUNCH ########################################### ################################# TRAINING LAUNCH ###########################################
############################################################################################# #############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
history = model.fit(x=[X_1_train,X_2_train], history = model.fit(x=[X_1_train,X_2_train],
y=[y_lon_train,y_lat_train], y=[y_lon_train,y_lat_train],
verbose=True, batch_size=100, verbose=True, batch_size=100,
...@@ -346,5 +349,5 @@ hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN)) ...@@ -346,5 +349,5 @@ hist_df.to_csv("outputs/{0}.csv".format(PREFIX_OUTPUT_FN))
model.save(MODEL_OUTPUT_FN) model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file # Erase Model Checkpoint file
if os.path.exists(output_fn + ".part"): if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(output_fn + ".part") os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file \ No newline at end of file
...@@ -21,49 +21,58 @@ def read_geonames(file): ...@@ -21,49 +21,58 @@ def read_geonames(file):
geonames data geonames data
""" """
dtypes_dict = { dtypes_dict = {
0: int, # geonameid 0: int, # geonameid
1: str, # name 1: str, # name
2: str, # asciiname 2: str, # asciiname
3: str, # alternatenames 3: str, # alternatenames
4: float, # latitude 4: float, # latitude
5: float, # longitude 5: float, # longitude
6: str, # feature class 6: str, # feature class
7: str, # feature code 7: str, # feature code
8: str, # country code 8: str, # country code
9: str, # cc2 9: str, # cc2
10: str, # admin1 code 10: str, # admin1 code
11: str, # admin2 code 11: str, # admin2 code
12: str, # admin3 code 12: str, # admin3 code
13: str, # admin4 code 13: str, # admin4 code
14: int, # population 14: int, # population
15: str, # elevation 15: str, # elevation
16: int, # dem (digital elevation model) 16: int, # dem (digital elevation model)
17: str, # timezone 17: str, # timezone
18: str # modification date yyyy-MM-dd 18: str, # modification date yyyy-MM-dd
} }
rename_cols = { rename_cols = {
0:"geonameid", # geonameid 0: "geonameid", # geonameid
1:"name", # name 1: "name", # name
2:"asciiname", # asciiname 2: "asciiname", # asciiname
3:"alternatenames", # alternatenames 3: "alternatenames", # alternatenames
4:"latitude", # latitude 4: "latitude", # latitude
5:"longitude", # longitude 5: "longitude", # longitude
6:"feature_class", # feature class 6: "feature_class", # feature class
7:"feature_code", # feature code 7: "feature_code", # feature code
8:"country_code", # country code 8: "country_code", # country code
9:"cc2", # cc2 9: "cc2", # cc2
10:"admin1_code", # admin1 code 10: "admin1_code", # admin1 code
11:"admin2_code", # admin2 code 11: "admin2_code", # admin2 code
12:"admin3_code", # admin3 code 12: "admin3_code", # admin3 code
13:"admin4_code", # admin4 code 13: "admin4_code", # admin4 code
14:"population", # population 14: "population", # population
15:"elevation", # elevation 15: "elevation", # elevation
16:"dem", # dem (digital elevation model) 16: "dem", # dem (digital elevation model)
17:"timezone", # timezone 17: "timezone", # timezone
18:"modification_date" # modification date yyyy-MM-dd 18: "modification_date", # modification date yyyy-MM-dd
} }
data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) data = pd.read_csv(
data.rename(columns=rename_cols,inplace=True) file,
sep="\t",
header=None,
quoting=3,
dtype=dtypes_dict,
na_values="",
keep_default_na=False,
error_bad_lines=False,
)
data.rename(columns=rename_cols, inplace=True)
return data return data
...@@ -81,10 +90,10 @@ def parse_title_wiki(title_wiki): ...@@ -81,10 +90,10 @@ def parse_title_wiki(title_wiki):
str str
parsed wikipedia title parsed wikipedia title
""" """
return re.sub("\(.*\)","",title_wiki).strip().lower() return re.sub("\(.*\)", "", title_wiki).strip().lower()
def _split(lst,n,complete_chunk_value): def _split(lst, n, complete_chunk_value):
""" """
Split a list into chunk of n-size. Split a list into chunk of n-size.
...@@ -102,17 +111,19 @@ def _split(lst,n,complete_chunk_value): ...@@ -102,17 +111,19 @@ def _split(lst,n,complete_chunk_value):
list list
chunked list chunked list
""" """
chunks = [lst[i:i + n] for i in range(0, len(lst), n)] chunks = [lst[i : i + n] for i in range(0, len(lst), n)]
if not chunks:return chunks if not chunks:
return chunks
if len(chunks[-1]) != n: if len(chunks[-1]) != n:
chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) chunks[-1].extend([complete_chunk_value] * (n - len(chunks[-1])))
return np.array(chunks) return np.array(chunks)
class Chronometer():
class Chronometer:
def __init__(self): def __init__(self):
self.__task_begin_timestamp = {} self.__task_begin_timestamp = {}
def start(self,task_name): def start(self, task_name):
""" """
Start a new task chronometer Start a new task chronometer
...@@ -127,10 +138,12 @@ class Chronometer(): ...@@ -127,10 +138,12 @@ class Chronometer():
if a running task already exists with that name if a running task already exists with that name
""" """
if task_name in self.__task_begin_timestamp: if task_name in self.__task_begin_timestamp:
raise ValueError("A running task exists with the name {0}!".format(task_name)) raise ValueError(
"A running task exists with the name {0}!".format(task_name)
)
self.__task_begin_timestamp[task_name] = time.time() self.__task_begin_timestamp[task_name] = time.time()
def stop(self,task_name): def stop(self, task_name):
""" """
Stop and return the duration of the task Stop and return the duration of the task
...@@ -150,11 +163,14 @@ class Chronometer(): ...@@ -150,11 +163,14 @@ class Chronometer():
if no task exist with the id `task_name` if no task exist with the id `task_name`
""" """
if not task_name in self.__task_begin_timestamp: if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name)) raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name] duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name] del self.__task_begin_timestamp[task_name]
return duration return duration
if __name__ == "__main__": if __name__ == "__main__":
chrono = Chronometer() chrono = Chronometer()
chrono.start("test") chrono.start("test")
...@@ -162,4 +178,4 @@ if __name__ == "__main__": ...@@ -162,4 +178,4 @@ if __name__ == "__main__":
time.sleep(3) time.sleep(3)
print(chrono.stop("test")) print(chrono.stop("test"))
time.sleep(3) time.sleep(3)
print(chrono.stop("test2")) print(chrono.stop("test2"))
\ No newline at end of file
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
{ "long": "--cooc-sample-size", "type": "int", "default": 3 }, { "long": "--cooc-sample-size", "type": "int", "default": 3 },
{"long": "--adjacency-iteration", "type":"int","default":1}, {"long": "--adjacency-iteration", "type":"int","default":1},
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "long": "--ngram-word2vec-dim", "type": "int", "default": 50 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment