diff --git a/helpers.py b/helpers.py index b093f62f25a9dce7913cca5373141f13f1753554..4bfee0e45f20a618bfb19046260ca2946ac0b025 100644 --- a/helpers.py +++ b/helpers.py @@ -172,13 +172,19 @@ class Chronometer: from keras.callbacks import Callback +import pandas as pd import time +import os class EpochTimer(Callback): def __init__(self,log_filename): self.epoch = 0 self.timer = time.time() - self.output = open(log_filename,'w') + if os.path.exists(log_filename): + self.output = open(log_filename,'a') + self.epoch = pd.read_csv(log_filename).Epoch.max() + else: + self.output = open(log_filename,'w') self.output.write("{0},{1}\n".format("Epoch","Execution Time")) self.output.flush() diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json index 507e9c5123fcf2df11ff8c7e5348f3a50ce32dd4..12bfcefaf50e9be36289e76624bc47d36884df20 100644 --- a/parser_config/toponym_combination_embedding_v3.json +++ b/parser_config/toponym_combination_embedding_v3.json @@ -15,6 +15,8 @@ { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] }, - { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level", "bert"] } + { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level", "bert"] }, + { "long": "--previous-state", "type": "str", "help": "If the model was trained before, give the path here" } + ] } \ No newline at end of file diff --git a/train_geocoder_v2.py b/train_geocoder_v2.py index b44fada124eba4215d899f4350a68d1719d52956..4b7bfc3c94a20e19b44cac248b98538c44774ce2 100644 --- a/train_geocoder_v2.py +++ b/train_geocoder_v2.py @@ -61,6 +61,7 @@ DATASET_NAME = args.dataset_name PREFIX_OUTPUT_FN = DATASET_NAME PREFIX_OUTPUT_FN+="_{0}".format(NGRAM_SIZE) +EMBEDDING_FN = "outputs/{0}_embedding.npy".format(PREFIX_OUTPUT_FN) PREFIX_OUTPUT_FN+="_{0}".format(EPOCHS) if args.adjacency: @@ -74,6 +75,7 @@ MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + ############################################################################################# ################################# LOAD DATA ################################################# ############################################################################################# @@ -121,9 +123,16 @@ logging.info("Done !") ################################# NGRAM EMBEDDINGS ########################################## ############################################################################################# -logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer([index.encode(p) for p in np.concatenate((pairs_of_toponym.toponym.unique(),pairs_of_toponym.toponym_context.unique()))],dim= EMBEDDING_DIM,iter=WORDVEC_ITER) -logging.info("Embedding generated !") + +if os.path.exists(EMBEDDING_FN): + logging.info("Load previous N-GRAM Embedding...") + embedding_weights = np.load(EMBEDDING_FN) + logging.info("Embedding loaded ! ") +else: + logging.info("Generating N-GRAM Embedding...") + embedding_weights = index.get_embedding_layer([index.encode(p) for p in np.concatenate((pairs_of_toponym.toponym.unique(),pairs_of_toponym.toponym_context.unique()))],dim= EMBEDDING_DIM,iter=WORDVEC_ITER) + np.save(EMBEDDING_FN,embedding_weights) + logging.info("Embedding generated !") ############################################################################################# ################################# BUILD TRAIN/TEST DATASETS ################################# @@ -132,31 +141,6 @@ logging.info("Preparing Input and Output data...") training_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "train"],index) validation_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "test"],index) -# X_1_train,X_2_train=[],[] -# X_1_test,X_2_test=[],[] -# y_train,y_test = [],[] - -# for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples(): -# top,top_c,split_ = couple[1], couple[2], couple[3] -# coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding -# enc_top, enc_top_c = index.encode(top),index.encode(top_c) -# if split_ == "train": -# X_1_train.append(enc_top) -# X_2_train.append(enc_top_c) -# y_train.append(list(coord)) -# else: -# X_1_test.append(enc_top) -# X_2_test.append(enc_top_c) -# y_test.append(list(coord)) - -# # "NUMPYZE" inputs and output lists -# X_1_train = np.array(X_1_train) -# X_2_train = np.array(X_2_train) -# y_train = np.array(y_train) - -# X_1_test = np.array(X_1_test) -# X_2_test = np.array(X_2_test) -# y_test = np.array(y_test) logging.info("Data prepared !") @@ -178,31 +162,38 @@ embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len, x1 = embedding_layer(input_1) x2 = embedding_layer(input_2) +if not args.previous_state: #Â Each LSTM learn on a permutation of the input toponyms -if args.lstm_layer == 2: - x1 = Bidirectional(LSTM(100))(x1) - x2 = Bidirectional(LSTM(100))(x2) - x = concatenate([x1,x2]) -else: - lstm_unique_layer = Bidirectional(LSTM(100)) - x1 = lstm_unique_layer(x1) - x2 = lstm_unique_layer(x2) - x = concatenate([x1,x2]) + if args.lstm_layer == 2: + x1 = Bidirectional(LSTM(100))(x1) + x2 = Bidirectional(LSTM(100))(x2) + x = concatenate([x1,x2]) + else: + lstm_unique_layer = Bidirectional(LSTM(100)) + x1 = lstm_unique_layer(x1) + x2 = lstm_unique_layer(x2) + x = concatenate([x1,x2]) -x1 = Dense(500,activation="relu")(x) -x1 = Dense(500,activation="relu")(x1) + x1 = Dense(500,activation="relu")(x) + x1 = Dense(500,activation="relu")(x1) -x2 = Dense(500,activation="relu")(x) -x2 = Dense(500,activation="relu")(x2) + x2 = Dense(500,activation="relu")(x) + x2 = Dense(500,activation="relu")(x2) -output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) -output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) + output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) + output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) -output_coord = concatenate([output_lon,output_lat],name="output_coord") + output_coord = concatenate([output_lon,output_lat],name="output_coord") -model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 -model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) + model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 + model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) +else: + if not os.path.exists(args.previous_state): + print("Model previous state was not found ! ") + sys.exit(1) + print("Load Previous state of the model...") + model = tf.keras.models.load_model(args.previous_state,custom_objects={"haversine_tf_1circle":haversine_tf_1circle,"compute_metric":accuracy_k(100)}) print("Neural Network Architecture : ") print(model.summary()) ############################################################################################# @@ -220,12 +211,6 @@ history = model.fit(training_generator,verbose=True, validation_data=validation_generator, callbacks=[checkpoint,epoch_timer],epochs=EPOCHS) -# history = model.fit(x=[X_1_train,X_2_train], -# y=y_train, -# verbose=True, batch_size=100, -# epochs=EPOCHS, -# validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), -# callbacks=[checkpoint,epoch_timer]) hist_df = pd.DataFrame(history.history)