diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/argument_train_geocoder.json similarity index 92% rename from parser_config/toponym_combination_embedding_v3.json rename to parser_config/argument_train_geocoder.json index 12bfcefaf50e9be36289e76624bc47d36884df20..3fcd22cc9f3dd749baf89df3b9ecd35f04201479 100644 --- a/parser_config/toponym_combination_embedding_v3.json +++ b/parser_config/argument_train_geocoder.json @@ -16,7 +16,7 @@ { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] }, { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level", "bert"] }, - { "long": "--previous-state", "type": "str", "help": "If the model was trained before, give the path here" } - + { "long": "--previous-state", "type": "str", "help": "If the model was trained before, give the path here" }, + { "long": "--save-best-model", "action": "store_true" } ] } \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json deleted file mode 100644 index 260d6ec129527b76e4f0ff2002a0dd83d4e80e01..0000000000000000000000000000000000000000 --- a/parser_config/toponym_combination_embedding.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "description": "Toponym Combination", - "args": [ - { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." }, - { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." }, - { "short": "-v", "long": "--verbose", "action": "store_true" }, - { "short": "-i", "long": "--inclusion", "action": "store_true" }, - { "short": "-a", "long": "--adjacency", "action": "store_true" }, - { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, - { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, - { "long": "--cooc-sample-size", "type": "int", "default": 1 }, - {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, - { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, - { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, - { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, - { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, - { "long": "--admin_code_1", "default": "None" } - ] -} \ No newline at end of file diff --git a/parser_config/toponym_combination_embedding_v2.json b/parser_config/toponym_combination_embedding_v2.json deleted file mode 100644 index 345c1d7d49f767b0076b11574538c46952bad788..0000000000000000000000000000000000000000 --- a/parser_config/toponym_combination_embedding_v2.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "description": "Toponym Combination", - "args": [ - { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." }, - { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." }, - { "short": "-v", "long": "--verbose", "action": "store_true" }, - { "short": "-i", "long": "--inclusion", "action": "store_true" }, - { "short": "-a", "long": "--adjacency", "action": "store_true" }, - { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, - { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, - { "long": "--cooc-sample-size", "type": "int", "default": 1 }, - {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, - { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, - { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, - { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, - { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, - { "long": "--admin_code_1", "default": "None" } - ] -} \ No newline at end of file diff --git a/train_geocoder.py b/train_geocoder.py index b83c3d6dbc4770eed7c86fecd43d622235a94bfc..eaa5193b252bfaf2c3e5b598e24d9ab66378ea72 100644 --- a/train_geocoder.py +++ b/train_geocoder.py @@ -37,7 +37,7 @@ except: print("NO GPU FOUND...") #Â COMMAND ARGS -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ +args = ConfigurationReader("parser_config/argument_train_geocoder.json")\ .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split()) # @@ -50,6 +50,7 @@ ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs WORDVEC_ITER = args.ngram_word2vec_iter EMBEDDING_DIM = args.dimension +save_best_only = args.save_best_model ################################################# ########## FILENAME VARIABLE #################### ################################################# @@ -201,8 +202,8 @@ print(model.summary()) ################################# TRAINING LAUNCH ########################################### ############################################################################################# -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN , monitor='loss', verbose=1, + save_best_only=save_best_only, mode='auto', period=1) epoch_timer = EpochTimer(HISTORY_FN) @@ -213,8 +214,8 @@ history = model.fit(training_generator,verbose=True, callbacks=[checkpoint,epoch_timer],epochs=EPOCHS) - -model.save(MODEL_OUTPUT_FN) +if not save_best_only: + model.save(MODEL_OUTPUT_FN) #Â Erase Model Checkpoint file if os.path.exists(MODEL_OUTPUT_FN + ".part"): diff --git a/wikipediageocoding.ipynb b/wikipediageocoding.ipynb index 7dceba34a5e6dee0243154dd899fa114a5dfe3de..91dc00b7003a06f915635ab958208601c9022a9c 100644 --- a/wikipediageocoding.ipynb +++ b/wikipediageocoding.ipynb @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [