From c0d73c7576667d673853d90bd0b40e6dc84e5fda Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Tue, 23 Mar 2021 16:14:52 +0100
Subject: [PATCH] add option for saving the best model each epoch

---
 ...g_v3.json => argument_train_geocoder.json} |  4 ++--
 .../toponym_combination_embedding.json        | 20 -------------------
 .../toponym_combination_embedding_v2.json     | 20 -------------------
 train_geocoder.py                             | 11 +++++-----
 wikipediageocoding.ipynb                      |  4 ++--
 5 files changed, 10 insertions(+), 49 deletions(-)
 rename parser_config/{toponym_combination_embedding_v3.json => argument_train_geocoder.json} (92%)
 delete mode 100644 parser_config/toponym_combination_embedding.json
 delete mode 100644 parser_config/toponym_combination_embedding_v2.json

diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/argument_train_geocoder.json
similarity index 92%
rename from parser_config/toponym_combination_embedding_v3.json
rename to parser_config/argument_train_geocoder.json
index 12bfcef..3fcd22c 100644
--- a/parser_config/toponym_combination_embedding_v3.json
+++ b/parser_config/argument_train_geocoder.json
@@ -16,7 +16,7 @@
         { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
         { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] },
         { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level", "bert"] },
-        { "long": "--previous-state", "type": "str", "help": "If the model was trained before, give the path here" }
-
+        { "long": "--previous-state", "type": "str", "help": "If the model was trained before, give the path here" },
+        {  "long": "--save-best-model", "action": "store_true" }
     ]
 }
\ No newline at end of file
diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json
deleted file mode 100644
index 260d6ec..0000000
--- a/parser_config/toponym_combination_embedding.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-    "description": "Toponym Combination",
-    "args": [
-        { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." },
-        { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." },
-        { "short": "-v", "long": "--verbose", "action": "store_true" },
-        { "short": "-i", "long": "--inclusion", "action": "store_true" },
-        { "short": "-a", "long": "--adjacency", "action": "store_true" },
-        { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
-        { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"},
-        { "long": "--cooc-sample-size", "type": "int", "default": 1 },
-        {"long": "--adjacency-iteration", "type":"int","default":1},
-        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 },
-        { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
-        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
-        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
-        { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
-        {  "long": "--admin_code_1", "default": "None" }
-    ]
-}
\ No newline at end of file
diff --git a/parser_config/toponym_combination_embedding_v2.json b/parser_config/toponym_combination_embedding_v2.json
deleted file mode 100644
index 345c1d7..0000000
--- a/parser_config/toponym_combination_embedding_v2.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-    "description": "Toponym Combination",
-    "args": [
-        { "short": "geoname_input", "help": "Filepath of the Geonames file you want to use." },
-        { "short": "geoname_hierachy_input", "help": "Filepath of the Geonames file you want to use." },
-        { "short": "-v", "long": "--verbose", "action": "store_true" },
-        { "short": "-i", "long": "--inclusion", "action": "store_true" },
-        { "short": "-a", "long": "--adjacency", "action": "store_true" },
-        { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
-        { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"},
-        { "long": "--cooc-sample-size", "type": "int", "default": 1 },
-        {"long": "--adjacency-iteration", "type":"int","default":1},
-        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 },
-        { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
-        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
-        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
-        { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
-        {  "long": "--admin_code_1", "default": "None" }
-    ]
-}
\ No newline at end of file
diff --git a/train_geocoder.py b/train_geocoder.py
index b83c3d6..eaa5193 100644
--- a/train_geocoder.py
+++ b/train_geocoder.py
@@ -37,7 +37,7 @@ except:
     print("NO GPU FOUND...")
 
 #Â COMMAND ARGS
-args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
+args = ConfigurationReader("parser_config/argument_train_geocoder.json")\
     .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w  -a -n 4 --ngram-word2vec-iter 1".split())
 
 #
@@ -50,6 +50,7 @@ ACCURACY_TOLERANCE = args.tolerance_value
 EPOCHS = args.epochs
 WORDVEC_ITER = args.ngram_word2vec_iter
 EMBEDDING_DIM = args.dimension
+save_best_only = args.save_best_model
 #################################################
 ########## FILENAME VARIABLE ####################
 #################################################
@@ -201,8 +202,8 @@ print(model.summary())
 ################################# TRAINING LAUNCH ###########################################
 #############################################################################################
 
-checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
-    save_best_only=True, mode='auto', period=1)
+checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN , monitor='loss', verbose=1,
+    save_best_only=save_best_only, mode='auto', period=1)
 
 epoch_timer = EpochTimer(HISTORY_FN)
 
@@ -213,8 +214,8 @@ history = model.fit(training_generator,verbose=True,
                     callbacks=[checkpoint,epoch_timer],epochs=EPOCHS)
 
 
-
-model.save(MODEL_OUTPUT_FN)
+if not save_best_only:
+    model.save(MODEL_OUTPUT_FN)
 
 #Â Erase Model Checkpoint file
 if os.path.exists(MODEL_OUTPUT_FN + ".part"):
diff --git a/wikipediageocoding.ipynb b/wikipediageocoding.ipynb
index 7dceba3..91dc00b 100644
--- a/wikipediageocoding.ipynb
+++ b/wikipediageocoding.ipynb
@@ -27,7 +27,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -147,7 +147,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
-- 
GitLab