From 57a7bee41a678ac18503b0e81eaea69f427b1124 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Mon, 9 Nov 2020 17:08:37 +0100
Subject: [PATCH] little UPD

---
 lib/word_index.py                             | 22 +++++++++----------
 .../toponym_combination_embedding_v3.json     |  5 +++--
 train_geocoder_v2.py                          | 10 +++++++--
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/lib/word_index.py b/lib/word_index.py
index 974a8ab..98e66f8 100644
--- a/lib/word_index.py
+++ b/lib/word_index.py
@@ -20,8 +20,8 @@ class WordIndex():
         loaded : bool
             if loaded from external file
         """
-        self.word_index = {"":0}
-        self.index_word = {0:""}
+        self.ngram_index = {"":0}
+        self.index_ngram = {0:""}
         self.cpt = 0
         self.max_len = 0
 
@@ -49,10 +49,10 @@ class WordIndex():
         ngram : str
             ngram
         """
-        if not subword in self.word_index:
+        if not subword in self.ngram_index:
             self.cpt+=1
-            self.word_index[subword]=self.cpt
-            self.index_word[self.cpt]=subword
+            self.ngram_index[subword]=self.cpt
+            self.index_ngram[self.cpt]=subword
         
 
     def encode(self,word):
@@ -72,10 +72,10 @@ class WordIndex():
         """
         subwords = [w.lower() for w in word.split(" ")]
         if not self.loaded:
-            [self.add(ng) for ng in subwords if not ng in self.word_index]
+            [self.add(ng) for ng in subwords if not ng in self.ngram_index]
         if self.max_len < len(subwords):
             self.max_len = max(self.max_len,len(subwords))
-        return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len)
+        return self.complete([self.ngram_index[ng] for ng in subwords if ng in self.ngram_index],self.max_len)
 
     def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
         """
@@ -119,7 +119,7 @@ class WordIndex():
             embedding matrix
         """
         model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
-        N = len(self.word_index)
+        N = len(self.ngram_index)
         embedding_matrix = np.zeros((N,dim))
         for i in range(N):
             if str(i) in model.wv:
@@ -137,7 +137,7 @@ class WordIndex():
             output filename
         """
         data = {
-            "word_index": self.word_index,
+            "word_index": self.ngram_index,
             "cpt_state": self.cpt,
             "max_len_state": self.max_len
         }
@@ -172,8 +172,8 @@ class WordIndex():
             if not key in data:
                 raise KeyError("{0} field cannot be found in given file".format(key))
         new_obj = WordIndex(loaded=True)
-        new_obj.word_index = data["word_index"]
-        new_obj.index_word = {v:k for k,v in new_obj.word_index.items()}
+        new_obj.ngram_index = data["ngram_index"]
+        new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
         new_obj.cpt = data["cpt_state"]
         new_obj.max_len = data["max_len_state"]
         return new_obj
diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json
index fac6e68..d4362ef 100644
--- a/parser_config/toponym_combination_embedding_v3.json
+++ b/parser_config/toponym_combination_embedding_v3.json
@@ -4,7 +4,7 @@
         { "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." },
         { "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." },
         { "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." },
-        { "long": "wikipedia_cooc","help":"Cooccurrence data filename"},
+        { "long": "wikipedia_cooc", "help": "Cooccurrence data filename" },
         { "short": "-v", "long": "--verbose", "action": "store_true" },
         { "short": "-i", "long": "--inclusion", "action": "store_true" },
         { "short": "-a", "long": "--adjacency", "action": "store_true" },
@@ -14,6 +14,7 @@
         { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
         { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
         { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
-        { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2,"choices":[1,2] }
+        { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] },
+        { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level"] }
     ]
 }
\ No newline at end of file
diff --git a/train_geocoder_v2.py b/train_geocoder_v2.py
index 4a51665..73a6818 100644
--- a/train_geocoder_v2.py
+++ b/train_geocoder_v2.py
@@ -14,6 +14,7 @@ from keras.callbacks import ModelCheckpoint
 # Custom module
 from lib.utils_geo import zero_one_encoding
 from lib.ngram_index import NgramIndex
+from lib.word_index import WordIndex
 from lib.utils import ConfigurationReader
 from lib.utils_geo import accuracy_k,haversine_tf_1circle
 from helpers import EpochTimer
@@ -29,8 +30,11 @@ logging.basicConfig( # LOGGING CONF
     )
 
 import tensorflow as tf
-physical_devices = tf.config.list_physical_devices('GPU')
-tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
+try:
+    physical_devices = tf.config.list_physical_devices('GPU')
+    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
+except:
+    print("NO GPU FOUND")
 # COMMAND ARGS
 args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
     .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w  -a -n 4 --ngram-word2vec-iter 1".split())
@@ -97,6 +101,8 @@ pairs_of_toponym = pd.concat(data_used)
 # ENCODING NAME USING N-GRAM SPLITTING
 logging.info("Encoding toponyms to ngram...")
 index = NgramIndex(NGRAM_SIZE)
+if args.tokenization_method == "word-level":
+    index = WordIndex()
 
  # Identify all ngram available
 pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
-- 
GitLab