little UPD

57a7bee4 · Fize Jacques · 00420f29 · 57a7bee4 · 57a7bee4 · 57a7bee4
Commit 57a7bee4 authored 4 years ago by Fize Jacques
--- a/lib/word_index.py
+++ b/lib/word_index.py
@@ -20,8 +20,8 @@ class WordIndex():
        loaded : bool
            if loaded from external file
        """
-        self.word_index = {"":0}
-        self.index_word = {0:""}
+        self.ngram_index = {"":0}
+        self.index_ngram = {0:""}
        self.cpt = 0
        self.max_len = 0

@@ -49,10 +49,10 @@ class WordIndex():
        ngram : str
            ngram
        """
-        if not subword in self.word_index:
+        if not subword in self.ngram_index:
            self.cpt+=1
-            self.word_index[subword]=self.cpt
-            self.index_word[self.cpt]=subword
+            self.ngram_index[subword]=self.cpt
+            self.index_ngram[self.cpt]=subword
        

    def encode(self,word):
@@ -72,10 +72,10 @@ class WordIndex():
        """
        subwords = [w.lower() for w in word.split(" ")]
        if not self.loaded:
-            [self.add(ng) for ng in subwords if not ng in self.word_index]
+            [self.add(ng) for ng in subwords if not ng in self.ngram_index]
        if self.max_len < len(subwords):
            self.max_len = max(self.max_len,len(subwords))
-        return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len)
+        return self.complete([self.ngram_index[ng] for ng in subwords if ng in self.ngram_index],self.max_len)

    def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
        """
@@ -119,7 +119,7 @@ class WordIndex():
            embedding matrix
        """
        model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
-        N = len(self.word_index)
+        N = len(self.ngram_index)
        embedding_matrix = np.zeros((N,dim))
        for i in range(N):
            if str(i) in model.wv:
@@ -137,7 +137,7 @@ class WordIndex():
            output filename
        """
        data = {
-            "word_index": self.word_index,
+            "word_index": self.ngram_index,
            "cpt_state": self.cpt,
            "max_len_state": self.max_len
        }
@@ -172,8 +172,8 @@ class WordIndex():
            if not key in data:
                raise KeyError("{0} field cannot be found in given file".format(key))
        new_obj = WordIndex(loaded=True)
-        new_obj.word_index = data["word_index"]
-        new_obj.index_word = {v:k for k,v in new_obj.word_index.items()}
+        new_obj.ngram_index = data["ngram_index"]
+        new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
        new_obj.cpt = data["cpt_state"]
        new_obj.max_len = data["max_len_state"]
        return new_obj

--- a/parser_config/toponym_combination_embedding_v3.json
+++ b/parser_config/toponym_combination_embedding_v3.json
@@ -4,7 +4,7 @@
        { "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." },
        { "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." },
        { "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." },
-        { "long": "wikipedia_cooc","help":"Cooccurrence data filename"},
+        { "long": "wikipedia_cooc", "help": "Cooccurrence data filename" },
        { "short": "-v", "long": "--verbose", "action": "store_true" },
        { "short": "-i", "long": "--inclusion", "action": "store_true" },
        { "short": "-a", "long": "--adjacency", "action": "store_true" },
@@ -14,6 +14,7 @@
        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
        { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
-        { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2,"choices":[1,2] }
+        { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] },
+        { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level"] }
    ]
 }
\ No newline at end of file
--- a/train_geocoder_v2.py
+++ b/train_geocoder_v2.py
@@ -14,6 +14,7 @@ from keras.callbacks import ModelCheckpoint
 # Custom module
 from lib.utils_geo import zero_one_encoding
 from lib.ngram_index import NgramIndex
+from lib.word_index import WordIndex
 from lib.utils import ConfigurationReader
 from lib.utils_geo import accuracy_k,haversine_tf_1circle
 from helpers import EpochTimer
@@ -29,8 +30,11 @@ logging.basicConfig( # LOGGING CONF
    )

 import tensorflow as tf
-physical_devices = tf.config.list_physical_devices('GPU')
-tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
+try:
+    physical_devices = tf.config.list_physical_devices('GPU')
+    tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
+except:
+    print("NO GPU FOUND")
 # COMMAND ARGS
 args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
    .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w  -a -n 4 --ngram-word2vec-iter 1".split())
@@ -97,6 +101,8 @@ pairs_of_toponym = pd.concat(data_used)
 # ENCODING NAME USING N-GRAM SPLITTING
 logging.info("Encoding toponyms to ngram...")
 index = NgramIndex(NGRAM_SIZE)
+if args.tokenization_method == "word-level":
+    index = WordIndex()

 # Identify all ngram available
 pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))