From 57a7bee41a678ac18503b0e81eaea69f427b1124 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Mon, 9 Nov 2020 17:08:37 +0100 Subject: [PATCH] little UPD --- lib/word_index.py | 22 +++++++++---------- .../toponym_combination_embedding_v3.json | 5 +++-- train_geocoder_v2.py | 10 +++++++-- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/lib/word_index.py b/lib/word_index.py index 974a8ab..98e66f8 100644 --- a/lib/word_index.py +++ b/lib/word_index.py @@ -20,8 +20,8 @@ class WordIndex(): loaded : bool if loaded from external file """ - self.word_index = {"":0} - self.index_word = {0:""} + self.ngram_index = {"":0} + self.index_ngram = {0:""} self.cpt = 0 self.max_len = 0 @@ -49,10 +49,10 @@ class WordIndex(): ngram : str ngram """ - if not subword in self.word_index: + if not subword in self.ngram_index: self.cpt+=1 - self.word_index[subword]=self.cpt - self.index_word[self.cpt]=subword + self.ngram_index[subword]=self.cpt + self.index_ngram[self.cpt]=subword def encode(self,word): @@ -72,10 +72,10 @@ class WordIndex(): """ subwords = [w.lower() for w in word.split(" ")] if not self.loaded: - [self.add(ng) for ng in subwords if not ng in self.word_index] + [self.add(ng) for ng in subwords if not ng in self.ngram_index] if self.max_len < len(subwords): self.max_len = max(self.max_len,len(subwords)) - return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len) + return self.complete([self.ngram_index[ng] for ng in subwords if ng in self.ngram_index],self.max_len) def complete(self,ngram_encoding,MAX_LEN,filling_item=0): """ @@ -119,7 +119,7 @@ class WordIndex(): embedding matrix """ model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) - N = len(self.word_index) + N = len(self.ngram_index) embedding_matrix = np.zeros((N,dim)) for i in range(N): if str(i) in model.wv: @@ -137,7 +137,7 @@ class WordIndex(): output filename """ data = { - "word_index": self.word_index, + "word_index": self.ngram_index, "cpt_state": self.cpt, "max_len_state": self.max_len } @@ -172,8 +172,8 @@ class WordIndex(): if not key in data: raise KeyError("{0} field cannot be found in given file".format(key)) new_obj = WordIndex(loaded=True) - new_obj.word_index = data["word_index"] - new_obj.index_word = {v:k for k,v in new_obj.word_index.items()} + new_obj.ngram_index = data["ngram_index"] + new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()} new_obj.cpt = data["cpt_state"] new_obj.max_len = data["max_len_state"] return new_obj diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json index fac6e68..d4362ef 100644 --- a/parser_config/toponym_combination_embedding_v3.json +++ b/parser_config/toponym_combination_embedding_v3.json @@ -4,7 +4,7 @@ { "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." }, { "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." }, { "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." }, - { "long": "wikipedia_cooc","help":"Cooccurrence data filename"}, + { "long": "wikipedia_cooc", "help": "Cooccurrence data filename" }, { "short": "-v", "long": "--verbose", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, @@ -14,6 +14,7 @@ { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, - { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2,"choices":[1,2] } + { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] }, + { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level"] } ] } \ No newline at end of file diff --git a/train_geocoder_v2.py b/train_geocoder_v2.py index 4a51665..73a6818 100644 --- a/train_geocoder_v2.py +++ b/train_geocoder_v2.py @@ -14,6 +14,7 @@ from keras.callbacks import ModelCheckpoint #Â Custom module from lib.utils_geo import zero_one_encoding from lib.ngram_index import NgramIndex +from lib.word_index import WordIndex from lib.utils import ConfigurationReader from lib.utils_geo import accuracy_k,haversine_tf_1circle from helpers import EpochTimer @@ -29,8 +30,11 @@ logging.basicConfig( # LOGGING CONF ) import tensorflow as tf -physical_devices = tf.config.list_physical_devices('GPU') -tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) +try: + physical_devices = tf.config.list_physical_devices('GPU') + tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) +except: + print("NO GPU FOUND") #Â COMMAND ARGS args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split()) @@ -97,6 +101,8 @@ pairs_of_toponym = pd.concat(data_used) # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) +if args.tokenization_method == "word-level": + index = WordIndex() # Identify all ngram available pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) -- GitLab