Skip to content
Snippets Groups Projects
Commit 57a7bee4 authored by Fize Jacques's avatar Fize Jacques
Browse files

little UPD

parent 00420f29
No related branches found
No related tags found
No related merge requests found
......@@ -20,8 +20,8 @@ class WordIndex():
loaded : bool
if loaded from external file
"""
self.word_index = {"":0}
self.index_word = {0:""}
self.ngram_index = {"":0}
self.index_ngram = {0:""}
self.cpt = 0
self.max_len = 0
......@@ -49,10 +49,10 @@ class WordIndex():
ngram : str
ngram
"""
if not subword in self.word_index:
if not subword in self.ngram_index:
self.cpt+=1
self.word_index[subword]=self.cpt
self.index_word[self.cpt]=subword
self.ngram_index[subword]=self.cpt
self.index_ngram[self.cpt]=subword
def encode(self,word):
......@@ -72,10 +72,10 @@ class WordIndex():
"""
subwords = [w.lower() for w in word.split(" ")]
if not self.loaded:
[self.add(ng) for ng in subwords if not ng in self.word_index]
[self.add(ng) for ng in subwords if not ng in self.ngram_index]
if self.max_len < len(subwords):
self.max_len = max(self.max_len,len(subwords))
return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len)
return self.complete([self.ngram_index[ng] for ng in subwords if ng in self.ngram_index],self.max_len)
def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
"""
......@@ -119,7 +119,7 @@ class WordIndex():
embedding matrix
"""
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
N = len(self.word_index)
N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim))
for i in range(N):
if str(i) in model.wv:
......@@ -137,7 +137,7 @@ class WordIndex():
output filename
"""
data = {
"word_index": self.word_index,
"word_index": self.ngram_index,
"cpt_state": self.cpt,
"max_len_state": self.max_len
}
......@@ -172,8 +172,8 @@ class WordIndex():
if not key in data:
raise KeyError("{0} field cannot be found in given file".format(key))
new_obj = WordIndex(loaded=True)
new_obj.word_index = data["word_index"]
new_obj.index_word = {v:k for k,v in new_obj.word_index.items()}
new_obj.ngram_index = data["ngram_index"]
new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
new_obj.cpt = data["cpt_state"]
new_obj.max_len = data["max_len_state"]
return new_obj
......
......@@ -4,7 +4,7 @@
{ "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." },
{ "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." },
{ "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." },
{ "long": "wikipedia_cooc","help":"Cooccurrence data filename"},
{ "long": "wikipedia_cooc", "help": "Cooccurrence data filename" },
{ "short": "-v", "long": "--verbose", "action": "store_true" },
{ "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" },
......@@ -14,6 +14,7 @@
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2,"choices":[1,2] }
{ "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] },
{ "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level"] }
]
}
\ No newline at end of file
......@@ -14,6 +14,7 @@ from keras.callbacks import ModelCheckpoint
# Custom module
from lib.utils_geo import zero_one_encoding
from lib.ngram_index import NgramIndex
from lib.word_index import WordIndex
from lib.utils import ConfigurationReader
from lib.utils_geo import accuracy_k,haversine_tf_1circle
from helpers import EpochTimer
......@@ -29,8 +30,11 @@ logging.basicConfig( # LOGGING CONF
)
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
try:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
except:
print("NO GPU FOUND")
# COMMAND ARGS
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
.parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split())
......@@ -97,6 +101,8 @@ pairs_of_toponym = pd.concat(data_used)
# ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE)
if args.tokenization_method == "word-level":
index = WordIndex()
# Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment