Skip to content
Snippets Groups Projects
Commit 57a7bee4 authored by Fize Jacques's avatar Fize Jacques
Browse files

little UPD

parent 00420f29
No related branches found
No related tags found
No related merge requests found
...@@ -20,8 +20,8 @@ class WordIndex(): ...@@ -20,8 +20,8 @@ class WordIndex():
loaded : bool loaded : bool
if loaded from external file if loaded from external file
""" """
self.word_index = {"":0} self.ngram_index = {"":0}
self.index_word = {0:""} self.index_ngram = {0:""}
self.cpt = 0 self.cpt = 0
self.max_len = 0 self.max_len = 0
...@@ -49,10 +49,10 @@ class WordIndex(): ...@@ -49,10 +49,10 @@ class WordIndex():
ngram : str ngram : str
ngram ngram
""" """
if not subword in self.word_index: if not subword in self.ngram_index:
self.cpt+=1 self.cpt+=1
self.word_index[subword]=self.cpt self.ngram_index[subword]=self.cpt
self.index_word[self.cpt]=subword self.index_ngram[self.cpt]=subword
def encode(self,word): def encode(self,word):
...@@ -72,10 +72,10 @@ class WordIndex(): ...@@ -72,10 +72,10 @@ class WordIndex():
""" """
subwords = [w.lower() for w in word.split(" ")] subwords = [w.lower() for w in word.split(" ")]
if not self.loaded: if not self.loaded:
[self.add(ng) for ng in subwords if not ng in self.word_index] [self.add(ng) for ng in subwords if not ng in self.ngram_index]
if self.max_len < len(subwords): if self.max_len < len(subwords):
self.max_len = max(self.max_len,len(subwords)) self.max_len = max(self.max_len,len(subwords))
return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len) return self.complete([self.ngram_index[ng] for ng in subwords if ng in self.ngram_index],self.max_len)
def complete(self,ngram_encoding,MAX_LEN,filling_item=0): def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
""" """
...@@ -119,7 +119,7 @@ class WordIndex(): ...@@ -119,7 +119,7 @@ class WordIndex():
embedding matrix embedding matrix
""" """
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
N = len(self.word_index) N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim)) embedding_matrix = np.zeros((N,dim))
for i in range(N): for i in range(N):
if str(i) in model.wv: if str(i) in model.wv:
...@@ -137,7 +137,7 @@ class WordIndex(): ...@@ -137,7 +137,7 @@ class WordIndex():
output filename output filename
""" """
data = { data = {
"word_index": self.word_index, "word_index": self.ngram_index,
"cpt_state": self.cpt, "cpt_state": self.cpt,
"max_len_state": self.max_len "max_len_state": self.max_len
} }
...@@ -172,8 +172,8 @@ class WordIndex(): ...@@ -172,8 +172,8 @@ class WordIndex():
if not key in data: if not key in data:
raise KeyError("{0} field cannot be found in given file".format(key)) raise KeyError("{0} field cannot be found in given file".format(key))
new_obj = WordIndex(loaded=True) new_obj = WordIndex(loaded=True)
new_obj.word_index = data["word_index"] new_obj.ngram_index = data["ngram_index"]
new_obj.index_word = {v:k for k,v in new_obj.word_index.items()} new_obj.index_ngram = {v:k for k,v in new_obj.ngram_index.items()}
new_obj.cpt = data["cpt_state"] new_obj.cpt = data["cpt_state"]
new_obj.max_len = data["max_len_state"] new_obj.max_len = data["max_len_state"]
return new_obj return new_obj
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
{ "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." }, { "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." },
{ "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." }, { "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." },
{ "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." }, { "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." },
{ "long": "wikipedia_cooc","help":"Cooccurrence data filename"}, { "long": "wikipedia_cooc", "help": "Cooccurrence data filename" },
{ "short": "-v", "long": "--verbose", "action": "store_true" }, { "short": "-v", "long": "--verbose", "action": "store_true" },
{ "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" },
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2,"choices":[1,2] } { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] },
{ "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level"] }
] ]
} }
\ No newline at end of file
...@@ -14,6 +14,7 @@ from keras.callbacks import ModelCheckpoint ...@@ -14,6 +14,7 @@ from keras.callbacks import ModelCheckpoint
# Custom module # Custom module
from lib.utils_geo import zero_one_encoding from lib.utils_geo import zero_one_encoding
from lib.ngram_index import NgramIndex from lib.ngram_index import NgramIndex
from lib.word_index import WordIndex
from lib.utils import ConfigurationReader from lib.utils import ConfigurationReader
from lib.utils_geo import accuracy_k,haversine_tf_1circle from lib.utils_geo import accuracy_k,haversine_tf_1circle
from helpers import EpochTimer from helpers import EpochTimer
...@@ -29,8 +30,11 @@ logging.basicConfig( # LOGGING CONF ...@@ -29,8 +30,11 @@ logging.basicConfig( # LOGGING CONF
) )
import tensorflow as tf import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU') try:
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
except:
print("NO GPU FOUND")
# COMMAND ARGS # COMMAND ARGS
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
.parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split()) .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split())
...@@ -97,6 +101,8 @@ pairs_of_toponym = pd.concat(data_used) ...@@ -97,6 +101,8 @@ pairs_of_toponym = pd.concat(data_used)
# ENCODING NAME USING N-GRAM SPLITTING # ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...") logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE) index = NgramIndex(NGRAM_SIZE)
if args.tokenization_method == "word-level":
index = WordIndex()
# Identify all ngram available # Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment