Skip to content
Snippets Groups Projects
Commit 87143d8e authored by Fize Jacques's avatar Fize Jacques
Browse files

Add bert tokenization

parent 57a7bee4
No related branches found
No related tags found
No related merge requests found
......@@ -3,15 +3,24 @@ import json
import numpy as np
from ngram import NGram
from transformers import BertTokenizer
# Machine learning
from gensim.models import Word2Vec
class bertTokenizer:
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False)
def split(self,string):
return self.tokenizer.tokenize(string)
class NgramIndex():
"""
Class used for encoding words in ngram representation
"""
def __init__(self,n,loaded = False):
def __init__(self,n,bert_tokenization=False,loaded = False):
"""
Constructor
......@@ -21,6 +30,10 @@ class NgramIndex():
ngram size
"""
self.ngram_gen = NGram(N=n)
self.empty_char = "$"
if bert_tokenization:
self.ngram_gen = bertTokenizer()
self.empty_char = "#"
self.size = n
self.ngram_index = {"":0}
......@@ -29,6 +42,8 @@ class NgramIndex():
self.max_len = 0
self.loaded = loaded
def split_and_add(self,word):
"""
Split word in multiple ngram and add each one of them to the index
......@@ -38,7 +53,7 @@ class NgramIndex():
word : str
a word
"""
ngrams = str(word).lower().replace(" ","$")
ngrams = str(word).lower().replace(" ",self.empty_char)
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams]
self.max_len = max(self.max_len,len(ngrams))
......@@ -73,9 +88,9 @@ class NgramIndex():
listfrom shapely.geometry import Point,box
of ngram index
"""
ngrams = str(word).lower().replace(" ","$")
ngrams = str(word).lower().replace(" ",self.empty_char)
ngrams = list(self.ngram_gen.split(ngrams))
ngrams = [ng for ng in ngrams if ng.count("$")<2]
ngrams = [ng for ng in ngrams if ng.count(self.empty_char)<2]
if not self.loaded:
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
......
......@@ -15,6 +15,6 @@
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] },
{ "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level"] }
{ "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level", "bert"] }
]
}
\ No newline at end of file
......@@ -34,7 +34,8 @@ try:
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
except:
print("NO GPU FOUND")
print("NO GPU FOUND...")
# COMMAND ARGS
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
.parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split())
......@@ -103,6 +104,8 @@ logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE)
if args.tokenization_method == "word-level":
index = WordIndex()
if args.tokenization_method == "bert":
index = NgramIndex(NGRAM_SIZE,bert_tokenization=True)
# Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment