Skip to content
Snippets Groups Projects
Commit 87143d8e authored by Fize Jacques's avatar Fize Jacques
Browse files

Add bert tokenization

parent 57a7bee4
No related branches found
No related tags found
No related merge requests found
...@@ -3,15 +3,24 @@ import json ...@@ -3,15 +3,24 @@ import json
import numpy as np import numpy as np
from ngram import NGram from ngram import NGram
from transformers import BertTokenizer
# Machine learning # Machine learning
from gensim.models import Word2Vec from gensim.models import Word2Vec
class bertTokenizer:
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False)
def split(self,string):
return self.tokenizer.tokenize(string)
class NgramIndex(): class NgramIndex():
""" """
Class used for encoding words in ngram representation Class used for encoding words in ngram representation
""" """
def __init__(self,n,loaded = False): def __init__(self,n,bert_tokenization=False,loaded = False):
""" """
Constructor Constructor
...@@ -21,6 +30,10 @@ class NgramIndex(): ...@@ -21,6 +30,10 @@ class NgramIndex():
ngram size ngram size
""" """
self.ngram_gen = NGram(N=n) self.ngram_gen = NGram(N=n)
self.empty_char = "$"
if bert_tokenization:
self.ngram_gen = bertTokenizer()
self.empty_char = "#"
self.size = n self.size = n
self.ngram_index = {"":0} self.ngram_index = {"":0}
...@@ -29,6 +42,8 @@ class NgramIndex(): ...@@ -29,6 +42,8 @@ class NgramIndex():
self.max_len = 0 self.max_len = 0
self.loaded = loaded self.loaded = loaded
def split_and_add(self,word): def split_and_add(self,word):
""" """
Split word in multiple ngram and add each one of them to the index Split word in multiple ngram and add each one of them to the index
...@@ -38,7 +53,7 @@ class NgramIndex(): ...@@ -38,7 +53,7 @@ class NgramIndex():
word : str word : str
a word a word
""" """
ngrams = str(word).lower().replace(" ","$") ngrams = str(word).lower().replace(" ",self.empty_char)
ngrams = list(self.ngram_gen.split(ngrams)) ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams] [self.add(ngram) for ngram in ngrams]
self.max_len = max(self.max_len,len(ngrams)) self.max_len = max(self.max_len,len(ngrams))
...@@ -73,9 +88,9 @@ class NgramIndex(): ...@@ -73,9 +88,9 @@ class NgramIndex():
listfrom shapely.geometry import Point,box listfrom shapely.geometry import Point,box
of ngram index of ngram index
""" """
ngrams = str(word).lower().replace(" ","$") ngrams = str(word).lower().replace(" ",self.empty_char)
ngrams = list(self.ngram_gen.split(ngrams)) ngrams = list(self.ngram_gen.split(ngrams))
ngrams = [ng for ng in ngrams if ng.count("$")<2] ngrams = [ng for ng in ngrams if ng.count(self.empty_char)<2]
if not self.loaded: if not self.loaded:
[self.add(ng) for ng in ngrams if not ng in self.ngram_index] [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len) return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
......
...@@ -15,6 +15,6 @@ ...@@ -15,6 +15,6 @@
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, { "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] }, { "short": "-l", "long": "--lstm-layer", "type": "int", "default": 2, "choices": [1, 2] },
{ "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level"] } { "long": "--tokenization-method", "type": "str", "default": "char-level", "choices": ["char-level", "word-level", "bert"] }
] ]
} }
\ No newline at end of file
...@@ -34,7 +34,8 @@ try: ...@@ -34,7 +34,8 @@ try:
physical_devices = tf.config.list_physical_devices('GPU') physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
except: except:
print("NO GPU FOUND") print("NO GPU FOUND...")
# COMMAND ARGS # COMMAND ARGS
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
.parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split()) .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split())
...@@ -103,6 +104,8 @@ logging.info("Encoding toponyms to ngram...") ...@@ -103,6 +104,8 @@ logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE) index = NgramIndex(NGRAM_SIZE)
if args.tokenization_method == "word-level": if args.tokenization_method == "word-level":
index = WordIndex() index = WordIndex()
if args.tokenization_method == "bert":
index = NgramIndex(NGRAM_SIZE,bert_tokenization=True)
# Identify all ngram available # Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment