Skip to content
Snippets Groups Projects
Commit 67aaa571 authored by Fize Jacques's avatar Fize Jacques
Browse files

Debug filter ngram encoder

parent 89505f37
No related branches found
No related tags found
No related merge requests found
...@@ -58,7 +58,7 @@ class NgramIndex(): ...@@ -58,7 +58,7 @@ class NgramIndex():
""" """
ngrams = str(word).lower().replace(" ",self.empty_char) ngrams = str(word).lower().replace(" ",self.empty_char)
ngrams = list(self.ngram_gen.split(ngrams)) ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams] [self.add(ngram) for ngram in ngrams if not ngram in self.ngram_index]
self.max_len = max(self.max_len,len(ngrams)) self.max_len = max(self.max_len,len(ngrams))
def add(self,ngram): def add(self,ngram):
...@@ -80,21 +80,21 @@ class NgramIndex(): ...@@ -80,21 +80,21 @@ class NgramIndex():
def filter_ngram_by_freq(self,threshold=20): def filter_ngram_by_freq(self,threshold=20):
freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split()) freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split())
selected_ngram = freq_data[freq_data.freq<threshold].ngram.values selected_ngram = freq_data[freq_data.freq>threshold]
for ng in selected_ngram: selected_ngram["index__"] = np.arange(len(selected_ngram))
index = self.ngram_index[ng] self.ngram_index = dict(selected_ngram["ngram index__".split()].values)
del self.ngram_index[ng] self.index_ngram = dict(selected_ngram["index__ ngram".split()].values)
del self.index_ngram[index]
def filter_top_ngram(self,threshold=20000): def filter_top_ngram(self,threshold=20000):
freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split()).sort_values(by="freq",ascending=False) freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split()).sort_values(by="freq",ascending=False)
if len(self.ngram_index)-threshold <0: if len(self.ngram_index)-threshold <=0:
return 0 return 0
selected_ngram = freq_data.tail(len(self.ngram_index)-threshold).ngram.values selected_ngram = freq_data.head(threshold)
for ng in selected_ngram: selected_ngram["index__"] = np.arange(len(selected_ngram))
index = self.ngram_index[ng] self.ngram_index = dict(selected_ngram["ngram index__".split()].values)
del self.ngram_index[ng] self.index_ngram = dict(selected_ngram["index__ ngram".split()].values)
del self.index_ngram[index]
def encode(self,word,complete=True): def encode(self,word,complete=True):
......
...@@ -112,7 +112,8 @@ if args.tokenization_method == "bert": ...@@ -112,7 +112,8 @@ if args.tokenization_method == "bert":
# Identify all ngram available # Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x))
index.filter_top_ngram(40000) print(len(index.ngram_index))
index.filter_top_ngram(10000)
num_words = len(index.index_ngram) # necessary for the embedding matrix num_words = len(index.index_ngram) # necessary for the embedding matrix
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment