Skip to content
Snippets Groups Projects
Commit 67aaa571 authored by Fize Jacques's avatar Fize Jacques
Browse files

Debug filter ngram encoder

parent 89505f37
No related branches found
No related tags found
No related merge requests found
......@@ -58,7 +58,7 @@ class NgramIndex():
"""
ngrams = str(word).lower().replace(" ",self.empty_char)
ngrams = list(self.ngram_gen.split(ngrams))
[self.add(ngram) for ngram in ngrams]
[self.add(ngram) for ngram in ngrams if not ngram in self.ngram_index]
self.max_len = max(self.max_len,len(ngrams))
def add(self,ngram):
......@@ -80,21 +80,21 @@ class NgramIndex():
def filter_ngram_by_freq(self,threshold=20):
freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split())
selected_ngram = freq_data[freq_data.freq<threshold].ngram.values
for ng in selected_ngram:
index = self.ngram_index[ng]
del self.ngram_index[ng]
del self.index_ngram[index]
selected_ngram = freq_data[freq_data.freq>threshold]
selected_ngram["index__"] = np.arange(len(selected_ngram))
self.ngram_index = dict(selected_ngram["ngram index__".split()].values)
self.index_ngram = dict(selected_ngram["index__ ngram".split()].values)
def filter_top_ngram(self,threshold=20000):
freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split()).sort_values(by="freq",ascending=False)
if len(self.ngram_index)-threshold <0:
if len(self.ngram_index)-threshold <=0:
return 0
selected_ngram = freq_data.tail(len(self.ngram_index)-threshold).ngram.values
for ng in selected_ngram:
index = self.ngram_index[ng]
del self.ngram_index[ng]
del self.index_ngram[index]
selected_ngram = freq_data.head(threshold)
selected_ngram["index__"] = np.arange(len(selected_ngram))
self.ngram_index = dict(selected_ngram["ngram index__".split()].values)
self.index_ngram = dict(selected_ngram["index__ ngram".split()].values)
def encode(self,word,complete=True):
......
......@@ -112,7 +112,8 @@ if args.tokenization_method == "bert":
# Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x))
index.filter_top_ngram(40000)
print(len(index.ngram_index))
index.filter_top_ngram(10000)
num_words = len(index.index_ngram) # necessary for the embedding matrix
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment