Skip to content
Snippets Groups Projects
Commit 4dd99f2a authored by Fize Jacques's avatar Fize Jacques
Browse files

Optimisation on ngram embedding

parent 1f6d2ac8
No related branches found
No related tags found
No related merge requests found
......@@ -78,13 +78,23 @@ class NgramIndex():
else:
self.freq_ngram[ngram] += 1
def filter_ngram(self,threshold=20):
def filter_ngram_by_freq(self,threshold=20):
freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split())
selected_ngram = freq_data[freq_data.freq<threshold].ngram.values
for ng in selected_ngram:
index = self.ngram_index[ng]
del self.ngram_index[ng]
del self.index_ngram[index]
def filter_top_ngram(self,threshold=20000):
freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split()).sort_values(by="freq",ascending=False)
if len(self.ngram_index)-threshold <0:
return 0
selected_ngram = freq_data.tail(len(self.ngram_index)-threshold).ngram.values
for ng in selected_ngram:
index = self.ngram_index[ng]
del self.ngram_index[ng]
del self.index_ngram[index]
def encode(self,word,complete=True):
......
......@@ -112,7 +112,7 @@ if args.tokenization_method == "bert":
# Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x))
index.filter_ngram()
index.filter_top_ngram(40000)
num_words = len(index.index_ngram) # necessary for the embedding matrix
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment