Skip to content
Snippets Groups Projects
Commit 1f6d2ac8 authored by Fize Jacques's avatar Fize Jacques
Browse files

Optimisation on ngram embedding

parent 22d47332
No related branches found
No related tags found
No related merge requests found
import json import json
import numpy as np import numpy as np
import pandas as pd
from ngram import NGram from ngram import NGram
from transformers import BertTokenizer from transformers import BertTokenizer
...@@ -38,6 +39,8 @@ class NgramIndex(): ...@@ -38,6 +39,8 @@ class NgramIndex():
self.size = n self.size = n
self.ngram_index = {"":0} self.ngram_index = {"":0}
self.index_ngram = {0:""} self.index_ngram = {0:""}
self.freq_ngram = {}
self.cpt = 0 self.cpt = 0
self.max_len = 0 self.max_len = 0
...@@ -71,6 +74,17 @@ class NgramIndex(): ...@@ -71,6 +74,17 @@ class NgramIndex():
self.cpt+=1 self.cpt+=1
self.ngram_index[ngram]=self.cpt self.ngram_index[ngram]=self.cpt
self.index_ngram[self.cpt]=ngram self.index_ngram[self.cpt]=ngram
self.freq_ngram[ngram] = 1
else:
self.freq_ngram[ngram] += 1
def filter_ngram(self,threshold=20):
freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split())
selected_ngram = freq_data[freq_data.freq<threshold].ngram.values
for ng in selected_ngram:
index = self.ngram_index[ng]
del self.ngram_index[ng]
del self.index_ngram[index]
def encode(self,word,complete=True): def encode(self,word,complete=True):
......
...@@ -112,6 +112,7 @@ if args.tokenization_method == "bert": ...@@ -112,6 +112,7 @@ if args.tokenization_method == "bert":
# Identify all ngram available # Identify all ngram available
pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x))
index.filter_ngram()
num_words = len(index.index_ngram) # necessary for the embedding matrix num_words = len(index.index_ngram) # necessary for the embedding matrix
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment