From 1f6d2ac83cdf55abf1e0b426d69d593b873b777b Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Wed, 6 Jan 2021 15:54:44 +0100 Subject: [PATCH] Optimisation on ngram embedding --- lib/ngram_index.py | 14 ++++++++++++++ train_geocoder.py | 1 + 2 files changed, 15 insertions(+) diff --git a/lib/ngram_index.py b/lib/ngram_index.py index 0b7a42b..b461718 100644 --- a/lib/ngram_index.py +++ b/lib/ngram_index.py @@ -1,6 +1,7 @@ import json import numpy as np +import pandas as pd from ngram import NGram from transformers import BertTokenizer @@ -38,6 +39,8 @@ class NgramIndex(): self.size = n self.ngram_index = {"":0} self.index_ngram = {0:""} + + self.freq_ngram = {} self.cpt = 0 self.max_len = 0 @@ -71,6 +74,17 @@ class NgramIndex(): self.cpt+=1 self.ngram_index[ngram]=self.cpt self.index_ngram[self.cpt]=ngram + self.freq_ngram[ngram] = 1 + else: + self.freq_ngram[ngram] += 1 + + def filter_ngram(self,threshold=20): + freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split()) + selected_ngram = freq_data[freq_data.freq<threshold].ngram.values + for ng in selected_ngram: + index = self.ngram_index[ng] + del self.ngram_index[ng] + del self.index_ngram[index] def encode(self,word,complete=True): diff --git a/train_geocoder.py b/train_geocoder.py index da47f1c..d143b58 100644 --- a/train_geocoder.py +++ b/train_geocoder.py @@ -112,6 +112,7 @@ if args.tokenization_method == "bert": # Identify all ngram available pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x)) +index.filter_ngram() num_words = len(index.index_ngram) # necessary for the embedding matrix -- GitLab