From 1f6d2ac83cdf55abf1e0b426d69d593b873b777b Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Wed, 6 Jan 2021 15:54:44 +0100
Subject: [PATCH] Optimisation on ngram embedding

---
 lib/ngram_index.py | 14 ++++++++++++++
 train_geocoder.py  |  1 +
 2 files changed, 15 insertions(+)

diff --git a/lib/ngram_index.py b/lib/ngram_index.py
index 0b7a42b..b461718 100644
--- a/lib/ngram_index.py
+++ b/lib/ngram_index.py
@@ -1,6 +1,7 @@
 import json
 
 import numpy as np
+import pandas as pd
 
 from ngram import NGram
 from transformers import BertTokenizer
@@ -38,6 +39,8 @@ class NgramIndex():
         self.size = n
         self.ngram_index = {"":0}
         self.index_ngram = {0:""}
+
+        self.freq_ngram = {}
         self.cpt = 0
         self.max_len = 0
 
@@ -71,6 +74,17 @@ class NgramIndex():
             self.cpt+=1
             self.ngram_index[ngram]=self.cpt
             self.index_ngram[self.cpt]=ngram
+            self.freq_ngram[ngram] = 1
+        else:
+            self.freq_ngram[ngram] += 1
+        
+    def filter_ngram(self,threshold=20):
+        freq_data = pd.DataFrame(self.freq_ngram.items(),columns="ngram freq".split())
+        selected_ngram = freq_data[freq_data.freq<threshold].ngram.values
+        for ng in selected_ngram:
+            index = self.ngram_index[ng]
+            del self.ngram_index[ng]
+            del self.index_ngram[index]
         
 
     def encode(self,word,complete=True):
diff --git a/train_geocoder.py b/train_geocoder.py
index da47f1c..d143b58 100644
--- a/train_geocoder.py
+++ b/train_geocoder.py
@@ -112,6 +112,7 @@ if args.tokenization_method == "bert":
  # Identify all ngram available
 pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
 pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x))
+index.filter_ngram()
 
 num_words = len(index.index_ngram) # necessary for the embedding matrix
 
-- 
GitLab