diff --git a/train_geocoder.py b/train_geocoder.py index 2ee31cd9271798f7c3e0d144fa4fa39d28a5a980..b83c3d6dbc4770eed7c86fecd43d622235a94bfc 100644 --- a/train_geocoder.py +++ b/train_geocoder.py @@ -112,8 +112,7 @@ if args.tokenization_method == "bert": # Identify all ngram available pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x)) -print(len(index.ngram_index)) -index.filter_top_ngram(10000) + num_words = len(index.index_ngram) # necessary for the embedding matrix