From 5c36fa47f4550c37eec57a226baadf693a78dd5f Mon Sep 17 00:00:00 2001
From: Jacques Fize <jacques.fize@insa-lyon.fr>
Date: Fri, 9 Oct 2020 15:21:45 +0200
Subject: [PATCH] ADD module to use Bert model train for geocoding + ADD new
 training script version + other minor bug fixes

---
 combination_embeddingsv4.py                   |  7 +-
 lib/bert_geocoder.py                          | 67 +++++++++++++++++++
 lib/geo.py                                    |  6 ++
 lib/geocoder.py                               |  3 +-
 lib/ngram_index.py                            |  4 +-
 lib/torch_generator.py                        |  5 +-
 .../toponym_combination_embedding.json        |  2 +-
 .../toponym_combination_embedding_v2.json     |  2 +-
 .../toponym_combination_embedding_v3.json     |  2 +-
 requirements.txt                              |  5 +-
 scripts/randoludo.py                          |  2 +-
 server.py                                     |  3 +-
 12 files changed, 95 insertions(+), 13 deletions(-)
 create mode 100644 lib/bert_geocoder.py

diff --git a/combination_embeddingsv4.py b/combination_embeddingsv4.py
index cfbd49d..c967f6f 100644
--- a/combination_embeddingsv4.py
+++ b/combination_embeddingsv4.py
@@ -27,9 +27,12 @@ logging.basicConfig( # LOGGING CONF
     level=logging.INFO  
     )
 
+import tensorflow as tf
+physical_devices = tf.config.list_physical_devices('GPU')
+tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)
 #Â COMMAND ARGS
 args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
-    .parse_args("FR FR_inclusion.csv FR_adjacent.csv FR_cooc.csv -i -a -w".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
+    .parse_args()#("IGN GB_inclusion_perm.csv ../data/IGN/IGN_adjacent.csv GB_cooc_perm.csv  -a".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
 
 #
 #################################################
@@ -109,7 +112,7 @@ logging.info("Done !")
 #############################################################################################
 
 logging.info("Generating N-GRAM Embedding...")
-embedding_weights = index.get_embedding_layer(np.concatenate((pairs_of_toponym.toponym.values,pairs_of_toponym.toponym_context.values)),dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
+embedding_weights = index.get_embedding_layer([index.encode(p) for p in np.concatenate((pairs_of_toponym.toponym.unique(),pairs_of_toponym.toponym_context.unique()))],dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
 logging.info("Embedding generated !")
 
 #############################################################################################
diff --git a/lib/bert_geocoder.py b/lib/bert_geocoder.py
new file mode 100644
index 0000000..d2305a5
--- /dev/null
+++ b/lib/bert_geocoder.py
@@ -0,0 +1,67 @@
+import os
+import sys
+import time
+import random
+import argparse
+import datetime
+
+import pandas as pd
+import numpy as np
+
+import tensorflow as tf
+import torch
+
+from tqdm import tqdm
+tqdm.pandas()
+
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from keras.preprocessing.sequence import pad_sequences
+from transformers import BertTokenizer
+from transformers import BertForSequenceClassification, AdamW, BertConfig
+from transformers import get_linear_schedule_with_warmup
+
+from lib.torch_generator import SentenceDataset
+from lib.geo import latlon2healpix,healpix2latlon
+
+import pickle
+
+# If there's a GPU available...
+if torch.cuda.is_available():    
+
+    # Tell PyTorch to use the GPU.    
+    device = torch.device("cuda")
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+
+
+class BertGeocoder():
+    def __init__(self,bert_model_dir,label_healpix_file,healpix_nside=128,batch_size=1):
+        self.bert_model = BertForSequenceClassification.from_pretrained(bert_model_dir)
+        self.bert_model.to(device)
+        self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir)
+        self.label_healpix = {v:k for k, v in pickle.load(open(label_healpix_file,'rb')).items()}
+
+        self.nside = healpix_nside
+
+        self.batch_size = batch_size
+
+    def geocode(self,toponyms, context_toponyms):
+        data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=self.batch_size,shuffle=False)
+        dataloader = DataLoader(data,  batch_size=self.batch_size)
+        results = []
+        for step, batch in enumerate(dataloader):
+            b_input_ids = batch[0].to(device)
+            b_input_mask = batch[1].to(device)
+            with torch.no_grad():
+                outputs = self.bert_model(b_input_ids, 
+                                token_type_ids=None, 
+                                attention_mask=b_input_mask)
+            results.append(outputs[0].detach().cpu().numpy())
+        label = np.argmax(np.concatenate(results),axis=1)
+        healpix_label = [self.label_healpix[l] for l in label]
+        lat,lon = healpix2latlon(healpix_label,self.nside)
+        return np.concatenate((lat.reshape(-1,1),lon.reshape(-1,1)),axis=1)
\ No newline at end of file
diff --git a/lib/geo.py b/lib/geo.py
index 1840705..00b8a1a 100644
--- a/lib/geo.py
+++ b/lib/geo.py
@@ -31,6 +31,12 @@ def latlon2healpix( lat , lon , res ):
     zs = ( np.sin(lat) ) # 
     return healpy.vec2pix( int(res) , xs , ys , zs )
 
+def healpix2latlon( code , nside ):
+    xs, ys, zs = healpy.pix2vec( nside , code )
+    lat =  np.arctan2(zs, np.sqrt(xs * xs + ys * ys)) * 180.0 / np.pi 
+    lon =  np.arctan2(ys, xs) * 180.0 / np.pi 
+    return lat, lon
+
 def haversine_tf(y_true,y_pred):
     """
     Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
diff --git a/lib/geocoder.py b/lib/geocoder.py
index 2f43e7c..99f6a0a 100644
--- a/lib/geocoder.py
+++ b/lib/geocoder.py
@@ -19,7 +19,8 @@ from lib.geo import haversine_tf_1circle
 
 import stanza
 import spacy
-
+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 
 class Geocoder(object):
     """
diff --git a/lib/ngram_index.py b/lib/ngram_index.py
index 5f38625..9e422e9 100644
--- a/lib/ngram_index.py
+++ b/lib/ngram_index.py
@@ -38,7 +38,7 @@ class NgramIndex():
         word : str
             a word
         """
-        ngrams = word.lower().replace(" ","$")
+        ngrams = str(word).lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))
         [self.add(ngram) for ngram in ngrams]
         self.max_len = max(self.max_len,len(ngrams))
@@ -73,7 +73,7 @@ class NgramIndex():
             listfrom shapely.geometry import Point,box
  of ngram index
         """
-        ngrams = word.lower().replace(" ","$")
+        ngrams = str(word).lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))
         ngrams = [ng for ng in ngrams if ng.count("$")<2]
         if not self.loaded:
diff --git a/lib/torch_generator.py b/lib/torch_generator.py
index 718d169..7c6de28 100644
--- a/lib/torch_generator.py
+++ b/lib/torch_generator.py
@@ -9,7 +9,7 @@ def chunks(lst, n):
 
 class SentenceDataset(torch.utils.data.Dataset):
     'Characterizes a dataset for PyTorch'
-    def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32):
+    def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32,shuffle=True):
         'Initialization'
         self.sentences = dataframe["sentence"].values
         self.labels = dataframe["label"].values
@@ -18,7 +18,8 @@ class SentenceDataset(torch.utils.data.Dataset):
 
         self.batch_size = batch_size
         a = np.arange(len(dataframe))
-        np.random.shuffle(a)
+        if shuffle:
+            np.random.shuffle(a)
         self.batch_tokenization = list(chunks(a,batch_size))
         assert(len(self.batch_tokenization[0])==batch_size)
         self.current_batch_id = 0
diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json
index a7dc96c..260d6ec 100644
--- a/parser_config/toponym_combination_embedding.json
+++ b/parser_config/toponym_combination_embedding.json
@@ -10,7 +10,7 @@
         { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"},
         { "long": "--cooc-sample-size", "type": "int", "default": 1 },
         {"long": "--adjacency-iteration", "type":"int","default":1},
-        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
+        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 },
         { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
         { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
         { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
diff --git a/parser_config/toponym_combination_embedding_v2.json b/parser_config/toponym_combination_embedding_v2.json
index 9163e70..345c1d7 100644
--- a/parser_config/toponym_combination_embedding_v2.json
+++ b/parser_config/toponym_combination_embedding_v2.json
@@ -10,7 +10,7 @@
         { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"},
         { "long": "--cooc-sample-size", "type": "int", "default": 1 },
         {"long": "--adjacency-iteration", "type":"int","default":1},
-        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
+        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 },
         { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
         { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
         { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json
index 37f9ae5..5053a89 100644
--- a/parser_config/toponym_combination_embedding_v3.json
+++ b/parser_config/toponym_combination_embedding_v3.json
@@ -9,7 +9,7 @@
         { "short": "-i", "long": "--inclusion", "action": "store_true" },
         { "short": "-a", "long": "--adjacency", "action": "store_true" },
         { "short": "-w", "long": "--wikipedia", "action": "store_true" },
-        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
+        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 },
         { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
         { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
         { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
diff --git a/requirements.txt b/requirements.txt
index bb296e6..a2016e7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,4 +21,7 @@ flask
 numba
 healpy
 stanza
-spacy
\ No newline at end of file
+spacy
+torch
+torchvision
+transformers
\ No newline at end of file
diff --git a/scripts/randoludo.py b/scripts/randoludo.py
index f1826b5..b933c9c 100644
--- a/scripts/randoludo.py
+++ b/scripts/randoludo.py
@@ -3,7 +3,7 @@ import numpy as np
 
 from lib.geocoder import Geocoder
 
-geocoder = Geocoder("./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C.h5","./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C_index")
+geocoder = Geocoder("./outputs/IGN_4_100_A_C.h5","./outputs/IGN_4_100_A_C_index")
 
 df = pd.read_csv("data/rando_toponymes.tsv",sep="\t")
 df["name"]=df.name.apply(lambda x:x.split("Â¦")[0])
diff --git a/server.py b/server.py
index f15fa93..26ab308 100644
--- a/server.py
+++ b/server.py
@@ -14,9 +14,10 @@ dict_model = {
     "GB_C":("./outputs/GB_MODEL_2/GB.txt_100_4_100__C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__C_index"),
     "GB_AC":("./outputs/GB_MODEL_2/GB.txt_100_4_100__A_C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__A_C_index"),
     "GB_IC":("./outputs/GB_MODEL_2/GB.txt_100_4_100__I_C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__I_C_index")
+    ,"FR_IGN":("./outputs/IGN_4_100_A_C.h5","./outputs/IGN_4_100_A_C_index")
 }
 
-MODEL = "FR_AIC"
+MODEL = "FR_IGN"
 LANG = "fr"
 NER = "spacy"
 
-- 
GitLab