From 5c36fa47f4550c37eec57a226baadf693a78dd5f Mon Sep 17 00:00:00 2001 From: Jacques Fize <jacques.fize@insa-lyon.fr> Date: Fri, 9 Oct 2020 15:21:45 +0200 Subject: [PATCH] ADD module to use Bert model train for geocoding + ADD new training script version + other minor bug fixes --- combination_embeddingsv4.py | 7 +- lib/bert_geocoder.py | 67 +++++++++++++++++++ lib/geo.py | 6 ++ lib/geocoder.py | 3 +- lib/ngram_index.py | 4 +- lib/torch_generator.py | 5 +- .../toponym_combination_embedding.json | 2 +- .../toponym_combination_embedding_v2.json | 2 +- .../toponym_combination_embedding_v3.json | 2 +- requirements.txt | 5 +- scripts/randoludo.py | 2 +- server.py | 3 +- 12 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 lib/bert_geocoder.py diff --git a/combination_embeddingsv4.py b/combination_embeddingsv4.py index cfbd49d..c967f6f 100644 --- a/combination_embeddingsv4.py +++ b/combination_embeddingsv4.py @@ -27,9 +27,12 @@ logging.basicConfig( # LOGGING CONF level=logging.INFO ) +import tensorflow as tf +physical_devices = tf.config.list_physical_devices('GPU') +tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) # COMMAND ARGS args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ - .parse_args("FR FR_inclusion.csv FR_adjacent.csv FR_cooc.csv -i -a -w".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) + .parse_args()#("IGN GB_inclusion_perm.csv ../data/IGN/IGN_adjacent.csv GB_cooc_perm.csv -a".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) # ################################################# @@ -109,7 +112,7 @@ logging.info("Done !") ############################################################################################# logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(np.concatenate((pairs_of_toponym.toponym.values,pairs_of_toponym.toponym_context.values)),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) +embedding_weights = index.get_embedding_layer([index.encode(p) for p in np.concatenate((pairs_of_toponym.toponym.unique(),pairs_of_toponym.toponym_context.unique()))],dim= EMBEDDING_DIM,iter=WORDVEC_ITER) logging.info("Embedding generated !") ############################################################################################# diff --git a/lib/bert_geocoder.py b/lib/bert_geocoder.py new file mode 100644 index 0000000..d2305a5 --- /dev/null +++ b/lib/bert_geocoder.py @@ -0,0 +1,67 @@ +import os +import sys +import time +import random +import argparse +import datetime + +import pandas as pd +import numpy as np + +import tensorflow as tf +import torch + +from tqdm import tqdm +tqdm.pandas() + +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from keras.preprocessing.sequence import pad_sequences +from transformers import BertTokenizer +from transformers import BertForSequenceClassification, AdamW, BertConfig +from transformers import get_linear_schedule_with_warmup + +from lib.torch_generator import SentenceDataset +from lib.geo import latlon2healpix,healpix2latlon + +import pickle + +# If there's a GPU available... +if torch.cuda.is_available(): + + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + print('We will use the GPU:', torch.cuda.get_device_name(0)) +# If not... +else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") + + +class BertGeocoder(): + def __init__(self,bert_model_dir,label_healpix_file,healpix_nside=128,batch_size=1): + self.bert_model = BertForSequenceClassification.from_pretrained(bert_model_dir) + self.bert_model.to(device) + self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir) + self.label_healpix = {v:k for k, v in pickle.load(open(label_healpix_file,'rb')).items()} + + self.nside = healpix_nside + + self.batch_size = batch_size + + def geocode(self,toponyms, context_toponyms): + data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=self.batch_size,shuffle=False) + dataloader = DataLoader(data, batch_size=self.batch_size) + results = [] + for step, batch in enumerate(dataloader): + b_input_ids = batch[0].to(device) + b_input_mask = batch[1].to(device) + with torch.no_grad(): + outputs = self.bert_model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask) + results.append(outputs[0].detach().cpu().numpy()) + label = np.argmax(np.concatenate(results),axis=1) + healpix_label = [self.label_healpix[l] for l in label] + lat,lon = healpix2latlon(healpix_label,self.nside) + return np.concatenate((lat.reshape(-1,1),lon.reshape(-1,1)),axis=1) \ No newline at end of file diff --git a/lib/geo.py b/lib/geo.py index 1840705..00b8a1a 100644 --- a/lib/geo.py +++ b/lib/geo.py @@ -31,6 +31,12 @@ def latlon2healpix( lat , lon , res ): zs = ( np.sin(lat) ) # return healpy.vec2pix( int(res) , xs , ys , zs ) +def healpix2latlon( code , nside ): + xs, ys, zs = healpy.pix2vec( nside , code ) + lat = np.arctan2(zs, np.sqrt(xs * xs + ys * ys)) * 180.0 / np.pi + lon = np.arctan2(ys, xs) * 180.0 / np.pi + return lat, lon + def haversine_tf(y_true,y_pred): """ Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates diff --git a/lib/geocoder.py b/lib/geocoder.py index 2f43e7c..99f6a0a 100644 --- a/lib/geocoder.py +++ b/lib/geocoder.py @@ -19,7 +19,8 @@ from lib.geo import haversine_tf_1circle import stanza import spacy - +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '-1' class Geocoder(object): """ diff --git a/lib/ngram_index.py b/lib/ngram_index.py index 5f38625..9e422e9 100644 --- a/lib/ngram_index.py +++ b/lib/ngram_index.py @@ -38,7 +38,7 @@ class NgramIndex(): word : str a word """ - ngrams = word.lower().replace(" ","$") + ngrams = str(word).lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) [self.add(ngram) for ngram in ngrams] self.max_len = max(self.max_len,len(ngrams)) @@ -73,7 +73,7 @@ class NgramIndex(): listfrom shapely.geometry import Point,box of ngram index """ - ngrams = word.lower().replace(" ","$") + ngrams = str(word).lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) ngrams = [ng for ng in ngrams if ng.count("$")<2] if not self.loaded: diff --git a/lib/torch_generator.py b/lib/torch_generator.py index 718d169..7c6de28 100644 --- a/lib/torch_generator.py +++ b/lib/torch_generator.py @@ -9,7 +9,7 @@ def chunks(lst, n): class SentenceDataset(torch.utils.data.Dataset): 'Characterizes a dataset for PyTorch' - def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32): + def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32,shuffle=True): 'Initialization' self.sentences = dataframe["sentence"].values self.labels = dataframe["label"].values @@ -18,7 +18,8 @@ class SentenceDataset(torch.utils.data.Dataset): self.batch_size = batch_size a = np.arange(len(dataframe)) - np.random.shuffle(a) + if shuffle: + np.random.shuffle(a) self.batch_tokenization = list(chunks(a,batch_size)) assert(len(self.batch_tokenization[0])==batch_size) self.current_batch_id = 0 diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index a7dc96c..260d6ec 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -10,7 +10,7 @@ { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, { "long": "--cooc-sample-size", "type": "int", "default": 1 }, {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/parser_config/toponym_combination_embedding_v2.json b/parser_config/toponym_combination_embedding_v2.json index 9163e70..345c1d7 100644 --- a/parser_config/toponym_combination_embedding_v2.json +++ b/parser_config/toponym_combination_embedding_v2.json @@ -10,7 +10,7 @@ { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, { "long": "--cooc-sample-size", "type": "int", "default": 1 }, {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json index 37f9ae5..5053a89 100644 --- a/parser_config/toponym_combination_embedding_v3.json +++ b/parser_config/toponym_combination_embedding_v3.json @@ -9,7 +9,7 @@ { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-w", "long": "--wikipedia", "action": "store_true" }, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/requirements.txt b/requirements.txt index bb296e6..a2016e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,7 @@ flask numba healpy stanza -spacy \ No newline at end of file +spacy +torch +torchvision +transformers \ No newline at end of file diff --git a/scripts/randoludo.py b/scripts/randoludo.py index f1826b5..b933c9c 100644 --- a/scripts/randoludo.py +++ b/scripts/randoludo.py @@ -3,7 +3,7 @@ import numpy as np from lib.geocoder import Geocoder -geocoder = Geocoder("./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C.h5","./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C_index") +geocoder = Geocoder("./outputs/IGN_4_100_A_C.h5","./outputs/IGN_4_100_A_C_index") df = pd.read_csv("data/rando_toponymes.tsv",sep="\t") df["name"]=df.name.apply(lambda x:x.split("¦")[0]) diff --git a/server.py b/server.py index f15fa93..26ab308 100644 --- a/server.py +++ b/server.py @@ -14,9 +14,10 @@ dict_model = { "GB_C":("./outputs/GB_MODEL_2/GB.txt_100_4_100__C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__C_index"), "GB_AC":("./outputs/GB_MODEL_2/GB.txt_100_4_100__A_C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__A_C_index"), "GB_IC":("./outputs/GB_MODEL_2/GB.txt_100_4_100__I_C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__I_C_index") + ,"FR_IGN":("./outputs/IGN_4_100_A_C.h5","./outputs/IGN_4_100_A_C_index") } -MODEL = "FR_AIC" +MODEL = "FR_IGN" LANG = "fr" NER = "spacy" -- GitLab