diff --git a/combination_embeddingsv4.py b/combination_embeddingsv4.py index cfbd49d768cd9db7358c9e8e26026ad94fde5d04..c967f6fdad9ce2534d8255edcf65949affdcdfdc 100644 --- a/combination_embeddingsv4.py +++ b/combination_embeddingsv4.py @@ -27,9 +27,12 @@ logging.basicConfig( # LOGGING CONF level=logging.INFO ) +import tensorflow as tf +physical_devices = tf.config.list_physical_devices('GPU') +tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) # COMMAND ARGS args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ - .parse_args("FR FR_inclusion.csv FR_adjacent.csv FR_cooc.csv -i -a -w".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) + .parse_args()#("IGN GB_inclusion_perm.csv ../data/IGN/IGN_adjacent.csv GB_cooc_perm.csv -a".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) # ################################################# @@ -109,7 +112,7 @@ logging.info("Done !") ############################################################################################# logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(np.concatenate((pairs_of_toponym.toponym.values,pairs_of_toponym.toponym_context.values)),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) +embedding_weights = index.get_embedding_layer([index.encode(p) for p in np.concatenate((pairs_of_toponym.toponym.unique(),pairs_of_toponym.toponym_context.unique()))],dim= EMBEDDING_DIM,iter=WORDVEC_ITER) logging.info("Embedding generated !") ############################################################################################# diff --git a/lib/bert_geocoder.py b/lib/bert_geocoder.py new file mode 100644 index 0000000000000000000000000000000000000000..d2305a52e8c74ca57d86ed2db37a418165f8176e --- /dev/null +++ b/lib/bert_geocoder.py @@ -0,0 +1,67 @@ +import os +import sys +import time +import random +import argparse +import datetime + +import pandas as pd +import numpy as np + +import tensorflow as tf +import torch + +from tqdm import tqdm +tqdm.pandas() + +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from keras.preprocessing.sequence import pad_sequences +from transformers import BertTokenizer +from transformers import BertForSequenceClassification, AdamW, BertConfig +from transformers import get_linear_schedule_with_warmup + +from lib.torch_generator import SentenceDataset +from lib.geo import latlon2healpix,healpix2latlon + +import pickle + +# If there's a GPU available... +if torch.cuda.is_available(): + + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + print('We will use the GPU:', torch.cuda.get_device_name(0)) +# If not... +else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") + + +class BertGeocoder(): + def __init__(self,bert_model_dir,label_healpix_file,healpix_nside=128,batch_size=1): + self.bert_model = BertForSequenceClassification.from_pretrained(bert_model_dir) + self.bert_model.to(device) + self.tokenizer = BertTokenizer.from_pretrained(bert_model_dir) + self.label_healpix = {v:k for k, v in pickle.load(open(label_healpix_file,'rb')).items()} + + self.nside = healpix_nside + + self.batch_size = batch_size + + def geocode(self,toponyms, context_toponyms): + data = SentenceDataset(pd.DataFrame([[toponyms[i] + " " + context_toponyms[i],0] for i in range(len(toponyms))],columns=["sentence","label"]),self.tokenizer,batch_size=self.batch_size,shuffle=False) + dataloader = DataLoader(data, batch_size=self.batch_size) + results = [] + for step, batch in enumerate(dataloader): + b_input_ids = batch[0].to(device) + b_input_mask = batch[1].to(device) + with torch.no_grad(): + outputs = self.bert_model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask) + results.append(outputs[0].detach().cpu().numpy()) + label = np.argmax(np.concatenate(results),axis=1) + healpix_label = [self.label_healpix[l] for l in label] + lat,lon = healpix2latlon(healpix_label,self.nside) + return np.concatenate((lat.reshape(-1,1),lon.reshape(-1,1)),axis=1) \ No newline at end of file diff --git a/lib/geo.py b/lib/geo.py index 18407051dc3616b160ee4458b00b95e10d071c12..00b8a1a2fba238cd5b01ad3710a0dd8b4f2aec8d 100644 --- a/lib/geo.py +++ b/lib/geo.py @@ -31,6 +31,12 @@ def latlon2healpix( lat , lon , res ): zs = ( np.sin(lat) ) # return healpy.vec2pix( int(res) , xs , ys , zs ) +def healpix2latlon( code , nside ): + xs, ys, zs = healpy.pix2vec( nside , code ) + lat = np.arctan2(zs, np.sqrt(xs * xs + ys * ys)) * 180.0 / np.pi + lon = np.arctan2(ys, xs) * 180.0 / np.pi + return lat, lon + def haversine_tf(y_true,y_pred): """ Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates diff --git a/lib/geocoder.py b/lib/geocoder.py index 2f43e7c34747480e68626270a0921dfc1c0da5e3..99f6a0ab7da1b25f8ff88a187555e5e72100b5f3 100644 --- a/lib/geocoder.py +++ b/lib/geocoder.py @@ -19,7 +19,8 @@ from lib.geo import haversine_tf_1circle import stanza import spacy - +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '-1' class Geocoder(object): """ diff --git a/lib/ngram_index.py b/lib/ngram_index.py index 5f386250ba40023831dcc0bf32e41729db2a7d9c..9e422e9cf9a148deb8bea50a9fc3a4b83a68336c 100644 --- a/lib/ngram_index.py +++ b/lib/ngram_index.py @@ -38,7 +38,7 @@ class NgramIndex(): word : str a word """ - ngrams = word.lower().replace(" ","$") + ngrams = str(word).lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) [self.add(ngram) for ngram in ngrams] self.max_len = max(self.max_len,len(ngrams)) @@ -73,7 +73,7 @@ class NgramIndex(): listfrom shapely.geometry import Point,box of ngram index """ - ngrams = word.lower().replace(" ","$") + ngrams = str(word).lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) ngrams = [ng for ng in ngrams if ng.count("$")<2] if not self.loaded: diff --git a/lib/torch_generator.py b/lib/torch_generator.py index 718d169d92c6003ba223258eb67052f34a1bb13d..7c6de282cf8a6f33c93a407d2d2e8e47b6916e57 100644 --- a/lib/torch_generator.py +++ b/lib/torch_generator.py @@ -9,7 +9,7 @@ def chunks(lst, n): class SentenceDataset(torch.utils.data.Dataset): 'Characterizes a dataset for PyTorch' - def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32): + def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32,shuffle=True): 'Initialization' self.sentences = dataframe["sentence"].values self.labels = dataframe["label"].values @@ -18,7 +18,8 @@ class SentenceDataset(torch.utils.data.Dataset): self.batch_size = batch_size a = np.arange(len(dataframe)) - np.random.shuffle(a) + if shuffle: + np.random.shuffle(a) self.batch_tokenization = list(chunks(a,batch_size)) assert(len(self.batch_tokenization[0])==batch_size) self.current_batch_id = 0 diff --git a/parser_config/toponym_combination_embedding.json b/parser_config/toponym_combination_embedding.json index a7dc96c78f74d703cad17f6c64f4e4a90c97dfa9..260d6ec129527b76e4f0ff2002a0dd83d4e80e01 100644 --- a/parser_config/toponym_combination_embedding.json +++ b/parser_config/toponym_combination_embedding.json @@ -10,7 +10,7 @@ { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, { "long": "--cooc-sample-size", "type": "int", "default": 1 }, {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/parser_config/toponym_combination_embedding_v2.json b/parser_config/toponym_combination_embedding_v2.json index 9163e70aa02c7381bf7cbc81298586c90c67fb85..345c1d7d49f767b0076b11574538c46952bad788 100644 --- a/parser_config/toponym_combination_embedding_v2.json +++ b/parser_config/toponym_combination_embedding_v2.json @@ -10,7 +10,7 @@ { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, { "long": "--cooc-sample-size", "type": "int", "default": 1 }, {"long": "--adjacency-iteration", "type":"int","default":1}, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json index 37f9ae530dcae123328d409ae2f1a0f9937b4422..5053a891b52541982cc68f7b156ac3b19e495d37 100644 --- a/parser_config/toponym_combination_embedding_v3.json +++ b/parser_config/toponym_combination_embedding_v3.json @@ -9,7 +9,7 @@ { "short": "-i", "long": "--inclusion", "action": "store_true" }, { "short": "-a", "long": "--adjacency", "action": "store_true" }, { "short": "-w", "long": "--wikipedia", "action": "store_true" }, - { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 4 }, { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 }, { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, diff --git a/requirements.txt b/requirements.txt index bb296e6b7aa1891a55d0997c88766e4310ff53c6..a2016e76db0a4bfbfa04c06ff282a42a904bd83b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,4 +21,7 @@ flask numba healpy stanza -spacy \ No newline at end of file +spacy +torch +torchvision +transformers \ No newline at end of file diff --git a/scripts/randoludo.py b/scripts/randoludo.py index f1826b534c6d246042dbf0b04442e369746c9b77..b933c9c5136bf6eed2c6497df5d175fcc6f6c767 100644 --- a/scripts/randoludo.py +++ b/scripts/randoludo.py @@ -3,7 +3,7 @@ import numpy as np from lib.geocoder import Geocoder -geocoder = Geocoder("./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C.h5","./outputs/FR_MODEL_2/FR.txt_100_4_100__A_I_C_index") +geocoder = Geocoder("./outputs/IGN_4_100_A_C.h5","./outputs/IGN_4_100_A_C_index") df = pd.read_csv("data/rando_toponymes.tsv",sep="\t") df["name"]=df.name.apply(lambda x:x.split("¦")[0]) diff --git a/server.py b/server.py index f15fa93503c6b5d203fd0df32a1719ae931182b9..26ab30860a154c8e237280f68c02fc47af0037e7 100644 --- a/server.py +++ b/server.py @@ -14,9 +14,10 @@ dict_model = { "GB_C":("./outputs/GB_MODEL_2/GB.txt_100_4_100__C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__C_index"), "GB_AC":("./outputs/GB_MODEL_2/GB.txt_100_4_100__A_C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__A_C_index"), "GB_IC":("./outputs/GB_MODEL_2/GB.txt_100_4_100__I_C.h5","./outputs/GB_MODEL_2/GB.txt_100_4_100__I_C_index") + ,"FR_IGN":("./outputs/IGN_4_100_A_C.h5","./outputs/IGN_4_100_A_C_index") } -MODEL = "FR_AIC" +MODEL = "FR_IGN" LANG = "fr" NER = "spacy"