diff --git a/.gitignore b/.gitignore index 0fe655a24d0489ecf7c7463dc8ddd6696b531968..10ad02ef32c99c292a43aaec5bd41314cb7508ee 100644 --- a/.gitignore +++ b/.gitignore @@ -152,3 +152,6 @@ log* temp* subset* time* + + +/data* \ No newline at end of file diff --git a/bert.py b/bert.py new file mode 100644 index 0000000000000000000000000000000000000000..df3e1349a918ba82583202d6429fb27815b9b301 --- /dev/null +++ b/bert.py @@ -0,0 +1,388 @@ +# REQUIREMENTS : pandas keras torch numpy transformers + +""" +Based from the article : https://mccormickml.com/2019/07/22/BERT-fine-tuning/ +by Chris McCormick + +""" +import os +import sys +import time +import random +import argparse +import datetime + +import pandas as pd +import numpy as np + +import tensorflow as tf +import torch + +from tqdm import tqdm +tqdm.pandas() + +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from keras.preprocessing.sequence import pad_sequences +from transformers import BertTokenizer +from transformers import BertForSequenceClassification, AdamW, BertConfig +from transformers import get_linear_schedule_with_warmup + +def flat_accuracy(preds, labels): + pred_flat = np.argmax(preds, axis=1).flatten() + labels_flat = labels.flatten() + return np.sum(pred_flat == labels_flat) / len(labels_flat) + + +def format_time(elapsed): + ''' + Takes a time in seconds and returns a string hh:mm:ss + ''' + # Round to the nearest second. + elapsed_rounded = int(round((elapsed))) + + # Format as hh:mm:ss + return str(datetime.timedelta(seconds=elapsed_rounded)) + +parser = argparse.ArgumentParser() + +parser.add_argument("train" ,help="TSV with two columns : 'sentence' and 'label'") +parser.add_argument("test",help="TSV with two columns : 'sentence' and 'label'") +parser.add_argument("outputdir",help="TSV with two columns : 'sentence' and 'label'") +parser.add_argument("-e","--epochs",type=int,default=5) +parser.add_argument("-b","--batch_size",default=32,type=int) + +args = parser.parse_args()#("-b 32 -e 10 cooc_adj_bert_train.csv cooc_adj_bert_test.csv output_bert_allcooc_adjsampling3radius20km_batch32_epoch10".split()) + +if not os.path.exists(args.train) or not os.path.exists(args.test): + raise FileNotFoundError("Train or Test filepath is incorrect !") + +# Number of training epochs (authors recommend between 2 and 4) +epochs = args.epochs + +# The DataLoader needs to know the batch size for training, so I specify it here. +# For fine-tuning BERT on a specific task, the authors recommend a batch size of +# 16 or 32. + +batch_size = args.batch_size + +#Â OUTPUT DIR +output_dir = args.outputdir + +if not os.path.exists(args.outputdir): + raise FileNotFoundError("{0} directory does not exists ! ".format(args.output_dir)) +if not os.path.isdir(args.outputdir): + raise NotADirectoryError("{0} is not a directory".format(args.output_dir)) + +df_train = pd.read_csv(args.train, sep="\t") +df_test = pd.read_csv(args.test, sep="\t") + +# Get the GPU device name. +device_name = tf.test.gpu_device_name() + +# The device name should look like the following: +if device_name == '/device:GPU:0': + print('Found GPU at: {}'.format(device_name)) +else: + raise SystemError('GPU device not found') + +# If there's a GPU available... +if torch.cuda.is_available(): + + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + print('We will use the GPU:', torch.cuda.get_device_name(0)) +# If not... +else: + print('No GPU available, using the CPU instead.') + device = torch.device("cpu") + + +# Load the BERT tokenizer. +print('Loading {0} tokenizer...'.format("bert-base-multilingual-cased")) +tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False) + +""" +print("Tokenize Input Data") +df_train["input_ids"] = df_train.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True)) +df_test["input_ids"] = df_test.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True)) + + +# Set the maximum sequence length. +# took the size of the largest sentence +MAX_LEN = df_train.input_ids.apply(len).max()+2 + +print('\nPadding/truncating all sentences to %d values...' % MAX_LEN) +print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id)) + + +df_train["input_ids"] = pad_sequences(df_train.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist() +df_test["input_ids"] = pad_sequences(df_test.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist() + +df_train["attention_mask"] = df_train.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x] ) +df_test["attention_mask"] = df_test.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x]) + +train_inputs = torch.tensor(np.array(df_train.input_ids.values.tolist())) +del df_train["input_ids"] +validation_inputs = torch.tensor(np.array(df_test.input_ids.values.tolist())) +del df_test["input_ids"] + +train_labels = torch.tensor(np.array(df_train.label.values.tolist())) +del df_train["label"] +validation_labels = torch.tensor(np.array(df_test.label.values.tolist())) +del df_test["label"] + +train_masks = torch.tensor(np.array(df_train.attention_mask.values.tolist())) +del df_train["attention_mask"] +validation_masks = torch.tensor(np.array(df_test.attention_mask.values.tolist())) +del df_test["attention_mask"] +""" + +from lib.torch_generator import SentenceDataset +# Create the DataLoader for training set. +train_data = SentenceDataset(df_train,tokenizer,batch_size=batch_size) +#train_sampler = RandomSampler(train_data) +train_dataloader = DataLoader(train_data, batch_size=batch_size)#,sampler=train_sampler,) +""" +del train_inputs +del train_masks +del train_labels +""" +# Create the DataLoader for validation set. +validation_data = SentenceDataset(df_test,tokenizer,batch_size=batch_size) +#validation_sampler = SequentialSampler(validation_data) +validation_dataloader = DataLoader(validation_data, batch_size=batch_size)#, sampler=validation_sampler) +""" +del validation_inputs +del validation_masks +del validation_labels +""" +# Load BertForSequenceClassification, the pretrained BERT model with a single +# linear classification layer on top. +model = BertForSequenceClassification.from_pretrained( + "bert-base-multilingual-cased", # Use the 12-layer BERT model, with an uncased vocab. + num_labels = max(df_test.label.max(),df_train.label.max())+1, # The number of output labels--2 for binary classification. + # You can increase this for multi-class tasks. + output_attentions = False, # Whether the model returns attentions weights. + output_hidden_states = False, # Whether the model returns all hidden-states. +) + +# Tell pytorch to run this model on the GPU. +model.cuda() + +optimizer = AdamW(model.parameters(), + lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 + eps = 1e-8 # args.adam_epsilon - default is 1e-8. + ) + + + +# Total number of training steps is number of batches * number of epochs. +total_steps = len(train_data) * epochs + +# Create the learning rate scheduler. +scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps = 0, # Default value in run_glue.py + num_training_steps = total_steps) + + + +# Set the seed value all over the place to make this reproducible. +seed_val = 42 + +random.seed(seed_val) +np.random.seed(seed_val) +torch.manual_seed(seed_val) +torch.cuda.manual_seed_all(seed_val) + +# Store the average loss after each epoch so I can plot them. +loss_values = [] + +history = [] +# For each epoch... +for epoch_i in range(0, epochs): + epoch_data={} + + # ======================================== + # Training + # ======================================== + + # Perform one full pass over the training set. + + print("") + print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) + print('Training...') + + # Measure how long the training epoch takes. + t0 = time.time() + + # Reset the total loss for this epoch. + total_loss = 0 + + # Put the model into training mode. + model.train() + + # For each batch of training data... + for step, batch in enumerate(train_dataloader): + + # Progress update every 40 batches. + if step % 100 == 0 and not step == 0: + # Calculate elapsed time in minutes. + elapsed = format_time(time.time() - t0) + + # Report progress.#Changed to sys.stdout to avoid uneccessary \n + sys.stdout.write('\r Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) + + # Unpack this training batch from the dataloader. + # + # As I unpack the batch, I'll also copy each tensor to the GPU using the + # `to` method. + # + # `batch` contains three pytorch tensors: + # [0]: input ids + # [1]: attention masks + # [2]: labels + b_input_ids = batch[0].to(device) + b_input_mask = batch[1].to(device) + b_labels = batch[2].to(device) + + # Always clear any previously calculated gradients before performing a + # backward pass. PyTorch doesn't do this automatically because + # accumulating the gradients is "convenient while training RNNs". + # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) + model.zero_grad() + + # Perform a forward pass (evaluate the model on this training batch). + # This will return the loss (rather than the model output) because I + # have provided the `labels`. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask, + labels=b_labels) + + # The call to `model` always returns a tuple, so I need to pull the + # loss value out of the tuple. + loss = outputs[0] + + # Accumulate the training loss over all of the batches so that I can + # calculate the average loss at the end. `loss` is a Tensor containing a + # single value; the `.item()` function just returns the Python value + # from the tensor. + + total_loss += loss.item() + + # Perform a backward pass to calculate the gradients. + loss.backward() + + # Clip the norm of the gradients to 1.0. + # This is to help prevent the "exploding gradients" problem. + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + + # Update parameters and take a step using the computed gradient. + # The optimizer dictates the "update rule"--how the parameters are + # modified based on their gradients, the learning rate, etc. + optimizer.step() + + # Update the learning rate. + scheduler.step() + + # Calculate the average loss over the training data. + avg_train_loss = total_loss / len(train_dataloader) + + # Store the loss value for plotting the learning curve. + loss_values.append(avg_train_loss) + + print("") + print(" Average training loss: {0:.2f}".format(avg_train_loss)) + print(" Training epoch took: {:}".format(format_time(time.time() - t0))) + epoch_data["loss"]=avg_train_loss + epoch_data["epoch_duration"] = time.time() - t0 + + # ======================================== + # Validation + # ======================================== + # After the completion of each training epoch, measure the performance on + # the validation set. + + print("") + print("Running Validation...") + + t0 = time.time() + + # Put the model in evaluation mode--the dropout layers behave differently + # during evaluation. + model.eval() + + # Tracking variables + eval_loss, eval_accuracy = 0, 0 + nb_eval_steps, nb_eval_examples = 0, 0 + + # Evaluate data for one epoch + for batch in validation_dataloader: + + # Add batch to GPU + batch = tuple(t.to(device) for t in batch) + + # Unpack the inputs from dataloader + b_input_ids, b_input_mask, b_labels = batch + + # Telling the model not to compute or store gradients, saving memory and + # speeding up validation + with torch.no_grad(): + + # Forward pass, calculate logit predictions. + # This will return the logits rather than the loss because we have + # not provided labels. + # token_type_ids is the same as the "segment ids", which + # differentiates sentence 1 and 2 in 2-sentence tasks. + # The documentation for this `model` function is here: + # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification + outputs = model(b_input_ids, + token_type_ids=None, + attention_mask=b_input_mask) + + # Get the "logits" output by the model. The "logits" are the output + # values prior to applying an activation function like the softmax. + logits = outputs[0] + + # Move logits and labels to CPU + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + + # Calculate the accuracy for this batch of test sentences. + tmp_eval_accuracy = flat_accuracy(logits, label_ids) + + # Accumulate the total accuracy. + eval_accuracy += tmp_eval_accuracy + + # Track the number of batches + nb_eval_steps += 1 + + + # Report the final accuracy for this validation run. + print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) + print(" Validation took: {:}".format(format_time(time.time() - t0))) + epoch_data["accuracy"] = eval_accuracy/nb_eval_steps + epoch_data["validation_duration"] = time.time() - t0 + history.append(epoch_data) +print("") +print("Training complete!") + +print("Save History") +pd.DataFrame(history).to_csv(output_dir+"/history_bert.csv",sep="\t") + + + +# Create output directory if needed +if not os.path.exists(output_dir): + os.makedirs(output_dir) + +print("Saving model to %s" % output_dir) + +# Save a trained model, configuration and tokenizer using `save_pretrained()`. +# They can then be reloaded using `from_pretrained()` +model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training +model_to_save.save_pretrained(output_dir) +tokenizer.save_pretrained(output_dir) \ No newline at end of file diff --git a/combination_embeddings.py b/combination_embeddings.py index 599ad1a8993547b7b17b88e4d4348aafbb728c2d..094578e8f491da71627ae823879f8f052987a916 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -47,7 +47,7 @@ def get_new_ids(cooc_data,id_first_value): Returns ------- dict - new ids for each toponyms + new ids for each toponyms """ topo_id = {} id_ = id_first_value @@ -76,12 +76,13 @@ args = ConfigurationReader("./parser_config/toponym_combination_embedding.json") ################################################# ############# MODEL TRAINING PARAMETER ########## ################################################# +MODEL_NAME = "Bi-LSTM_NGRAM" NGRAM_SIZE = args.ngram_size ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs ITER_ADJACENCY = args.adjacency_iteration COOC_SAMPLING_NUMBER = args.cooc_sample_size -WORDVEC_ITER = 1 #args.ngram_word2vec_iter +WORDVEC_ITER = args.ngram_word2vec_iter EMBEDDING_DIM = 256 ################################################# ########## FILENAME VARIABLE #################### @@ -121,6 +122,7 @@ HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) from lib.utils import MetaDataSerializer meta_data = MetaDataSerializer( + MODEL_NAME, DATASET_NAME, REL_CODE, COOC_SAMPLING_NUMBER, @@ -209,6 +211,8 @@ if args.wikipedia_cooc: cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") + if not "title" in train_cooc_indices: + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) diff --git a/combination_embeddings_baselines.py b/combination_embeddings_baselines.py new file mode 100644 index 0000000000000000000000000000000000000000..b4c495b280153a6b3b2c8e93c6e2509cd24504b1 --- /dev/null +++ b/combination_embeddings_baselines.py @@ -0,0 +1,360 @@ +# Base module +import re +import os +import json + +#Â Structure +import pandas as pd +import numpy as np +import geopandas as gpd + +import tensorflow as tf + +#Â Geometry +from shapely.geometry import Point + +#Â Custom module +from helpers import read_geonames +from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader +from lib.metrics import lat_accuracy,lon_accuracy + +# Logging +from tqdm import tqdm +import logging +from helpers import parse_title_wiki,EpochTimer + +logging.getLogger('gensim').setLevel(logging.WARNING) + +def get_new_ids(cooc_data,id_first_value): + """ + Return new ids from cooccurrence data + + Parameters + ---------- + cooc_data : pd.DataFrame + cooccurrence da + id_first_value : int + id beginning value + + Returns + ------- + dict + new ids for each toponyms + """ + topo_id = {} + id_ = id_first_value + for title in cooc_data.title.values: + if not title in topo_id: + id_+=1 + topo_id[id_]=title + for interlinks in cooc_data.interlinks.values: + for interlink in interlinks.split("|"): + if not interlink in topo_id: + id_+=1 + topo_id[id_]=interlink + return topo_id + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ + .parse_args()#("-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt -n 4 --ngram-word2vec-iter 10 -e 100 ../data/geonamesData/US_FR.txt ../data/geonamesData/hierarchy.txt".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +MODEL_NAME = "BASELINE" +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.tolerance_value +EPOCHS = args.epochs +ITER_ADJACENCY = args.adjacency_iteration +COOC_SAMPLING_NUMBER = args.cooc_sample_size +WORDVEC_ITER = args.ngram_word2vec_iter +EMBEDDING_DIM = 256 +################################################# +########## FILENAME VARIABLE #################### +################################################# +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 +ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( + GEONAME_FN, + ITER_ADJACENCY, + REGION_SUFFIX_FN) + +COOC_FN = args.wikipedia_cooc_fn +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + REGION_SUFFIX_FN) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + +from lib.utils import MetaDataSerializer + +meta_data = MetaDataSerializer( + MODEL_NAME, + DATASET_NAME, + REL_CODE, + COOC_SAMPLING_NUMBER, + ITER_ADJACENCY, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) +print(REL_CODE) + +############################################################################################# +################################# LOAD DATA ################################################# +############################################################################################# + +# LOAD Geonames DATA +logging.info("Load Geonames data...") +geoname_data = read_geonames(GEONAME_FN).fillna("") + +train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) +test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) + +logging.info("Geonames data loaded!") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places +#CLEAR RAM +del geoname_data + + +# IF REGION +if args.admin_code_1 != "None": + filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() + +# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS +filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD + + + +############################################################################################# +################################# RETRIEVE RELATIONSHIPS #################################### +############################################################################################# + + +# INITIALIZE RELATION STORE +rel_store = [] + +# Retrieve adjacency relationships +if args.adjacency: + logging.info("Retrieve adjacency relationships ! ") + + if not os.path.exists(ADJACENCY_REL_FILENAME): + bounds = get_bounds(filtered) # Required to get adjacency relationships + rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) + json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) + else: + logging.info("Open and load data from previous computation!") + rel_store=json.load(open(ADJACENCY_REL_FILENAME)) + + logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) + +# Retrieve inclusion relationships +if args.inclusion: + logging.info("Retrieve inclusion relationships ! ") + + cpt_rel = len(rel_store) + rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) + + logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) + + + +if args.wikipedia_cooc: + logging.info("Load Wikipedia Cooccurrence data and merge with geonames") + + cooc_data = pd.read_csv(COOC_FN,sep="\t") + cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) + cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) + id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) + wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} + title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} + cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) + filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") + if not "title" in train_cooc_indices: + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") + train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + + logging.info("Merged with Geonames data !") + + # EXTRACT rel + logging.info("Extracting cooccurrence relationships") + cpt=0 + for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): + for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): + cpt+=1 + rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) + logging.info("Extract {0} cooccurrence relationships !".format(cpt)) + + +# STORE ID to name +geoname2name = dict(filtered["geonameid name".split()].values) + +# ENCODING NAME USING N-GRAM SPLITTING +logging.info("Encoding toponyms to ngram...") +index = NgramIndex(NGRAM_SIZE) + + # Identify all ngram available +filtered.name.apply(lambda x : index.split_and_add(x)) +if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] + +geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association + +if args.wikipedia_cooc: + geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) + +# SAVE THE INDEX TO REUSE THE MODEL +index.save(INDEX_FN) + +logging.info("Done !") + + +############################################################################################# +################################# ENCODE COORDINATES ######################################## +############################################################################################# + +from lib.geo import latlon2healpix + +# Encode each geonames entry coordinates +geoname_vec = {row.geonameid : latlon2healpix(row.latitude,row.longitude,128) for row in filtered.itertuples()} +# CLEAR RAM +del filtered + + +EMBEDDING_DIM = 256 +num_words = len(index.index_ngram) # necessary for the embedding matrix + +logging.info("Preparing Input and Output data...") + + +############################################################################################# +################################# BUILD TRAIN/TEST DATASETS ################################# +############################################################################################# + +X_train,y_train = [],[] +X_test,y_test = [],[] + +from joblib import Parallel,delayed +from tensorflow.keras.utils import to_categorical + +def parse_bow(x): + return np.sum(to_categorical(x,num_classes=index.cpt+1),axis=0) + +for couple in rel_store: + geonameId_1,geonameId_2 = couple[0],couple[1] + if not geonameId_1 in geoname2encodedname: + continue + top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] + if geonameId_1 in train_indices: #and geonameId_2 in train_indices: + + X_train.append(top1 + top2) + y_train.append(geoname_vec[geonameId_1]) + + else: + X_test.append(top1 + top2) + y_test.append(geoname_vec[geonameId_1]) + +# NUMPYZE inputs and output lists +X_train = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_train)) +X_train = np.array(X_train) +y_train = np.array(y_train) + +X_test = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_test)) +X_test = np.array(X_test) +y_test = np.array(y_test) + +logging.info("Data prepared !") + + +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +from scipy.sparse import csr_matrix +from sklearn import svm +from sklearn.naive_bayes import GaussianNB,MultinomialNB +from sklearn.metrics import classification_report +from sklearn import tree +from sklearn.ensemble import RandomForestClassifier + +X_train = csr_matrix(X_train) +X_test = csr_matrix(X_test) + +print(REL_CODE) +oupt = open("log_baseline_US_FR_{0}.txt".format(REL_CODE),'a') +oupt.write("------") + +from joblib import dump +import sys +f=True + +for kernel in ["rbf","linear","poly"]: + clf = svm.SVC(kernel=kernel) + clf.fit(X_train,y_train) + if kernel =="linear" and f: + dump(clf,"SVMLINEAR_US_FR_{0}.bin".format(REL_CODE)) + sys.exit() + y_pred = clf.predict(X_test) + oupt.write("Results for : "+"SVM with the kernel "+kernel) + oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])) + oupt.flush() + +for alg in (GaussianNB,MultinomialNB): + clf = alg() + clf.fit(X_train.toarray(),y_train) + y_pred = clf.predict(X_test.toarray()) + oupt.write("Results for : "+"NaiveBayes with the alg "+alg.__name__) + oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])+"\n") + oupt.flush() + +clf = tree.DecisionTreeClassifier() +clf.fit(X_train,y_train) +y_pred = clf.predict(X_test) +oupt.write("Results for : "+"Decision Tree classifier") +oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])) +oupt.flush() + +clf = RandomForestClassifier(max_depth=8, random_state=0) +clf.fit(X_train,y_train) +y_pred = clf.predict(X_test) +oupt.write("Results for : "+"Random Forest classifier") +oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])) +oupt.flush() + +oupt.close() \ No newline at end of file diff --git a/combination_embeddings_word.py b/combination_embeddings_word.py new file mode 100644 index 0000000000000000000000000000000000000000..762780d19ef97e260ead434196fad1e302787a70 --- /dev/null +++ b/combination_embeddings_word.py @@ -0,0 +1,390 @@ +# Base module +import re +import os +import json + +#Â Structure +import pandas as pd +import numpy as np +import geopandas as gpd + +#Â DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout +from keras.models import Model +from keras import backend as K +from keras.callbacks import ModelCheckpoint + +import tensorflow as tf + +#Â Geometry +from shapely.geometry import Point + +#Â Custom module +from helpers import read_geonames +from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.ngram_index import NgramIndex +from lib.word_index import WordIndex +from lib.utils import ConfigurationReader +from lib.metrics import lat_accuracy,lon_accuracy + +# Logging +from tqdm import tqdm +import logging +from helpers import parse_title_wiki,EpochTimer + +logging.getLogger('gensim').setLevel(logging.WARNING) + +def get_new_ids(cooc_data,id_first_value): + """ + Return new ids from cooccurrence data + + Parameters + ---------- + cooc_data : pd.DataFrame + cooccurrence da + id_first_value : int + id beginning value + + Returns + ------- + dict + new ids for each toponyms + """ + topo_id = {} + id_ = id_first_value + for title in cooc_data.title.values: + if not title in topo_id: + id_+=1 + topo_id[id_]=title + for interlinks in cooc_data.interlinks.values: + for interlink in interlinks.split("|"): + if not interlink in topo_id: + id_+=1 + topo_id[id_]=interlink + return topo_id + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\ + .parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +MODEL_NAME = "Bi-LSTM_WORD" +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.tolerance_value +EPOCHS = args.epochs +ITER_ADJACENCY = args.adjacency_iteration +COOC_SAMPLING_NUMBER = args.cooc_sample_size +WORDVEC_ITER = args.ngram_word2vec_iter +EMBEDDING_DIM = 256 + + +################################################# +########## FILENAME VARIABLE #################### +################################################# + +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input +REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 +ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( + GEONAME_FN, + ITER_ADJACENCY, + REGION_SUFFIX_FN) + +COOC_FN = args.wikipedia_cooc_fn +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}_{5}".format(MODEL_NAME, + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + REGION_SUFFIX_FN) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + +from lib.utils import MetaDataSerializer + +meta_data = MetaDataSerializer( + MODEL_NAME, + DATASET_NAME, + REL_CODE, + COOC_SAMPLING_NUMBER, + ITER_ADJACENCY, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) + +############################################################################################# +################################# LOAD DATA ################################################# +############################################################################################# + +# LOAD Geonames DATA +logging.info("Load Geonames data...") +geoname_data = read_geonames(GEONAME_FN).fillna("") + +train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) +test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) + +logging.info("Geonames data loaded!") + +# SELECT ENTRY with class == to A and P (Areas and Populated Places) +filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places +#CLEAR RAM +del geoname_data + + +# IF REGION +if args.admin_code_1 != "None": + filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() + +# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS +filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD + + + +############################################################################################# +################################# RETRIEVE RELATIONSHIPS #################################### +############################################################################################# + + +# INITIALIZE RELATION STORE +rel_store = [] + +# Retrieve adjacency relationships +if args.adjacency: + logging.info("Retrieve adjacency relationships ! ") + + if not os.path.exists(ADJACENCY_REL_FILENAME): + bounds = get_bounds(filtered) # Required to get adjacency relationships + rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) + json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) + else: + logging.info("Open and load data from previous computation!") + rel_store=json.load(open(ADJACENCY_REL_FILENAME)) + + logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) + +# Retrieve inclusion relationships +if args.inclusion: + logging.info("Retrieve inclusion relationships ! ") + + cpt_rel = len(rel_store) + rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) + + logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) + + + +if args.wikipedia_cooc: + logging.info("Load Wikipedia Cooccurrence data and merge with geonames") + + cooc_data = pd.read_csv(COOC_FN,sep="\t") + cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) + cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) + id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) + wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} + title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} + cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) + filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") + if not "title" in train_cooc_indices: + train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") + train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) + + logging.info("Merged with Geonames data !") + + # EXTRACT rel + logging.info("Extracting cooccurrence relationships") + cpt=0 + for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): + for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): + cpt+=1 + rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) + logging.info("Extract {0} cooccurrence relationships !".format(cpt)) + + +# STORE ID to name +geoname2name = dict(filtered["geonameid name".split()].values) + +# ENCODING NAME USING N-GRAM SPLITTING +logging.info("Encoding toponyms to ngram...") +index = WordIndex() + + # Identify all ngram available +filtered.name.apply(lambda x : index.split_and_add(x)) +if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] + +geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association + +if args.wikipedia_cooc: + geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) + +# SAVE THE INDEX TO REUSE THE MODEL +index.save(INDEX_FN) + +logging.info("Done !") + + +############################################################################################# +################################# ENCODE COORDINATES ######################################## +############################################################################################# + + + +# Encode each geonames entry coordinates +geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} +# CLEAR RAM +del filtered + + +EMBEDDING_DIM = 256 +num_words = len(index.index_word) # necessary for the embedding matrix + +logging.info("Preparing Input and Output data...") + + +############################################################################################# +################################# BUILD TRAIN/TEST DATASETS ################################# +############################################################################################# + +X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] +X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] + +for couple in rel_store: + geonameId_1,geonameId_2 = couple[0],couple[1] + if not geonameId_1 in geoname2encodedname: + continue + top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] + if geonameId_1 in train_indices: #and geonameId_2 in train_indices: + + X_1_train.append(top1) + X_2_train.append(top2) + + y_lon_train.append(geoname_vec[geonameId_1][0]) + y_lat_train.append(geoname_vec[geonameId_1][1]) + + else: + X_1_test.append(top1) + X_2_test.append(top2) + + y_lon_test.append(geoname_vec[geonameId_1][0]) + y_lat_test.append(geoname_vec[geonameId_1][1]) + +# NUMPYZE inputs and output lists +X_1_train = np.array(X_1_train) +X_2_train = np.array(X_2_train) +y_lat_train = np.array(y_lat_train) +y_lon_train = np.array(y_lon_train) + +X_1_test = np.array(X_1_test) +X_2_test = np.array(X_2_test) +y_lat_test = np.array(y_lat_test) +y_lon_test = np.array(y_lon_test) + +logging.info("Data prepared !") + + +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# + + +logging.info("Generating N-GRAM Embedding...") +embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) +logging.info("Embedding generated !") + +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# + + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) + +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +#Â Each LSTM learn on a permutation of the input toponyms +x1 = Bidirectional(LSTM(98))(x1) +x2 = Bidirectional(LSTM(98))(x2) + +x = concatenate([x1,x2])#,x3]) + +x1 = Dense(500,activation="relu")(x) +# x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +# x1 = Dropout(0.3)(x1) + +x2 = Dense(500,activation="relu")(x) +# x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +# x2 = Dropout(0.3)(x2) + +output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) +output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) + +model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 + +model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) + + +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + +epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") + +history = model.fit(x=[X_1_train,X_2_train], + y=[y_lon_train,y_lat_train], + verbose=True, batch_size=100, + epochs=EPOCHS, + validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]), + callbacks=[checkpoint,epoch_timer]) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv(HISTORY_FN) + +model.save(MODEL_OUTPUT_FN) + +#Â Erase Model Checkpoint file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/combination_embeddingsv2.py b/combination_embeddingsv2.py index ef1de0580b3a5cf227675726ec5ee2e8cb93d29b..96fefc70e3eab286d6bf833bd9fd0c950c82ff3e 100644 --- a/combination_embeddingsv2.py +++ b/combination_embeddingsv2.py @@ -3,9 +3,10 @@ import os #Â Structure import pandas as pd +import numpy as np #Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU from keras.models import Model from keras.callbacks import ModelCheckpoint from tensorflow.keras.layers import Lambda @@ -140,7 +141,7 @@ num_words = len(index.index_ngram) ############################################################################################# embedding_weights = load_embedding(args.embedding_fn) - +EMBEDDING_DIM = len(embedding_weights[0]) ############################################################################################# ################################# MODEL DEFINITION ########################################## @@ -153,49 +154,33 @@ input_2 = Input(shape=(index.max_len,)) embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) -x1 = Dropout(0.1)(embedding_layer(input_1)) -x2 = Dropout(0.1)(embedding_layer(input_2)) +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) #Â Each LSTM learn on a permutation of the input toponyms -biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh")) +biLSTM = Bidirectional(LSTM(64,activation="pentanh", recurrent_activation="pentanh")) x1 = biLSTM(x1) x2 = biLSTM(x2) x = concatenate([x2,x1])#,x3]) -aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x) - -x1 = Dense(5000, - activation="relu", - kernel_regularizer=regularizers.l2(0.01) - )(x) -x1 = Dropout(0.3)(x1) -x1 = Dense(5000, - activation="relu", - kernel_regularizer=regularizers.l2(0.01) - )(x1) -x1 = Dropout(0.3)(x1) - -x2 = Dense(5000, - activation="relu", - kernel_regularizer=regularizers.l2(0.01) - )(x) -x2 = Dropout(0.3)(x2) -x2 = Dense(5000, - activation="relu", - kernel_regularizer=regularizers.l2(0.01) - )(x2) -x2 = Dropout(0.3)(x2) +x1 = Dense(1000,activation="pentanh")(x) +# x1 = Dropout(0.3)(x1) +x1 = Dense(1000,activation="pentanh")(x1) +# x1 = Dropout(0.3)(x1) +x2 = Dense(1000,activation="pentanh")(x) +# x2 = Dropout(0.3)(x2) +x2 = Dense(1000,activation="pentanh")(x2) +# x2 = Dropout(0.3)(x2) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) output = concatenate([output_lon,output_lat],name="output_layer") -model = Model(inputs = [input_1,input_2], outputs = [output,aux_layer])#input_3 - -model.compile(loss={"output_layer":haversine_tf_1circle,"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy","output_layer":accuracy_k(ACCURACY_TOLERANCE)}) +model = Model(inputs = [input_1,input_2], outputs = output)#input_3 +model.compile(loss={"output_layer":haversine_tf_1circle}, optimizer='adam',metrics={"output_layer":accuracy_k(ACCURACY_TOLERANCE)}) ############################################################################################# ################################# TRAINING LAUNCH ########################################### diff --git a/combination_embeddingsv3.py b/combination_embeddingsv3.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb10318413925cac5f23238ee30f2b42d44e94d --- /dev/null +++ b/combination_embeddingsv3.py @@ -0,0 +1,216 @@ +# Base module +import os + +#Â Structure +import pandas as pd +import numpy as np + +#Â DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU +from keras.models import Model +from keras.callbacks import ModelCheckpoint +from tensorflow.keras.layers import Lambda +import keras.backend as K +import tensorflow as tf +from lib.custom_layer import * + +#Â Custom module +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder +from lib.metrics import lat_accuracy,lon_accuracy +from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency +from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle + +# Logging +import logging + +logging.getLogger('gensim').setLevel(logging.WARNING) + +from helpers import EpochTimer + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ + .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + +#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.k_value +EPOCHS = args.epochs +ADJACENCY_SAMPLING = args.adjacency_sample +COOC_SAMPLING = args.cooc_sample +WORDVEC_ITER = 50 +EMBEDDING_DIM = args.dimension +BATCH_SIZE = args.batch_size +################################################# +########## FILENAME VARIABLE #################### +################################################# +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.inclusion_fn +ADJACENCY_REL_FILENAME = args.adjacency_fn +COOC_FN = args.wikipedia_cooc_fn + +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + + +meta_data = MetaDataSerializer( + DATASET_NAME, + REL_CODE, + COOC_SAMPLING, + ADJACENCY_SAMPLING, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) + + +###Â PUT DATASRC + GENERATOR + +index = NgramIndex.load(args.ngram_index_fn) + +train_src = [] +test_src = [] + +class_encoder = LabelEncoder() + +if args.wikipedia_cooc: + train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4,use_healpix=False)) + test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4,use_healpix=False)) + +if args.adjacency: + a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + train_src.append(a_train) + test_src.append(a_test) + +if args.inclusion: + i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv") + i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv") + train_src.append(i_train) + test_src.append(i_test) +#Adjacency + +print("Number of classes:",class_encoder.get_num_classes()) + +d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE) +d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE) + +num_words = len(index.index_ngram) + +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# + +embedding_weights = load_embedding(args.embedding_fn) +EMBEDDING_DIM = len(embedding_weights[0]) + +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# + +from keras import regularizers +#### + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) + +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +#Â Each LSTM learn on a permutation of the input toponyms +biLSTM = Bidirectional(GRU(128,activation="pentanh", recurrent_activation="pentanh")) +x1 = biLSTM(x1) +x2 = biLSTM(x2) + +x = concatenate([x1,x2])#,x3]) + +x1 = Dense(500,activation="relu")(x) +x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +x1 = Dropout(0.3)(x1) + +x2 = Dense(500,activation="relu")(x) +x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +x2 = Dropout(0.3)(x2) + +#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D) + +output_lon = Dense(1,activation="sigmoid")(x1) +output_lat = Dense(1,activation="sigmoid")(x2) + +output_coord = concatenate([output_lon,output_lat],name="output_coord") + +##### +model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 + +model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) + +model.summary() +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + +epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") + + +history = model.fit_generator(generator=d_train, + validation_data=d_test, + verbose=True, + epochs=EPOCHS, + callbacks=[checkpoint,epoch_timer]) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv(HISTORY_FN) + +model.save(MODEL_OUTPUT_FN) + +#Â Erase Model Checkpoint file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/desamb_eval.py b/desamb_eval.py index ab90a480f8350e7b55226a513946091fa758a293..34f04ad63ff99ef6ec0a9c9611fd1368d77a670e 100644 --- a/desamb_eval.py +++ b/desamb_eval.py @@ -52,11 +52,15 @@ prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")] final_output = [] for prefix in prefixes: - df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index") - data = json.load(open(prefix+".json")) - data["acccuracy@100km"] = (df.dist<100).sum()/len(df) - data["acccuracy@50km"] = (df.dist<50).sum()/len(df) - data["acccuracy@25km"] = (df.dist<25).sum()/len(df) - final_output.append(data) + try: + df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index") + data = json.load(open(prefix+".json")) + data["acccuracy@100km"] = (df.dist<100).sum()/len(df) + data["acccuracy@50km"] = (df.dist<50).sum()/len(df) + data["acccuracy@25km"] = (df.dist<25).sum()/len(df) + final_output.append(data) + except: + print("BUMP!") + pd.DataFrame(final_output).to_csv("{0}_RESULT.csv".format(EVAL_DATASET_FN.rstrip(".csv"))) \ No newline at end of file diff --git a/desamb_eval_runs.sh b/desamb_eval_runs.sh index a45dd2c97e8c3f7dd306cc6f5f9f73786f5a6e36..20eb682c6a8e00856a6d20d05100452d913b13d0 100644 --- a/desamb_eval_runs.sh +++ b/desamb_eval_runs.sh @@ -1,3 +1,3 @@ python3 desamb_eval.py -g ../data/geocoding_evaluation/fr_dataset_ambiguity_sample50percent.csv outputs/FR_RESULT -#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/US\ FR\ results -#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/US\ FR\ results +python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/USFR_WORD +python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/USFR_WORD diff --git a/lib/data_generator.py b/lib/data_generator.py index fb871467542858a421f3ef2d33ad644825cae9cb..58cbad4a3bf095e52772cbcea3168e5860d8489d 100644 --- a/lib/data_generator.py +++ b/lib/data_generator.py @@ -178,14 +178,13 @@ class Inclusion(DataSource): class CoOccurrences(DataSource): - def __init__(self, filename, label_encoder,sampling=3,resolution = 1): + def __init__(self, filename, label_encoder,sampling=3,resolution = 256,use_healpix=False): super().__init__("Co-Occurrence data",filename) - self.is_there_healpix = True + self.is_there_healpix = use_healpix # LOAD DATA - try: - self.data_src = pd.read_csv(filename) - except: - self.data_src = pd.read_csv(filename,sep="\t") + + self.data_src = pd.read_csv(filename,sep="\t") + # CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA if not "healpix_{0}".format(resolution) in self.data_src.columns: raise KeyError("healpix_{0} column does not exists ! ".format(resolution)) @@ -272,7 +271,6 @@ class DataGenerator(keras.utils.Sequence): self.num_classes = class_encoder.get_num_classes() self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix - def __len__(self): 'Denotes the number of batches per epoch' return int(np.floor(self.len / self.batch_size)) @@ -281,7 +279,7 @@ class DataGenerator(keras.utils.Sequence): if self.is_there_healpix and self.only_healpix: return [X[:,0],X[:,1]],y2 - if self.is_there_healpix: + elif self.is_there_healpix: return [X[:,0],X[:,1]],[y,y2] else: return [X[:,0],X[:,1]],y @@ -299,7 +297,6 @@ class DataGenerator(keras.utils.Sequence): self.datasrc_index += 1 self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix - if self.datasrc_index >= len(self.data_src): self.return_(X,y,y2) @@ -332,7 +329,10 @@ def load_embedding(model_fn,dim_vector=100): N = len(model.wv.vocab) M = np.zeros((N,dim_vector)) for i in range(N): - M[i] = model.wv[str(i)] + try: + M[i] = model.wv[str(i)] + except KeyError: + pass return M if __name__ == "__main__": diff --git a/lib/data_generatorv3.py b/lib/data_generatorv3.py new file mode 100644 index 0000000000000000000000000000000000000000..cd17cdba4fafea2ab43f49fa778d88374b66dbe9 --- /dev/null +++ b/lib/data_generatorv3.py @@ -0,0 +1,354 @@ +import os +from gzip import GzipFile + +import keras +from keras.utils import to_categorical +import numpy as np +import pandas as pd + +from .geo import zero_one_encoding + +from helpers import parse_title_wiki,read_geonames +from gensim.models.keyedvectors import KeyedVectors + +from sklearn.preprocessing import LabelEncoder + + +def wc_l(filename,gzip=True): + lc = 0 + if not gzip: + f = open(filename) + if gzip: + f = GzipFile(filename) + while f.readline(): + lc += 1 + f.close() + return lc + +class SamplingProbabilities: + def __init__(self): + self.count = {} + + def get_probs(self,item): + if not item in self.count: + self.count[item] = 0 + self.count[item]+=1 + return 1/self.count[item] + def __call__(self,a): + return self.get_probs(a) + + +class DataSource(object): + def __init__(self,name,input_filename): + self.name = name + assert os.path.exists(input_filename) + self.input_filename = input_filename + self.len = 0 + + self.is_there_healpix = False + + def __next__(self): + raise NotImplementedError() + + def __iter__(self): + return self + + def __len__(self): + return self.len + + def __reset__(self): + raise NotImplementedError() + + def isOver(self): + raise NotImplementedError() + +class Adjacency(DataSource): + def __init__(self,filename,geonames_filename,sampling=3,len_=None,gzip=True): + DataSource.__init__(self,"Adjacency SRC",filename) + + assert os.path.exists(geonames_filename) + self.geonames_data_dict = {row.geonameid:row.name for row in read_geonames(geonames_filename).itertuples()} + + self.gzip = gzip + if not self.gzip: + self.data_src = open(self.input_filename,'rb') + else: + self.data_src = GzipFile(self.input_filename,'rb') + + if len_: + self.len = len_*sampling + else: + self.len = wc_l(filename,gzip=gzip) + + self.data_src.readline() # header line + + self.sampling = sampling + if self.sampling: + self.probs_storage = SamplingProbabilities() + + self.topo = None + self.context_topo_context = [] + self.curr_probs = None + self.lat, self.lon = None, None + + + self.i = 0 + self.is_over = False + + def __next__(self): + if self.i >= len(self.context_topo_context): + line = self.data_src.readline() + if not line: + self.is_over = True + raise StopIteration + line = line.decode("utf-8").rstrip("\n") + _,geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(",")) + + self.topo = int(geonameid) + self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")] + if self.sampling: + self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context] + self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs) + self.lat, self.lon = float(latitude),float(longitude) + + self.i = 0 + + self.i += 1 + return (self.geonames_data_dict[self.topo], + self.geonames_data_dict[self.context_topo_context[self.i-1]], + self.lat,self.lon) + + def __reset__(self): + if not self.gzip: + self.data_src = open(self.input_filename,'rb') + else: + self.data_src = GzipFile(self.input_filename,'rb') + + self.data_src.readline() # header line + self.is_over = False + + def isOver(self): + return self.is_over + + +class Inclusion(DataSource): + def __init__(self, geonames_filename,hierarchy_filename,mask_ids=None): + super().__init__("Inclusion SRC",hierarchy_filename) + assert os.path.exists(geonames_filename) + self.geonames_data_dict = {row.geonameid:(row.name,row.latitude,row.longitude) for row in read_geonames(geonames_filename).itertuples()} + + self.data_src = pd.read_csv(self.input_filename, + sep="\t", + header=None, + names="parentId,childId,type".split(",") + ).fillna("") + + if mask_ids: + self.data_src = self.data_src[self.data_src.childId.isin(mask_ids)] + self.data_src= self.data_src[self.data_src.childId.isin(self.geonames_data_dict)] + self.data_src= self.data_src[self.data_src.parentId.isin(self.geonames_data_dict)] + + self.data_src = self.data_src["childId parentId".split()].values.tolist() + self.len = len(self.data_src) + + self.i = 0 + + self.is_over = False + + def __next__(self): + if self.i+1 >= self.len: + self.eof = True + raise StopIteration + else: + self.i += 1 + tup_ = tuple(self.data_src[self.i-1]) + return (self.geonames_data_dict[tup_[0]][0], + self.geonames_data_dict[tup_[1]][0], + self.geonames_data_dict[tup_[0]][2], + self.geonames_data_dict[tup_[0]][1]) + + def __reset__(self): + self.i = 0 + self.is_over = False + + def isOver(self): + return (self.i == self.len) + + + + +class CoOccurrences(DataSource): + def __init__(self, filename, label_encoder,sampling=3,resolution = 256,use_healpix=False): + super().__init__("Co-Occurrence data",filename) + self.is_there_healpix = use_healpix + # LOAD DATA + + self.data_src = pd.read_csv(filename,sep="\t") + + # CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA + if not "healpix_{0}".format(resolution) in self.data_src.columns: + raise KeyError("healpix_{0} column does not exists ! ".format(resolution)) + + # PARSE TOPONYMS + self.data_src["title"] = self.data_src.title.apply(parse_title_wiki) + try: + self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki) + except: + pass + + # LOOP parameter + self.sampling = sampling + if self.sampling: + self.probs_storage = SamplingProbabilities() + + # LOOP INDICES + self.i = 0 + self.j = 0 + self.is_over = False + self.len = len(self.data_src)*(self.sampling-1) + + + # BUFFER VARIABLE + self.topo = None + self.context_topo_context = [] + self.curr_probs = None + self.lat, self.lon = None, None + + + self.resolution = resolution + self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist() + + self.class_encoder = label_encoder + self.class_encoder.fit(self.classes) + + self.healpix = None + + def __next__(self): + if self.isOver() or self.i*self.sampling == self.len: + self.is_over = True + raise StopIteration + + if self.j >= len(self.context_topo_context): + line = self.data_src.iloc[self.i] + + self.topo = line.title + self.context_topo_context = [x for x in str(line.interlinks).split("|")] + N = len(self.context_topo_context) + triple = [] + for i in range(N): + if i+1 == N: + break + triple.append((self.context_topo_context[i],self.context_topo_context[i+1])) + + + self.context_topo_context = triple + np.random.shuffle(self.context_topo_context) + self.lat, self.lon = line.latitude,line.longitude + + self.healpix = line["healpix_{0}".format(self.resolution)] + + self.i += 1 + self.j = 0 + + self.j += 1 + return (self.topo, + *self.context_topo_context[self.j-1], + self.lat,self.lon,self.class_encoder.transform([self.healpix])[0]) + + def __reset__(self): + self.i = 0 + self.is_over = False + + def isOver(self): + return self.is_over + +class DataGenerator(keras.utils.Sequence): + 'Generates data for Keras' + def __init__(self,data_sources,ngram_index,class_encoder,**kwargs): + 'Initialization' + self.data_src = data_sources + self.ngram_index = ngram_index + + self.batch_size = kwargs.get("batch_size",1000) + self.only_healpix = kwargs.get("only_healpix",False) + + self.len = sum([len(d) for d in self.data_src]) + self.datasrc_index = 0 + + self.num_classes = class_encoder.get_num_classes() + + self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix + def __len__(self): + 'Denotes the number of batches per epoch' + return int(np.floor(self.len / self.batch_size)) + + def return_(self,X,y,y2=None): + if self.is_there_healpix and self.only_healpix: + return [X[:,0],X[:,1],X[:,2]],y2 + + elif self.is_there_healpix: + return [X[:,0],X[:,1],X[:,2]],[y,y2] + else: + return [X[:,0],X[:,1],X[:,2]],y + + def __getitem__(self, index): + 'Generate one batch of data' + X = np.empty((self.batch_size,3,self.ngram_index.max_len),dtype=np.int32) #Â toponym + y = np.empty((self.batch_size,2),dtype=float) #lat lon coord + + y2=None # For healpix + if self.is_there_healpix: + y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class + + if self.data_src[self.datasrc_index].isOver(): + self.datasrc_index += 1 + self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix + + if self.datasrc_index >= len(self.data_src): + self.return_(X,y,y2) + + for i in range(self.batch_size): + if self.data_src[self.datasrc_index].isOver(): + return self.return_(X,y,y2) + try: + topo, topo_context_1,topo_context_2, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__() + except StopIteration as e: + return self.return_(X,y,y2) + + X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context_1),self.ngram_index.encode(topo_context_2)] + y[i] = [*zero_one_encoding(longitude,latitude)] + if self.is_there_healpix: + y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32' +) + + #y[i] = [longitude,latitude] + return self.return_(X,y,y2) + + def on_epoch_end(self): + 'Updates indexes after each epoch' + [d.__reset__() for d in self.data_src] + self.datasrc_index = 0 + + + +def load_embedding(model_fn,dim_vector=100): + model = KeyedVectors.load(model_fn) + N = len(model.wv.vocab) + M = np.zeros((N,dim_vector)) + for i in range(N): + try: + M[i] = model.wv[str(i)] + except KeyError: + pass + return M + +if __name__ == "__main__": + # All adj nb of line :7955000-1 + from lib.ngram_index import NgramIndex + from tqdm import tqdm + ng = NgramIndex.load("../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json") + c= CoOccurrences("../data/wikipedia/cooccurrence_FR.txt_test.csv",sampling=3) + a = Adjacency("/home/jacques/sample_adjacency.txt",geonames_filename="../data/geonamesData/allCountries.txt",gzip=False,sampling=10) + i= Inclusion(geonames_filename="../data/geonamesData/allCountries.txt",hierarchy_filename="../data/geonamesData/hierarchy.txt") + d= DataGenerator([c,a,i],ng) + for x in tqdm(range(len(d))):d[i] diff --git a/lib/geo.py b/lib/geo.py index 1bd63965db7a0cfdb65cbc1c01db59938364743e..18407051dc3616b160ee4458b00b95e10d071c12 100644 --- a/lib/geo.py +++ b/lib/geo.py @@ -26,9 +26,9 @@ def tf_deg2rad(deg): def latlon2healpix( lat , lon , res ): lat = np.radians(lat) lon = np.radians(lon) - xs = ( np.cos(lat) * np.cos(lon) )# - ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates - zs = ( np.sin(lat) )# + xs = ( np.cos(lat) * np.cos(lon) ) # + ys = ( np.cos(lat) * np.sin(lon) ) # -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates + zs = ( np.sin(lat) ) # return healpy.vec2pix( int(res) , xs , ys , zs ) def haversine_tf(y_true,y_pred): diff --git a/lib/ngram_index.py b/lib/ngram_index.py index b69fcfea40513df3816ec32859c86101f6851440..47a5a70005533a2106c49c3a6f0d0a9da8b7d2b4 100644 --- a/lib/ngram_index.py +++ b/lib/ngram_index.py @@ -75,7 +75,7 @@ class NgramIndex(): """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) - ngrams = [ng for ng in ngrams if ng.count("$")<self.size-1] + ngrams = [ng for ng in ngrams if ng.count("$")<2] if not self.loaded: [self.add(ng) for ng in ngrams if not ng in self.ngram_index] return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len) @@ -125,7 +125,8 @@ class NgramIndex(): N = len(self.ngram_index) embedding_matrix = np.zeros((N,dim)) for i in range(N): - embedding_matrix[i] = model.wv[str(i)] + if str(i) in model.wv: + embedding_matrix[i] = model.wv[str(i)] return embedding_matrix def save(self,fn): diff --git a/lib/run.py b/lib/run.py index 5c652b4f492cd7a38486b34ec1e66abf874c0404..f2a7b79bc6cb5534b748326ee0db155057f24c35 100644 --- a/lib/run.py +++ b/lib/run.py @@ -146,7 +146,8 @@ class Run(object): out_proc = subprocess.PIPE if log_filename: - out_proc = open(log_filename,'w') + out_proc = open(log_filename,'a') + print(4) process = subprocess.Popen(self.get_command().split(),stdout=out_proc) _, _ = process.communicate() #Â We don't care of the output (if so, we use the log_filename argument) @@ -209,8 +210,10 @@ class GridSearchModel: log_filename : str, optional log filename, by default None """ + i=0 for task in self.tasks: - task.run(log_filename=log_filename) + task.run(log_filename=log_filename+"_"+str(i)) + i+=1 if __name__ == "__main__": diff --git a/lib/torch_generator.py b/lib/torch_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..718d169d92c6003ba223258eb67052f34a1bb13d --- /dev/null +++ b/lib/torch_generator.py @@ -0,0 +1,49 @@ +import torch +from keras.preprocessing.sequence import pad_sequences +import numpy as np + +def chunks(lst, n): + """Yield successive n-sized chunks from lst.""" + for i in range(0, len(lst), n): + yield lst[i:i + n] + +class SentenceDataset(torch.utils.data.Dataset): + 'Characterizes a dataset for PyTorch' + def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32): + 'Initialization' + self.sentences = dataframe["sentence"].values + self.labels = dataframe["label"].values + self.tokenizer = tokenizer + self.max_len = max_len + + self.batch_size = batch_size + a = np.arange(len(dataframe)) + np.random.shuffle(a) + self.batch_tokenization = list(chunks(a,batch_size)) + assert(len(self.batch_tokenization[0])==batch_size) + self.current_batch_id = 0 + self.boundaries = (0,0+batch_size) + self.current_batch_tokenized = self.tokenize(self.current_batch_id) + + def tokenize(self,batch_index): + X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512) for x in self.batch_tokenization[batch_index]]# Tokenizer + X = pad_sequences(X, maxlen=self.max_len, dtype="long", value=0, truncating="post", padding="post").tolist() + return X + + def __len__(self): + 'Denotes the total number of samples' + return len(self.sentences) + def __getitem__(self, index): + 'Generates one sample of data' + if not index < self.boundaries[1] or not index >= self.boundaries[0]: + self.current_batch_id = index//self.batch_size + self.current_batch_tokenized = self.tokenize(self.current_batch_id) + self.boundaries= (self.current_batch_id*self.batch_size,self.current_batch_id*self.batch_size + self.batch_size) + # Load data and get label + + index_in_batch = index-self.boundaries[0] + #print(self.boundaries,index_in_batch) + X = self.current_batch_tokenized[index_in_batch] + M = [int(token_id > 0) for token_id in X] # attention mask + y = self.labels[index] + return torch.tensor(np.array(X)),torch.tensor(np.array(M)),torch.tensor(np.array(y)) \ No newline at end of file diff --git a/lib/utils.py b/lib/utils.py index 5af3a90cf702cf25f0635c649ac3abae93a269c7..82531b38dea65bd305a04e37d2f07d3fae0fbd77 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -3,6 +3,8 @@ import math import argparse import os import json +import time +import datetime # Data Structure import numpy as np @@ -101,6 +103,7 @@ class ConfigurationReader(object): class MetaDataSerializer(object): def __init__(self, + model_name, dataset_name, rel_code, cooc_sample_size, @@ -113,6 +116,7 @@ class MetaDataSerializer(object): index_fn, keras_model_fn, train_test_history_fn): + self.model_name = model_name self.dataset_name = dataset_name self.rel_code = rel_code self.cooc_sample_size = cooc_sample_size @@ -128,6 +132,7 @@ class MetaDataSerializer(object): def save(self,fn): json.dump({ + "model_name":self.model_name, "dataset_name" : self.dataset_name, "rel_code" : self.rel_code, "cooc_sample_size" : self.cooc_sample_size, @@ -193,4 +198,23 @@ class Chronometer: duration = time.time() - self.__task_begin_timestamp[task_name] del self.__task_begin_timestamp[task_name] - return duration \ No newline at end of file + return duration + + + # Function to calculate the accuracy of our predictions vs labels +def flat_accuracy(preds, labels): + pred_flat = np.argmax(preds, axis=1).flatten() + labels_flat = labels.flatten() + return np.sum(pred_flat == labels_flat) / len(labels_flat) + + + +def format_time(elapsed): + ''' + Takes a time in seconds and returns a string hh:mm:ss + ''' + # Round to the nearest second. + elapsed_rounded = int(round((elapsed))) + + # Format as hh:mm:ss + return str(datetime.timedelta(seconds=elapsed_rounded)) \ No newline at end of file diff --git a/lib/word_index.py b/lib/word_index.py new file mode 100644 index 0000000000000000000000000000000000000000..974a8abb05f86c5c465ff87bd8c2a4f82e10dacf --- /dev/null +++ b/lib/word_index.py @@ -0,0 +1,180 @@ +import json + +import numpy as np + +from ngram import NGram + +# Machine learning +from gensim.models import Word2Vec + +class WordIndex(): + """ + Class used for encoding words in ngram representation + """ + def __init__(self,loaded = False): + """ + Constructor + + Parameters + ---------- + loaded : bool + if loaded from external file + """ + self.word_index = {"":0} + self.index_word = {0:""} + self.cpt = 0 + self.max_len = 0 + + self.loaded = loaded + + def split_and_add(self,word): + """ + Split word in multiple ngram and add each one of them to the index + + Parameters + ---------- + word : str + a word + """ + grams = word.lower().split(" ") + [self.add(subword) for subword in grams ] + self.max_len = max(self.max_len,len(grams)) + + def add(self,subword): + """ + Add a ngram to the index + + Parameters + ---------- + ngram : str + ngram + """ + if not subword in self.word_index: + self.cpt+=1 + self.word_index[subword]=self.cpt + self.index_word[self.cpt]=subword + + + def encode(self,word): + """ + Return a ngram representation of a word + + Parameters + ---------- + word : str + a word + + Returns + ------- + list of int + listfrom shapely.geometry import Point,box + of ngram index + """ + subwords = [w.lower() for w in word.split(" ")] + if not self.loaded: + [self.add(ng) for ng in subwords if not ng in self.word_index] + if self.max_len < len(subwords): + self.max_len = max(self.max_len,len(subwords)) + return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len) + + def complete(self,ngram_encoding,MAX_LEN,filling_item=0): + """ + Complete a ngram encoded version of word with void ngram. It's necessary for neural network. + + Parameters + ---------- + ngram_encoding : list of int + first encoding of a word + MAX_LEN : int + desired length of the encoding + filling_item : int, optional + ngram index you wish to use, by default 0 + + Returns + ------- + list of int + list of ngram index + """ + if self.loaded and len(ngram_encoding) >=MAX_LEN: + return ngram_encoding[:MAX_LEN] + assert len(ngram_encoding) <= MAX_LEN + diff = MAX_LEN - len(ngram_encoding) + ngram_encoding.extend([filling_item]*diff) + return ngram_encoding + + def get_embedding_layer(self,texts,dim=100,**kwargs): + """ + Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model. + + Parameters + ---------- + texts : list of [list of int] + list of encoded word + dim : int, optional + embedding dimension, by default 100 + + Returns + ------- + np.array + embedding matrix + """ + model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs) + N = len(self.word_index) + embedding_matrix = np.zeros((N,dim)) + for i in range(N): + if str(i) in model.wv: + embedding_matrix[i] = model.wv[str(i)] + return embedding_matrix + + def save(self,fn): + """ + + Save the NgramIndex + + Parameters + ---------- + fn : str + output filename + """ + data = { + "word_index": self.word_index, + "cpt_state": self.cpt, + "max_len_state": self.max_len + } + json.dump(data,open(fn,'w')) + + @staticmethod + def load(fn): + """ + + Load a NgramIndex state from a file. + + Parameters + ---------- + fn : str + input filename + + Returns + ------- + NgramIndex + ngram index + + Raises + ------ + KeyError + raised if a required field does not appear in the input file + """ + try: + data = json.load(open(fn)) + except json.JSONDecodeError: + print("Data file must be a JSON") + for key in ["word_index","cpt_state","max_len_state"]: + if not key in data: + raise KeyError("{0} field cannot be found in given file".format(key)) + new_obj = WordIndex(loaded=True) + new_obj.word_index = data["word_index"] + new_obj.index_word = {v:k for k,v in new_obj.word_index.items()} + new_obj.cpt = data["cpt_state"] + new_obj.max_len = data["max_len_state"] + return new_obj + diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py index 6fb4930e11e91b5d25d2d8c2316b5b1d133dbce7..6ed8169cd5549ffbbaa02d4ecc0b58e0b698311e 100644 --- a/predict_toponym_coordinates.py +++ b/predict_toponym_coordinates.py @@ -3,6 +3,7 @@ import os import tensorflow as tf import keras.backend as K from lib.ngram_index import NgramIndex +from lib.word_index import WordIndex import numpy as np from tensorflow.python.keras.backend import set_session @@ -64,7 +65,7 @@ class Geocoder(object): # graph = tf.compat.v1.get_default_graph() # set_session(sess) self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()}) - self.ngram_encoder = NgramIndex.load(ngram_index_file) + self.ngram_encoder = WordIndex.load(ngram_index_file) def get_coord(self,toponym,context_toponym): global sess diff --git a/region_embedding.py b/region_model.py similarity index 100% rename from region_embedding.py rename to region_model.py diff --git a/run_train.py b/run_train.py index 987eaf8d7a9fca12c828756e614ca6983cce70e8..3030694f6f0b5df0589c5509ed7bab17d68cd0e8 100644 --- a/run_train.py +++ b/run_train.py @@ -14,11 +14,11 @@ for rel in rels: c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt" # Init GridsearchModel grid = GridSearchModel(\ - "python3 combination_embeddings.py", + "python3 combination_embeddings_baselines.py", **OrderedDict({ # necessary because some args have to be given in a certain order - "rel":["-i -a",("-i -w "+c_f),"-a -w","-a -i -w"], + "rel":["-w "+c_f,("-i -w "+c_f),"-a -w "+c_f,"-a -i -w "+c_f], # ,"-a -i -w "+c_f ,"-i -a" "-n":[4], - "--ngram-word2vec-iter" :[1], + "--ngram-word2vec-iter" :[10], "-e":[100], "geoname_fn":"../data/geonamesData/US_FR.txt".split(), "hierarchy_fn":"../data/geonamesData/hierarchy.txt".split() @@ -27,6 +27,6 @@ grid = GridSearchModel(\ print("########### THE FOLLOWING COMMAND(S) WILL BE EXECUTED ###########" ) [print(task.get_command()) for task in grid.tasks] print("#################################################################") -grid.run("outputs/log_RUN_TEXAS_IDFrance.txt") +grid.run("outputs/log_{0}".format("FR_baseline")) #["-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -a","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -i"] \ No newline at end of file diff --git a/scripts/embeddingngram.py b/scripts/embeddingngram.py index 000e8f21e9ebe0c4e32db7ce06d72caba86c6883..ac5ec02210db62d51d351c071c158671ad935703 100644 --- a/scripts/embeddingngram.py +++ b/scripts/embeddingngram.py @@ -34,12 +34,13 @@ p= [wiki_labels.extend(x.split("|")) for x in df_cooc["interlinks"].values] del df_geo del df_cooc +N = 5 -ng = NgramIndex(4) +ng = NgramIndex(N) p = [ng.split_and_add(x) for x in tqdm(geonames_label)] p = [ng.split_and_add(x) for x in tqdm(wiki_labels)] -ng.save("4gramWiki+Geonames_index.json") +ng.save("{0}gramWiki+Geonames_index.json".format(N)) geonames_label.extend(wiki_labels) @@ -51,8 +52,8 @@ class MySentences(object): for w in self.texts: yield [str(x)for x in ng.encode(w)] -model = Word2Vec(MySentences(geonames_label), size=100, window=5, min_count=1, workers=4) -model.save("embedding4gramWiki+Geonames.bin") +model = Word2Vec(MySentences(geonames_label), size=100, window=5, min_count=1, workers=4,sg=1) +model.save("embedding{0}gramWiki+Geonames.bin".format(5)) diff --git a/svm_predict_hp.py b/svm_predict_hp.py new file mode 100644 index 0000000000000000000000000000000000000000..c836859e186a468189b297a1a185e2f1e387880f --- /dev/null +++ b/svm_predict_hp.py @@ -0,0 +1,36 @@ +import numpy as np + +from joblib import dump, load +from tensorflow.keras.utils import to_categorical + +from lib.geo import latlon2healpix +from lib.ngram_index import NgramIndex + + +def parse_bow(x,index): + return np.sum(to_categorical(x,num_classes=index.cpt+1),axis=0) + +def is_in(lat,lon,hp_predicted,hp_nside): + hp_truth = latlon2healpix(lat,lon,hp_nside) + return hp_truth == hp_predicted + +class HealpixGeocoder(object): + + def __init__(self,model_fn,ngram_index_filename): + self.model = load(model_fn) + self.ng_index = NgramIndex.load(ngram_index_filename) + + def geocode(self,phrase1,phrase2): + if not phrase1 or not phrase2: + return None + vec = parse_bow(np.array(self.ng_index.encode(phrase1)),self.ng_index)+\ + parse_bow(np.array(self.ng_index.encode(phrase2)),self.ng_index) + return self.model.predict([vec])[0] + + def geocode_multi(self,phrases1,phrases2): + vecs = np.array([ parse_bow(np.array(self.ng_index.encode(ph)),self.ng_index) for ph in phrases1 if ph]) + vecs += np.array([ parse_bow(np.array(self.ng_index.encode(ph)),self.ng_index) for ph in phrases2 if ph]) + return self.model.predict(vecs) + +hp = HealpixGeocoder("SVMLINEAR_US_FR_AC.bin","outputs/US_FR.txt_100_4_0.002__A_C_index") +hp.geocode("paris","montpellier") diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py index 750803b53419142d1310970468f4b576a741c8ef..edacbaa85ca2aff468114ecfe7a1aaae1a9fadcd 100644 --- a/train_test_split_cooccurrence_data.py +++ b/train_test_split_cooccurrence_data.py @@ -1,6 +1,7 @@ import argparse import pandas as pd +import numpy as np import geopandas as gpd import logging @@ -13,12 +14,13 @@ logging.basicConfig( from sklearn.model_selection import train_test_split from shapely.geometry import Point -from lib.geo import Grid +from lib.geo import Grid,latlon2healpix from tqdm import tqdm parser = argparse.ArgumentParser() parser.add_argument("cooccurrence_file") +parser.add_argument("-s",action="store_true") args = parser.parse_args()#("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split()) @@ -29,31 +31,31 @@ COOC_FN = args.cooccurrence_file logging.info("Load Cooc DATA data...") cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("") -cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) -cooc_data = gpd.GeoDataFrame(cooc_data) +# cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) +# cooc_data = gpd.GeoDataFrame(cooc_data) logging.info("Cooc data loaded!") -#Â World Shape bounds -world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) -world["nn"] = 1 -dissolved = world.dissolve(by="nn").iloc[0].geometry +# #Â World Shape bounds +# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) +# world["nn"] = 1 +# dissolved = world.dissolve(by="nn").iloc[0].geometry -#Creating Grid -logging.info("Initializing Grid (360,180)...") -g = Grid(*dissolved.bounds,[360,180]) -logging.info("Fit Data to the Grid...") -g.fit_data(cooc_data) -logging.info("Placing place into the grid...") -[g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))] +# #Creating Grid +# logging.info("Initializing Grid (360,180)...") +# g = Grid(*dissolved.bounds,[360,180]) +# logging.info("Fit Data to the Grid...") +# g.fit_data(cooc_data) +# logging.info("Placing place into the grid...") +# [g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))] -#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME -logging.info("Associate a cell number to each place in the Geoname Dataframe") -def foo(g,id_): - for ix,cell in enumerate(g.cells): - if id_ in cell.list_object: - return ix +# #ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME +# logging.info("Associate a cell number to each place in the Geoname Dataframe") +# def foo(g,id_): +# for ix,cell in enumerate(g.cells): +# if id_ in cell.list_object: +# return ix -cooc_data["cat"] = cooc_data.title.apply(lambda x:foo(g,x)) +cooc_data["cat"] = cooc_data.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1) # TRAIN AND TEST SPLIT logging.info("Split Between Train and Test") @@ -66,20 +68,27 @@ while 1: break i+=1 -for i in range(i+1,len(g.cells)): +for i in np.unique(cooc_data.cat.values): try: - x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i]) + if not args.s: + x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i]) + else: + x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i].sample(frac=0.1)) + X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test)) except Exception as e: print(e) #print("Error",len(filtered[filtered.cat == i])) -del X_train["geometry"] -del X_train["nn"] +# del X_train["geometry"] +# del X_train["nn"] del X_train["cat"] del X_test["cat"] -del X_test["geometry"] -del X_test["nn"] +# del X_test["geometry"] +# del X_test["nn"] # SAVING THE DATA logging.info("Saving Output !") -X_train.to_csv(COOC_FN+"_train.csv") -X_test.to_csv(COOC_FN+"_test.csv") +suffix ="" +if args.s: + suffix = "10per" +X_train.to_csv(COOC_FN+suffix+"_train.csv") +X_test.to_csv(COOC_FN+suffix+"_test.csv")