Skip to content
Snippets Groups Projects
Commit 10945496 authored by Jacques Fize's avatar Jacques Fize
Browse files

UPDATE, DEBUG, AND CLEANING of the repo

parent 8b047924
No related branches found
No related tags found
No related merge requests found
Showing with 2026 additions and 64 deletions
......@@ -152,3 +152,6 @@ log*
temp*
subset*
time*
/data*
\ No newline at end of file
bert.py 0 → 100644
# REQUIREMENTS : pandas keras torch numpy transformers
"""
Based from the article : https://mccormickml.com/2019/07/22/BERT-fine-tuning/
by Chris McCormick
"""
import os
import sys
import time
import random
import argparse
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from tqdm import tqdm
tqdm.pandas()
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
parser = argparse.ArgumentParser()
parser.add_argument("train" ,help="TSV with two columns : 'sentence' and 'label'")
parser.add_argument("test",help="TSV with two columns : 'sentence' and 'label'")
parser.add_argument("outputdir",help="TSV with two columns : 'sentence' and 'label'")
parser.add_argument("-e","--epochs",type=int,default=5)
parser.add_argument("-b","--batch_size",default=32,type=int)
args = parser.parse_args()#("-b 32 -e 10 cooc_adj_bert_train.csv cooc_adj_bert_test.csv output_bert_allcooc_adjsampling3radius20km_batch32_epoch10".split())
if not os.path.exists(args.train) or not os.path.exists(args.test):
raise FileNotFoundError("Train or Test filepath is incorrect !")
# Number of training epochs (authors recommend between 2 and 4)
epochs = args.epochs
# The DataLoader needs to know the batch size for training, so I specify it here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.
batch_size = args.batch_size
# OUTPUT DIR
output_dir = args.outputdir
if not os.path.exists(args.outputdir):
raise FileNotFoundError("{0} directory does not exists ! ".format(args.output_dir))
if not os.path.isdir(args.outputdir):
raise NotADirectoryError("{0} is not a directory".format(args.output_dir))
df_train = pd.read_csv(args.train, sep="\t")
df_test = pd.read_csv(args.test, sep="\t")
# Get the GPU device name.
device_name = tf.test.gpu_device_name()
# The device name should look like the following:
if device_name == '/device:GPU:0':
print('Found GPU at: {}'.format(device_name))
else:
raise SystemError('GPU device not found')
# If there's a GPU available...
if torch.cuda.is_available():
# Tell PyTorch to use the GPU.
device = torch.device("cuda")
print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
# Load the BERT tokenizer.
print('Loading {0} tokenizer...'.format("bert-base-multilingual-cased"))
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False)
"""
print("Tokenize Input Data")
df_train["input_ids"] = df_train.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True))
df_test["input_ids"] = df_test.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True))
# Set the maximum sequence length.
# took the size of the largest sentence
MAX_LEN = df_train.input_ids.apply(len).max()+2
print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
df_train["input_ids"] = pad_sequences(df_train.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist()
df_test["input_ids"] = pad_sequences(df_test.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist()
df_train["attention_mask"] = df_train.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x] )
df_test["attention_mask"] = df_test.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x])
train_inputs = torch.tensor(np.array(df_train.input_ids.values.tolist()))
del df_train["input_ids"]
validation_inputs = torch.tensor(np.array(df_test.input_ids.values.tolist()))
del df_test["input_ids"]
train_labels = torch.tensor(np.array(df_train.label.values.tolist()))
del df_train["label"]
validation_labels = torch.tensor(np.array(df_test.label.values.tolist()))
del df_test["label"]
train_masks = torch.tensor(np.array(df_train.attention_mask.values.tolist()))
del df_train["attention_mask"]
validation_masks = torch.tensor(np.array(df_test.attention_mask.values.tolist()))
del df_test["attention_mask"]
"""
from lib.torch_generator import SentenceDataset
# Create the DataLoader for training set.
train_data = SentenceDataset(df_train,tokenizer,batch_size=batch_size)
#train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=batch_size)#,sampler=train_sampler,)
"""
del train_inputs
del train_masks
del train_labels
"""
# Create the DataLoader for validation set.
validation_data = SentenceDataset(df_test,tokenizer,batch_size=batch_size)
#validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size)#, sampler=validation_sampler)
"""
del validation_inputs
del validation_masks
del validation_labels
"""
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
"bert-base-multilingual-cased", # Use the 12-layer BERT model, with an uncased vocab.
num_labels = max(df_test.label.max(),df_train.label.max())+1, # The number of output labels--2 for binary classification.
# You can increase this for multi-class tasks.
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
model.cuda()
optimizer = AdamW(model.parameters(),
lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
eps = 1e-8 # args.adam_epsilon - default is 1e-8.
)
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_data) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so I can plot them.
loss_values = []
history = []
# For each epoch...
for epoch_i in range(0, epochs):
epoch_data={}
# ========================================
# Training
# ========================================
# Perform one full pass over the training set.
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
# Measure how long the training epoch takes.
t0 = time.time()
# Reset the total loss for this epoch.
total_loss = 0
# Put the model into training mode.
model.train()
# For each batch of training data...
for step, batch in enumerate(train_dataloader):
# Progress update every 40 batches.
if step % 100 == 0 and not step == 0:
# Calculate elapsed time in minutes.
elapsed = format_time(time.time() - t0)
# Report progress.#Changed to sys.stdout to avoid uneccessary \n
sys.stdout.write('\r Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
# Unpack this training batch from the dataloader.
#
# As I unpack the batch, I'll also copy each tensor to the GPU using the
# `to` method.
#
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Always clear any previously calculated gradients before performing a
# backward pass. PyTorch doesn't do this automatically because
# accumulating the gradients is "convenient while training RNNs".
# (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
model.zero_grad()
# Perform a forward pass (evaluate the model on this training batch).
# This will return the loss (rather than the model output) because I
# have provided the `labels`.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# The call to `model` always returns a tuple, so I need to pull the
# loss value out of the tuple.
loss = outputs[0]
# Accumulate the training loss over all of the batches so that I can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_loss += loss.item()
# Perform a backward pass to calculate the gradients.
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# Update parameters and take a step using the computed gradient.
# The optimizer dictates the "update rule"--how the parameters are
# modified based on their gradients, the learning rate, etc.
optimizer.step()
# Update the learning rate.
scheduler.step()
# Calculate the average loss over the training data.
avg_train_loss = total_loss / len(train_dataloader)
# Store the loss value for plotting the learning curve.
loss_values.append(avg_train_loss)
print("")
print(" Average training loss: {0:.2f}".format(avg_train_loss))
print(" Training epoch took: {:}".format(format_time(time.time() - t0)))
epoch_data["loss"]=avg_train_loss
epoch_data["epoch_duration"] = time.time() - t0
# ========================================
# Validation
# ========================================
# After the completion of each training epoch, measure the performance on
# the validation set.
print("")
print("Running Validation...")
t0 = time.time()
# Put the model in evaluation mode--the dropout layers behave differently
# during evaluation.
model.eval()
# Tracking variables
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# Evaluate data for one epoch
for batch in validation_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from dataloader
b_input_ids, b_input_mask, b_labels = batch
# Telling the model not to compute or store gradients, saving memory and
# speeding up validation
with torch.no_grad():
# Forward pass, calculate logit predictions.
# This will return the logits rather than the loss because we have
# not provided labels.
# token_type_ids is the same as the "segment ids", which
# differentiates sentence 1 and 2 in 2-sentence tasks.
# The documentation for this `model` function is here:
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask)
# Get the "logits" output by the model. The "logits" are the output
# values prior to applying an activation function like the softmax.
logits = outputs[0]
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences.
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
# Accumulate the total accuracy.
eval_accuracy += tmp_eval_accuracy
# Track the number of batches
nb_eval_steps += 1
# Report the final accuracy for this validation run.
print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
print(" Validation took: {:}".format(format_time(time.time() - t0)))
epoch_data["accuracy"] = eval_accuracy/nb_eval_steps
epoch_data["validation_duration"] = time.time() - t0
history.append(epoch_data)
print("")
print("Training complete!")
print("Save History")
pd.DataFrame(history).to_csv(output_dir+"/history_bert.csv",sep="\t")
# Create output directory if needed
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print("Saving model to %s" % output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
\ No newline at end of file
......@@ -47,7 +47,7 @@ def get_new_ids(cooc_data,id_first_value):
Returns
-------
dict
new ids for each toponyms
new ids for each toponyms
"""
topo_id = {}
id_ = id_first_value
......@@ -76,12 +76,13 @@ args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
MODEL_NAME = "Bi-LSTM_NGRAM"
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = 1 #args.ngram_word2vec_iter
WORDVEC_ITER = args.ngram_word2vec_iter
EMBEDDING_DIM = 256
#################################################
########## FILENAME VARIABLE ####################
......@@ -121,6 +122,7 @@ HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
from lib.utils import MetaDataSerializer
meta_data = MetaDataSerializer(
MODEL_NAME,
DATASET_NAME,
REL_CODE,
COOC_SAMPLING_NUMBER,
......@@ -209,6 +211,8 @@ if args.wikipedia_cooc:
cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
if not "title" in train_cooc_indices:
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
......
# Base module
import re
import os
import json
# Structure
import pandas as pd
import numpy as np
import geopandas as gpd
import tensorflow as tf
# Geometry
from shapely.geometry import Point
# Custom module
from helpers import read_geonames
from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader
from lib.metrics import lat_accuracy,lon_accuracy
# Logging
from tqdm import tqdm
import logging
from helpers import parse_title_wiki,EpochTimer
logging.getLogger('gensim').setLevel(logging.WARNING)
def get_new_ids(cooc_data,id_first_value):
"""
Return new ids from cooccurrence data
Parameters
----------
cooc_data : pd.DataFrame
cooccurrence da
id_first_value : int
id beginning value
Returns
-------
dict
new ids for each toponyms
"""
topo_id = {}
id_ = id_first_value
for title in cooc_data.title.values:
if not title in topo_id:
id_+=1
topo_id[id_]=title
for interlinks in cooc_data.interlinks.values:
for interlink in interlinks.split("|"):
if not interlink in topo_id:
id_+=1
topo_id[id_]=interlink
return topo_id
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
.parse_args()#("-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt -n 4 --ngram-word2vec-iter 10 -e 100 ../data/geonamesData/US_FR.txt ../data/geonamesData/hierarchy.txt".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
MODEL_NAME = "BASELINE"
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = args.ngram_word2vec_iter
EMBEDDING_DIM = 256
#################################################
########## FILENAME VARIABLE ####################
#################################################
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
GEONAME_FN,
ITER_ADJACENCY,
REGION_SUFFIX_FN)
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
REGION_SUFFIX_FN)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
from lib.utils import MetaDataSerializer
meta_data = MetaDataSerializer(
MODEL_NAME,
DATASET_NAME,
REL_CODE,
COOC_SAMPLING_NUMBER,
ITER_ADJACENCY,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
print(REL_CODE)
#############################################################################################
################################# LOAD DATA #################################################
#############################################################################################
# LOAD Geonames DATA
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
#CLEAR RAM
del geoname_data
# IF REGION
if args.admin_code_1 != "None":
filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
#############################################################################################
################################# RETRIEVE RELATIONSHIPS ####################################
#############################################################################################
# INITIALIZE RELATION STORE
rel_store = []
# Retrieve adjacency relationships
if args.adjacency:
logging.info("Retrieve adjacency relationships ! ")
if not os.path.exists(ADJACENCY_REL_FILENAME):
bounds = get_bounds(filtered) # Required to get adjacency relationships
rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
else:
logging.info("Open and load data from previous computation!")
rel_store=json.load(open(ADJACENCY_REL_FILENAME))
logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
# Retrieve inclusion relationships
if args.inclusion:
logging.info("Retrieve inclusion relationships ! ")
cpt_rel = len(rel_store)
rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
if args.wikipedia_cooc:
logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
cooc_data = pd.read_csv(COOC_FN,sep="\t")
cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))}
cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
if not "title" in train_cooc_indices:
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
logging.info("Merged with Geonames data !")
# EXTRACT rel
logging.info("Extracting cooccurrence relationships")
cpt=0
for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
cpt+=1
rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
logging.info("Extract {0} cooccurrence relationships !".format(cpt))
# STORE ID to name
geoname2name = dict(filtered["geonameid name".split()].values)
# ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...")
index = NgramIndex(NGRAM_SIZE)
# Identify all ngram available
filtered.name.apply(lambda x : index.split_and_add(x))
if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
if args.wikipedia_cooc:
geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
# SAVE THE INDEX TO REUSE THE MODEL
index.save(INDEX_FN)
logging.info("Done !")
#############################################################################################
################################# ENCODE COORDINATES ########################################
#############################################################################################
from lib.geo import latlon2healpix
# Encode each geonames entry coordinates
geoname_vec = {row.geonameid : latlon2healpix(row.latitude,row.longitude,128) for row in filtered.itertuples()}
# CLEAR RAM
del filtered
EMBEDDING_DIM = 256
num_words = len(index.index_ngram) # necessary for the embedding matrix
logging.info("Preparing Input and Output data...")
#############################################################################################
################################# BUILD TRAIN/TEST DATASETS #################################
#############################################################################################
X_train,y_train = [],[]
X_test,y_test = [],[]
from joblib import Parallel,delayed
from tensorflow.keras.utils import to_categorical
def parse_bow(x):
return np.sum(to_categorical(x,num_classes=index.cpt+1),axis=0)
for couple in rel_store:
geonameId_1,geonameId_2 = couple[0],couple[1]
if not geonameId_1 in geoname2encodedname:
continue
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
X_train.append(top1 + top2)
y_train.append(geoname_vec[geonameId_1])
else:
X_test.append(top1 + top2)
y_test.append(geoname_vec[geonameId_1])
# NUMPYZE inputs and output lists
X_train = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_train))
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_test))
X_test = np.array(X_test)
y_test = np.array(y_test)
logging.info("Data prepared !")
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
from scipy.sparse import csr_matrix
from sklearn import svm
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
X_train = csr_matrix(X_train)
X_test = csr_matrix(X_test)
print(REL_CODE)
oupt = open("log_baseline_US_FR_{0}.txt".format(REL_CODE),'a')
oupt.write("------")
from joblib import dump
import sys
f=True
for kernel in ["rbf","linear","poly"]:
clf = svm.SVC(kernel=kernel)
clf.fit(X_train,y_train)
if kernel =="linear" and f:
dump(clf,"SVMLINEAR_US_FR_{0}.bin".format(REL_CODE))
sys.exit()
y_pred = clf.predict(X_test)
oupt.write("Results for : "+"SVM with the kernel "+kernel)
oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"]))
oupt.flush()
for alg in (GaussianNB,MultinomialNB):
clf = alg()
clf.fit(X_train.toarray(),y_train)
y_pred = clf.predict(X_test.toarray())
oupt.write("Results for : "+"NaiveBayes with the alg "+alg.__name__)
oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])+"\n")
oupt.flush()
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
oupt.write("Results for : "+"Decision Tree classifier")
oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"]))
oupt.flush()
clf = RandomForestClassifier(max_depth=8, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
oupt.write("Results for : "+"Random Forest classifier")
oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"]))
oupt.flush()
oupt.close()
\ No newline at end of file
# Base module
import re
import os
import json
# Structure
import pandas as pd
import numpy as np
import geopandas as gpd
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout
from keras.models import Model
from keras import backend as K
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
# Geometry
from shapely.geometry import Point
# Custom module
from helpers import read_geonames
from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
from lib.ngram_index import NgramIndex
from lib.word_index import WordIndex
from lib.utils import ConfigurationReader
from lib.metrics import lat_accuracy,lon_accuracy
# Logging
from tqdm import tqdm
import logging
from helpers import parse_title_wiki,EpochTimer
logging.getLogger('gensim').setLevel(logging.WARNING)
def get_new_ids(cooc_data,id_first_value):
"""
Return new ids from cooccurrence data
Parameters
----------
cooc_data : pd.DataFrame
cooccurrence da
id_first_value : int
id beginning value
Returns
-------
dict
new ids for each toponyms
"""
topo_id = {}
id_ = id_first_value
for title in cooc_data.title.values:
if not title in topo_id:
id_+=1
topo_id[id_]=title
for interlinks in cooc_data.interlinks.values:
for interlink in interlinks.split("|"):
if not interlink in topo_id:
id_+=1
topo_id[id_]=interlink
return topo_id
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
.parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
MODEL_NAME = "Bi-LSTM_WORD"
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = args.ngram_word2vec_iter
EMBEDDING_DIM = 256
#################################################
########## FILENAME VARIABLE ####################
#################################################
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
GEONAME_FN,
ITER_ADJACENCY,
REGION_SUFFIX_FN)
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}_{5}".format(MODEL_NAME,
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
REGION_SUFFIX_FN)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
from lib.utils import MetaDataSerializer
meta_data = MetaDataSerializer(
MODEL_NAME,
DATASET_NAME,
REL_CODE,
COOC_SAMPLING_NUMBER,
ITER_ADJACENCY,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
#############################################################################################
################################# LOAD DATA #################################################
#############################################################################################
# LOAD Geonames DATA
logging.info("Load Geonames data...")
geoname_data = read_geonames(GEONAME_FN).fillna("")
train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
logging.info("Geonames data loaded!")
# SELECT ENTRY with class == to A and P (Areas and Populated Places)
filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
#CLEAR RAM
del geoname_data
# IF REGION
if args.admin_code_1 != "None":
filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
#############################################################################################
################################# RETRIEVE RELATIONSHIPS ####################################
#############################################################################################
# INITIALIZE RELATION STORE
rel_store = []
# Retrieve adjacency relationships
if args.adjacency:
logging.info("Retrieve adjacency relationships ! ")
if not os.path.exists(ADJACENCY_REL_FILENAME):
bounds = get_bounds(filtered) # Required to get adjacency relationships
rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
else:
logging.info("Open and load data from previous computation!")
rel_store=json.load(open(ADJACENCY_REL_FILENAME))
logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
# Retrieve inclusion relationships
if args.inclusion:
logging.info("Retrieve inclusion relationships ! ")
cpt_rel = len(rel_store)
rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
if args.wikipedia_cooc:
logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
cooc_data = pd.read_csv(COOC_FN,sep="\t")
cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))}
cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
if not "title" in train_cooc_indices:
train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
logging.info("Merged with Geonames data !")
# EXTRACT rel
logging.info("Extracting cooccurrence relationships")
cpt=0
for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
cpt+=1
rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
logging.info("Extract {0} cooccurrence relationships !".format(cpt))
# STORE ID to name
geoname2name = dict(filtered["geonameid name".split()].values)
# ENCODING NAME USING N-GRAM SPLITTING
logging.info("Encoding toponyms to ngram...")
index = WordIndex()
# Identify all ngram available
filtered.name.apply(lambda x : index.split_and_add(x))
if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
if args.wikipedia_cooc:
geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
# SAVE THE INDEX TO REUSE THE MODEL
index.save(INDEX_FN)
logging.info("Done !")
#############################################################################################
################################# ENCODE COORDINATES ########################################
#############################################################################################
# Encode each geonames entry coordinates
geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()}
# CLEAR RAM
del filtered
EMBEDDING_DIM = 256
num_words = len(index.index_word) # necessary for the embedding matrix
logging.info("Preparing Input and Output data...")
#############################################################################################
################################# BUILD TRAIN/TEST DATASETS #################################
#############################################################################################
X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
for couple in rel_store:
geonameId_1,geonameId_2 = couple[0],couple[1]
if not geonameId_1 in geoname2encodedname:
continue
top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
X_1_train.append(top1)
X_2_train.append(top2)
y_lon_train.append(geoname_vec[geonameId_1][0])
y_lat_train.append(geoname_vec[geonameId_1][1])
else:
X_1_test.append(top1)
X_2_test.append(top2)
y_lon_test.append(geoname_vec[geonameId_1][0])
y_lat_test.append(geoname_vec[geonameId_1][1])
# NUMPYZE inputs and output lists
X_1_train = np.array(X_1_train)
X_2_train = np.array(X_2_train)
y_lat_train = np.array(y_lat_train)
y_lon_train = np.array(y_lon_train)
X_1_test = np.array(X_1_test)
X_2_test = np.array(X_2_test)
y_lat_test = np.array(y_lat_test)
y_lon_test = np.array(y_lon_test)
logging.info("Data prepared !")
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
logging.info("Generating N-GRAM Embedding...")
embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
logging.info("Embedding generated !")
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
x1 = Bidirectional(LSTM(98))(x1)
x2 = Bidirectional(LSTM(98))(x2)
x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x)
# x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
# x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x)
# x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
# x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit(x=[X_1_train,X_2_train],
y=[y_lon_train,y_lat_train],
verbose=True, batch_size=100,
epochs=EPOCHS,
validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]),
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
......@@ -3,9 +3,10 @@ import os
# Structure
import pandas as pd
import numpy as np
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
......@@ -140,7 +141,7 @@ num_words = len(index.index_ngram)
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
EMBEDDING_DIM = len(embedding_weights[0])
#############################################################################################
################################# MODEL DEFINITION ##########################################
......@@ -153,49 +154,33 @@ input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = Dropout(0.1)(embedding_layer(input_1))
x2 = Dropout(0.1)(embedding_layer(input_2))
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
biLSTM = Bidirectional(LSTM(64,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x2,x1])#,x3])
aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
x1 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x)
x1 = Dropout(0.3)(x1)
x1 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x1)
x1 = Dropout(0.3)(x1)
x2 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x)
x2 = Dropout(0.3)(x2)
x2 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x2)
x2 = Dropout(0.3)(x2)
x1 = Dense(1000,activation="pentanh")(x)
# x1 = Dropout(0.3)(x1)
x1 = Dense(1000,activation="pentanh")(x1)
# x1 = Dropout(0.3)(x1)
x2 = Dense(1000,activation="pentanh")(x)
# x2 = Dropout(0.3)(x2)
x2 = Dense(1000,activation="pentanh")(x2)
# x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
output = concatenate([output_lon,output_lat],name="output_layer")
model = Model(inputs = [input_1,input_2], outputs = [output,aux_layer])#input_3
model.compile(loss={"output_layer":haversine_tf_1circle,"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy","output_layer":accuracy_k(ACCURACY_TOLERANCE)})
model = Model(inputs = [input_1,input_2], outputs = output)#input_3
model.compile(loss={"output_layer":haversine_tf_1circle}, optimizer='adam',metrics={"output_layer":accuracy_k(ACCURACY_TOLERANCE)})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
......
# Base module
import os
# Structure
import pandas as pd
import numpy as np
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
import keras.backend as K
import tensorflow as tf
from lib.custom_layer import *
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy
from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)
from helpers import EpochTimer
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.k_value
EPOCHS = args.epochs
ADJACENCY_SAMPLING = args.adjacency_sample
COOC_SAMPLING = args.cooc_sample
WORDVEC_ITER = 50
EMBEDDING_DIM = args.dimension
BATCH_SIZE = args.batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.inclusion_fn
ADJACENCY_REL_FILENAME = args.adjacency_fn
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
meta_data = MetaDataSerializer(
DATASET_NAME,
REL_CODE,
COOC_SAMPLING,
ADJACENCY_SAMPLING,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
### PUT DATASRC + GENERATOR
index = NgramIndex.load(args.ngram_index_fn)
train_src = []
test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4,use_healpix=False))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4,use_healpix=False))
if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
train_src.append(a_train)
test_src.append(a_test)
if args.inclusion:
i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
train_src.append(i_train)
test_src.append(i_test)
#Adjacency
print("Number of classes:",class_encoder.get_num_classes())
d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE)
d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE)
num_words = len(index.index_ngram)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
EMBEDDING_DIM = len(embedding_weights[0])
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from keras import regularizers
####
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
biLSTM = Bidirectional(GRU(128,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x)
x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x)
x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
x2 = Dropout(0.3)(x2)
#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D)
output_lon = Dense(1,activation="sigmoid")(x1)
output_lat = Dense(1,activation="sigmoid")(x2)
output_coord = concatenate([output_lon,output_lat],name="output_coord")
#####
model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3
model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)})
model.summary()
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit_generator(generator=d_train,
validation_data=d_test,
verbose=True,
epochs=EPOCHS,
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
......@@ -52,11 +52,15 @@ prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")]
final_output = []
for prefix in prefixes:
df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
data = json.load(open(prefix+".json"))
data["acccuracy@100km"] = (df.dist<100).sum()/len(df)
data["acccuracy@50km"] = (df.dist<50).sum()/len(df)
data["acccuracy@25km"] = (df.dist<25).sum()/len(df)
final_output.append(data)
try:
df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
data = json.load(open(prefix+".json"))
data["acccuracy@100km"] = (df.dist<100).sum()/len(df)
data["acccuracy@50km"] = (df.dist<50).sum()/len(df)
data["acccuracy@25km"] = (df.dist<25).sum()/len(df)
final_output.append(data)
except:
print("BUMP!")
pd.DataFrame(final_output).to_csv("{0}_RESULT.csv".format(EVAL_DATASET_FN.rstrip(".csv")))
\ No newline at end of file
python3 desamb_eval.py -g ../data/geocoding_evaluation/fr_dataset_ambiguity_sample50percent.csv outputs/FR_RESULT
#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/US\ FR\ results
#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/US\ FR\ results
python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/USFR_WORD
python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/USFR_WORD
......@@ -178,14 +178,13 @@ class Inclusion(DataSource):
class CoOccurrences(DataSource):
def __init__(self, filename, label_encoder,sampling=3,resolution = 1):
def __init__(self, filename, label_encoder,sampling=3,resolution = 256,use_healpix=False):
super().__init__("Co-Occurrence data",filename)
self.is_there_healpix = True
self.is_there_healpix = use_healpix
# LOAD DATA
try:
self.data_src = pd.read_csv(filename)
except:
self.data_src = pd.read_csv(filename,sep="\t")
self.data_src = pd.read_csv(filename,sep="\t")
# CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA
if not "healpix_{0}".format(resolution) in self.data_src.columns:
raise KeyError("healpix_{0} column does not exists ! ".format(resolution))
......@@ -272,7 +271,6 @@ class DataGenerator(keras.utils.Sequence):
self.num_classes = class_encoder.get_num_classes()
self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(self.len / self.batch_size))
......@@ -281,7 +279,7 @@ class DataGenerator(keras.utils.Sequence):
if self.is_there_healpix and self.only_healpix:
return [X[:,0],X[:,1]],y2
if self.is_there_healpix:
elif self.is_there_healpix:
return [X[:,0],X[:,1]],[y,y2]
else:
return [X[:,0],X[:,1]],y
......@@ -299,7 +297,6 @@ class DataGenerator(keras.utils.Sequence):
self.datasrc_index += 1
self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
if self.datasrc_index >= len(self.data_src):
self.return_(X,y,y2)
......@@ -332,7 +329,10 @@ def load_embedding(model_fn,dim_vector=100):
N = len(model.wv.vocab)
M = np.zeros((N,dim_vector))
for i in range(N):
M[i] = model.wv[str(i)]
try:
M[i] = model.wv[str(i)]
except KeyError:
pass
return M
if __name__ == "__main__":
......
import os
from gzip import GzipFile
import keras
from keras.utils import to_categorical
import numpy as np
import pandas as pd
from .geo import zero_one_encoding
from helpers import parse_title_wiki,read_geonames
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import LabelEncoder
def wc_l(filename,gzip=True):
lc = 0
if not gzip:
f = open(filename)
if gzip:
f = GzipFile(filename)
while f.readline():
lc += 1
f.close()
return lc
class SamplingProbabilities:
def __init__(self):
self.count = {}
def get_probs(self,item):
if not item in self.count:
self.count[item] = 0
self.count[item]+=1
return 1/self.count[item]
def __call__(self,a):
return self.get_probs(a)
class DataSource(object):
def __init__(self,name,input_filename):
self.name = name
assert os.path.exists(input_filename)
self.input_filename = input_filename
self.len = 0
self.is_there_healpix = False
def __next__(self):
raise NotImplementedError()
def __iter__(self):
return self
def __len__(self):
return self.len
def __reset__(self):
raise NotImplementedError()
def isOver(self):
raise NotImplementedError()
class Adjacency(DataSource):
def __init__(self,filename,geonames_filename,sampling=3,len_=None,gzip=True):
DataSource.__init__(self,"Adjacency SRC",filename)
assert os.path.exists(geonames_filename)
self.geonames_data_dict = {row.geonameid:row.name for row in read_geonames(geonames_filename).itertuples()}
self.gzip = gzip
if not self.gzip:
self.data_src = open(self.input_filename,'rb')
else:
self.data_src = GzipFile(self.input_filename,'rb')
if len_:
self.len = len_*sampling
else:
self.len = wc_l(filename,gzip=gzip)
self.data_src.readline() # header line
self.sampling = sampling
if self.sampling:
self.probs_storage = SamplingProbabilities()
self.topo = None
self.context_topo_context = []
self.curr_probs = None
self.lat, self.lon = None, None
self.i = 0
self.is_over = False
def __next__(self):
if self.i >= len(self.context_topo_context):
line = self.data_src.readline()
if not line:
self.is_over = True
raise StopIteration
line = line.decode("utf-8").rstrip("\n")
_,geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(","))
self.topo = int(geonameid)
self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")]
if self.sampling:
self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
self.lat, self.lon = float(latitude),float(longitude)
self.i = 0
self.i += 1
return (self.geonames_data_dict[self.topo],
self.geonames_data_dict[self.context_topo_context[self.i-1]],
self.lat,self.lon)
def __reset__(self):
if not self.gzip:
self.data_src = open(self.input_filename,'rb')
else:
self.data_src = GzipFile(self.input_filename,'rb')
self.data_src.readline() # header line
self.is_over = False
def isOver(self):
return self.is_over
class Inclusion(DataSource):
def __init__(self, geonames_filename,hierarchy_filename,mask_ids=None):
super().__init__("Inclusion SRC",hierarchy_filename)
assert os.path.exists(geonames_filename)
self.geonames_data_dict = {row.geonameid:(row.name,row.latitude,row.longitude) for row in read_geonames(geonames_filename).itertuples()}
self.data_src = pd.read_csv(self.input_filename,
sep="\t",
header=None,
names="parentId,childId,type".split(",")
).fillna("")
if mask_ids:
self.data_src = self.data_src[self.data_src.childId.isin(mask_ids)]
self.data_src= self.data_src[self.data_src.childId.isin(self.geonames_data_dict)]
self.data_src= self.data_src[self.data_src.parentId.isin(self.geonames_data_dict)]
self.data_src = self.data_src["childId parentId".split()].values.tolist()
self.len = len(self.data_src)
self.i = 0
self.is_over = False
def __next__(self):
if self.i+1 >= self.len:
self.eof = True
raise StopIteration
else:
self.i += 1
tup_ = tuple(self.data_src[self.i-1])
return (self.geonames_data_dict[tup_[0]][0],
self.geonames_data_dict[tup_[1]][0],
self.geonames_data_dict[tup_[0]][2],
self.geonames_data_dict[tup_[0]][1])
def __reset__(self):
self.i = 0
self.is_over = False
def isOver(self):
return (self.i == self.len)
class CoOccurrences(DataSource):
def __init__(self, filename, label_encoder,sampling=3,resolution = 256,use_healpix=False):
super().__init__("Co-Occurrence data",filename)
self.is_there_healpix = use_healpix
# LOAD DATA
self.data_src = pd.read_csv(filename,sep="\t")
# CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA
if not "healpix_{0}".format(resolution) in self.data_src.columns:
raise KeyError("healpix_{0} column does not exists ! ".format(resolution))
# PARSE TOPONYMS
self.data_src["title"] = self.data_src.title.apply(parse_title_wiki)
try:
self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki)
except:
pass
# LOOP parameter
self.sampling = sampling
if self.sampling:
self.probs_storage = SamplingProbabilities()
# LOOP INDICES
self.i = 0
self.j = 0
self.is_over = False
self.len = len(self.data_src)*(self.sampling-1)
# BUFFER VARIABLE
self.topo = None
self.context_topo_context = []
self.curr_probs = None
self.lat, self.lon = None, None
self.resolution = resolution
self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist()
self.class_encoder = label_encoder
self.class_encoder.fit(self.classes)
self.healpix = None
def __next__(self):
if self.isOver() or self.i*self.sampling == self.len:
self.is_over = True
raise StopIteration
if self.j >= len(self.context_topo_context):
line = self.data_src.iloc[self.i]
self.topo = line.title
self.context_topo_context = [x for x in str(line.interlinks).split("|")]
N = len(self.context_topo_context)
triple = []
for i in range(N):
if i+1 == N:
break
triple.append((self.context_topo_context[i],self.context_topo_context[i+1]))
self.context_topo_context = triple
np.random.shuffle(self.context_topo_context)
self.lat, self.lon = line.latitude,line.longitude
self.healpix = line["healpix_{0}".format(self.resolution)]
self.i += 1
self.j = 0
self.j += 1
return (self.topo,
*self.context_topo_context[self.j-1],
self.lat,self.lon,self.class_encoder.transform([self.healpix])[0])
def __reset__(self):
self.i = 0
self.is_over = False
def isOver(self):
return self.is_over
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self,data_sources,ngram_index,class_encoder,**kwargs):
'Initialization'
self.data_src = data_sources
self.ngram_index = ngram_index
self.batch_size = kwargs.get("batch_size",1000)
self.only_healpix = kwargs.get("only_healpix",False)
self.len = sum([len(d) for d in self.data_src])
self.datasrc_index = 0
self.num_classes = class_encoder.get_num_classes()
self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
def __len__(self):
'Denotes the number of batches per epoch'
return int(np.floor(self.len / self.batch_size))
def return_(self,X,y,y2=None):
if self.is_there_healpix and self.only_healpix:
return [X[:,0],X[:,1],X[:,2]],y2
elif self.is_there_healpix:
return [X[:,0],X[:,1],X[:,2]],[y,y2]
else:
return [X[:,0],X[:,1],X[:,2]],y
def __getitem__(self, index):
'Generate one batch of data'
X = np.empty((self.batch_size,3,self.ngram_index.max_len),dtype=np.int32) # toponym
y = np.empty((self.batch_size,2),dtype=float) #lat lon coord
y2=None # For healpix
if self.is_there_healpix:
y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
if self.data_src[self.datasrc_index].isOver():
self.datasrc_index += 1
self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
if self.datasrc_index >= len(self.data_src):
self.return_(X,y,y2)
for i in range(self.batch_size):
if self.data_src[self.datasrc_index].isOver():
return self.return_(X,y,y2)
try:
topo, topo_context_1,topo_context_2, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__()
except StopIteration as e:
return self.return_(X,y,y2)
X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context_1),self.ngram_index.encode(topo_context_2)]
y[i] = [*zero_one_encoding(longitude,latitude)]
if self.is_there_healpix:
y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
)
#y[i] = [longitude,latitude]
return self.return_(X,y,y2)
def on_epoch_end(self):
'Updates indexes after each epoch'
[d.__reset__() for d in self.data_src]
self.datasrc_index = 0
def load_embedding(model_fn,dim_vector=100):
model = KeyedVectors.load(model_fn)
N = len(model.wv.vocab)
M = np.zeros((N,dim_vector))
for i in range(N):
try:
M[i] = model.wv[str(i)]
except KeyError:
pass
return M
if __name__ == "__main__":
# All adj nb of line :7955000-1
from lib.ngram_index import NgramIndex
from tqdm import tqdm
ng = NgramIndex.load("../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json")
c= CoOccurrences("../data/wikipedia/cooccurrence_FR.txt_test.csv",sampling=3)
a = Adjacency("/home/jacques/sample_adjacency.txt",geonames_filename="../data/geonamesData/allCountries.txt",gzip=False,sampling=10)
i= Inclusion(geonames_filename="../data/geonamesData/allCountries.txt",hierarchy_filename="../data/geonamesData/hierarchy.txt")
d= DataGenerator([c,a,i],ng)
for x in tqdm(range(len(d))):d[i]
......@@ -26,9 +26,9 @@ def tf_deg2rad(deg):
def latlon2healpix( lat , lon , res ):
lat = np.radians(lat)
lon = np.radians(lon)
xs = ( np.cos(lat) * np.cos(lon) )#
ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
zs = ( np.sin(lat) )#
xs = ( np.cos(lat) * np.cos(lon) ) #
ys = ( np.cos(lat) * np.sin(lon) ) # -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
zs = ( np.sin(lat) ) #
return healpy.vec2pix( int(res) , xs , ys , zs )
def haversine_tf(y_true,y_pred):
......
......@@ -75,7 +75,7 @@ class NgramIndex():
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
ngrams = [ng for ng in ngrams if ng.count("$")<self.size-1]
ngrams = [ng for ng in ngrams if ng.count("$")<2]
if not self.loaded:
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
......@@ -125,7 +125,8 @@ class NgramIndex():
N = len(self.ngram_index)
embedding_matrix = np.zeros((N,dim))
for i in range(N):
embedding_matrix[i] = model.wv[str(i)]
if str(i) in model.wv:
embedding_matrix[i] = model.wv[str(i)]
return embedding_matrix
def save(self,fn):
......
......@@ -146,7 +146,8 @@ class Run(object):
out_proc = subprocess.PIPE
if log_filename:
out_proc = open(log_filename,'w')
out_proc = open(log_filename,'a')
print(4)
process = subprocess.Popen(self.get_command().split(),stdout=out_proc)
_, _ = process.communicate() # We don't care of the output (if so, we use the log_filename argument)
......@@ -209,8 +210,10 @@ class GridSearchModel:
log_filename : str, optional
log filename, by default None
"""
i=0
for task in self.tasks:
task.run(log_filename=log_filename)
task.run(log_filename=log_filename+"_"+str(i))
i+=1
if __name__ == "__main__":
......
import torch
from keras.preprocessing.sequence import pad_sequences
import numpy as np
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i:i + n]
class SentenceDataset(torch.utils.data.Dataset):
'Characterizes a dataset for PyTorch'
def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32):
'Initialization'
self.sentences = dataframe["sentence"].values
self.labels = dataframe["label"].values
self.tokenizer = tokenizer
self.max_len = max_len
self.batch_size = batch_size
a = np.arange(len(dataframe))
np.random.shuffle(a)
self.batch_tokenization = list(chunks(a,batch_size))
assert(len(self.batch_tokenization[0])==batch_size)
self.current_batch_id = 0
self.boundaries = (0,0+batch_size)
self.current_batch_tokenized = self.tokenize(self.current_batch_id)
def tokenize(self,batch_index):
X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512) for x in self.batch_tokenization[batch_index]]# Tokenizer
X = pad_sequences(X, maxlen=self.max_len, dtype="long", value=0, truncating="post", padding="post").tolist()
return X
def __len__(self):
'Denotes the total number of samples'
return len(self.sentences)
def __getitem__(self, index):
'Generates one sample of data'
if not index < self.boundaries[1] or not index >= self.boundaries[0]:
self.current_batch_id = index//self.batch_size
self.current_batch_tokenized = self.tokenize(self.current_batch_id)
self.boundaries= (self.current_batch_id*self.batch_size,self.current_batch_id*self.batch_size + self.batch_size)
# Load data and get label
index_in_batch = index-self.boundaries[0]
#print(self.boundaries,index_in_batch)
X = self.current_batch_tokenized[index_in_batch]
M = [int(token_id > 0) for token_id in X] # attention mask
y = self.labels[index]
return torch.tensor(np.array(X)),torch.tensor(np.array(M)),torch.tensor(np.array(y))
\ No newline at end of file
......@@ -3,6 +3,8 @@ import math
import argparse
import os
import json
import time
import datetime
# Data Structure
import numpy as np
......@@ -101,6 +103,7 @@ class ConfigurationReader(object):
class MetaDataSerializer(object):
def __init__(self,
model_name,
dataset_name,
rel_code,
cooc_sample_size,
......@@ -113,6 +116,7 @@ class MetaDataSerializer(object):
index_fn,
keras_model_fn,
train_test_history_fn):
self.model_name = model_name
self.dataset_name = dataset_name
self.rel_code = rel_code
self.cooc_sample_size = cooc_sample_size
......@@ -128,6 +132,7 @@ class MetaDataSerializer(object):
def save(self,fn):
json.dump({
"model_name":self.model_name,
"dataset_name" : self.dataset_name,
"rel_code" : self.rel_code,
"cooc_sample_size" : self.cooc_sample_size,
......@@ -193,4 +198,23 @@ class Chronometer:
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
\ No newline at end of file
return duration
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round((elapsed)))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
\ No newline at end of file
import json
import numpy as np
from ngram import NGram
# Machine learning
from gensim.models import Word2Vec
class WordIndex():
"""
Class used for encoding words in ngram representation
"""
def __init__(self,loaded = False):
"""
Constructor
Parameters
----------
loaded : bool
if loaded from external file
"""
self.word_index = {"":0}
self.index_word = {0:""}
self.cpt = 0
self.max_len = 0
self.loaded = loaded
def split_and_add(self,word):
"""
Split word in multiple ngram and add each one of them to the index
Parameters
----------
word : str
a word
"""
grams = word.lower().split(" ")
[self.add(subword) for subword in grams ]
self.max_len = max(self.max_len,len(grams))
def add(self,subword):
"""
Add a ngram to the index
Parameters
----------
ngram : str
ngram
"""
if not subword in self.word_index:
self.cpt+=1
self.word_index[subword]=self.cpt
self.index_word[self.cpt]=subword
def encode(self,word):
"""
Return a ngram representation of a word
Parameters
----------
word : str
a word
Returns
-------
list of int
listfrom shapely.geometry import Point,box
of ngram index
"""
subwords = [w.lower() for w in word.split(" ")]
if not self.loaded:
[self.add(ng) for ng in subwords if not ng in self.word_index]
if self.max_len < len(subwords):
self.max_len = max(self.max_len,len(subwords))
return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len)
def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
"""
Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
Parameters
----------
ngram_encoding : list of int
first encoding of a word
MAX_LEN : int
desired length of the encoding
filling_item : int, optional
ngram index you wish to use, by default 0
Returns
-------
list of int
list of ngram index
"""
if self.loaded and len(ngram_encoding) >=MAX_LEN:
return ngram_encoding[:MAX_LEN]
assert len(ngram_encoding) <= MAX_LEN
diff = MAX_LEN - len(ngram_encoding)
ngram_encoding.extend([filling_item]*diff)
return ngram_encoding
def get_embedding_layer(self,texts,dim=100,**kwargs):
"""
Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
Parameters
----------
texts : list of [list of int]
list of encoded word
dim : int, optional
embedding dimension, by default 100
Returns
-------
np.array
embedding matrix
"""
model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
N = len(self.word_index)
embedding_matrix = np.zeros((N,dim))
for i in range(N):
if str(i) in model.wv:
embedding_matrix[i] = model.wv[str(i)]
return embedding_matrix
def save(self,fn):
"""
Save the NgramIndex
Parameters
----------
fn : str
output filename
"""
data = {
"word_index": self.word_index,
"cpt_state": self.cpt,
"max_len_state": self.max_len
}
json.dump(data,open(fn,'w'))
@staticmethod
def load(fn):
"""
Load a NgramIndex state from a file.
Parameters
----------
fn : str
input filename
Returns
-------
NgramIndex
ngram index
Raises
------
KeyError
raised if a required field does not appear in the input file
"""
try:
data = json.load(open(fn))
except json.JSONDecodeError:
print("Data file must be a JSON")
for key in ["word_index","cpt_state","max_len_state"]:
if not key in data:
raise KeyError("{0} field cannot be found in given file".format(key))
new_obj = WordIndex(loaded=True)
new_obj.word_index = data["word_index"]
new_obj.index_word = {v:k for k,v in new_obj.word_index.items()}
new_obj.cpt = data["cpt_state"]
new_obj.max_len = data["max_len_state"]
return new_obj
......@@ -3,6 +3,7 @@ import os
import tensorflow as tf
import keras.backend as K
from lib.ngram_index import NgramIndex
from lib.word_index import WordIndex
import numpy as np
from tensorflow.python.keras.backend import set_session
......@@ -64,7 +65,7 @@ class Geocoder(object):
# graph = tf.compat.v1.get_default_graph()
# set_session(sess)
self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()})
self.ngram_encoder = NgramIndex.load(ngram_index_file)
self.ngram_encoder = WordIndex.load(ngram_index_file)
def get_coord(self,toponym,context_toponym):
global sess
......
File moved
......@@ -14,11 +14,11 @@ for rel in rels:
c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt"
# Init GridsearchModel
grid = GridSearchModel(\
"python3 combination_embeddings.py",
"python3 combination_embeddings_baselines.py",
**OrderedDict({ # necessary because some args have to be given in a certain order
"rel":["-i -a",("-i -w "+c_f),"-a -w","-a -i -w"],
"rel":["-w "+c_f,("-i -w "+c_f),"-a -w "+c_f,"-a -i -w "+c_f], # ,"-a -i -w "+c_f ,"-i -a"
"-n":[4],
"--ngram-word2vec-iter" :[1],
"--ngram-word2vec-iter" :[10],
"-e":[100],
"geoname_fn":"../data/geonamesData/US_FR.txt".split(),
"hierarchy_fn":"../data/geonamesData/hierarchy.txt".split()
......@@ -27,6 +27,6 @@ grid = GridSearchModel(\
print("########### THE FOLLOWING COMMAND(S) WILL BE EXECUTED ###########" )
[print(task.get_command()) for task in grid.tasks]
print("#################################################################")
grid.run("outputs/log_RUN_TEXAS_IDFrance.txt")
grid.run("outputs/log_{0}".format("FR_baseline"))
#["-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -a","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -i"]
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment