diff --git a/evaluate_baseline.py b/evaluate_baseline.py index 2058636fabf79c4f71c59ef26f7eeb3131413e2b..ed07554f29b68a7d2d5f21546c28acf5bff49117 100644 --- a/evaluate_baseline.py +++ b/evaluate_baseline.py @@ -1,6 +1,6 @@ from joblib import load import pandas as pd -from lib.utils_geo import haversine_pd,latlon2healpix +from lib.utils_geo import latlon2healpix import argparse import os diff --git a/evaluate_geocoder.py b/evaluate_geocoder.py index e57c2b3013c58e5049f47256fb0cd4fcc9331238..a47a68a746ed9bdeeec0b666ca36ea0f7990f81e 100644 --- a/evaluate_geocoder.py +++ b/evaluate_geocoder.py @@ -45,13 +45,20 @@ def accuracy_at_k(geocoding_df,geocoder,k=100): geocoding_df["distanceKM"] = haversine_pd(geocoding_df.longitude,geocoding_df.latitude,geocoding_df.pred_longitude,geocoding_df.pred_latitude) return (geocoding_df.distanceKM <k).sum()/len(geocoding_df) +def median_distance_error(geocoding_df,geocoder): + lons,lats = g.get_coords(geocoding_df.toponym.values,geocoding_df.toponym_context.values) + geocoding_df["pred_latitude"] = lats + geocoding_df["pred_longitude"] = lons + geocoding_df["distanceKM"] = haversine_pd(geocoding_df.longitude,geocoding_df.latitude,geocoding_df.pred_longitude,geocoding_df.pred_latitude) + return geocoding_df.distanceKM.median() + res_ = [] for mod in tqdm(model_available): index_fn = MODELS_DIR + mod +"_index" model_fn = MODELS_DIR + mod +".h5" g = Geocoder(model_fn, index_fn) - res_.append([mod,accuracy_at_k(geocoding_df,g,100),accuracy_at_k(geocoding_df,g,50),accuracy_at_k(geocoding_df,g,20)]) + res_.append([mod,accuracy_at_k(geocoding_df,g,161),accuracy_at_k(geocoding_df,g,100),accuracy_at_k(geocoding_df,g,50),accuracy_at_k(geocoding_df,g,20),median_distance_error(geocoding_df,g)]) -pd.DataFrame(res_,columns="dataset accuracy@100km accuracy@50km accuracy@20km".split()).to_csv(OUTPUT_FN,sep="\t",index=None) +pd.DataFrame(res_,columns="dataset accuracy@161km accuracy@100km accuracy@50km accuracy@20km MDE".split()).to_csv(OUTPUT_FN,sep="\t",index=None) diff --git a/generate_dataset.py b/generate_dataset.py index 90224c8997d3b3ab370ff29e5e5d8ee05cd96be1..39e688cb186ffb1c0fc027ccf42e0d1376b4015f 100644 --- a/generate_dataset.py +++ b/generate_dataset.py @@ -3,7 +3,7 @@ import argparse import pandas as pd import numpy as np -from helpers import read_geonames +from lib.helpers import read_geonames from lib.utils_geo import latlon2healpix from tqdm import tqdm @@ -166,6 +166,10 @@ print('# cooc_pairs: ', len(cooc_pairs)) print('# adjacent_pairs: ', len(adjacent_pairs)) print('# inclusion_pairs: ', len(inclusion_pairs)) +#ADD metadata +cooc_pairs["sampling"] = args.cooc_sampling +adjacent_pairs["sampling"] = args.adj_sampling + # SAVE DATA inclusion_pairs.to_csv("{0}_inclusion.csv".format(PREFIX), sep="\t") adjacent_pairs.to_csv("{0}_adjacent.csv".format(PREFIX), sep="\t") diff --git a/helpers.py b/lib/helpers.py similarity index 90% rename from helpers.py rename to lib/helpers.py index b6cf164a1d711d7968010f4d126b3ea1088d17de..ae19955f450a2bed75a3e8bcd2e56f8560dacfe4 100644 --- a/helpers.py +++ b/lib/helpers.py @@ -177,7 +177,7 @@ import time import os class EpochTimer(Callback): - def __init__(self,log_filename): + def __init__(self,log_filename,is_p,is_c,is_i,sampling_p,sampling_c): self.epoch = 0 self.timer = time.time() # if os.path.exists(log_filename): @@ -185,7 +185,9 @@ class EpochTimer(Callback): # self.epoch = pd.read_csv(log_filename).Epoch.max() # else: self.output = open(log_filename,'a') - self.output.write("{0},{1},{2},{3},{4},{5}\n".format("Epoch","Execution Time","Loss","Val_Loss","Accuracy","Accuracy_val")) + header_config = "is_P,is_C,is_I" + self.config_model = ",".join(list(map(str,[is_p,is_c,is_i,sampling_p,sampling_c]))) + self.output.write("{0},{1},{2},{3},{4},{5},{6}\n".format("Epoch","Execution Time","Loss","Val_Loss","Accuracy","Accuracy_val",header_config)) self.output.flush() def on_epoch_begin(self,epoch, logs={}): @@ -193,7 +195,7 @@ class EpochTimer(Callback): def on_epoch_end(self, epoch, logs={}): end_time = time.time() - self.timer - self.output.write("{0},{1},{2},{3},{4},{5}\n".format(self.epoch,end_time,logs["loss"],logs["val_loss"],logs["compute_metric"],logs["val_compute_metric"])) + self.output.write("{0},{1},{2},{3},{4},{5},{6}\n".format(self.epoch,end_time,logs["loss"],logs["val_loss"],logs["compute_metric"],logs["val_compute_metric"],self.config_model)) self.output.flush() self.epoch += 1 diff --git a/lib/utils_geo.py b/lib/utils_geo.py index 00b8a1a2fba238cd5b01ad3710a0dd8b4f2aec8d..3ee3833b441f3939b9b7b47d75f8a54fbdb9b404 100644 --- a/lib/utils_geo.py +++ b/lib/utils_geo.py @@ -11,7 +11,7 @@ from tqdm import tqdm import pandas as pd, numpy as np from numba import njit -from helpers import read_geonames +from .helpers import read_geonames from tqdm import tqdm from joblib import Parallel,delayed diff --git a/train_baseline.py b/train_baseline.py index 653c326902d4ea92f457fb8ec74f825cc4b6b6b3..f3609727c303f50ba359601a830f9c87f69bb962 100644 --- a/train_baseline.py +++ b/train_baseline.py @@ -69,7 +69,7 @@ classifier_dict = { parameters = { "naive-bayes":[{"alpha":[0,1]}], - "svm":[{"kernel":["rbf","poly"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}], + "svm":[{"kernel":["linear","rbf","poly"], 'gamma': [1e-1,1e-2,1e-3, 1,10,100]}], "sgd":[{"penalty":["l1","l2"],"loss":["hinge","modified_huber","log"]}], "knn":[{"n_neighbors":list(range(4,8)),"p":[1,2]}], "decision-tree": [{"criterion":["gini","entropy"]}], diff --git a/train_geocoder.py b/train_geocoder.py index eaa5193b252bfaf2c3e5b598e24d9ab66378ea72..a89d2988e17ba37312786009008b4e5194ad3830 100644 --- a/train_geocoder.py +++ b/train_geocoder.py @@ -17,7 +17,7 @@ from lib.ngram_index import NgramIndex from lib.word_index import WordIndex from lib.utils import ConfigurationReader from lib.utils_geo import accuracy_k,haversine_tf_1circle -from helpers import EpochTimer +from lib.helpers import EpochTimer from lib.datageneratorv4 import DataGenerator # Logging @@ -66,11 +66,11 @@ EMBEDDING_FN = "outputs/{0}_embedding.npy".format(PREFIX_OUTPUT_FN) PREFIX_OUTPUT_FN+="_{0}".format(EPOCHS) if args.adjacency: - PREFIX_OUTPUT_FN += "_A" + PREFIX_OUTPUT_FN += "_P" if args.inclusion: PREFIX_OUTPUT_FN += "_I" if args.wikipedia: - PREFIX_OUTPUT_FN += "_P" + PREFIX_OUTPUT_FN += "_C" MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) @@ -82,15 +82,20 @@ HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) ############################################################################################# data_used = [] - +sampling_adj = None +sampling_cooc = None if args.wikipedia: data_used.append(pd.read_csv(COOC_FN,sep="\t")) + if "sampling" in data_used[-1].columns: + sampling_cooc = data_used[-1]["sampling"].unique()[0] if args.inclusion: data_used.append(pd.read_csv(INCLUSION_FN,sep="\t")) if args.adjacency: data_used.append(pd.read_csv(ADJACENT_FN, sep="\t")) + if "sampling" in data_used[-1].columns: + sampling_adj = data_used[-1]["sampling"].unique()[0] if len(data_used) <1: print("No Type of toponyms indicated. Stopping the program...") @@ -205,8 +210,8 @@ print(model.summary()) checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN , monitor='loss', verbose=1, save_best_only=save_best_only, mode='auto', period=1) -epoch_timer = EpochTimer(HISTORY_FN) +epoch_timer = EpochTimer(HISTORY_FN,is_p=args.adjacency,is_c=args.wikipedia,is_i=args.inclusion,sampling_c=sampling_cooc,sampling_p=sampling_adj) history = model.fit(training_generator,verbose=True, diff --git a/wikipediageocoding.ipynb b/wikipediageocoding.ipynb index 91dc00b7003a06f915635ab958208601c9022a9c..4b1febdf3d94f8b76be5a8c2338dfb51c0ecdeaa 100644 --- a/wikipediageocoding.ipynb +++ b/wikipediageocoding.ipynb @@ -37,6 +37,13 @@ "model_ngram_index_path = \"data/FR20_cooc_4_30_P_index.txt\"" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 3,