From d6f858af94a181745573add6380fb3652bd0a3dd Mon Sep 17 00:00:00 2001
From: Ludovic Moncla <ludovic.moncla@insa-lyon.fr>
Date: Mon, 13 Sep 2021 06:55:04 +0000
Subject: [PATCH] Upload New File

---
 evaluate_geocoder_wikipage.py | 89 +++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 evaluate_geocoder_wikipage.py

diff --git a/evaluate_geocoder_wikipage.py b/evaluate_geocoder_wikipage.py
new file mode 100644
index 0000000..49333c3
--- /dev/null
+++ b/evaluate_geocoder_wikipage.py
@@ -0,0 +1,89 @@
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from lib.utils_geo import haversine_pd
+import warnings
+from pandas.core.common import SettingWithCopyWarning
+from lib.geocoder.our_geocoder import Geocoder
+import argparse
+import glob
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("models_dir")
+parser.add_argument("coocurrence_dataset")
+parser.add_argument("output_filename")
+parser.add_argument("-k",default=4,type=int)
+
+
+args = parser.parse_args()
+
+tqdm.pandas()
+warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
+
+
+def heuristic_mean(geocoder,toponym, context_toponyms):
+    input_ = np.asarray([[toponym,t1] for t1 in context_toponyms if toponym != t1])
+    if len(input_) == 0:
+        input_ = np.asarray([[toponym,toponym]])
+    res_geocode = pd.DataFrame(input_,columns="t tc".split())
+    lons,lats = geocoder.get_coords(input_[:,0],input_[:,1])
+    res_geocode["lon"] = lons
+    res_geocode["lat"] = lats
+    return [res_geocode["lon"].mean(),res_geocode["lat"].mean()]
+
+def accuracy_at_k(geocoding_df,k=100):
+    geocoding_df["distanceKM"] = haversine_pd(geocoding_df.longitude,geocoding_df.latitude,geocoding_df.pred_longitude,geocoding_df.pred_latitude)
+    return (geocoding_df.distanceKM <k).sum()/len(geocoding_df)
+
+
+def median_distance_error(geocoding_df):
+    geocoding_df["distanceKM"] = haversine_pd(geocoding_df.longitude,geocoding_df.latitude,geocoding_df.pred_longitude,geocoding_df.pred_latitude)
+    return geocoding_df.distanceKM.median()
+
+
+def geocode_wikipages(df,geo,k=None):
+    import random
+    random.seed(42)
+    if not k:
+        found_coords = df.progress_apply(lambda x: heuristic_mean(geo,x.title,x.interlinks),axis=1).values
+    else:
+        found_coords = df.progress_apply(lambda x: heuristic_mean(geo,x.title,random.choices(x.interlinks,k=k)),axis=1).values
+  
+    found_coords = np.asarray(found_coords.tolist())
+    return found_coords
+
+
+MODELS_DIR = args.models_dir.rstrip("/") + "/"
+COOC_DATASET_FN = args.coocurrence_dataset
+OUTPUT_FN = args.output_filename
+
+k_cooc_used = args.k
+
+df = pd.read_csv(COOC_DATASET_FN,sep="\t")
+df["interlinks"] = df.interlinks.apply(lambda x: x.split("|"))
+
+
+model_available = glob.glob(MODELS_DIR+"*.h5")
+model_available = [mod.rstrip(".h5").split("/")[-1] for mod in model_available]
+print("Models that will be evaluated :")
+for model_fn in model_available:
+    print("\t*",model_fn)
+
+res_ = []
+for mod in tqdm(model_available):
+    index_fn = MODELS_DIR + mod +"_index"
+    model_fn = MODELS_DIR + mod +".h5"
+    g = Geocoder(model_fn, index_fn)
+    found_coords = geocode_wikipages(df,g, k_cooc_used)
+    df.loc[:, "pred_longitude"] = found_coords[:, 0]
+    df.loc[:, "pred_latitude"] = found_coords[:, 1]
+
+    res_.append([mod,accuracy_at_k(df,161),accuracy_at_k(df,100),accuracy_at_k(df,50),accuracy_at_k(df,20),median_distance_error(df)])
+
+
+pd.DataFrame(res_,columns="dataset accuracy@161km accuracy@100km accuracy@50km accuracy@20km MDE".split()).to_csv(OUTPUT_FN,sep="\t",index=None)
+
+
+
+
-- 
GitLab