Skip to content
Snippets Groups Projects
Commit a2bf8cec authored by Fize Jacques's avatar Fize Jacques
Browse files

add wikipedia geocoder notebook

parent 0be81e05
No related branches found
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
```
%% Output
/Users/jacquesfize/opt/anaconda3/envs/my_env/lib/python3.7/site-packages/tqdm/std.py:699: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version
from pandas import Panel
%% Cell type:code id: tags:
``` python
cooccurrence_FN = "data/cooccurrence_FR.txt"
k_cooc_used = 4
model_path = "data/FR20_cooc_4_30_P.h5"
model_ngram_index_path = "data/FR20_cooc_4_30_P_index.txt"
```
%% Cell type:code id: tags:
``` python
df = pd.read_csv(cooccurrence_FN,sep="\t")
df = df.head(100) # For testing
df["interlinks"] = df.interlinks.apply(lambda x: x.split("|"))
```
%% Cell type:code id: tags:
``` python
df.head()
```
%% Output
title interlinks \
0 Auvergne [Le Puy-en-Velay, Clermont-Ferrand, Réserve na...
1 Alpes-de-Haute-Provence [Annot, Uvernet-Fours, Canton de Manosque-Nord...
2 Alpes-Maritimes [Gréolières, Valdeblore, Sophia Antipolis, Vil...
3 Bas-Rhin [Bœrsch, Basse-Alsace, Pyrénées-Atlantiques, I...
4 Bouches-du-Rhône [Saint-Rémy-de-Provence, Échangeur de Frais-Va...
longitude latitude
0 3.300000 45.700000
1 6.240000 44.095278
2 7.166667 43.833333
3 7.783333 48.816667
4 5.083333 43.500000
%% Cell type:code id: tags:
``` python
from lib.geocoder.our_geocoder import Geocoder
geo = Geocoder(model_path,model_ngram_index_path)
```
%% Cell type:code id: tags:
``` python
from lib.utils_geo import haversine_pd
def heuristic_mean(geocoder,toponym, context_toponyms):
input_ = np.asarray([[toponym,t1] for t1 in context_toponyms if toponym != t1])
res_geocode = pd.DataFrame(input_,columns="t tc".split())
lons,lats = geocoder.get_coords(input_[:,0],input_[:,1])
res_geocode["lon"] = lons
res_geocode["lat"] = lats
return [res_geocode["lon"].mean(),res_geocode["lat"].mean()]
def accuracy_at_k(geocoding_df,k=100):
geocoding_df["distanceKM"] = haversine_pd(geocoding_df.longitude,geocoding_df.latitude,geocoding_df.pred_longitude,geocoding_df.pred_latitude)
return (geocoding_df.distanceKM <k).sum()/len(geocoding_df)
def geocode_wikipages(df,k=None):
import random
random.seed(42)
if not k:
found_coords = df.progress_apply(lambda x: heuristic_mean(geo,x.title,x.interlinks),axis=1).values
else:
found_coords = df.progress_apply(lambda x: heuristic_mean(geo,x.title,random.choices(x.interlinks,k=k)),axis=1).values
found_coords = np.asarray(found_coords.tolist())
return found_coords
```
%% Cell type:code id: tags:
``` python
found_coords = geocode_wikipages(df,k_cooc_used)
```
%% Output
%% Cell type:code id: tags:
``` python
df.loc[:,"pred_longitude"] = found_coords[:,0]
df.loc[:,"pred_latitude"] = found_coords[:,1]
```
%% Cell type:code id: tags:
``` python
accuracy_at_k(df,100),accuracy_at_k(df,50),accuracy_at_k(df,20)
```
%% Output
(0.96, 0.81, 0.26)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment