diff --git a/wikipediageocoding.ipynb b/wikipediageocoding.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..7dceba34a5e6dee0243154dd899fa114a5dfe3de --- /dev/null +++ b/wikipediageocoding.ipynb @@ -0,0 +1,273 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jacquesfize/opt/anaconda3/envs/my_env/lib/python3.7/site-packages/tqdm/std.py:699: FutureWarning: The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version\n", + " from pandas import Panel\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from tqdm.notebook import tqdm\n", + "tqdm.pandas()\n", + "\n", + "import warnings\n", + "from pandas.core.common import SettingWithCopyWarning\n", + "warnings.simplefilter(action=\"ignore\", category=SettingWithCopyWarning)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "cooccurrence_FN = \"data/cooccurrence_FR.txt\"\n", + "k_cooc_used = 4\n", + "model_path = \"data/FR20_cooc_4_30_P.h5\"\n", + "model_ngram_index_path = \"data/FR20_cooc_4_30_P_index.txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(cooccurrence_FN,sep=\"\\t\")\n", + "df = df.head(100) # For testing\n", + "df[\"interlinks\"] = df.interlinks.apply(lambda x: x.split(\"|\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>title</th>\n", + " <th>interlinks</th>\n", + " <th>longitude</th>\n", + " <th>latitude</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>Auvergne</td>\n", + " <td>[Le Puy-en-Velay, Clermont-Ferrand, Réserve na...</td>\n", + " <td>3.300000</td>\n", + " <td>45.700000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>Alpes-de-Haute-Provence</td>\n", + " <td>[Annot, Uvernet-Fours, Canton de Manosque-Nord...</td>\n", + " <td>6.240000</td>\n", + " <td>44.095278</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>Alpes-Maritimes</td>\n", + " <td>[Gréolières, Valdeblore, Sophia Antipolis, Vil...</td>\n", + " <td>7.166667</td>\n", + " <td>43.833333</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>Bas-Rhin</td>\n", + " <td>[Bœrsch, Basse-Alsace, Pyrénées-Atlantiques, I...</td>\n", + " <td>7.783333</td>\n", + " <td>48.816667</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>Bouches-du-Rhône</td>\n", + " <td>[Saint-Rémy-de-Provence, Échangeur de Frais-Va...</td>\n", + " <td>5.083333</td>\n", + " <td>43.500000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " title interlinks \\\n", + "0 Auvergne [Le Puy-en-Velay, Clermont-Ferrand, Réserve na... \n", + "1 Alpes-de-Haute-Provence [Annot, Uvernet-Fours, Canton de Manosque-Nord... \n", + "2 Alpes-Maritimes [Gréolières, Valdeblore, Sophia Antipolis, Vil... \n", + "3 Bas-Rhin [Bœrsch, Basse-Alsace, Pyrénées-Atlantiques, I... \n", + "4 Bouches-du-Rhône [Saint-Rémy-de-Provence, Échangeur de Frais-Va... \n", + "\n", + " longitude latitude \n", + "0 3.300000 45.700000 \n", + "1 6.240000 44.095278 \n", + "2 7.166667 43.833333 \n", + "3 7.783333 48.816667 \n", + "4 5.083333 43.500000 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.geocoder.our_geocoder import Geocoder\n", + "geo = Geocoder(model_path,model_ngram_index_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from lib.utils_geo import haversine_pd\n", + "\n", + "def heuristic_mean(geocoder,toponym, context_toponyms):\n", + " input_ = np.asarray([[toponym,t1] for t1 in context_toponyms if toponym != t1])\n", + " res_geocode = pd.DataFrame(input_,columns=\"t tc\".split())\n", + " lons,lats = geocoder.get_coords(input_[:,0],input_[:,1])\n", + " res_geocode[\"lon\"] = lons\n", + " res_geocode[\"lat\"] = lats\n", + " return [res_geocode[\"lon\"].mean(),res_geocode[\"lat\"].mean()]\n", + "\n", + "def accuracy_at_k(geocoding_df,k=100):\n", + " geocoding_df[\"distanceKM\"] = haversine_pd(geocoding_df.longitude,geocoding_df.latitude,geocoding_df.pred_longitude,geocoding_df.pred_latitude)\n", + " return (geocoding_df.distanceKM <k).sum()/len(geocoding_df)\n", + "\n", + "def geocode_wikipages(df,k=None):\n", + " import random\n", + " random.seed(42)\n", + " if not k:\n", + " found_coords = df.progress_apply(lambda x: heuristic_mean(geo,x.title,x.interlinks),axis=1).values\n", + " else:\n", + " found_coords = df.progress_apply(lambda x: heuristic_mean(geo,x.title,random.choices(x.interlinks,k=k)),axis=1).values\n", + " \n", + " found_coords = np.asarray(found_coords.tolist())\n", + " return found_coords" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5374160c45964b9c99bef2f2fb51d920", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "found_coords = geocode_wikipages(df,k_cooc_used)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[:,\"pred_longitude\"] = found_coords[:,0]\n", + "df.loc[:,\"pred_latitude\"] = found_coords[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.96, 0.81, 0.26)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "accuracy_at_k(df,100),accuracy_at_k(df,50),accuracy_at_k(df,20)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python (my_env)", + "language": "python", + "name": "my_env" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}