Something went wrong on our end
-
Jacques Fize authoredc3dfbda0
generate_dataset.py 6.10 KiB
import argparse
import pandas as pd
import numpy as np
from helpers import read_geonames
from lib.utils_geo import latlon2healpix
from tqdm import tqdm
from sklearn.model_selection import train_test_split
parser = argparse.ArgumentParser()
parser.add_argument("geonames_dataset")
parser.add_argument("wikipedia_dataset")
parser.add_argument("geonames_hierarchy_data")
parser.add_argument("--cooc-sampling", default=4, type=int)
parser.add_argument("--adj-sampling", default=4, type=int)
parser.add_argument("--adj-nside", default=128, type=int)
parser.add_argument("--split-nside", default=128, type=int)
parser.add_argument("--split-method", default="per_pair", type=str, choices="per_pair per_place".split())
args = parser.parse_args()#("../data/geonamesData/FR.txt ../data/wikipedia/cooccurrence_FR.txt ../data/geonamesData/hierarchy.txt".split())
PREFIX = args.geonames_dataset.split("/")[-1].split(".")[0] # Ouch !
# LOAD DATA
geonames_data = read_geonames(args.geonames_dataset)
wikipedia_data = pd.read_csv(args.wikipedia_dataset, sep="\t")
geonames_hierarchy_data = pd.read_csv(args.geonames_hierarchy_data, sep="\t", header=None,
names="parentId,childId,type".split(",")).fillna("")
# Add IDs for the Wikipedia Cooc Dataset
min_id = geonames_data.geonameid.max() + 1
max_id = min_id + len(wikipedia_data)
wikipedia_data["geonameid"] = np.arange(min_id, max_id)
# Healpix cell id computation
geonames_data["adj_split"] = geonames_data.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.adj_nside),
axis=1)
def get_adjacent_pairs(dataframe, sampling_nb):
"""
Return pairs of place toponyms that are adjacent geographicaly.
Parameters
----------
dataframe : pandas.DataFrame
geonames data
sampling_nb : int
number of adjacent place drawn
Returns
-------
list of list
[[ID,place toponym,adjacent place toponym, latitude, longitude],...]
"""
new_pairs = []
for ix, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Get Adjacent Toponym Pairs"):
healpix_cell = row.adj_split
topo_prin = row["name"]
lat, lon = row.latitude, row.longitude
within_cell = dataframe[dataframe.adj_split == healpix_cell]["name"].values
selected = np.random.choice(within_cell, sampling_nb)
new_pairs.extend([[row.geonameid, topo_prin, sel, lat, lon] for sel in selected])
return new_pairs
def get_cooccurrence_pairs(dataframe, sampling_nb):
"""
Return pairs of place toponyms where toponyms appears in a same wikipedia page
Parameters
----------
dataframe : pandas.DataFrame
wikipedia cooccurrence data
sampling_nb : int
number of adjacent place drawn
Returns
-------
list of list
[[ID,place toponym,adjacent place toponym, latitude, longitude],...]
"""
new_pairs = []
dataframe["interlinks"] = dataframe.interlinks.apply(lambda x: np.random.choice(x.split("|"), sampling_nb))
for ix, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Get Cooccurrent Toponym Pairs"):
topo_prin = row.title
lat, lon = row.latitude, row.longitude
new_pairs.extend([[row.geonameid, topo_prin, sel, lat, lon] for sel in row["interlinks"]])
return new_pairs
def get_inclusion_pairs(geoname_df, hierarchy_df):
"""
Return pairs of place toponyms that share an inclusion relationship. Ex. Paris, France (Paris geometry is included in France geometry)
Parameters
----------
dataframe : pandas.DataFrame
geonames data
hierarchy_df : pandas.DataFrame
geonames hierarchy data
Returns
-------
list of list
[[ID,place toponym,adjacent place toponym, latitude, longitude],...]
"""
geonamesIDS = set(geoname_df.geonameid.values)
id_label = dict(geonames_data["geonameid name".split()].values)
id_lat = dict(geonames_data["geonameid latitude".split()].values)
id_lon = dict(geonames_data["geonameid longitude".split()].values)
filter_mask = (hierarchy_df.childId.isin(geonamesIDS) & hierarchy_df.parentId.isin(geonamesIDS))
pairs_id = hierarchy_df[filter_mask]["childId parentId".split()].values.tolist()
return [[p[0], id_label[p[0]], id_label[p[1]], id_lat[p[0]], id_lon[p[0]]] for p in pairs_id]
# EXTRACT PAIRS FROM INPUT DATA
cooc_pairs = pd.DataFrame(get_cooccurrence_pairs(wikipedia_data, args.cooc_sampling),
columns="ID toponym toponym_context latitude longitude".split())
adjacent_pairs = pd.DataFrame(get_adjacent_pairs(geonames_data, args.adj_sampling),
columns="ID toponym toponym_context latitude longitude".split())
inclusion_pairs = pd.DataFrame(get_inclusion_pairs(geonames_data, geonames_hierarchy_data),
columns="ID toponym toponym_context latitude longitude".split())
# FOR EACH PAIR, COMPUTE THE HEALPIX CELL ID FOR EACH COORDINATES ASSOCIATED
cooc_pairs["hp_split"] = cooc_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside), axis=1)
adjacent_pairs["hp_split"] = adjacent_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside),
axis=1)
inclusion_pairs["hp_split"] = inclusion_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside),
axis=1)
# SPLIT DATASETS BETWEEN TRAIN AND TEST GEOGRAPHICALY
field = "hp_split"
if args.split_method == "per_place":
field = "ID"
for df in [cooc_pairs, adjacent_pairs]:
df_train, _ = train_test_split(df, stratify=df[field].values, test_size=0.33)
df["split"] = "test"
df.loc[df_train.index.values, "split"] = "train"
inc_train, _ = train_test_split(inclusion_pairs, test_size=0.33)
inclusion_pairs["split"] = "test"
inclusion_pairs.loc[inc_train.index.values, "split"] = "train"
# SAVE DATA
inclusion_pairs.to_csv("{0}_inclusion.csv".format(PREFIX), sep="\t")
adjacent_pairs.to_csv("{0}_adjacent.csv".format(PREFIX), sep="\t")
cooc_pairs.to_csv("{0}_cooc.csv".format(PREFIX), sep="\t")