Something went wrong on our end
-
Fize Jacques authored35f9959b
1_extractDataFromWikidata.py 3.19 KiB
import json
import gzip
import argparse
import re
import pandas as pd
from joblib import Parallel, delayed
# To avoid progressbar issue
from tqdm import tqdm as tqdm_base
def tqdm(*args, **kwargs):
if hasattr(tqdm_base, '_instances'):
for instance in list(tqdm_base._instances):
tqdm_base._decr_instances(instance)
return tqdm_base(*args, **kwargs)
parser = argparse.ArgumentParser()
parser.add_argument("wikidata_json_dump_filename",help="Wikipedia JSON dump compressed with gzip (*.gz)")
parser.add_argument("output_filename")
args = parser.parse_args()
# Prepare Output File
output = open(args.output_filename,'w')
output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format("ID_WIKIDATA","title","url","latitude","longitude","classes"))
def job(line):
line = line.decode("utf-8")
if not "\"P625\"" in line or not "\"P31\"" in line:
return
try:
data = json.loads(line.strip(",\n"))
if "sitelinks" in data and "claims" in data:
if "enwiki" in data["sitelinks"]:
id_ = data["id"]
coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]
title = data["sitelinks"]["enwiki"]["title"]
url = "https://en.wikipedia.org/wiki/{0}".format(title.replace(" ","_"))
lat = coords_data["latitude"]
lon = coords_data["longitude"]
classes_ = ""
for claimP31 in data["claims"]["P31"]:
classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"])
output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_")))
except Exception: # First Line is "['" and last line is "]'"
pass
Parallel(n_jobs=8,backend="multiprocessing")(delayed(job)(line)for line in tqdm(gzip.GzipFile(args.wikidata_json_dump_filename),unit_scale=True,unit_divisor=1000))
"""
grep -v "ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses" selectedPages.csv > selectedPages2.csv
{ echo -n 'ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses\n'; cat selectedPages2.csv; } > selectedPages3.csv
import pandas as pd
df = pd.read_csv("test.txt.new",sep="\t")
df
df.latitude
df
df.columns
nano test.txt.new
!nano test.txt.new
!nano test.txt.new
df = pd.read_csv("test.txt.new",sep="\t")
df.latitude
import geopandas as gpd
gdf = gpd.read_file("data/france/france_metro.geojson")
from shapely.geometry import Point
df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1)
df["geom"]=df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1)
gdf
gdf.iloc[0].geometry
france = gdf.iloc[0].geometry
from tqdm import tqdm
tqdm.pandas()
df.geom.progress_apply(lambda x : france.contains(x))
france.convex_hull
ff =france.convex_hull
df.geom.progress_apply(lambda x : ff.contains(x))
is_in_france = df.geom.progress_apply(lambda x : ff.contains(x))
df_new = df[is_in_france].copy()
df_new
del df_new["geom"]
df_new.to_csv("data/wikidata/sample/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_FRANCE")
!cp test.txt.new data/wikidata/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv
"""
# A run is done in ~1,18 hours (i7 2.8ghz, 16Gb RAM)