diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..769c55f833fc02c195e1134d4ebf9d2d061a47e6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,142 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + + +#### CUSTOM + +data/* +deprecated/* +*.ipynb_checkpoints +notebooks/* +outputs/* +temp/* +WikipediaExtract/* + +*.DS_Store \ No newline at end of file diff --git a/1_extractDataFromWikidata.py b/1_extractDataFromWikidata.py new file mode 100644 index 0000000000000000000000000000000000000000..64c777c54ad8fbbfa7c7ef6878b973014bad6410 --- /dev/null +++ b/1_extractDataFromWikidata.py @@ -0,0 +1,92 @@ +import json +import gzip +import argparse +import re + +import pandas as pd + +from joblib import Parallel, delayed + +# To avoid progressbar issue +from tqdm import tqdm as tqdm_base +def tqdm(*args, **kwargs): + if hasattr(tqdm_base, '_instances'): + for instance in list(tqdm_base._instances): + tqdm_base._decr_instances(instance) + return tqdm_base(*args, **kwargs) + + +parser = argparse.ArgumentParser() +parser.add_argument("wikidata_json_dump_filename",help="Wikipedia JSON dump compressed with gzip (*.gz)") +parser.add_argument("output_filename") + +args = parser.parse_args() + +# Prepare Output File +output = open(args.output_filename,'w') +output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format("ID_WIKIDATA","title","url","latitude","longitude","classes")) + +def job(line): + line = line.decode("utf-8") + + if not "\"P625\"" in line or not "\"P31\"" in line: + return + try: + data = json.loads(line.strip(",\n")) + if "sitelinks" in data and "claims" in data: + if "enwiki" in data["sitelinks"]: + id_ = data["id"] + coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"] + title = data["sitelinks"]["enwiki"]["title"] + url = "https://en.wikipedia.org/wiki/{0}".format(title.replace(" ","_")) + lat = coords_data["latitude"] + lon = coords_data["longitude"] + classes_ = "" + for claimP31 in data["claims"]["P31"]: + classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"]) + output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_"))) + except Exception: # First Line is "['" and last line is "]'" + pass + + +Parallel(n_jobs=8,backend="multiprocessing")(delayed(job)(line)for line in tqdm(gzip.GzipFile(args.wikidata_json_dump_filename),unit_scale=True,unit_divisor=1000)) + + +""" +grep -v "ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses" selectedPages.csv > selectedPages2.csv +{ echo -n 'ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses\n'; cat selectedPages2.csv; } > selectedPages3.csv + + +import pandas as pd +df = pd.read_csv("test.txt.new",sep="\t") +df +df.latitude +df +df.columns +nano test.txt.new +!nano test.txt.new +!nano test.txt.new +df = pd.read_csv("test.txt.new",sep="\t") +df.latitude +import geopandas as gpd +gdf = gpd.read_file("data/france/france_metro.geojson") +from shapely.geometry import Point +df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1) +df["geom"]=df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1) +gdf +gdf.iloc[0].geometry +france = gdf.iloc[0].geometry +from tqdm import tqdm +tqdm.pandas() +df.geom.progress_apply(lambda x : france.contains(x)) +france.convex_hull +ff =france.convex_hull +df.geom.progress_apply(lambda x : ff.contains(x)) +is_in_france = df.geom.progress_apply(lambda x : ff.contains(x)) +df_new = df[is_in_france].copy() +df_new +del df_new["geom"] +df_new.to_csv("data/wikidata/sample/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_FRANCE") +!cp test.txt.new data/wikidata/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv +""" +# A run is done in ~1,18 hours (i7 2.8ghz, 16Gb RAM) \ No newline at end of file diff --git a/2_extractLearningDataset.py b/2_extractLearningDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..725d82a71f794569a150b9a5aef396b1c8c74b54 --- /dev/null +++ b/2_extractLearningDataset.py @@ -0,0 +1,46 @@ +import gzip +import json +import re + +import argparse + +import pandas as pd + +from joblib import Parallel,delayed +from tqdm import tqdm + +parser = argparse.ArgumentParser() + +parser.add_argument("wikipedia_archive_filename",help="Filename of the Wikipedia corpus parsed with gensim") +parser.add_argument("wikidata_extraction",help="Output from the previous step") +parser.add_argument("output_file") + +args = parser.parse_args() + +try: + df_extract_wikidata = pd.read_csv(args.wikidata_extraction)#,header=None,names=["ID_WIKIDATA","title","url","latitude","longitude","classes"]) +except: + df_extract_wikidata = pd.read_csv(args.wikidata_extraction,sep="\t")#,header=None,names=["ID_WIKIDATA","title","url","latitude","longitude","classes"]) + +titles = set(df_extract_wikidata.title.values) + +coords_lat = dict(df_extract_wikidata["title latitude".split()].values) +coords_lon = dict(df_extract_wikidata["title longitude".split()].values) +class_ = dict(df_extract_wikidata["title classes".split()].values) + +output = open(args.output_file,'w') + + + + +def job(line): + line = line.decode("utf-8") + data = json.loads(line) + if data["title"] in titles: + title = data["title"] + data["lat"] = coords_lat[title] + data["lon"] = coords_lon[title] + data["classes"] = class_[title] + output.write(json.dumps(data)+"\n") + +Parallel(n_jobs = 8,backend="multiprocessing")(delayed(job)(line) for line in tqdm(gzip.GzipFile(args.wikipedia_archive_filename,'rb'),total=5980533)) \ No newline at end of file diff --git a/3_samplingLearningDataset.py b/3_samplingLearningDataset.py new file mode 100644 index 0000000000000000000000000000000000000000..2d512ce71cf324213f2e6e67ce74f495b28d35e0 --- /dev/null +++ b/3_samplingLearningDataset.py @@ -0,0 +1,48 @@ +import subprocess,os,json +import numpy as np +import time + +import logging +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +from tqdm import tqdm + +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("corpus_filename") +parser.add_argument("sampling_size",type=int) +parser.add_argument("output_filename") + +args= parser.parse_args() + +CORPUS_FILENAME = args.corpus_filename +SAMPLE_SIZE = args.sampling_size + +# Compute the size of input corpus +logging.info("Computing the corpus size...") +wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ]) +NUMBER_OF_INPUT=int(wc_l.strip().split()[-2]) +logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT)) + +# Sampling +logging.info("Sampling...") +arr_ = np.arange(NUMBER_OF_INPUT) +sample = np.random.choice(arr_,SAMPLE_SIZE) + +# Prepare Output file +output = open(args.output_filename,'w') + +# Writing in the output file +logging.info("Writing the output...") +for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT): + if ix in sample: + output.write(line) + +logging.info("Done !") + + diff --git a/4_embeddings_lat_lon_type.py b/4_embeddings_lat_lon_type.py new file mode 100644 index 0000000000000000000000000000000000000000..e674e508052df416b5378c3b79a6d6cd0de3451b --- /dev/null +++ b/4_embeddings_lat_lon_type.py @@ -0,0 +1,215 @@ +#Â Basic module +import time +import random +import json +import os +import sys +import argparse + +# Data module +import numpy as np +import pandas as pd + +from joblib import Parallel,delayed + +# Keras basic +import keras +from keras import backend as K +from keras.initializers import Constant + +# preprocessing +from sklearn import preprocessing +from keras.preprocessing import sequence +from keras.preprocessing.text import Tokenizer +from keras.preprocessing.sequence import pad_sequences +from keras.utils import to_categorical +from keras.preprocessing.text import text_to_word_sequence + +# Neural Network Model and layers class +from keras.layers import Dense, Input, GlobalAveragePooling1D, Embedding, LSTM, Bidirectional, Conv1D, GRU +from keras.models import Model + +#Â Neural network model and visualisation function +from models import getModel,BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model +from helpers import plot_accuracy_from_history, save_embedding + +#Â Utils +from utils import CoordinatesEncoder,TokenizerCustom,_split + + +# Logging +import logging +from chrono import Chronometer +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) +chrono = Chronometer() + +#Â Visualisation +import matplotlib.pyplot as plt +from tqdm import tqdm + +parser = argparse.ArgumentParser() + +parser.add_argument("input") +parser.add_argument("--glove_dir",default="data/glove") + +parser.add_argument("--max_sequence_length",type=int, default=15) +parser.add_argument("--max_num_words",type=int, default=400000) + +parser.add_argument("--embedding_dimension",type=int, default=100) + +parser.add_argument("--batch_size",type=int, default=100) +parser.add_argument("--epochs",type=int, default=100) + +parser.add_argument("-v",action="store_true",help="Display Keras training verbose") + + +def clean(x): + return x.lower().replace("\n","").replace("\'\'\'","").replace("\'\'","") + +def split_data(input_data): + """ + Split the corpus into different list, each one corresponding to a feature(coordinates, type, textual data) + + Parameters + ---------- + input_data : _io.TextIOWrapper + File instance + + Returns + ------- + tuple + lists of locations : name, coordinates, types and text data + """ + listLocations, listText, listCoords, listTypes = [], [], [], [] + for line in input_data: + data = json.loads(line.strip("\n")) + listLocations.append(data["title"]) + listText.append(clean(data["section_texts"][0])) # get intro + # listTypes.append(data["classes"] if data["classes"] else "Q5624766") # Default Populated Places + listTypes.append(dict_class_translate[data["classes"]] + if data["classes"] in dict_class_translate else "populated place") + listCoords.append([float(data["lat"]), float(data["lon"])]) + return listLocations, listCoords, listTypes, listText + + + +# PARSE ARGS +args = parser.parse_args()#("./WikipediaExtract/WikipediaExtract_filtered_300.json".split()) + +GLOVE_DIR = args.glove_dir +MAX_SEQUENCE_LENGTH = args.max_sequence_length # Size of the context window for word2vec CBOW +MAX_NUM_WORDS = args.max_num_words # Number of words in the vocabulary +EMBEDDING_DIM = args.embedding_dimension # Dimensionality for the word embeddings +BATCH_SIZE = args.batch_size # Size of the training batches of instances +EPOCHS = args.epochs +CORPUS_FILENAME = args.input + +# SEARCH FOR UNIQUE TYPES +logging.info("Collecting class name") + +# For class translation +dict_class_translate = json.load(open("data/wikidata/class_data/WikiClass2DPClass.json")) + +classlabels = set([]) +for line in tqdm(open(CORPUS_FILENAME), desc="Reading the line "): + data = json.loads(line.strip("\n")) + classlabels.add(dict_class_translate[data["classes"]] + if data["classes"] in dict_class_translate else "populated place") + #classlabels.add(data["classes"] if data["classes"] else "Q5624766") +classlabels = list(classlabels) + +# LOAD DATA +logging.info("Loading the data...") +chrono.start("data_loading") + +_, listCoords, listTypes, listText = split_data(open(CORPUS_FILENAME)) + +logging.info("Data Loaded in {0} seconds!".format(chrono.stop("data_loading"))) + +logging.info("Extract Vocab") +# Extract Vocab +vocab_ = set([" "]) +tokenizer = Tokenizer() +tokenizer.fit_on_texts(listText) +tokenizer.word_index[" "] = np.max(list(tokenizer.index_word.keys()))+1 +vocab_ = vocab_.union(tokenizer.word_index.keys()) +logging.info("The vocabulary contains {0} words".format(len(list(vocab_)))) + +logging.info("Initialize Tokenizer/ClassEncoder/CoordinateEncoder...") +# Tokenizer +#tokenizer = TokenizerCustom(list(vocab_)) +max_key_tokenizer = np.max(list(tokenizer.index_word.keys())) +num_words = min(MAX_NUM_WORDS, len(tokenizer.word_index)) + 1 +# Coordinate Encoder +coordinate_encoder = CoordinatesEncoder(2,2) +# CLASS ENCODER type-->int +type_encoder = preprocessing.LabelEncoder() +type_encoder.fit(classlabels) + + +logging.info("Parsing data for the neural network...") +chrono.start("parse_data") +X = tokenizer.texts_to_sequences(listText) + +#X = X[:500] # Sample for tests + +listCoords = np.array(listCoords) +y_lat = listCoords[:, 0] +y_lon = listCoords[:, 1] + + +new_X,Y_type,Y_coord = [],[],[] +for ix,x in tqdm(enumerate(X),total=len(X)): + text_sequence_splited = _split(x,MAX_SEQUENCE_LENGTH,0) + + # Sub-Sequences vectors + new_X.extend(text_sequence_splited) + + # coordinate vectors + new_coordinates=coordinate_encoder.vector_flatten(y_lat[ix],y_lon[ix]) + Y_coord.extend([new_coordinates]*len(text_sequence_splited)) + + #type vectors + type_code = type_encoder.transform([listTypes[ix]])[0] + arr_ = np.zeros(len(type_encoder.classes_)) + arr_[type_code]=1 + Y_type.extend([arr_]*len(text_sequence_splited)) + + +Y_coord = np.array(Y_coord) +Y_type = np.array(Y_type) +new_X = np.array(new_X) + +logging.info("Data Parsed in {0} seconds!".format(chrono.stop("parse_data"))) + +for model_f in [BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model]: + name, model = getModel(model_f,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder,coordinate_encoder) + logging.info("Training the {0} model...".format(name)) + chrono.start("model_training") + name = name + \ + "_{0}dim_{1}epoch_{2}batch".format(EMBEDDING_DIM, EPOCHS, BATCH_SIZE) + + history = model.fit(x=new_X, + y=[Y_type,Y_coord],#Y_lat,Y_lon], + validation_split=0.33, + epochs=EPOCHS, + batch_size=BATCH_SIZE, + verbose=(1 if args.v else 0), + workers = 4, + #use_multiprocessing=True + ) + + logging.info("Model {0} have been trained in {1} seconds!".format(name,chrono.stop("model_training"))) + hist_df = pd.DataFrame(history.history) + hist_df.to_csv("outputs/{0}.csv".format(name)) + for layer_name in "place_type coord".split(): + plt.clf() + plot_accuracy_from_history( + name, hist_df, layer_name, "outputs/{0}.png".format(name), "") + save_embedding(model, tokenizer, 0, "outputs/{0}.txt".format(name)) + + diff --git a/README.md b/README.md index 0eb5eab21761132cef34520dd53d5fe9b72af33a..72ba7df9b5cfb872f979eb8fce9511d74d40fa9d 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,20 @@ -# Place Embedding +# INSTALL BASEMAP +```bash +brew install geos +pip3 install https://github.com/matplotlib/basemap/archive/master.zip +``` + +# GET DATA + +## Process Wikipedia + +python3 -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz + +##Â Process Wikidata + +python3 extractInfoWikidata.py + +## Fuse Data for training + +python3 extractsubWikipedia.py \ No newline at end of file diff --git a/chrono.py b/chrono.py new file mode 100644 index 0000000000000000000000000000000000000000..8e0adfd61f63449bce232df919402448735db4b6 --- /dev/null +++ b/chrono.py @@ -0,0 +1,57 @@ +import time + +class Chronometer(): + def __init__(self): + self.__task_begin_timestamp = {} + + def start(self,task_name): + """ + Start a new task chronometer + + Parameters + ---------- + task_name : str + task id + + Raises + ------ + ValueError + if a running task already exists with that name + """ + if task_name in self.__task_begin_timestamp: + raise ValueError("A running task exists with the name {0}!".format(task_name)) + self.__task_begin_timestamp[task_name] = time.time() + + def stop(self,task_name): + """ + Stop and return the duration of the task + + Parameters + ---------- + task_name : str + task id + + Returns + ------- + float + duration of the task in seconds + + Raises + ------ + ValueError + if no task exist with the id `task_name` + """ + if not task_name in self.__task_begin_timestamp: + raise ValueError("The {0} task does not exist!".format(task_name)) + duration = time.time() - self.__task_begin_timestamp[task_name] + del self.__task_begin_timestamp[task_name] + return duration + +if __name__ == "__main__": + chrono = Chronometer() + chrono.start("test") + chrono.start("test2") + time.sleep(3) + print(chrono.stop("test")) + time.sleep(3) + print(chrono.stop("test2")) \ No newline at end of file diff --git a/evalgeonamesembeddings.py b/evalgeonamesembeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..bf2bc7d2caea005829ddf6c56609f71ccc1fc323 --- /dev/null +++ b/evalgeonamesembeddings.py @@ -0,0 +1,70 @@ +# Evaluation process +import gensim +import glob +import re +import gensim +import random +from helpers import * +from scipy.spatial.distance import cosine +from shapely.geometry import Point +from scipy.stats.stats import pearsonr + +import pandas as pd +import geopandas as gpd + +from tqdm import tqdm + +NPAIR = 100000 +fns = glob.glob("data/embeddings/*.bin") + +def get_data(fn): + data = [int(x) for x in re.findall("\d+",fn)] + if not len(data) == 4: + return {"embedding_size":data[0], + "walk_length":data[1], + "number_of_walks":data[2], + "word2vec_window_size":data[3], + "filepath":fn, + "noise":data[4] + } + #raise Exception("filename should have 4 integers") + return { + "embedding_size":data[0], + "walk_length":data[1], + "number_of_walks":data[2], + "word2vec_window_size":data[3], + "filepath":fn + } + +df = read_geonames("./data/geonamesData/FR.txt") +df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1) + +# Create GeoDataFrame for faster spatial comparison operations +gdf = gpd.GeoDataFrame(df) + +# Select a sample that concerns the departement "La Manche" +manche_gdf = gdf[gdf.admin2_code == "50"] + +df =pd.DataFrame([get_data(fn) for fn in fns]) + +def get_pearsons(model): + manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid + coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values) + places = list(coords.keys()) + geodesic_d = [] + embeddings_d = [] + for i in tqdm(range(NPAIR),disable=True): + placeA=random.choice(places) + placeB=random.choice(places) + geodesic_d.append(coords[placeA].distance(coords[placeB])) + embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)])) + return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value + +df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0]) + +df.plot.scatter(x="walk_length", y="pearson") +plt.show() +df.plot.scatter(x="number_of_walks", y="pearson") +plt.show() +df.plot.scatter(x="word2vec_window_size", y="pearson") +plt.show() \ No newline at end of file diff --git a/geonames_embedding.py b/geonames_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c6305d1bd5e5d2fe7a927792dcc765ea1bf198 --- /dev/null +++ b/geonames_embedding.py @@ -0,0 +1,228 @@ +# PYTHON MODULE +import math +import random +from argparse import ArgumentParser +from multiprocessing import cpu_count +from argparse import RawTextHelpFormatter + +# COMMON DATA STRUCTURE MODULE +import pandas as pd +import numpy as np +import networkx as nx + +# SPATIAL DATA MANIPULATION +import geopandas as gpd +import osrm +osrm.RequestConfig.host = "jacquesfize.com:5000" +from shapely.geometry import Point + +# DISTANCE MODULE +from scipy.spatial.distance import cosine +from scipy.stats.stats import pearsonr + +# Machine Learning MODULE +from node2vec import Node2Vec +import gensim + +# VISUALISATION MODULE +import matplotlib.pyplot as plt +from tqdm import tqdm +tqdm.pandas() + +# PERSONAL FUNCTION +from helpers import * + +parser = ArgumentParser(description='Generate a spatial embedding of places using Geonames data', formatter_class=RawTextHelpFormatter) + +parser.add_argument("input") + +parser.add_argument("--nbcpu",type=int,default=cpu_count()) + +parser.add_argument("--vector-size",type=int,default=64,help="Output Vector Dimension") +parser.add_argument("--walk-length",type=int,default=30, help="Size of the walk generated during the Node2vec algorithm") +parser.add_argument("--num-walks",type=int,default=200, help="Number of walk generated during the Node2vec algorithm") +parser.add_argument("--word2vec-window-size",type=int,default=30, help="Window size used in the Word2vec algorithm") + +parser.add_argument("--buffer-size",type=float,default=0.03,help="Buffer size to transform Point in Polygon. Used for adjacency matrix computation.") +parser.add_argument("-d",action="store_true",help="Integrate the distance weight between vertices") +parser.add_argument("--dist",choices=["euclidean","itinerary"],default="itinerary",help="""Two distance functions are available: + - Euclidean : Euclidean distance between the two places centroids + - Itinerary : Compute the itinerary distance between two places using an OSRM service +""") + +parser.add_argument("--noise",action="store_true") +parser.add_argument("--noise-size",type=int,default=500) + +args = parser.parse_args() + +# INPUT DATA +GEONAMES_FN = args.input + +# PARALLELISM OPTION +NUMBER_OF_CPU_USED = args.nbcpu + +# Graph Embedding parameter +VECTOR_SIZE = args.vector_size +WALK_LENGTH = args.walk_length +NUMBER_OF_WALK = args.num_walks +WORD2VEC_WINDOW = args.word2vec_window_size + +# GRAPH WEIGHT PARAMETER +IS_DISTANCE = args.d +DISTANCE = args.dist +# if simulation of new toponyms +GEO_DISTANCE_COEF = 0.5 +EMBEDDING_DISTANCE_COEF = 0.5 + +# New toponym simulation +IS_NOISE = args.noise +NUMBER_OF_NODE_DESPATIALIZED = args.noise_size + + +# DISTANCE CACHE STORAGE +from sqlitedict import SqliteDict +distance_dict = SqliteDict('./data/distance_dict.sqlite', autocommit=True) + +# LOAD GEONAMES DATA +df = read_geonames(GEONAMES_FN) +df["geometry"] = df["latitude longitude".split()].progress_apply(lambda x:Point(x.longitude,x.latitude),axis=1) + +# Create GeoDataFrame for faster spatial comparison operations +gdf = gpd.GeoDataFrame(df) + +# Select a sample that concerns the departement "La Manche" +manche_gdf = gdf[gdf.admin2_code == "50"] +manche_gdf["geometry"]=manche_gdf.geometry.buffer(0.03) +manche_gdf.plot() +# plt.show() + +# Build a Adjacency matrix to generate the graph used for the embedding generation +N = len(manche_gdf) +adjacency_matrix = np.zeros((N,N)) +geometries = manche_gdf.geometry.values +for i in tqdm(range(N)): + for j in range(i,N): + adjacency_matrix[i,j] = geometries[i].intersects(geometries[j]) + adjacency_matrix[j,i] = adjacency_matrix[i,j] +plt.clf() +plt.imshow(adjacency_matrix);plt.colorbar() +# plt.show() + +# Mapping id between matrix and geonameid +manche_gdf["code_matrix"]=np.arange(N) +geoname_id2idxmat = dict(manche_gdf["geonameid code_matrix".split()].values) +idxmat2geoname_id = {v:k for k,v in geoname_id2idxmat.items()} + +# Add adjacent entity found in the Geodataframe +def get_adjacent_entity(x): + idxs = np.nonzero(adjacency_matrix[geoname_id2idxmat[x]])[0] + return [idxmat2geoname_id[idx] for idx in idxs if not idxmat2geoname_id[idx] == x] # take not itself + +manche_gdf["adjacent_entity"]=manche_gdf.geonameid.apply(get_adjacent_entity) + +# Code for getting the distance using the road network (not euclidean) PART 1 +manche_gdf["geometry_centroid"]=manche_gdf.centroid +coords = dict(manche_gdf["geonameid geometry_centroid".split()].values) + +# Code for getting the distance using the road network (not euclidean) PART 2 + +# Run ORSM SERVER +#https://hub.docker.com/r/osrm/osrm-backend/ +#docker run -t -p 5000:5000 -v $(pwd):/data osrm/osrm-backend osrm-extract -p /opt/car.lua /data/road.pbf +#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-partition /data/road.osrm +#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-customize /data/road.osrm +#docker run -t -i -p 5000:5000 -v "${PWD}:/data" osrm/osrm-backend osrm-routed --algorithm mld /data/road.osrm +# Check Also : https://github.com/ustroetz/python-osrm#route + +# Test: curl 'http://<yourserver>:5000/route/v1/driving/49.38,-1.37;49,-1.37?steps=true' +def getTupCoords(id_): + return [coords[id_].x,coords[id_].y] + +def getDistance(id_1,id_2): + try: + return osrm.simple_route(getTupCoords(id_1), getTupCoords(id_2), output="route", overview="full",steps=False,geometry="wkt")[0]["distance"] + except IndexError: + return -1 + +def signature(id_1,id2): + return "_".join([str(id_)for id_ in sorted([id_1,id2])]) + +def getDistanceSDict(id_1,id_2,sqlite_dict): + hash_ = signature(id_1,id_2) + if not hash_ in sqlite_dict: + sqlite_dict[hash_]=getDistance(id_1,id_2) + return sqlite_dict[hash_] +from joblib import Parallel,delayed # for parallel job computation + +def job(G,row,adjacent): + new_edge = (row.geonameid,adjacent) + if not G.has_edge(*new_edge): + if IS_DISTANCE and DISTANCE == "itinerary": + return (*new_edge,getDistanceSDict(new_edge[0],new_edge[1],distance_dict)) + elif IS_DISTANCE and DISTANCE == "euclidean": + raise NotImplementedError() + else: + return 1 +# Using Route Distance +G = nx.Graph() +for ix,row in tqdm(manche_gdf["geonameid adjacent_entity".split()].iterrows(),total=len(manche_gdf)): + new_edges = Parallel(n_jobs=4,backend="threading")(delayed(job)(G,row,adjacent) for adjacent in row.adjacent_entity) + for edge in new_edges: + if edge: + G.add_edge(edge[0],edge[1],weight=edge[2]) + +# Data for graph projection +lon_dict= dict(manche_gdf["geonameid longitude".split()].values) +lat_dict= dict(manche_gdf["geonameid latitude".split()].values) +pos= {n:[lon_dict[n],lat_dict[n]]for n in G.nodes()} + +nx.draw(G,pos=pos,node_size=1) +# plt.show() + +for ed in list(G.edges()): + G[ed[0]][ed[1]]["weight"]+=1 # problem when G[ed[0]][ed[1]]["weight"]==0: + +if IS_NOISE: + H = G.copy() + edges,weights = zip(*nx.get_edge_attributes(H,'weight').items()) + sample = random.sample(list(H.nodes()),NUMBER_OF_NODE_DESPATIALIZED) + H.remove_nodes_from(sample) + + pos= {n:[lon_dict[n],lat_dict[n]] for n in H.nodes()} + nx.draw(H,pos=pos,node_size=1) + # plt.show() + + label_dict = dict(manche_gdf["geonameid name".split()].values) + embeddings = dict(pd.read_msgpack("data/embeddings/geonamesFRWithEmbeddings.msg")["geonameid embedding".split()].values) + + ids,emb = zip(*embeddings.items()) + id2geonameid = dict(enumerate(ids)) + geonameid2id = {id_:ix for ix, id_ in enumerate(ids) } + emb_matrix = np.asarray(emb) + + from sklearn.metrics.pairwise import cosine_similarity + sim_matrix = cosine_similarity(emb) + top_n = np.argsort(sim_matrix)[:,-3:] + + for ix,n in enumerate(sample): + top_i = [id2geonameid[top] for top in top_n[geonameid2id[n]]] + weights_i = [sim_matrix[geonameid2id[n]][top]for top in top_n[geonameid2id[n]]] + for ij,top_ij in enumerate(top_i): + H.add_edge(n,top_ij,weight=weights_i[ij]) + +G = H.copy() + +node2vec = Node2Vec(G, dimensions=VECTOR_SIZE, walk_length=WALK_LENGTH, num_walks=NUMBER_OF_WALK, workers=NUMBER_OF_CPU_USED,temp_folder="temp") # Use temp_folder for big graphs +model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_CPU_USED) + +# Saving the embedding model +if not IS_NOISE: + model.save("manche_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE, + walk_l = WALK_LENGTH, + num_walk = NUMBER_OF_WALK, + window = WORD2VEC_WINDOW))#,noise = NUMBER_OF_NODE_DESPATIALIZED)) +else: + model.save("manche_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE, + walk_l = WALK_LENGTH, + num_walk = NUMBER_OF_WALK, + window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED)) \ No newline at end of file diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..19ac5551fff18870e8e76f2cb18dcf4394358d3c --- /dev/null +++ b/helpers.py @@ -0,0 +1,90 @@ +import pandas as pd +import matplotlib.pyplot as plt +import os + +def read_geonames(file): + """ + Return a dataframe that contains Geonames data. + + Parameters + ---------- + file : str + path of the Geonames Csv file + + Returns + ------- + pd.DataFrame + geonames data + """ + dtypes_dict = { + 0: int, # geonameid + 1: str, # name + 2: str, # asciiname + 3: str, # alternatenames + 4: float, # latitude + 5: float, # longitude + 6: str, # feature class + 7: str, # feature code + 8: str, # country code + 9: str, # cc2 + 10: str, # admin1 code + 11: str, # admin2 code + 12: str, # admin3 code + 13: str, # admin4 code + 14: int, # population + 15: str, # elevation + 16: int, # dem (digital elevation model) + 17: str, # timezone + 18: str # modification date yyyy-MM-dd + } + rename_cols = { + 0:"geonameid", # geonameid + 1:"name", # name + 2:"asciiname", # asciiname + 3:"alternatenames", # alternatenames + 4:"latitude", # latitude + 5:"longitude", # longitude + 6:"feature_class", # feature class + 7:"feature_class", # feature code + 8:"country_code", # country code + 9:"cc2", # cc2 + 10:"admin1_code", # admin1 code + 11:"admin2_code", # admin2 code + 12:"admin3_code", # admin3 code + 13:"admin4_code", # admin4 code + 14:"population", # population + 15:"elevation", # elevation + 16:"dem", # dem (digital elevation model) + 17:"timezone", # timezone + 18:"modification_date" # modification date yyyy-MM-dd + } + data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) + data.rename(columns=rename_cols,inplace=True) + return data + +def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False): + # Plot training & validation loss values + plt.gcf() + plt.gca() + plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data") + if validation: + plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data") + plt.title('Layer {0} accuracy'.format(output_layer_name)) + plt.ylabel('Accuracy') + plt.xlabel('Epoch') + plt.ylim((0,1.1)) #1.1 if accuracy = 1 + plt.legend() + plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,)) + if show : + plt.show() + + +def save_embedding(model,tokenizer,layer_idx,fn): + embedding_matrix = model.get_weights()[0] + with open(os.path.join(fn), 'w') as f: + for word, i in tokenizer.word_index.items(): + f.write(word) + for i in embedding_matrix[i]: f.write(' ' + repr(i)) + f.write('\n') + + diff --git a/models.py b/models.py new file mode 100644 index 0000000000000000000000000000000000000000..fa259fb03f4382e3312964d4efdd8d5d8963ea0b --- /dev/null +++ b/models.py @@ -0,0 +1,43 @@ + +from keras import Model +from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding, GRU, GlobalAveragePooling1D, Dropout + +# name,model_2=MPC_model(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder) +# model_2.fit(x=new_X,y=[Y_type,Y_lat,Y_lon],validation_split=0.33,epochs=EPOCHS,batch_size=BATCH_SIZE,verbose=1) + + +def getModel(model_func,max_sequence_length,embedding_dim,num_words,class_encoder,coordinate_encoder): + sequence_input = Input(shape=(max_sequence_length,), dtype='int32') + embedding_layer = Embedding(num_words, embedding_dim, input_length=max_sequence_length, trainable=True) + x = embedding_layer(sequence_input) + + name,x = model_func(x) + + placetype_layer = Dense(len(class_encoder.classes_), activation='softmax',name="place_type")(x) # From the transformation, attempt to predict the place type + coordinates_layer = Dense(coordinate_encoder.oneDimensionOutputSize(), activation='softmax',name="coord")(x) + + model = Model(sequence_input, [ placetype_layer , coordinates_layer] ) + model.compile(loss=['categorical_crossentropy','categorical_crossentropy'], optimizer='adam',metrics=["accuracy"]) + return name,model + +def WEAverage_model(x): + name = "WordEmb_Average" + x = GlobalAveragePooling1D()(x) + return name,x + +def MPC_WEAverage_model(x): + name = "MPC_WordEmb_Average" + x = GlobalAveragePooling1D()(x) + x = Dense(2000)(x) + x = Dense(2000)(x) + return name,x + +def BI_LSTM_model(x): + name = "Bi-LSTM" + x = (Bidirectional(LSTM(64)))(x) + return name,x + +def BI_GRU_model(x): + name = "Bi-GRU" + x = (Bidirectional(GRU(64)))(x) + return name, x diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c3f0d19a8b7b000c536884c28b1b5c96d2d909d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +pyroutelib3 +node2vec +osrm +geopandas +pandas +numpy +tqdm +networkx +matplotlib +joblib +gensim \ No newline at end of file diff --git a/scripts/classificationEmbeddings.py b/scripts/classificationEmbeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..2943861f3727b0838de8d77dea231d685c7cc15b --- /dev/null +++ b/scripts/classificationEmbeddings.py @@ -0,0 +1,32 @@ + +import pandas as pd +df= pd.read_csv("dbpediaPlaceClassification.csv") +import numpy as np +def loadGloveModel(gloveFile): + print("Loading Glove Model") + f = open(gloveFile,'r') + model = {} + for line in f: + splitLine = line.split() + word = splitLine[0] + embedding = np.array([float(val) for val in splitLine[1:]]) + model[word] = embedding + print("Done.",len(model)," words loaded!") + return model +model = loadGloveModel("data/glove/glove.6B.100d.txt") +def getEmb(x,model): + emb = np.zeros(100) + for word in x.split(): + word =word.lower() + if word in model: + emb+=model[word] + return emb + +df["embeddings"] = df["Place"].apply(lambda x : getEmb(x,model)) +df.to_msgpack("dbpediaPlaceEmbedding.msg") + +import json +data = json.load(open("classname.json")) +df2 = pd.DataFrame(data.items(),columns="WID label".split()) +df2["embeddings"] = df2["label"].apply(lambda x:getEmb(x,model)) +df2.to_msgpack("classnameEmbedding.msg") \ No newline at end of file diff --git a/scripts/evaluation-dbpedia-types.py b/scripts/evaluation-dbpedia-types.py new file mode 100644 index 0000000000000000000000000000000000000000..d1ef557279a1e7c062b45e8d8581c1b3b2137abc --- /dev/null +++ b/scripts/evaluation-dbpedia-types.py @@ -0,0 +1,61 @@ +import os +import random +import numpy as np +from owlready2 import * +from vincenty import vincenty +from scipy.stats.stats import pearsonr + +BASE_DIR = '' +GLOVE_DIR = os.path.join(BASE_DIR, '.') +NUM_PAIRS = 100000 + +print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.') +embeddings_index = {} +listText = [] +with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f: + for line in f: + word, coefs = line.split(maxsplit=1) + coefs = np.fromstring(coefs, 'f', sep=' ') + embeddings_index[word] = coefs + listText.append(word) +print('Found %s word vectors.' % len(embeddings_index)) + +print('Collecting data from DBPedia ontology, downloaded from http://downloads.dbpedia.org/2014/dbpedia_2014.owl.bz2.') +onto = get_ontology("dbpedia_2014.owl") +onto.load() +def retrive_desc( concept , old_names=[] ): + desc = list( concept.descendants( include_self=False) ) + names = list( [ re.sub(r'.+\.', '', repr(concept)) + "/" + re.sub(r'.+\.', '', repr(c)) for c in desc ] ) + names = [x for x in names if x not in old_names] + desc = [ desc[x] for x in range(len(names)) if names[x] not in old_names ] + new_desc = list(desc) + for i in desc: + n1, d1 = retrive_desc(i, names + old_names) + for j in range(len(n1)): + new_desc.append( d1[j] ) + names.append( re.sub(r'.+\.', '', repr(concept)) + "/" + n1[j] ) + return names, new_desc +names, _ = retrive_desc(onto.Place) +names = [ n.lower() for n in set(names) if re.sub(r'.+/', '', n.lower()) in embeddings_index ] # check if the name of the place type exists in the embeddings matrix + +print('Generating pairs of place names.') +name_pairs = [] +similarity_pairs = [] +distance_pairs = [] +for num in range(NUM_PAIRS): + name1 = random.choice(names) + name2 = random.choice(names) + if name1 == name2: continue + name_pairs.append( (name1,name2) ) + dist = 0.0 + n1 = name1.split('/') + n2 = name2.split('/') + for i in range(min(len(n1),len(n2))): + if n1[i] == n2[i]: dist += 1.0 + else: break + similarity_pairs.append(dist) # Similarity between the place types, given by the number of ancestors in common + distance_pairs.append( np.sqrt(np.sum((embeddings_index[re.sub(r'.+/', '', name1)] - embeddings_index[re.sub(r'.+/', '', name2)])**2)) ) # Euclidean distance between the embeddings + +result = pearsonr( distance_pairs , similarity_pairs) # Compute Pearson correlation and associated p-value +print(result) + diff --git a/scripts/evaluation-geonames.py b/scripts/evaluation-geonames.py new file mode 100644 index 0000000000000000000000000000000000000000..3f3b847f8d9f980d2a29c4bfef574f85d44235fb --- /dev/null +++ b/scripts/evaluation-geonames.py @@ -0,0 +1,62 @@ +import os +import random +import numpy as np +from vincenty import vincenty +from scipy.stats.stats import pearsonr + +BASE_DIR = '' +GLOVE_DIR = os.path.join(BASE_DIR, 'data/glove') +NUM_PAIRS = 100000 + +print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.') +embeddings_index = {} +listText = [] +with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f: + for line in f: + word, coefs = line.split(maxsplit=1) + coefs = np.fromstring(coefs, 'f', sep=' ') + embeddings_index[word] = coefs + listText.append(word) +print('Found %s word vectors.' % len(embeddings_index)) + +print('Collecting data from geonames downloaded from http://download.geonames.org/export/dump/allCountries.zip.') +file = open("data/geonamesData/allCountries.txt", "r") +placenames = { } +for line in file: + line = line.split("\t") + name = line[1].lower() + if " " in name or not(name in embeddings_index): # check if the main name exists in the embeddings matrix + names = line[3].split(",") + for n in names: + n = n.strip().lower() + if not(" " in n) and (n in embeddings_index): # if not, check if any of the alternative names exists in the embeddings matrix + name = n + break + if " " in name or not(name in embeddings_index): continue + placenames.update( { name : (float(line[4]), float(line[5])) } ) + +from scipy.spatial.distance import cosine +from tqdm import tqdm + +print('Generating pairs of place names.') +NUM_PAIRS = 1000 +name_pairs = [] +geo_distance_pairs = [] +distance_pairs = [] +for num in tqdm(range(NUM_PAIRS)): + name1 = random.choice(list(placenames.keys())) + name2 = random.choice(list(placenames.keys())) + if name1 == name2: continue + name_pairs.append( (name1,name2) ) + try: + distance_pairs.append(cosine(embeddings_index[name1], embeddings_index[name2])) + geo_distance_pairs.append( vincenty(placenames[name1], placenames[name2]) ) # Geospatial distance between the place names, given by Vincenty's geodetic formulae # Cosine distance between the embeddings + except: + pass +geo_distance_pairs= np.array(geo_distance_pairs).astype(float) +distance_pairs = np.nan_to_num(distance_pairs,nan=np.nanmax(distance_pairs)) +geo_distance_pairs = np.nan_to_num(geo_distance_pairs,nan=1) + + +result = pearsonr( geo_distance_pairs , distance_pairs) # Compute Pearson correlation and associated p-value +print(result) \ No newline at end of file diff --git a/scripts/extractWikidataClasseName.py b/scripts/extractWikidataClasseName.py new file mode 100644 index 0000000000000000000000000000000000000000..10740194a303f026f53ff997124853e573f5f73e --- /dev/null +++ b/scripts/extractWikidataClasseName.py @@ -0,0 +1,53 @@ +import json +import argparse +import time + +from SPARQLWrapper import SPARQLWrapper,JSON +from urllib.request import HTTPError + +from tqdm import tqdm + +parser = argparse.ArgumentParser() +parser.add_argument("available_class_filename",help="JSON file that contains an array of string. Each string is a Wikidata id (e.g. Q30)") +parser.add_argument("output_filename") +args = parser.parse_args() + +ids= json.load(open(args.available_class_filename)) + +def get_label(id_wikidata): + sparql = SPARQLWrapper("https://query.wikidata.org/sparql") + sparql.setQuery(""" + PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> + PREFIX wd: <http://www.wikidata.org/entity/> + select * + where { + wd:"""+id_wikidata+ """ rdfs:label ?label . + FILTER (langMatches( lang(?label), "EN" ) ) + } + LIMIT 1""" ) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + time.sleep(0.1) + try: + return results["results"]["bindings"][0]["label"]["value"] + except: + return "" + + +t = 0 +dict_results = {} +progress_bar = tqdm(total=len(ids)) +while t<len(ids): + try: + dict_results[ids[t]]=get_label(ids[t]) + except HTTPError as e: + time.sleep(1) + continue + progress_bar.update(1) + t+=1 +progress_bar.close() + +json.dump(dict_results,open(args.output_filename,'w')) + + + diff --git a/scripts/filterDataWithtopNclasse.py b/scripts/filterDataWithtopNclasse.py new file mode 100644 index 0000000000000000000000000000000000000000..e516a3db60e9b72dce11ffe1c4bd0c210c51dd09 --- /dev/null +++ b/scripts/filterDataWithtopNclasse.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# coding: utf-8 + +import pandas as pd +import numpy as np + +TOPN = 100 + +df = pd.read_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv",sep="\t",names="ID title url latitude longitude instance_of".split())[1:] +df = df.fillna("") +df["instance_of"] = df.instance_of.apply(lambda x: str(x).split("_")) + + +count = {} +for list_type in df.instance_of.values: + #print(list_type) + for type_ in list_type: + if not type_ in count: count[type_]=0 + count[type_]+=1 + +# modify count for CLASS IMPORTANT BUT NOT FREQUENT BECAUSE OF A HIGH GRANULARITY +to_increase = [ + "Q6256",#country + "Q5119",#capital + "Q27554677",#former capital + "Q10864048", # ADM. DIV. 1 + "Q13220204", # ADM. DIV. 2 + "Q13220204", # ADM. DIV. 3 + "Q14757767", # ADM. DIV. 4 + "Q82794" # geographic region +] +inf_ = np.max(list(count.values())) +for type_ in to_increase: + count[type_] = inf_+1 + + +print("Dataframe contains",len(df),"entities") + + + +count_df = df.from_dict(count,orient="index").reset_index().sort_values(0,ascending=False) + +class_filtered = set(count_df.head(TOPN)["index"].values) +#Q15640612 #5 +#Q22927291 #6 + + + +df = df[df.instance_of.apply(lambda x: sum(True for i in x if i in class_filtered)>0)] + +def getMostFrequentClass(x): + idx = np.argsort([count[i] for i in x])[-1] + return x[idx] + +df["type"] = df.instance_of.apply(getMostFrequentClass) +df.to_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_filteredTop{0}class".format(TOPN)) + + + diff --git a/scripts/getEmbeddingGeonamesPlacenames.py b/scripts/getEmbeddingGeonamesPlacenames.py new file mode 100644 index 0000000000000000000000000000000000000000..0493858b80ee67795aef73b89fd7bddaf6e24c84 --- /dev/null +++ b/scripts/getEmbeddingGeonamesPlacenames.py @@ -0,0 +1,59 @@ +import fasttext +print("Load Model Fasttext FR") +model = fasttext.load_model("./data/fasttext_FR/wiki.fr.bin") +print("Model Loaded !") + +import pandas as pd +def read_geonames(file): + dtypes_dict = { + 0: int, # geonameid + 1: str, # name + 2: str, # asciiname + 3: str, # alternatenames + 4: float, # latitude + 5: float, # longitude + 6: str, # feature class + 7: str, # feature code + 8: str, # country code + 9: str, # cc2 + 10: str, # admin1 code + 11: str, # admin2 code + 12: str, # admin3 code + 13: str, # admin4 code + 14: int, # population + 15: str, # elevation + 16: int, # dem (digital elevation model) + 17: str, # timezone + 18: str # modification date yyyy-MM-dd + } + rename_cols = { + 0:"geonameid", # geonameid + 1:"name", # name + 2:"asciiname", # asciiname + 3:"alternatenames", # alternatenames + 4:"latitude", # latitude + 5:"longitude", # longitude + 6:"feature_class", # feature class + 7:"feature_class", # feature code + 8:"country_code", # country code + 9:"cc2", # cc2 + 10:"admin1_code", # admin1 code + 11:"admin2_code", # admin2 code + 12:"admin3_code", # admin3 code + 13:"admin4_code", # admin4 code + 14:"population", # population + 15:"elevation", # elevation + 16:"dem", # dem (digital elevation model) + 17:"timezone", # timezone + 18:"modification_date" # modification date yyyy-MM-dd + } + data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False) + data.rename(columns=rename_cols,inplace=True) + return data +data = read_geonames("./data/geonamesData/FR.txt") +data= data.fillna("") +data = data[data.admin2_code == "50"] + +data["embedding"] = data["name"].apply(lambda x : model[x]) +print(data) +data.to_msgpack("geonamesFRWithEmbeddings.msg") \ No newline at end of file diff --git a/scripts/getWikidataTypesNames.py b/scripts/getWikidataTypesNames.py new file mode 100644 index 0000000000000000000000000000000000000000..342fc469da7a287bdf997a8cb68c410b714e8f04 --- /dev/null +++ b/scripts/getWikidataTypesNames.py @@ -0,0 +1,42 @@ +import subprocess,os,json +import numpy as np +import time + +import json +ids= json.load(open("classavailable.json")) + +from SPARQLWrapper import SPARQLWrapper,JSON + +def get_label(id_wikidata): + sparql = SPARQLWrapper("https://query.wikidata.org/sparql") + sparql.setQuery(""" + PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> + PREFIX wd: <http://www.wikidata.org/entity/> + select * + where { + wd:"""+id_wikidata+ """ rdfs:label ?label . + FILTER (langMatches( lang(?label), "EN" ) ) + } + LIMIT 1""" ) + sparql.setReturnFormat(JSON) + results = sparql.query().convert() + time.sleep(0.1) + try: + return results["results"]["bindings"][0]["label"]["value"] + except: + return "" + +from urllib.request import HTTPError +from tqdm import tqdm +t = 0 +dict_results = {} +pbar = tqdm(total=len(ids)) +while t<len(ids): + try: + dict_results[ids[t]]=get_label(ids[t]) + except HTTPError as e: + time.sleep(1) + continue + pbar.update(1) + t+=1 +pbar.close() \ No newline at end of file diff --git a/scripts/run_.sh b/scripts/run_.sh new file mode 100755 index 0000000000000000000000000000000000000000..a614a8bd9938a00d552ed4a530d151de7f49770c --- /dev/null +++ b/scripts/run_.sh @@ -0,0 +1,7 @@ +python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 100 --walk-length 30 --num-walks 200 --word2vec-window-size 30 +python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 200 --walk-length 30 --num-walks 200 --word2vec-window-size 30 +python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 300 --walk-length 30 --num-walks 200 --word2vec-window-size 30 +python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 400 --walk-length 30 --num-walks 200 --word2vec-window-size 30 +python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 500 --walk-length 30 --num-walks 200 --word2vec-window-size 30 +python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 600 --walk-length 30 --num-walks 200 --word2vec-window-size 30 +python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 700 --walk-length 30 --num-walks 200 --word2vec-window-size 30 \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d41f4e0ddb988363aad8411da5c854e23cb37d1e --- /dev/null +++ b/utils.py @@ -0,0 +1,69 @@ +import math +import numpy as np + +from stop_words import get_stop_words +from nltk.tokenize import word_tokenize + + + +class TokenizerCustom(): + def __init__(self,vocab): + self.word_index = {vocab[i]:i for i in range(len(vocab))} + self.index_word = {i:vocab[i] for i in range(len(vocab))} + self.N = len(self.index_word) + def texts_to_sequences(self,listText): + seqs = [] + for text in listText: + seqs.append([self.word_index[word] for word in word_tokenize(text) if word in self.word_index]) + return seqs + +class CoordinatesEncoder: + def __init__(self,cell_size_lat=0.5,cell_size_lon=0.5): + self.min_lon = -180 + self.max_lon = -(self.min_lon) #Â Symetric + self.min_lat = -90 + self.max_lat = -(self.min_lat) # Symetric + + self.ecart_lat = self.max_lat-self.min_lat + self.ecart_lon = self.max_lon-self.min_lon + + self.cell_size_lat = cell_size_lat + self.cell_size_lon = cell_size_lon + + self.unit_size_lat = self.ecart_lat/self.cell_size_lat + self.unit_size_lon = self.ecart_lon/self.cell_size_lon + + def encode(self,lat,lon): + return ( + math.floor(((lat+self.max_lat)/self.ecart_lat)*self.unit_size_lat), + math.floor(((lon+self.max_lon)/self.ecart_lon)*(self.unit_size_lon)) + ) + + def number_lat_cell(self): + return int(self.unit_size_lat) + + def number_lon_cell(self): + return int(self.unit_size_lon) + + def oneDimensionOutputSize(self): + return self.number_lat_cell()*self.number_lon_cell() + + def vector(self,lat,lon): + lat_v,lon_v=np.zeros(self.number_lat_cell()),np.zeros(self.number_lon_cell()) + new_coords = self.encode(lat,lon) + lat_v[int(new_coords[0])] = 1 + lon_v[int(new_coords[1])] = 1 + return lat_v,lon_v + def vector_flatten(self,lat,lon): + vec = np.zeros(self.oneDimensionOutputSize()) # 2D Dense softmax isn't possible + new_coords = self.encode(lat,lon) + pos = self.number_lat_cell()*(new_coords[0])+new_coords[1] + vec[pos] = 1 #lon * lon size + return vec + +def _split(lst,n,complete_chunk_value): + chunks = [lst[i:i + n] for i in range(0, len(lst), n)] + if not chunks:return chunks + if len(chunks[-1]) != n: + chunks[-1].extend([complete_chunk_value]*(n-len(chunks[-1]))) + return np.array(chunks) \ No newline at end of file