Initial Commit

35f9959b · Fize Jacques · 76f085e4 · 35f9959b · 35f9959b · 35f9959b
Commit 35f9959b authored 5 years ago by Fize Jacques
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+
+#### CUSTOM
+
+data/*
+deprecated/*
+*.ipynb_checkpoints
+notebooks/*
+outputs/*
+temp/*
+WikipediaExtract/*
+
+*.DS_Store
\ No newline at end of file
--- a/1_extractDataFromWikidata.py
+++ b/1_extractDataFromWikidata.py
+import json
+import gzip
+import argparse
+import re
+
+import pandas as pd
+
+from joblib import Parallel, delayed
+
+# To avoid progressbar issue
+from tqdm import tqdm as tqdm_base
+def tqdm(*args, **kwargs):
+    if hasattr(tqdm_base, '_instances'):
+        for instance in list(tqdm_base._instances):
+            tqdm_base._decr_instances(instance)
+    return tqdm_base(*args, **kwargs)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("wikidata_json_dump_filename",help="Wikipedia JSON dump compressed with gzip (*.gz)")
+parser.add_argument("output_filename")
+
+args = parser.parse_args()
+
+# Prepare Output File
+output = open(args.output_filename,'w')
+output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format("ID_WIKIDATA","title","url","latitude","longitude","classes"))
+
+def job(line):
+    line = line.decode("utf-8")
+
+    if not "\"P625\"" in line or not "\"P31\"" in line:
+        return
+    try: 
+        data = json.loads(line.strip(",\n"))
+        if "sitelinks" in data and "claims" in data:
+            if "enwiki" in data["sitelinks"]:
+                id_ = data["id"]
+                coords_data = data["claims"]["P625"][0]["mainsnak"]["datavalue"]["value"]
+                title = data["sitelinks"]["enwiki"]["title"]
+                url = "https://en.wikipedia.org/wiki/{0}".format(title.replace(" ","_"))
+                lat = coords_data["latitude"]
+                lon = coords_data["longitude"]
+                classes_ = ""
+                for claimP31 in data["claims"]["P31"]:
+                    classes_ = classes_ + "_"+ str(claimP31["mainsnak"]["datavalue"]["value"]["id"])
+                output.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(id_,title,url,lat,lon,classes_.strip("_")))
+    except Exception: # First Line is "['" and last line is "]'"
+        pass
+
+
+Parallel(n_jobs=8,backend="multiprocessing")(delayed(job)(line)for line in tqdm(gzip.GzipFile(args.wikidata_json_dump_filename),unit_scale=True,unit_divisor=1000))
+
+
+"""
+grep -v "ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses" selectedPages.csv > selectedPages2.csv
+{ echo -n 'ID_WIKIDATA\ttitle\turl\tlatitude\tlongitude\tclasses\n'; cat selectedPages2.csv; } > selectedPages3.csv
+
+
+import pandas as pd
+df = pd.read_csv("test.txt.new",sep="\t")
+df
+df.latitude
+df
+df.columns
+nano test.txt.new
+!nano test.txt.new
+!nano test.txt.new
+df = pd.read_csv("test.txt.new",sep="\t")
+df.latitude
+import geopandas as gpd
+gdf = gpd.read_file("data/france/france_metro.geojson")
+from shapely.geometry import Point
+df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1)
+df["geom"]=df["latitude longitude".split()].apply(lambda x : Point(x.longitude,x.latitude),axis=1)
+gdf
+gdf.iloc[0].geometry
+france = gdf.iloc[0].geometry
+from tqdm import tqdm
+tqdm.pandas()
+df.geom.progress_apply(lambda x : france.contains(x))
+france.convex_hull
+ff =france.convex_hull
+df.geom.progress_apply(lambda x : ff.contains(x))
+is_in_france = df.geom.progress_apply(lambda x : ff.contains(x))
+df_new =  df[is_in_france].copy()
+df_new
+del df_new["geom"]
+df_new.to_csv("data/wikidata/sample/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_FRANCE")
+!cp test.txt.new data/wikidata/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv
+"""
+# A run is done in ~1,18 hours (i7 2.8ghz, 16Gb RAM)
\ No newline at end of file
--- a/2_extractLearningDataset.py
+++ b/2_extractLearningDataset.py
+import gzip
+import json
+import re
+
+import argparse
+
+import pandas as pd
+
+from joblib import Parallel,delayed
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("wikipedia_archive_filename",help="Filename of the Wikipedia corpus parsed with gensim")
+parser.add_argument("wikidata_extraction",help="Output from the previous step")
+parser.add_argument("output_file")
+
+args = parser.parse_args()
+
+try:
+    df_extract_wikidata = pd.read_csv(args.wikidata_extraction)#,header=None,names=["ID_WIKIDATA","title","url","latitude","longitude","classes"])
+except:
+    df_extract_wikidata = pd.read_csv(args.wikidata_extraction,sep="\t")#,header=None,names=["ID_WIKIDATA","title","url","latitude","longitude","classes"])
+
+titles = set(df_extract_wikidata.title.values)
+
+coords_lat = dict(df_extract_wikidata["title latitude".split()].values)
+coords_lon = dict(df_extract_wikidata["title longitude".split()].values)
+class_ = dict(df_extract_wikidata["title classes".split()].values)
+
+output = open(args.output_file,'w')
+
+
+
+
+def job(line):
+    line = line.decode("utf-8")
+    data = json.loads(line)
+    if data["title"] in titles:
+        title = data["title"]
+        data["lat"] = coords_lat[title]
+        data["lon"] = coords_lon[title]
+        data["classes"] = class_[title]
+        output.write(json.dumps(data)+"\n")
+        
+Parallel(n_jobs = 8,backend="multiprocessing")(delayed(job)(line) for line in tqdm(gzip.GzipFile(args.wikipedia_archive_filename,'rb'),total=5980533))
\ No newline at end of file
--- a/3_samplingLearningDataset.py
+++ b/3_samplingLearningDataset.py
+import subprocess,os,json
+import numpy as np
+import time
+
+import logging
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO
+    )
+
+from tqdm import tqdm
+
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument("corpus_filename")
+parser.add_argument("sampling_size",type=int)
+parser.add_argument("output_filename")
+
+args= parser.parse_args()
+
+CORPUS_FILENAME = args.corpus_filename
+SAMPLE_SIZE = args.sampling_size
+
+# Compute the size of input corpus 
+logging.info("Computing the corpus size...")
+wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ])
+NUMBER_OF_INPUT=int(wc_l.strip().split()[-2])
+logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT))
+
+# Sampling
+logging.info("Sampling...")
+arr_ = np.arange(NUMBER_OF_INPUT)
+sample = np.random.choice(arr_,SAMPLE_SIZE)
+
+# Prepare Output file
+output = open(args.output_filename,'w')
+
+# Writing in the output file
+logging.info("Writing the output...")
+for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT):
+    if ix in sample:
+        output.write(line)
+
+logging.info("Done !")
+
+
--- a/4_embeddings_lat_lon_type.py
+++ b/4_embeddings_lat_lon_type.py
+# Basic module
+import time
+import random
+import json
+import os
+import sys
+import argparse
+
+# Data module
+import numpy as np
+import pandas as pd
+
+from joblib import Parallel,delayed
+
+# Keras basic
+import keras
+from keras import backend as K
+from keras.initializers import Constant
+
+# preprocessing
+from sklearn import preprocessing
+from keras.preprocessing import sequence
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from keras.utils import to_categorical
+from keras.preprocessing.text import text_to_word_sequence
+
+# Neural Network Model and layers class
+from keras.layers import Dense, Input, GlobalAveragePooling1D, Embedding, LSTM, Bidirectional, Conv1D, GRU
+from keras.models import Model
+
+# Neural network model and visualisation function
+from models import getModel,BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model
+from helpers import plot_accuracy_from_history, save_embedding
+
+# Utils
+from utils import CoordinatesEncoder,TokenizerCustom,_split
+
+
+# Logging
+import logging
+from chrono import Chronometer
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO
+    )
+chrono = Chronometer()
+
+# Visualisation
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("input")
+parser.add_argument("--glove_dir",default="data/glove")
+
+parser.add_argument("--max_sequence_length",type=int, default=15)
+parser.add_argument("--max_num_words",type=int, default=400000)
+
+parser.add_argument("--embedding_dimension",type=int, default=100)
+
+parser.add_argument("--batch_size",type=int, default=100)
+parser.add_argument("--epochs",type=int, default=100)
+
+parser.add_argument("-v",action="store_true",help="Display Keras training verbose")
+
+
+def clean(x):
+    return x.lower().replace("\n","").replace("\'\'\'","").replace("\'\'","")
+
+def split_data(input_data):
+    """
+    Split the corpus into different list, each one corresponding to a feature(coordinates, type, textual data)
+    
+    Parameters
+    ----------
+    input_data : _io.TextIOWrapper
+        File instance
+    
+    Returns
+    -------
+    tuple
+        lists of locations : name, coordinates, types and text data
+    """
+    listLocations, listText, listCoords, listTypes = [], [], [], []
+    for line in input_data:
+        data = json.loads(line.strip("\n"))
+        listLocations.append(data["title"])
+        listText.append(clean(data["section_texts"][0]))  # get intro
+        # listTypes.append(data["classes"] if data["classes"] else "Q5624766") # Default Populated Places
+        listTypes.append(dict_class_translate[data["classes"]]
+                         if data["classes"] in dict_class_translate else "populated place")
+        listCoords.append([float(data["lat"]), float(data["lon"])])
+    return listLocations, listCoords, listTypes, listText
+
+
+
+# PARSE ARGS
+args = parser.parse_args()#("./WikipediaExtract/WikipediaExtract_filtered_300.json".split())
+
+GLOVE_DIR = args.glove_dir
+MAX_SEQUENCE_LENGTH = args.max_sequence_length  # Size of the context window for word2vec CBOW
+MAX_NUM_WORDS = args.max_num_words  # Number of words in the vocabulary
+EMBEDDING_DIM = args.embedding_dimension  # Dimensionality for the word embeddings
+BATCH_SIZE = args.batch_size  # Size of the training batches of instances
+EPOCHS = args.epochs
+CORPUS_FILENAME = args.input
+
+# SEARCH FOR UNIQUE TYPES
+logging.info("Collecting class name")
+
+# For class translation
+dict_class_translate = json.load(open("data/wikidata/class_data/WikiClass2DPClass.json"))
+
+classlabels = set([])
+for line in tqdm(open(CORPUS_FILENAME), desc="Reading the line "):
+    data = json.loads(line.strip("\n"))
+    classlabels.add(dict_class_translate[data["classes"]]
+                    if data["classes"] in dict_class_translate else "populated place")
+    #classlabels.add(data["classes"] if data["classes"] else "Q5624766")
+classlabels = list(classlabels)
+
+# LOAD DATA
+logging.info("Loading the data...")
+chrono.start("data_loading")
+
+_, listCoords, listTypes, listText = split_data(open(CORPUS_FILENAME))
+
+logging.info("Data Loaded in {0} seconds!".format(chrono.stop("data_loading")))
+
+logging.info("Extract Vocab")
+# Extract Vocab
+vocab_ = set([" "])
+tokenizer = Tokenizer()
+tokenizer.fit_on_texts(listText)
+tokenizer.word_index[" "] = np.max(list(tokenizer.index_word.keys()))+1 
+vocab_ = vocab_.union(tokenizer.word_index.keys())
+logging.info("The vocabulary contains {0} words".format(len(list(vocab_))))
+
+logging.info("Initialize Tokenizer/ClassEncoder/CoordinateEncoder...")
+# Tokenizer
+#tokenizer = TokenizerCustom(list(vocab_))
+max_key_tokenizer = np.max(list(tokenizer.index_word.keys()))
+num_words = min(MAX_NUM_WORDS, len(tokenizer.word_index)) + 1
+# Coordinate Encoder
+coordinate_encoder = CoordinatesEncoder(2,2)
+# CLASS ENCODER type-->int
+type_encoder = preprocessing.LabelEncoder()
+type_encoder.fit(classlabels)
+
+
+logging.info("Parsing data for the neural network...")
+chrono.start("parse_data")
+X = tokenizer.texts_to_sequences(listText)
+
+#X = X[:500] # Sample for tests
+
+listCoords = np.array(listCoords)
+y_lat = listCoords[:, 0]
+y_lon = listCoords[:, 1]
+
+
+new_X,Y_type,Y_coord = [],[],[]
+for ix,x in tqdm(enumerate(X),total=len(X)):
+    text_sequence_splited = _split(x,MAX_SEQUENCE_LENGTH,0)
+    
+    # Sub-Sequences vectors
+    new_X.extend(text_sequence_splited)
+    
+    # coordinate vectors
+    new_coordinates=coordinate_encoder.vector_flatten(y_lat[ix],y_lon[ix])
+    Y_coord.extend([new_coordinates]*len(text_sequence_splited))
+    
+    #type vectors
+    type_code = type_encoder.transform([listTypes[ix]])[0]
+    arr_ = np.zeros(len(type_encoder.classes_))
+    arr_[type_code]=1
+    Y_type.extend([arr_]*len(text_sequence_splited))
+
+
+Y_coord = np.array(Y_coord)
+Y_type = np.array(Y_type)
+new_X = np.array(new_X)
+
+logging.info("Data Parsed in {0} seconds!".format(chrono.stop("parse_data")))
+
+for model_f in [BI_GRU_model, BI_LSTM_model, MPC_WEAverage_model, WEAverage_model]:
+    name, model = getModel(model_f,MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder,coordinate_encoder)
+    logging.info("Training the {0} model...".format(name))
+    chrono.start("model_training")
+    name = name + \
+        "_{0}dim_{1}epoch_{2}batch".format(EMBEDDING_DIM, EPOCHS, BATCH_SIZE)
+
+    history = model.fit(x=new_X,
+                    y=[Y_type,Y_coord],#Y_lat,Y_lon],
+                    validation_split=0.33,
+                    epochs=EPOCHS,
+                    batch_size=BATCH_SIZE,
+                    verbose=(1 if args.v else 0),
+                    workers = 4,
+                    #use_multiprocessing=True
+                    )
+
+    logging.info("Model {0} have been trained in {1} seconds!".format(name,chrono.stop("model_training")))
+    hist_df = pd.DataFrame(history.history)
+    hist_df.to_csv("outputs/{0}.csv".format(name))
+    for layer_name in "place_type coord".split():
+        plt.clf()
+        plot_accuracy_from_history(
+            name, hist_df, layer_name, "outputs/{0}.png".format(name), "")
+    save_embedding(model, tokenizer, 0, "outputs/{0}.txt".format(name))
+
+
--- a/README.md
+++ b/README.md
-# Place Embedding
+# INSTALL BASEMAP

+```bash
+brew install geos
+pip3 install https://github.com/matplotlib/basemap/archive/master.zip
+```
+
+# GET DATA
+
+## Process Wikipedia
+
+python3 -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz
+
+## Process Wikidata
+
+python3 extractInfoWikidata.py
+
+## Fuse Data for training 
+
+python3 extractsubWikipedia.py
\ No newline at end of file
--- a/chrono.py
+++ b/chrono.py
+import time
+
+class Chronometer():
+    def __init__(self):
+        self.__task_begin_timestamp = {}
+
+    def start(self,task_name):
+        """
+        Start a new task chronometer
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Raises
+        ------
+        ValueError
+            if a running task already exists with that name
+        """
+        if task_name in self.__task_begin_timestamp:
+            raise ValueError("A running task exists with the name {0}!".format(task_name))
+        self.__task_begin_timestamp[task_name] = time.time()
+
+    def stop(self,task_name):
+        """
+        Stop and return the duration of the task
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Returns
+        -------
+        float
+            duration of the task in seconds
+        
+        Raises
+        ------
+        ValueError
+            if no task exist with the id `task_name`
+        """
+        if not task_name in self.__task_begin_timestamp:
+             raise ValueError("The {0} task does not exist!".format(task_name))
+        duration = time.time() - self.__task_begin_timestamp[task_name]
+        del self.__task_begin_timestamp[task_name]
+        return duration
+
+if __name__ == "__main__":
+    chrono = Chronometer()
+    chrono.start("test")
+    chrono.start("test2")
+    time.sleep(3)
+    print(chrono.stop("test"))
+    time.sleep(3)
+    print(chrono.stop("test2"))
\ No newline at end of file
--- a/evalgeonamesembeddings.py
+++ b/evalgeonamesembeddings.py
+# Evaluation process
+import gensim
+import glob
+import re
+import gensim
+import random
+from helpers import *
+from scipy.spatial.distance import cosine
+from shapely.geometry import Point
+from scipy.stats.stats import pearsonr
+
+import pandas as pd
+import geopandas as gpd
+
+from tqdm import tqdm
+
+NPAIR = 100000
+fns = glob.glob("data/embeddings/*.bin")
+
+def get_data(fn):
+    data = [int(x) for x in re.findall("\d+",fn)]
+    if not len(data) == 4:
+        return {"embedding_size":data[0],
+        "walk_length":data[1],
+        "number_of_walks":data[2],
+        "word2vec_window_size":data[3],
+        "filepath":fn,
+        "noise":data[4]
+        }
+        #raise Exception("filename should have 4 integers")
+    return {
+        "embedding_size":data[0],
+        "walk_length":data[1],
+        "number_of_walks":data[2],
+        "word2vec_window_size":data[3],
+        "filepath":fn
+    }
+    
+df = read_geonames("./data/geonamesData/FR.txt")
+df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1)
+
+# Create GeoDataFrame for faster spatial comparison operations
+gdf = gpd.GeoDataFrame(df)
+
+# Select a sample that concerns the departement "La Manche"
+manche_gdf = gdf[gdf.admin2_code == "50"]
+
+df =pd.DataFrame([get_data(fn) for fn in fns])
+
+def get_pearsons(model):
+    manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid
+    coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values)
+    places = list(coords.keys())
+    geodesic_d = []
+    embeddings_d = []
+    for i in tqdm(range(NPAIR),disable=True):
+        placeA=random.choice(places)
+        placeB=random.choice(places)
+        geodesic_d.append(coords[placeA].distance(coords[placeB]))
+        embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)]))
+    return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value
+
+df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0])
+
+df.plot.scatter(x="walk_length", y="pearson")
+plt.show()
+df.plot.scatter(x="number_of_walks", y="pearson")
+plt.show()
+df.plot.scatter(x="word2vec_window_size", y="pearson")
+plt.show()
\ No newline at end of file
--- a/geonames_embedding.py
+++ b/geonames_embedding.py
+# PYTHON MODULE
+import math
+import random
+from argparse import ArgumentParser
+from multiprocessing import cpu_count
+from argparse import RawTextHelpFormatter
+
+# COMMON DATA STRUCTURE MODULE
+import pandas as pd
+import numpy as np
+import networkx as nx
+
+# SPATIAL DATA MANIPULATION
+import geopandas as gpd
+import osrm
+osrm.RequestConfig.host = "jacquesfize.com:5000"
+from shapely.geometry import Point
+
+# DISTANCE MODULE
+from scipy.spatial.distance import cosine
+from scipy.stats.stats import pearsonr
+
+# Machine Learning MODULE
+from node2vec import Node2Vec
+import gensim
+
+# VISUALISATION MODULE
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+tqdm.pandas()
+
+# PERSONAL FUNCTION
+from helpers import *
+
+parser = ArgumentParser(description='Generate a spatial embedding of places using Geonames data', formatter_class=RawTextHelpFormatter)
+
+parser.add_argument("input")
+
+parser.add_argument("--nbcpu",type=int,default=cpu_count())
+
+parser.add_argument("--vector-size",type=int,default=64,help="Output Vector Dimension")
+parser.add_argument("--walk-length",type=int,default=30, help="Size of the walk generated during the Node2vec algorithm")
+parser.add_argument("--num-walks",type=int,default=200, help="Number of walk generated during the Node2vec algorithm")
+parser.add_argument("--word2vec-window-size",type=int,default=30, help="Window size used in the Word2vec algorithm")
+
+parser.add_argument("--buffer-size",type=float,default=0.03,help="Buffer size to transform Point in Polygon. Used for adjacency matrix computation.")
+parser.add_argument("-d",action="store_true",help="Integrate the distance weight between vertices")
+parser.add_argument("--dist",choices=["euclidean","itinerary"],default="itinerary",help="""Two distance functions are available:
+ - Euclidean : Euclidean distance between the two places centroids
+ - Itinerary : Compute the itinerary distance between two places using an OSRM service 
+""")
+
+parser.add_argument("--noise",action="store_true")
+parser.add_argument("--noise-size",type=int,default=500)
+
+args = parser.parse_args()
+
+# INPUT DATA
+GEONAMES_FN = args.input
+
+# PARALLELISM OPTION
+NUMBER_OF_CPU_USED = args.nbcpu
+
+# Graph Embedding parameter
+VECTOR_SIZE = args.vector_size
+WALK_LENGTH = args.walk_length
+NUMBER_OF_WALK = args.num_walks
+WORD2VEC_WINDOW = args.word2vec_window_size
+
+# GRAPH WEIGHT PARAMETER
+IS_DISTANCE = args.d
+DISTANCE = args.dist
+# if simulation of new toponyms
+GEO_DISTANCE_COEF = 0.5
+EMBEDDING_DISTANCE_COEF = 0.5
+
+# New toponym simulation
+IS_NOISE = args.noise
+NUMBER_OF_NODE_DESPATIALIZED = args.noise_size
+
+
+# DISTANCE CACHE STORAGE
+from sqlitedict import SqliteDict
+distance_dict = SqliteDict('./data/distance_dict.sqlite', autocommit=True)
+
+# LOAD GEONAMES DATA
+df = read_geonames(GEONAMES_FN)
+df["geometry"] = df["latitude longitude".split()].progress_apply(lambda x:Point(x.longitude,x.latitude),axis=1)
+
+# Create GeoDataFrame for faster spatial comparison operations
+gdf = gpd.GeoDataFrame(df)
+
+# Select a sample that concerns the departement "La Manche"
+manche_gdf = gdf[gdf.admin2_code == "50"]
+manche_gdf["geometry"]=manche_gdf.geometry.buffer(0.03)
+manche_gdf.plot()
+# plt.show()
+
+# Build a Adjacency matrix to generate the graph used for the embedding generation
+N = len(manche_gdf)
+adjacency_matrix = np.zeros((N,N))
+geometries = manche_gdf.geometry.values
+for i in tqdm(range(N)):
+    for j in range(i,N):
+        adjacency_matrix[i,j] = geometries[i].intersects(geometries[j])
+        adjacency_matrix[j,i] = adjacency_matrix[i,j]
+plt.clf()
+plt.imshow(adjacency_matrix);plt.colorbar()
+# plt.show()
+
+# Mapping id between matrix and geonameid
+manche_gdf["code_matrix"]=np.arange(N)
+geoname_id2idxmat = dict(manche_gdf["geonameid code_matrix".split()].values)
+idxmat2geoname_id = {v:k for k,v in geoname_id2idxmat.items()}
+
+# Add adjacent entity found in the Geodataframe
+def get_adjacent_entity(x):
+    idxs = np.nonzero(adjacency_matrix[geoname_id2idxmat[x]])[0]
+    return [idxmat2geoname_id[idx] for idx in idxs if not idxmat2geoname_id[idx] == x] # take not itself
+
+manche_gdf["adjacent_entity"]=manche_gdf.geonameid.apply(get_adjacent_entity)
+
+# Code for getting the distance using the road network (not euclidean) PART 1
+manche_gdf["geometry_centroid"]=manche_gdf.centroid
+coords = dict(manche_gdf["geonameid geometry_centroid".split()].values)
+
+# Code for getting the distance using the road network (not euclidean) PART 2
+
+# Run ORSM SERVER
+#https://hub.docker.com/r/osrm/osrm-backend/
+#docker run -t -p 5000:5000 -v $(pwd):/data osrm/osrm-backend osrm-extract -p /opt/car.lua /data/road.pbf
+#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-partition /data/road.osrm
+#docker run -t -v "${PWD}:/data" osrm/osrm-backend osrm-customize /data/road.osrm
+#docker run -t -i -p 5000:5000 -v "${PWD}:/data" osrm/osrm-backend osrm-routed --algorithm mld /data/road.osrm
+# Check Also : https://github.com/ustroetz/python-osrm#route
+
+# Test:  curl 'http://<yourserver>:5000/route/v1/driving/49.38,-1.37;49,-1.37?steps=true'
+def getTupCoords(id_):
+    return [coords[id_].x,coords[id_].y]
+
+def getDistance(id_1,id_2):
+    try:
+        return osrm.simple_route(getTupCoords(id_1), getTupCoords(id_2), output="route", overview="full",steps=False,geometry="wkt")[0]["distance"]
+    except IndexError:
+        return -1
+
+def signature(id_1,id2):
+    return "_".join([str(id_)for id_ in sorted([id_1,id2])])
+
+def getDistanceSDict(id_1,id_2,sqlite_dict):
+    hash_ = signature(id_1,id_2)
+    if not hash_ in sqlite_dict:
+        sqlite_dict[hash_]=getDistance(id_1,id_2)
+    return sqlite_dict[hash_]
+from joblib import Parallel,delayed # for parallel job computation
+
+def job(G,row,adjacent):
+    new_edge = (row.geonameid,adjacent)
+    if not G.has_edge(*new_edge):
+        if IS_DISTANCE and DISTANCE == "itinerary":
+            return (*new_edge,getDistanceSDict(new_edge[0],new_edge[1],distance_dict))
+        elif IS_DISTANCE and DISTANCE == "euclidean":
+            raise NotImplementedError()
+        else:
+            return 1
+# Using Route Distance
+G = nx.Graph()
+for ix,row in tqdm(manche_gdf["geonameid adjacent_entity".split()].iterrows(),total=len(manche_gdf)):
+    new_edges = Parallel(n_jobs=4,backend="threading")(delayed(job)(G,row,adjacent) for adjacent in row.adjacent_entity)
+    for edge in new_edges:
+        if edge:
+            G.add_edge(edge[0],edge[1],weight=edge[2])
+
+# Data for graph projection
+lon_dict= dict(manche_gdf["geonameid longitude".split()].values)
+lat_dict= dict(manche_gdf["geonameid latitude".split()].values)
+pos= {n:[lon_dict[n],lat_dict[n]]for n in G.nodes()}
+
+nx.draw(G,pos=pos,node_size=1)
+# plt.show()
+
+for ed in list(G.edges()):
+    G[ed[0]][ed[1]]["weight"]+=1 # problem when G[ed[0]][ed[1]]["weight"]==0:
+
+if IS_NOISE:
+    H = G.copy()
+    edges,weights = zip(*nx.get_edge_attributes(H,'weight').items()) 
+    sample = random.sample(list(H.nodes()),NUMBER_OF_NODE_DESPATIALIZED)
+    H.remove_nodes_from(sample)
+
+    pos= {n:[lon_dict[n],lat_dict[n]] for n in H.nodes()}
+    nx.draw(H,pos=pos,node_size=1)
+    # plt.show()
+
+    label_dict = dict(manche_gdf["geonameid name".split()].values)
+    embeddings = dict(pd.read_msgpack("data/embeddings/geonamesFRWithEmbeddings.msg")["geonameid embedding".split()].values)
+
+    ids,emb = zip(*embeddings.items())
+    id2geonameid = dict(enumerate(ids))
+    geonameid2id = {id_:ix for ix, id_ in enumerate(ids) }
+    emb_matrix = np.asarray(emb)
+
+    from sklearn.metrics.pairwise import cosine_similarity
+    sim_matrix = cosine_similarity(emb)
+    top_n = np.argsort(sim_matrix)[:,-3:]
+
+    for ix,n in enumerate(sample):
+        top_i = [id2geonameid[top] for top in top_n[geonameid2id[n]]]
+        weights_i = [sim_matrix[geonameid2id[n]][top]for top in top_n[geonameid2id[n]]]
+        for ij,top_ij in enumerate(top_i):
+            H.add_edge(n,top_ij,weight=weights_i[ij])
+
+G = H.copy()
+
+node2vec = Node2Vec(G, dimensions=VECTOR_SIZE, walk_length=WALK_LENGTH, num_walks=NUMBER_OF_WALK, workers=NUMBER_OF_CPU_USED,temp_folder="temp")  # Use temp_folder for big graphs
+model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_CPU_USED)  
+
+# Saving the embedding model
+if not IS_NOISE:
+    model.save("manche_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
+    walk_l = WALK_LENGTH,
+    num_walk = NUMBER_OF_WALK,
+    window = WORD2VEC_WINDOW))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
+else:
+    model.save("manche_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
+    walk_l = WALK_LENGTH,
+    num_walk = NUMBER_OF_WALK,
+    window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED))
\ No newline at end of file
--- a/helpers.py
+++ b/helpers.py
+import pandas as pd
+import matplotlib.pyplot as plt 
+import os
+
+def read_geonames(file):
+    """
+    Return a dataframe that contains Geonames data.
+    
+    Parameters
+    ----------
+    file : str
+        path of the Geonames Csv file
+    
+    Returns
+    -------
+    pd.DataFrame
+        geonames data
+    """
+    dtypes_dict = {
+    0: int, # geonameid
+    1: str,  # name
+    2: str,  # asciiname
+    3: str,  # alternatenames
+    4: float, # latitude
+    5: float, # longitude
+    6: str, # feature class
+    7: str, # feature code
+    8: str, # country code
+    9: str, # cc2
+    10: str, # admin1 code
+    11: str, # admin2 code
+    12: str, # admin3 code
+    13: str, # admin4 code
+    14: int, # population
+    15: str, # elevation
+    16: int, # dem (digital elevation model)
+    17: str, # timezone
+    18: str # modification date yyyy-MM-dd
+    }
+    rename_cols = {
+    0:"geonameid", # geonameid
+    1:"name",  # name
+    2:"asciiname",  # asciiname
+    3:"alternatenames",  # alternatenames
+    4:"latitude", # latitude
+    5:"longitude", # longitude
+    6:"feature_class", # feature class
+    7:"feature_class", # feature code
+    8:"country_code", # country code
+    9:"cc2", # cc2
+    10:"admin1_code", # admin1 code
+    11:"admin2_code", # admin2 code
+    12:"admin3_code", # admin3 code
+    13:"admin4_code", # admin4 code
+    14:"population", # population
+    15:"elevation", # elevation
+    16:"dem", # dem (digital elevation model)
+    17:"timezone", # timezone
+    18:"modification_date" # modification date yyyy-MM-dd
+    }
+    data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
+    data.rename(columns=rename_cols,inplace=True)
+    return data
+
+def plot_accuracy_from_history(model_name,history_data,output_layer_name,outpu_filename,parameter_string,output_dirname="outputs",validation=True,show=False):
+  # Plot training & validation loss values
+  plt.gcf()
+  plt.gca()
+  plt.plot(history_data['{0}_accuracy'.format(output_layer_name)].values,label="Train Data")
+  if validation:
+    plt.plot(history_data['val_{0}_accuracy'.format(output_layer_name)].values,label = "Test Data")
+  plt.title('Layer {0} accuracy'.format(output_layer_name))
+  plt.ylabel('Accuracy')
+  plt.xlabel('Epoch')
+  plt.ylim((0,1.1)) #1.1 if accuracy = 1
+  plt.legend()
+  plt.savefig("outputs/{0}_{1}_{2}.png".format(model_name,parameter_string,output_layer_name,))
+  if show :
+    plt.show()
+
+
+def save_embedding(model,tokenizer,layer_idx,fn):
+    embedding_matrix = model.get_weights()[0]
+    with open(os.path.join(fn), 'w') as f:
+        for word, i in tokenizer.word_index.items(): 
+            f.write(word)
+            for i in embedding_matrix[i]: f.write(' ' + repr(i))
+            f.write('\n')
+
+
--- a/models.py
+++ b/models.py
+
+from keras import Model
+from keras.layers import Input, Dense, Bidirectional, LSTM, Embedding, GRU, GlobalAveragePooling1D, Dropout
+
+# name,model_2=MPC_model(MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,num_words,type_encoder)
+# model_2.fit(x=new_X,y=[Y_type,Y_lat,Y_lon],validation_split=0.33,epochs=EPOCHS,batch_size=BATCH_SIZE,verbose=1)
+
+
+def getModel(model_func,max_sequence_length,embedding_dim,num_words,class_encoder,coordinate_encoder):
+    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
+    embedding_layer = Embedding(num_words, embedding_dim,  input_length=max_sequence_length, trainable=True)
+    x = embedding_layer(sequence_input)
+    
+    name,x = model_func(x)
+
+    placetype_layer = Dense(len(class_encoder.classes_), activation='softmax',name="place_type")(x) # From the transformation, attempt to predict the place type
+    coordinates_layer = Dense(coordinate_encoder.oneDimensionOutputSize(), activation='softmax',name="coord")(x)
+
+    model = Model(sequence_input, [ placetype_layer , coordinates_layer] )
+    model.compile(loss=['categorical_crossentropy','categorical_crossentropy'], optimizer='adam',metrics=["accuracy"])
+    return name,model
+
+def WEAverage_model(x):
+    name = "WordEmb_Average"
+    x = GlobalAveragePooling1D()(x)
+    return name,x
+
+def MPC_WEAverage_model(x):
+    name = "MPC_WordEmb_Average"
+    x = GlobalAveragePooling1D()(x)
+    x = Dense(2000)(x)
+    x = Dense(2000)(x)
+    return name,x
+
+def BI_LSTM_model(x):
+    name = "Bi-LSTM"
+    x = (Bidirectional(LSTM(64)))(x)
+    return name,x
+
+def BI_GRU_model(x):
+    name = "Bi-GRU"
+    x = (Bidirectional(GRU(64)))(x)
+    return name, x
--- a/requirements.txt
+++ b/requirements.txt
+pyroutelib3
+node2vec
+osrm
+geopandas
+pandas
+numpy
+tqdm
+networkx
+matplotlib
+joblib
+gensim
\ No newline at end of file
--- a/scripts/classificationEmbeddings.py
+++ b/scripts/classificationEmbeddings.py
+
+import pandas as pd
+df= pd.read_csv("dbpediaPlaceClassification.csv")
+import numpy as np
+def loadGloveModel(gloveFile):
+    print("Loading Glove Model")
+    f = open(gloveFile,'r')
+    model = {}
+    for line in f:
+        splitLine = line.split()
+        word = splitLine[0]
+        embedding = np.array([float(val) for val in splitLine[1:]])
+        model[word] = embedding
+    print("Done.",len(model)," words loaded!")
+    return model
+model = loadGloveModel("data/glove/glove.6B.100d.txt")
+def getEmb(x,model):
+    emb = np.zeros(100)
+    for word in x.split():
+        word =word.lower()
+        if word in model:
+            emb+=model[word]
+    return emb
+
+df["embeddings"] = df["Place"].apply(lambda x : getEmb(x,model))
+df.to_msgpack("dbpediaPlaceEmbedding.msg")
+
+import json
+data = json.load(open("classname.json"))
+df2 = pd.DataFrame(data.items(),columns="WID label".split())
+df2["embeddings"] = df2["label"].apply(lambda x:getEmb(x,model))
+df2.to_msgpack("classnameEmbedding.msg")
\ No newline at end of file
--- a/scripts/evaluation-dbpedia-types.py
+++ b/scripts/evaluation-dbpedia-types.py
+import os
+import random
+import numpy as np
+from owlready2 import *
+from vincenty import vincenty
+from scipy.stats.stats import pearsonr
+
+BASE_DIR = ''
+GLOVE_DIR = os.path.join(BASE_DIR, '.')
+NUM_PAIRS = 100000
+
+print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.')
+embeddings_index = {}
+listText = []
+with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f:
+    for line in f:
+        word, coefs = line.split(maxsplit=1)
+        coefs = np.fromstring(coefs, 'f', sep=' ')
+        embeddings_index[word] = coefs
+        listText.append(word)
+print('Found %s word vectors.' % len(embeddings_index))
+
+print('Collecting data from DBPedia ontology, downloaded from http://downloads.dbpedia.org/2014/dbpedia_2014.owl.bz2.')
+onto = get_ontology("dbpedia_2014.owl")
+onto.load()
+def retrive_desc( concept , old_names=[] ): 
+ desc = list( concept.descendants( include_self=False) )
+ names = list( [ re.sub(r'.+\.', '', repr(concept)) + "/" + re.sub(r'.+\.', '', repr(c)) for c in desc ] )
+ names = [x for x in names if x not in old_names]
+ desc = [ desc[x] for x in range(len(names)) if names[x] not in old_names ] 
+ new_desc = list(desc) 
+ for i in desc:
+  n1, d1 = retrive_desc(i, names + old_names)
+  for j in range(len(n1)):
+   new_desc.append( d1[j] )
+   names.append( re.sub(r'.+\.', '', repr(concept)) + "/" + n1[j] )
+ return names, new_desc
+names, _ = retrive_desc(onto.Place)
+names = [ n.lower() for n in set(names) if re.sub(r'.+/', '', n.lower()) in embeddings_index ] # check if the name of the place type exists in the embeddings matrix
+
+print('Generating pairs of place names.')
+name_pairs = []
+similarity_pairs = []
+distance_pairs = []
+for num in range(NUM_PAIRS):
+    name1 = random.choice(names)
+    name2 = random.choice(names)
+    if name1 == name2: continue
+    name_pairs.append( (name1,name2) )
+    dist = 0.0
+    n1 = name1.split('/')
+    n2 = name2.split('/')
+    for i in range(min(len(n1),len(n2))):
+     if n1[i] == n2[i]: dist += 1.0
+     else: break    
+    similarity_pairs.append(dist) # Similarity between the place types, given by the number of ancestors in common
+    distance_pairs.append( np.sqrt(np.sum((embeddings_index[re.sub(r'.+/', '', name1)] - embeddings_index[re.sub(r'.+/', '', name2)])**2)) ) # Euclidean distance between the embeddings
+
+result = pearsonr( distance_pairs , similarity_pairs) # Compute Pearson correlation and associated p-value
+print(result)
+
--- a/scripts/evaluation-geonames.py
+++ b/scripts/evaluation-geonames.py
+import os
+import random
+import numpy as np
+from vincenty import vincenty
+from scipy.stats.stats import pearsonr
+
+BASE_DIR = ''
+GLOVE_DIR = os.path.join(BASE_DIR, 'data/glove')
+NUM_PAIRS = 100000
+
+print('Indexing word vectors downloaded from http://nlp.stanford.edu/data/glove.6B.zip.')
+embeddings_index = {}
+listText = []
+with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.new.txt')) as f:
+    for line in f:
+        word, coefs = line.split(maxsplit=1)
+        coefs = np.fromstring(coefs, 'f', sep=' ')
+        embeddings_index[word] = coefs
+        listText.append(word)
+print('Found %s word vectors.' % len(embeddings_index))
+
+print('Collecting data from geonames downloaded from http://download.geonames.org/export/dump/allCountries.zip.')
+file = open("data/geonamesData/allCountries.txt", "r") 
+placenames = { }
+for line in file: 
+    line = line.split("\t")
+    name = line[1].lower()
+    if " " in name or not(name in embeddings_index): # check if the main name exists in the embeddings matrix
+       names = line[3].split(",") 
+       for n in names:
+           n = n.strip().lower()
+           if not(" " in n) and (n in embeddings_index): # if not, check if any of the alternative names exists in the embeddings matrix
+               name = n
+               break
+    if " " in name or not(name in embeddings_index): continue
+    placenames.update( { name : (float(line[4]), float(line[5])) } )
+
+from scipy.spatial.distance import cosine 
+from tqdm import tqdm
+
+print('Generating pairs of place names.')
+NUM_PAIRS = 1000
+name_pairs = []
+geo_distance_pairs = []
+distance_pairs = []
+for num in tqdm(range(NUM_PAIRS)):
+    name1 = random.choice(list(placenames.keys()))
+    name2 = random.choice(list(placenames.keys()))
+    if name1 == name2: continue
+    name_pairs.append( (name1,name2) )
+    try:
+        distance_pairs.append(cosine(embeddings_index[name1], embeddings_index[name2]))
+        geo_distance_pairs.append( vincenty(placenames[name1], placenames[name2]) ) # Geospatial distance between the place names, given by Vincenty's geodetic formulae # Cosine distance between the embeddings
+    except:
+        pass
+geo_distance_pairs= np.array(geo_distance_pairs).astype(float)
+distance_pairs = np.nan_to_num(distance_pairs,nan=np.nanmax(distance_pairs))
+geo_distance_pairs = np.nan_to_num(geo_distance_pairs,nan=1)
+
+
+result = pearsonr( geo_distance_pairs , distance_pairs) # Compute Pearson correlation and associated p-value
+print(result)
\ No newline at end of file
--- a/scripts/extractWikidataClasseName.py
+++ b/scripts/extractWikidataClasseName.py
+import json
+import argparse
+import time
+
+from SPARQLWrapper import SPARQLWrapper,JSON
+from urllib.request import HTTPError
+
+from tqdm import tqdm
+
+parser = argparse.ArgumentParser()
+parser.add_argument("available_class_filename",help="JSON file that contains an array of string. Each string is a Wikidata id (e.g. Q30)")
+parser.add_argument("output_filename")
+args = parser.parse_args()
+
+ids= json.load(open(args.available_class_filename))
+
+def get_label(id_wikidata):
+    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
+    sparql.setQuery("""
+        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
+        PREFIX wd: <http://www.wikidata.org/entity/> 
+        select  *
+        where {
+                wd:"""+id_wikidata+ """ rdfs:label ?label .
+        FILTER (langMatches( lang(?label), "EN" ) )
+            } 
+        LIMIT 1""" )
+    sparql.setReturnFormat(JSON)
+    results = sparql.query().convert()
+    time.sleep(0.1)
+    try:
+        return results["results"]["bindings"][0]["label"]["value"]
+    except:
+        return ""
+
+
+t = 0
+dict_results = {}
+progress_bar = tqdm(total=len(ids))
+while t<len(ids):
+    try:
+        dict_results[ids[t]]=get_label(ids[t])
+    except HTTPError as e:
+        time.sleep(1)
+        continue
+    progress_bar.update(1)
+    t+=1
+progress_bar.close()
+
+json.dump(dict_results,open(args.output_filename,'w'))
+
+
+
--- a/scripts/filterDataWithtopNclasse.py
+++ b/scripts/filterDataWithtopNclasse.py
+#!/usr/bin/env python
+# coding: utf-8
+
+import pandas as pd
+import numpy as np
+
+TOPN = 100
+
+df = pd.read_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv",sep="\t",names="ID title url latitude longitude instance_of".split())[1:]
+df = df.fillna("")
+df["instance_of"] = df.instance_of.apply(lambda x: str(x).split("_")) 
+
+
+count = {}
+for list_type in df.instance_of.values:
+    #print(list_type)
+    for type_ in list_type:
+        if not type_ in count: count[type_]=0
+        count[type_]+=1
+
+# modify count for CLASS IMPORTANT BUT NOT FREQUENT BECAUSE OF A HIGH GRANULARITY
+to_increase = [
+    "Q6256",#country
+    "Q5119",#capital
+    "Q27554677",#former capital
+    "Q10864048", # ADM. DIV. 1
+    "Q13220204", # ADM. DIV. 2
+    "Q13220204", # ADM. DIV. 3
+    "Q14757767", # ADM. DIV. 4
+    "Q82794" # geographic region
+]
+inf_ = np.max(list(count.values()))
+for type_ in to_increase:
+    count[type_] = inf_+1
+
+
+print("Dataframe contains",len(df),"entities")
+
+
+
+count_df = df.from_dict(count,orient="index").reset_index().sort_values(0,ascending=False)
+
+class_filtered = set(count_df.head(TOPN)["index"].values)
+#Q15640612 #5
+#Q22927291 #6
+
+
+
+df = df[df.instance_of.apply(lambda x: sum(True for i in x if i in class_filtered)>0)]
+
+def getMostFrequentClass(x):
+    idx = np.argsort([count[i] for i in x])[-1]
+    return x[idx]
+
+df["type"] = df.instance_of.apply(getMostFrequentClass)
+df.to_csv("data/wikidataIDWikipediaURLofPlaceInEngliseWiki.tsv_filteredTop{0}class".format(TOPN))
+
+
+
--- a/scripts/getEmbeddingGeonamesPlacenames.py
+++ b/scripts/getEmbeddingGeonamesPlacenames.py
+import fasttext
+print("Load Model Fasttext FR")
+model = fasttext.load_model("./data/fasttext_FR/wiki.fr.bin")
+print("Model Loaded !")
+
+import pandas as pd
+def read_geonames(file):
+    dtypes_dict = {
+    0: int, # geonameid
+    1: str,  # name
+    2: str,  # asciiname
+    3: str,  # alternatenames
+    4: float, # latitude
+    5: float, # longitude
+    6: str, # feature class
+    7: str, # feature code
+    8: str, # country code
+    9: str, # cc2
+    10: str, # admin1 code
+    11: str, # admin2 code
+    12: str, # admin3 code
+    13: str, # admin4 code
+    14: int, # population
+    15: str, # elevation
+    16: int, # dem (digital elevation model)
+    17: str, # timezone
+    18: str # modification date yyyy-MM-dd
+    }
+    rename_cols = {
+    0:"geonameid", # geonameid
+    1:"name",  # name
+    2:"asciiname",  # asciiname
+    3:"alternatenames",  # alternatenames
+    4:"latitude", # latitude
+    5:"longitude", # longitude
+    6:"feature_class", # feature class
+    7:"feature_class", # feature code
+    8:"country_code", # country code
+    9:"cc2", # cc2
+    10:"admin1_code", # admin1 code
+    11:"admin2_code", # admin2 code
+    12:"admin3_code", # admin3 code
+    13:"admin4_code", # admin4 code
+    14:"population", # population
+    15:"elevation", # elevation
+    16:"dem", # dem (digital elevation model)
+    17:"timezone", # timezone
+    18:"modification_date" # modification date yyyy-MM-dd
+    }
+    data = pd.read_csv(file, sep="\t", header = None, quoting=3,dtype=dtypes_dict,na_values='', keep_default_na=False,error_bad_lines=False)
+    data.rename(columns=rename_cols,inplace=True)
+    return data
+data = read_geonames("./data/geonamesData/FR.txt")
+data= data.fillna("")
+data = data[data.admin2_code == "50"]
+
+data["embedding"] = data["name"].apply(lambda x : model[x])
+print(data)
+data.to_msgpack("geonamesFRWithEmbeddings.msg")
\ No newline at end of file
--- a/scripts/getWikidataTypesNames.py
+++ b/scripts/getWikidataTypesNames.py
+import subprocess,os,json
+import numpy as np
+import time
+
+import json
+ids= json.load(open("classavailable.json"))
+
+from SPARQLWrapper import SPARQLWrapper,JSON
+
+def get_label(id_wikidata):
+    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
+    sparql.setQuery("""
+        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
+        PREFIX wd: <http://www.wikidata.org/entity/> 
+        select  *
+        where {
+                wd:"""+id_wikidata+ """ rdfs:label ?label .
+        FILTER (langMatches( lang(?label), "EN" ) )
+            } 
+        LIMIT 1""" )
+    sparql.setReturnFormat(JSON)
+    results = sparql.query().convert()
+    time.sleep(0.1)
+    try:
+        return results["results"]["bindings"][0]["label"]["value"]
+    except:
+        return ""
+
+from urllib.request import HTTPError
+from tqdm import tqdm
+t = 0
+dict_results = {}
+pbar = tqdm(total=len(ids))
+while t<len(ids):
+    try:
+        dict_results[ids[t]]=get_label(ids[t])
+    except HTTPError as e:
+        time.sleep(1)
+        continue
+    pbar.update(1)
+    t+=1
+pbar.close()
\ No newline at end of file
--- a/scripts/run_.sh
+++ b/scripts/run_.sh
+python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 100 --walk-length 30 --num-walks 200 --word2vec-window-size 30 
+python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 200 --walk-length 30 --num-walks 200 --word2vec-window-size 30 
+python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 300 --walk-length 30 --num-walks 200 --word2vec-window-size 30 
+python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 400 --walk-length 30 --num-walks 200 --word2vec-window-size 30 
+python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 500 --walk-length 30 --num-walks 200 --word2vec-window-size 30 
+python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 600 --walk-length 30 --num-walks 200 --word2vec-window-size 30 
+python3 geonames_embedding.py data/geonamesData/FR.txt -d --noise --noise-size 700 --walk-length 30 --num-walks 200 --word2vec-window-size 30 
\ No newline at end of file