diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..24a8e87939aa53cdd833f6be7610cb4972e063ad --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore index d004fc118703ea29ec3f728c3f4d1220ccf8d795..589ecbd98650129ffab8c9cb9447c47966228249 100644 --- a/.gitignore +++ b/.gitignore @@ -143,4 +143,7 @@ WikipediaExtract/* test_comb.sh .vscode/* -notes.md \ No newline at end of file +notes.md + +.idea/* +.vscode/* \ No newline at end of file diff --git a/3_samplingLearningDataset.py b/3_samplingLearningDataset.py deleted file mode 100644 index 2d512ce71cf324213f2e6e67ce74f495b28d35e0..0000000000000000000000000000000000000000 --- a/3_samplingLearningDataset.py +++ /dev/null @@ -1,48 +0,0 @@ -import subprocess,os,json -import numpy as np -import time - -import logging -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -from tqdm import tqdm - -import argparse - -parser = argparse.ArgumentParser() -parser.add_argument("corpus_filename") -parser.add_argument("sampling_size",type=int) -parser.add_argument("output_filename") - -args= parser.parse_args() - -CORPUS_FILENAME = args.corpus_filename -SAMPLE_SIZE = args.sampling_size - -# Compute the size of input corpus -logging.info("Computing the corpus size...") -wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ]) -NUMBER_OF_INPUT=int(wc_l.strip().split()[-2]) -logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT)) - -# Sampling -logging.info("Sampling...") -arr_ = np.arange(NUMBER_OF_INPUT) -sample = np.random.choice(arr_,SAMPLE_SIZE) - -# Prepare Output file -output = open(args.output_filename,'w') - -# Writing in the output file -logging.info("Writing the output...") -for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT): - if ix in sample: - output.write(line) - -logging.info("Done !") - - diff --git a/README.md b/README.md index 84d51789db8d8b0f64c18d74dd52ecb20a6e27fb..c9fc710b441da4c643d446a6f900945164e0e60e 100644 --- a/README.md +++ b/README.md @@ -3,18 +3,22 @@ - Python3.6+ - Os free (all dependencies work on Windows !) +It is strongly advised to used Anaconda in a windows environnement ! + ## Install dependencies pip3 install -r requirements.txt +For Anaconda users + + while read requirement; do conda install --yes $requirement; done < requirements.txt + # Different approaches execution ## Embedding using places Wikipedia pages -Three scripts need to be used : - * 1_extractDataFromWikidata.py - * 2_extractLearningDataset.py - * 4_embeddings_lat_lon_type.py + + ### Step 1: Parse Wikipedia data ! @@ -62,6 +66,8 @@ The different outputs (on for each neural network architecture) are put in the ` ## Geonames place embedding + + First, download the Geonames dump here : https://download.geonames.org/export/dump/ *N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM). @@ -87,6 +93,7 @@ Gensim word2vec format is saved in the execution directory. ## Embedding : train using concatenation of close places + ### Prepare required data diff --git a/chrono.py b/chrono.py deleted file mode 100644 index 8e0adfd61f63449bce232df919402448735db4b6..0000000000000000000000000000000000000000 --- a/chrono.py +++ /dev/null @@ -1,57 +0,0 @@ -import time - -class Chronometer(): - def __init__(self): - self.__task_begin_timestamp = {} - - def start(self,task_name): - """ - Start a new task chronometer - - Parameters - ---------- - task_name : str - task id - - Raises - ------ - ValueError - if a running task already exists with that name - """ - if task_name in self.__task_begin_timestamp: - raise ValueError("A running task exists with the name {0}!".format(task_name)) - self.__task_begin_timestamp[task_name] = time.time() - - def stop(self,task_name): - """ - Stop and return the duration of the task - - Parameters - ---------- - task_name : str - task id - - Returns - ------- - float - duration of the task in seconds - - Raises - ------ - ValueError - if no task exist with the id `task_name` - """ - if not task_name in self.__task_begin_timestamp: - raise ValueError("The {0} task does not exist!".format(task_name)) - duration = time.time() - self.__task_begin_timestamp[task_name] - del self.__task_begin_timestamp[task_name] - return duration - -if __name__ == "__main__": - chrono = Chronometer() - chrono.start("test") - chrono.start("test2") - time.sleep(3) - print(chrono.stop("test")) - time.sleep(3) - print(chrono.stop("test2")) \ No newline at end of file diff --git a/combination_embeddings.py b/combination_embeddings.py index 7913784f82da7a7fa04aef23e23ecf864c619464..94ee452266826d6f2815bf74ddb257bc5e8ae4c4 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -62,7 +62,7 @@ def get_new_ids(cooc_data,id_first_value): # Logging import logging -from chrono import Chronometer +from helpers import Chronometer logging.basicConfig( format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', diff --git a/documentation/imgs/first_approach.png b/documentation/imgs/first_approach.png new file mode 100644 index 0000000000000000000000000000000000000000..297c1a5025d993acfae6e501d88acac24dfc7e59 --- /dev/null +++ b/documentation/imgs/first_approach.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a243605f4d58dee8bad4a18845ab78ca2319049e633b35e6a89540add684be8 +size 298011 diff --git a/documentation/imgs/second_approach.png b/documentation/imgs/second_approach.png new file mode 100644 index 0000000000000000000000000000000000000000..e5e693fbaf11113de2673b366d4bf603047239c2 --- /dev/null +++ b/documentation/imgs/second_approach.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bab13df1e420e97a08977aa38382076491a8294d85b7daa0a10d69a36a52fc0 +size 457738 diff --git a/documentation/imgs/third_approach.png b/documentation/imgs/third_approach.png new file mode 100644 index 0000000000000000000000000000000000000000..d96596ad9ee35b8ada81b0a68535e593ed8e1a0e --- /dev/null +++ b/documentation/imgs/third_approach.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7cbed2e748b814c38eb070d29a23f7417469169a56aeb0e660a743e00430fd +size 31104 diff --git a/4_embeddings_lat_lon_type.py b/embeddings_lat_lon_type.py similarity index 99% rename from 4_embeddings_lat_lon_type.py rename to embeddings_lat_lon_type.py index e674e508052df416b5378c3b79a6d6cd0de3451b..d5778f6aefcfebe7e1d970e4520feb95cee8a8a8 100644 --- a/4_embeddings_lat_lon_type.py +++ b/embeddings_lat_lon_type.py @@ -39,7 +39,7 @@ from utils import CoordinatesEncoder,TokenizerCustom,_split # Logging import logging -from chrono import Chronometer +from helpers import Chronometer logging.basicConfig( format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', diff --git a/evalgeonamesembeddings.py b/evalgeonamesembeddings.py deleted file mode 100644 index c7d346dd4a58940a1c0beb1e5f3a5782489b52bd..0000000000000000000000000000000000000000 --- a/evalgeonamesembeddings.py +++ /dev/null @@ -1,70 +0,0 @@ -# Evaluation process -import gensim -import glob -import re -import gensim -import random -from helpers import * -from scipy.spatial.distance import cosine -from shapely.geometry import Point -from scipy.stats.stats import pearsonr - -import pandas as pd -import geopandas as gpd - -from tqdm import tqdm - -NPAIR = 100000 -fns = glob.glob("data/embeddings/*.bin") - -def get_data(fn): - data = [int(x) for x in re.findall("\d+",fn)] - if not len(data) == 4: - return {"embedding_size":data[0], - "walk_length":data[1], - "number_of_walks":data[2], - "word2vec_window_size":data[3], - "filepath":fn, - "noise":data[4] - } - #raise Exception("filename should have 4 integers") - return { - "embedding_size":data[0], - "walk_length":data[1], - "number_of_walks":data[2], - "word2vec_window_size":data[3], - "filepath":fn - } - -df = read_geonames("./data/geonamesData/FR.txt") -df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1) - -# Create GeoDataFrame for faster spatial comparison operations -gdf = gpd.GeoDataFrame(df) - -# Select a sample that concerns the departement "La Manche" -manche_gdf = gdf[gdf.admin2_code == "50"].copy() - -df =pd.DataFrame([get_data(fn) for fn in fns]) - -def get_pearsons(model): - manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid - coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values) - places = list(coords.keys()) - geodesic_d = [] - embeddings_d = [] - for i in tqdm(range(NPAIR),disable=True): - placeA=random.choice(places) - placeB=random.choice(places) - geodesic_d.append(coords[placeA].distance(coords[placeB])) - embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)])) - return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value - -df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0]) -df.fillna(0,inplace=True) -df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno') -plt.show() -df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno') -plt.show() -df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno') -plt.show() \ No newline at end of file diff --git a/1_extractDataFromWikidata.py b/extractDataFromWikidata.py similarity index 100% rename from 1_extractDataFromWikidata.py rename to extractDataFromWikidata.py diff --git a/2_extractLearningDataset.py b/extractLearningDataset.py similarity index 100% rename from 2_extractLearningDataset.py rename to extractLearningDataset.py diff --git a/helpers.py b/helpers.py index c1e1f34178bcd39c5f49f90e4d1fd2d9f3cbf803..554e62033471e11f2587d1f6dec879dfd75cc2e1 100644 --- a/helpers.py +++ b/helpers.py @@ -88,3 +88,60 @@ def save_embedding(model,tokenizer,layer_idx,fn): f.write('\n') +import time + +class Chronometer(): + def __init__(self): + self.__task_begin_timestamp = {} + + def start(self,task_name): + """ + Start a new task chronometer + + Parameters + ---------- + task_name : str + task id + + Raises + ------ + ValueError + if a running task already exists with that name + """ + if task_name in self.__task_begin_timestamp: + raise ValueError("A running task exists with the name {0}!".format(task_name)) + self.__task_begin_timestamp[task_name] = time.time() + + def stop(self,task_name): + """ + Stop and return the duration of the task + + Parameters + ---------- + task_name : str + task id + + Returns + ------- + float + duration of the task in seconds + + Raises + ------ + ValueError + if no task exist with the id `task_name` + """ + if not task_name in self.__task_begin_timestamp: + raise ValueError("The {0} task does not exist!".format(task_name)) + duration = time.time() - self.__task_begin_timestamp[task_name] + del self.__task_begin_timestamp[task_name] + return duration + +if __name__ == "__main__": + chrono = Chronometer() + chrono.start("test") + chrono.start("test2") + time.sleep(3) + print(chrono.stop("test")) + time.sleep(3) + print(chrono.stop("test2")) \ No newline at end of file diff --git a/utils.py b/utils.py index 37eb4a7372b9823f1c91d42c4f837de56bebc026..6e052dbf5cc7b3194d69043c4e40a88596c459be 100644 --- a/utils.py +++ b/utils.py @@ -1,24 +1,27 @@ +# Basic import import math -import numpy as np - -from nltk.tokenize import word_tokenize - -import textwrap -from ngram import NGram - - import argparse import os import json -from tqdm import tqdm +# Data Structure +import numpy as np import geopandas as gpd +from shapely.geometry import Point,box + +# NLP +from nltk.tokenize import word_tokenize +import textwrap +from ngram import NGram +# Machine learning from keras.layers import Embedding from gensim.models import Word2Vec +#Â Visualisation and parallelisation +from tqdm import tqdm from joblib import Parallel,delayed -from shapely.geometry import Point,box + class TokenizerCustom(): @@ -94,7 +97,8 @@ class NgramIndex(): Returns ------- list of int - list of ngram index + listfrom shapely.geometry import Point,box + of ngram index """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams))