Skip to content
Snippets Groups Projects
Commit 98d75bb6 authored by Jacques Fize's avatar Jacques Fize
Browse files

Add git lfs + change some names + add images to README

parent 637949fa
No related branches found
No related tags found
No related merge requests found
*.png filter=lfs diff=lfs merge=lfs -text
......@@ -143,4 +143,7 @@ WikipediaExtract/*
test_comb.sh
.vscode/*
notes.md
\ No newline at end of file
notes.md
.idea/*
.vscode/*
\ No newline at end of file
import subprocess,os,json
import numpy as np
import time
import logging
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
from tqdm import tqdm
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("corpus_filename")
parser.add_argument("sampling_size",type=int)
parser.add_argument("output_filename")
args= parser.parse_args()
CORPUS_FILENAME = args.corpus_filename
SAMPLE_SIZE = args.sampling_size
# Compute the size of input corpus
logging.info("Computing the corpus size...")
wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ])
NUMBER_OF_INPUT=int(wc_l.strip().split()[-2])
logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT))
# Sampling
logging.info("Sampling...")
arr_ = np.arange(NUMBER_OF_INPUT)
sample = np.random.choice(arr_,SAMPLE_SIZE)
# Prepare Output file
output = open(args.output_filename,'w')
# Writing in the output file
logging.info("Writing the output...")
for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT):
if ix in sample:
output.write(line)
logging.info("Done !")
......@@ -3,18 +3,22 @@
- Python3.6+
- Os free (all dependencies work on Windows !)
It is strongly advised to used Anaconda in a windows environnement !
## Install dependencies
pip3 install -r requirements.txt
For Anaconda users
while read requirement; do conda install --yes $requirement; done < requirements.txt
# Different approaches execution
## Embedding using places Wikipedia pages
Three scripts need to be used :
* 1_extractDataFromWikidata.py
* 2_extractLearningDataset.py
* 4_embeddings_lat_lon_type.py
![first_approach](documentation/imgs/first_approach.png)
### Step 1: Parse Wikipedia data !
......@@ -62,6 +66,8 @@ The different outputs (on for each neural network architecture) are put in the `
## Geonames place embedding
![second_approach](documentation/imgs/second_approach.png)
First, download the Geonames dump here : https://download.geonames.org/export/dump/
*N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM).
......@@ -87,6 +93,7 @@ Gensim word2vec format is saved in the execution directory.
## Embedding : train using concatenation of close places
![second_approach](documentation/imgs/third_approach.png)
### Prepare required data
......
import time
class Chronometer():
def __init__(self):
self.__task_begin_timestamp = {}
def start(self,task_name):
"""
Start a new task chronometer
Parameters
----------
task_name : str
task id
Raises
------
ValueError
if a running task already exists with that name
"""
if task_name in self.__task_begin_timestamp:
raise ValueError("A running task exists with the name {0}!".format(task_name))
self.__task_begin_timestamp[task_name] = time.time()
def stop(self,task_name):
"""
Stop and return the duration of the task
Parameters
----------
task_name : str
task id
Returns
-------
float
duration of the task in seconds
Raises
------
ValueError
if no task exist with the id `task_name`
"""
if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
if __name__ == "__main__":
chrono = Chronometer()
chrono.start("test")
chrono.start("test2")
time.sleep(3)
print(chrono.stop("test"))
time.sleep(3)
print(chrono.stop("test2"))
\ No newline at end of file
......@@ -62,7 +62,7 @@ def get_new_ids(cooc_data,id_first_value):
# Logging
import logging
from chrono import Chronometer
from helpers import Chronometer
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
......
documentation/imgs/first_approach.png

131 B

documentation/imgs/second_approach.png

131 B

documentation/imgs/third_approach.png

130 B

......@@ -39,7 +39,7 @@ from utils import CoordinatesEncoder,TokenizerCustom,_split
# Logging
import logging
from chrono import Chronometer
from helpers import Chronometer
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
......
# Evaluation process
import gensim
import glob
import re
import gensim
import random
from helpers import *
from scipy.spatial.distance import cosine
from shapely.geometry import Point
from scipy.stats.stats import pearsonr
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
NPAIR = 100000
fns = glob.glob("data/embeddings/*.bin")
def get_data(fn):
data = [int(x) for x in re.findall("\d+",fn)]
if not len(data) == 4:
return {"embedding_size":data[0],
"walk_length":data[1],
"number_of_walks":data[2],
"word2vec_window_size":data[3],
"filepath":fn,
"noise":data[4]
}
#raise Exception("filename should have 4 integers")
return {
"embedding_size":data[0],
"walk_length":data[1],
"number_of_walks":data[2],
"word2vec_window_size":data[3],
"filepath":fn
}
df = read_geonames("./data/geonamesData/FR.txt")
df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1)
# Create GeoDataFrame for faster spatial comparison operations
gdf = gpd.GeoDataFrame(df)
# Select a sample that concerns the departement "La Manche"
manche_gdf = gdf[gdf.admin2_code == "50"].copy()
df =pd.DataFrame([get_data(fn) for fn in fns])
def get_pearsons(model):
manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid
coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values)
places = list(coords.keys())
geodesic_d = []
embeddings_d = []
for i in tqdm(range(NPAIR),disable=True):
placeA=random.choice(places)
placeB=random.choice(places)
geodesic_d.append(coords[placeA].distance(coords[placeB]))
embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)]))
return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value
df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0])
df.fillna(0,inplace=True)
df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno')
plt.show()
df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno')
plt.show()
df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno')
plt.show()
\ No newline at end of file
File moved
File moved
......@@ -88,3 +88,60 @@ def save_embedding(model,tokenizer,layer_idx,fn):
f.write('\n')
import time
class Chronometer():
def __init__(self):
self.__task_begin_timestamp = {}
def start(self,task_name):
"""
Start a new task chronometer
Parameters
----------
task_name : str
task id
Raises
------
ValueError
if a running task already exists with that name
"""
if task_name in self.__task_begin_timestamp:
raise ValueError("A running task exists with the name {0}!".format(task_name))
self.__task_begin_timestamp[task_name] = time.time()
def stop(self,task_name):
"""
Stop and return the duration of the task
Parameters
----------
task_name : str
task id
Returns
-------
float
duration of the task in seconds
Raises
------
ValueError
if no task exist with the id `task_name`
"""
if not task_name in self.__task_begin_timestamp:
raise ValueError("The {0} task does not exist!".format(task_name))
duration = time.time() - self.__task_begin_timestamp[task_name]
del self.__task_begin_timestamp[task_name]
return duration
if __name__ == "__main__":
chrono = Chronometer()
chrono.start("test")
chrono.start("test2")
time.sleep(3)
print(chrono.stop("test"))
time.sleep(3)
print(chrono.stop("test2"))
\ No newline at end of file
# Basic import
import math
import numpy as np
from nltk.tokenize import word_tokenize
import textwrap
from ngram import NGram
import argparse
import os
import json
from tqdm import tqdm
# Data Structure
import numpy as np
import geopandas as gpd
from shapely.geometry import Point,box
# NLP
from nltk.tokenize import word_tokenize
import textwrap
from ngram import NGram
# Machine learning
from keras.layers import Embedding
from gensim.models import Word2Vec
# Visualisation and parallelisation
from tqdm import tqdm
from joblib import Parallel,delayed
from shapely.geometry import Point,box
class TokenizerCustom():
......@@ -94,7 +97,8 @@ class NgramIndex():
Returns
-------
list of int
list of ngram index
listfrom shapely.geometry import Point,box
of ngram index
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment