Add git lfs + change some names + add images to README

98d75bb6 · Jacques Fize · 637949fa · 98d75bb6 · 98d75bb6 · 637949fa
Commit 98d75bb6 authored 5 years ago by Jacques Fize
--- a/.gitattributes
+++ b/.gitattributes
+*.png filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -143,4 +143,7 @@ WikipediaExtract/*
 test_comb.sh

 .vscode/*
-notes.md
\ No newline at end of file
+notes.md
+
+.idea/*
+.vscode/*
\ No newline at end of file
--- a/3_samplingLearningDataset.py
+++ b/3_samplingLearningDataset.py
-import subprocess,os,json
-import numpy as np
-import time
-
-import logging
-logging.basicConfig(
-    format='[%(asctime)s][%(levelname)s] %(message)s ', 
-    datefmt='%m/%d/%Y %I:%M:%S %p',
-    level=logging.INFO
-    )
-
-from tqdm import tqdm
-
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("corpus_filename")
-parser.add_argument("sampling_size",type=int)
-parser.add_argument("output_filename")
-
-args= parser.parse_args()
-
-CORPUS_FILENAME = args.corpus_filename
-SAMPLE_SIZE = args.sampling_size
-
-# Compute the size of input corpus 
-logging.info("Computing the corpus size...")
-wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ])
-NUMBER_OF_INPUT=int(wc_l.strip().split()[-2])
-logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT))
-
-# Sampling
-logging.info("Sampling...")
-arr_ = np.arange(NUMBER_OF_INPUT)
-sample = np.random.choice(arr_,SAMPLE_SIZE)
-
-# Prepare Output file
-output = open(args.output_filename,'w')
-
-# Writing in the output file
-logging.info("Writing the output...")
-for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT):
-    if ix in sample:
-        output.write(line)
-
-logging.info("Done !")
-
-
--- a/README.md
+++ b/README.md
@@ -3,18 +3,22 @@
 - Python3.6+
 - Os free (all dependencies work on Windows !)

+It is strongly advised to used Anaconda in a windows environnement ! 
+
 ## Install dependencies

    pip3 install -r requirements.txt

+For Anaconda users
+
+    while read requirement; do conda install --yes $requirement; done < requirements.txt
+
 # Different approaches execution

 ## Embedding using places Wikipedia pages

-Three scripts need to be used : 
- * 1_extractDataFromWikidata.py
- * 2_extractLearningDataset.py
- * 4_embeddings_lat_lon_type.py
+![first_approach](documentation/imgs/first_approach.png)
+

 ### Step 1: Parse Wikipedia data !

@@ -62,6 +66,8 @@ The different outputs (on for each neural network architecture) are put in the `

 ## Geonames place embedding

+![second_approach](documentation/imgs/second_approach.png)
+
 First, download the Geonames dump here : https://download.geonames.org/export/dump/ 

 *N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM).
@@ -87,6 +93,7 @@ Gensim word2vec format is saved in the execution directory.

 ## Embedding : train using concatenation of close places

+![second_approach](documentation/imgs/third_approach.png)

 ### Prepare required data


--- a/chrono.py
+++ b/chrono.py
-import time
-
-class Chronometer():
-    def __init__(self):
-        self.__task_begin_timestamp = {}
-
-    def start(self,task_name):
-        """
-        Start a new task chronometer
-        
-        Parameters
-        ----------
-        task_name : str
-            task id
-        
-        Raises
-        ------
-        ValueError
-            if a running task already exists with that name
-        """
-        if task_name in self.__task_begin_timestamp:
-            raise ValueError("A running task exists with the name {0}!".format(task_name))
-        self.__task_begin_timestamp[task_name] = time.time()
-
-    def stop(self,task_name):
-        """
-        Stop and return the duration of the task
-        
-        Parameters
-        ----------
-        task_name : str
-            task id
-        
-        Returns
-        -------
-        float
-            duration of the task in seconds
-        
-        Raises
-        ------
-        ValueError
-            if no task exist with the id `task_name`
-        """
-        if not task_name in self.__task_begin_timestamp:
-             raise ValueError("The {0} task does not exist!".format(task_name))
-        duration = time.time() - self.__task_begin_timestamp[task_name]
-        del self.__task_begin_timestamp[task_name]
-        return duration
-
-if __name__ == "__main__":
-    chrono = Chronometer()
-    chrono.start("test")
-    chrono.start("test2")
-    time.sleep(3)
-    print(chrono.stop("test"))
-    time.sleep(3)
-    print(chrono.stop("test2"))
\ No newline at end of file
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -62,7 +62,7 @@ def get_new_ids(cooc_data,id_first_value):

 # Logging
 import logging
-from chrono import Chronometer
+from helpers import Chronometer
 logging.basicConfig(
    format='[%(asctime)s][%(levelname)s] %(message)s ', 
    datefmt='%m/%d/%Y %I:%M:%S %p',

--- a/documentation/imgs/first_approach.png
+++ b/documentation/imgs/first_approach.png
--- a/documentation/imgs/second_approach.png
+++ b/documentation/imgs/second_approach.png
--- a/documentation/imgs/third_approach.png
+++ b/documentation/imgs/third_approach.png
--- a/4_embeddings_lat_lon_type.py
+++ b/4_embeddings_lat_lon_type.py
@@ -39,7 +39,7 @@ from utils import CoordinatesEncoder,TokenizerCustom,_split

 # Logging
 import logging
-from chrono import Chronometer
+from helpers import Chronometer
 logging.basicConfig(
    format='[%(asctime)s][%(levelname)s] %(message)s ', 
    datefmt='%m/%d/%Y %I:%M:%S %p',

--- a/evalgeonamesembeddings.py
+++ b/evalgeonamesembeddings.py
-# Evaluation process
-import gensim
-import glob
-import re
-import gensim
-import random
-from helpers import *
-from scipy.spatial.distance import cosine
-from shapely.geometry import Point
-from scipy.stats.stats import pearsonr
-
-import pandas as pd
-import geopandas as gpd
-
-from tqdm import tqdm
-
-NPAIR = 100000
-fns = glob.glob("data/embeddings/*.bin")
-
-def get_data(fn):
-    data = [int(x) for x in re.findall("\d+",fn)]
-    if not len(data) == 4:
-        return {"embedding_size":data[0],
-        "walk_length":data[1],
-        "number_of_walks":data[2],
-        "word2vec_window_size":data[3],
-        "filepath":fn,
-        "noise":data[4]
-        }
-        #raise Exception("filename should have 4 integers")
-    return {
-        "embedding_size":data[0],
-        "walk_length":data[1],
-        "number_of_walks":data[2],
-        "word2vec_window_size":data[3],
-        "filepath":fn
-    }
-    
-df = read_geonames("./data/geonamesData/FR.txt")
-df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1)
-
-# Create GeoDataFrame for faster spatial comparison operations
-gdf = gpd.GeoDataFrame(df)
-
-# Select a sample that concerns the departement "La Manche"
-manche_gdf = gdf[gdf.admin2_code == "50"].copy()
-
-df =pd.DataFrame([get_data(fn) for fn in fns])
-
-def get_pearsons(model):
-    manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid
-    coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values)
-    places = list(coords.keys())
-    geodesic_d = []
-    embeddings_d = []
-    for i in tqdm(range(NPAIR),disable=True):
-        placeA=random.choice(places)
-        placeB=random.choice(places)
-        geodesic_d.append(coords[placeA].distance(coords[placeB]))
-        embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)]))
-    return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value
-
-df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0])
-df.fillna(0,inplace=True)
-df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno')
-plt.show()
-df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno')
-plt.show()
-df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno')
-plt.show()
\ No newline at end of file
--- a/1_extractDataFromWikidata.py
+++ b/1_extractDataFromWikidata.py
--- a/2_extractLearningDataset.py
+++ b/2_extractLearningDataset.py
--- a/helpers.py
+++ b/helpers.py
@@ -88,3 +88,60 @@ def save_embedding(model,tokenizer,layer_idx,fn):
            f.write('\n')


+import time
+
+class Chronometer():
+    def __init__(self):
+        self.__task_begin_timestamp = {}
+
+    def start(self,task_name):
+        """
+        Start a new task chronometer
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Raises
+        ------
+        ValueError
+            if a running task already exists with that name
+        """
+        if task_name in self.__task_begin_timestamp:
+            raise ValueError("A running task exists with the name {0}!".format(task_name))
+        self.__task_begin_timestamp[task_name] = time.time()
+
+    def stop(self,task_name):
+        """
+        Stop and return the duration of the task
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Returns
+        -------
+        float
+            duration of the task in seconds
+        
+        Raises
+        ------
+        ValueError
+            if no task exist with the id `task_name`
+        """
+        if not task_name in self.__task_begin_timestamp:
+             raise ValueError("The {0} task does not exist!".format(task_name))
+        duration = time.time() - self.__task_begin_timestamp[task_name]
+        del self.__task_begin_timestamp[task_name]
+        return duration
+
+if __name__ == "__main__":
+    chrono = Chronometer()
+    chrono.start("test")
+    chrono.start("test2")
+    time.sleep(3)
+    print(chrono.stop("test"))
+    time.sleep(3)
+    print(chrono.stop("test2"))
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
+# Basic import 
 import math
-import numpy as np
-
-from nltk.tokenize import word_tokenize
-
-import textwrap
-from ngram import NGram
-
-
 import argparse
 import os
 import json

-from tqdm import tqdm
+# Data Structure
+import numpy as np
 import geopandas as gpd
+from shapely.geometry import Point,box
+
+# NLP 
+from nltk.tokenize import word_tokenize
+import textwrap
+from ngram import NGram

+# Machine learning 
 from keras.layers import Embedding
 from gensim.models import Word2Vec

+# Visualisation and parallelisation
+from tqdm import tqdm
 from joblib import Parallel,delayed
-from shapely.geometry import Point,box
+


 class TokenizerCustom():
@@ -94,7 +97,8 @@ class NgramIndex():
        Returns
        -------
        list of int
-            list of ngram index
+            listfrom shapely.geometry import Point,box
+ of ngram index
        """
        ngrams = word.lower().replace(" ","$")
        ngrams = list(self.ngram_gen.split(ngrams))