From 98d75bb61fd99f4f7ef197dada4b38c79b0d2424 Mon Sep 17 00:00:00 2001
From: jfize <jacques.fize@insa-lyon.fr>
Date: Mon, 3 Feb 2020 17:18:39 +0100
Subject: [PATCH] Add git lfs + change some names + add images to README

---
 .gitattributes                                |  1 +
 .gitignore                                    |  5 +-
 3_samplingLearningDataset.py                  | 48 -------------
 README.md                                     | 15 ++--
 chrono.py                                     | 57 ---------------
 combination_embeddings.py                     |  2 +-
 documentation/imgs/first_approach.png         |  3 +
 documentation/imgs/second_approach.png        |  3 +
 documentation/imgs/third_approach.png         |  3 +
 ..._lon_type.py => embeddings_lat_lon_type.py |  2 +-
 evalgeonamesembeddings.py                     | 70 -------------------
 ...mWikidata.py => extractDataFromWikidata.py |  0
 ...ingDataset.py => extractLearningDataset.py |  0
 helpers.py                                    | 57 +++++++++++++++
 utils.py                                      | 26 ++++---
 15 files changed, 99 insertions(+), 193 deletions(-)
 create mode 100644 .gitattributes
 delete mode 100644 3_samplingLearningDataset.py
 delete mode 100644 chrono.py
 create mode 100644 documentation/imgs/first_approach.png
 create mode 100644 documentation/imgs/second_approach.png
 create mode 100644 documentation/imgs/third_approach.png
 rename 4_embeddings_lat_lon_type.py => embeddings_lat_lon_type.py (99%)
 delete mode 100644 evalgeonamesembeddings.py
 rename 1_extractDataFromWikidata.py => extractDataFromWikidata.py (100%)
 rename 2_extractLearningDataset.py => extractLearningDataset.py (100%)

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..24a8e87
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+*.png filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
index d004fc1..589ecbd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -143,4 +143,7 @@ WikipediaExtract/*
 test_comb.sh
 
 .vscode/*
-notes.md
\ No newline at end of file
+notes.md
+
+.idea/*
+.vscode/*
\ No newline at end of file
diff --git a/3_samplingLearningDataset.py b/3_samplingLearningDataset.py
deleted file mode 100644
index 2d512ce..0000000
--- a/3_samplingLearningDataset.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import subprocess,os,json
-import numpy as np
-import time
-
-import logging
-logging.basicConfig(
-    format='[%(asctime)s][%(levelname)s] %(message)s ', 
-    datefmt='%m/%d/%Y %I:%M:%S %p',
-    level=logging.INFO
-    )
-
-from tqdm import tqdm
-
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("corpus_filename")
-parser.add_argument("sampling_size",type=int)
-parser.add_argument("output_filename")
-
-args= parser.parse_args()
-
-CORPUS_FILENAME = args.corpus_filename
-SAMPLE_SIZE = args.sampling_size
-
-# Compute the size of input corpus 
-logging.info("Computing the corpus size...")
-wc_l = subprocess.check_output(["wc","-l", CORPUS_FILENAME ])
-NUMBER_OF_INPUT=int(wc_l.strip().split()[-2])
-logging.info("The corpus is composed of {0} entries".format(NUMBER_OF_INPUT))
-
-# Sampling
-logging.info("Sampling...")
-arr_ = np.arange(NUMBER_OF_INPUT)
-sample = np.random.choice(arr_,SAMPLE_SIZE)
-
-# Prepare Output file
-output = open(args.output_filename,'w')
-
-# Writing in the output file
-logging.info("Writing the output...")
-for ix,line in tqdm(enumerate(open(CORPUS_FILENAME)),total=NUMBER_OF_INPUT):
-    if ix in sample:
-        output.write(line)
-
-logging.info("Done !")
-
-
diff --git a/README.md b/README.md
index 84d5178..c9fc710 100644
--- a/README.md
+++ b/README.md
@@ -3,18 +3,22 @@
  - Python3.6+
  - Os free (all dependencies work on Windows !)
 
+It is strongly advised to used Anaconda in a windows environnement ! 
+
 ## Install dependencies
 
     pip3 install -r requirements.txt
 
+For Anaconda users
+
+    while read requirement; do conda install --yes $requirement; done < requirements.txt
+
 # Different approaches execution
 
 ## Embedding using places Wikipedia pages
 
-Three scripts need to be used : 
- * 1_extractDataFromWikidata.py
- * 2_extractLearningDataset.py
- * 4_embeddings_lat_lon_type.py
+![first_approach](documentation/imgs/first_approach.png)
+
 
 ### Step 1: Parse Wikipedia data !
 
@@ -62,6 +66,8 @@ The different outputs (on for each neural network architecture) are put in the `
 
 ## Geonames place embedding
 
+![second_approach](documentation/imgs/second_approach.png)
+
 First, download the Geonames dump here : https://download.geonames.org/export/dump/ 
 
 *N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM).
@@ -87,6 +93,7 @@ Gensim word2vec format is saved in the execution directory.
 
 ## Embedding : train using concatenation of close places
 
+![second_approach](documentation/imgs/third_approach.png)
 
 ### Prepare required data
 
diff --git a/chrono.py b/chrono.py
deleted file mode 100644
index 8e0adfd..0000000
--- a/chrono.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import time
-
-class Chronometer():
-    def __init__(self):
-        self.__task_begin_timestamp = {}
-
-    def start(self,task_name):
-        """
-        Start a new task chronometer
-        
-        Parameters
-        ----------
-        task_name : str
-            task id
-        
-        Raises
-        ------
-        ValueError
-            if a running task already exists with that name
-        """
-        if task_name in self.__task_begin_timestamp:
-            raise ValueError("A running task exists with the name {0}!".format(task_name))
-        self.__task_begin_timestamp[task_name] = time.time()
-
-    def stop(self,task_name):
-        """
-        Stop and return the duration of the task
-        
-        Parameters
-        ----------
-        task_name : str
-            task id
-        
-        Returns
-        -------
-        float
-            duration of the task in seconds
-        
-        Raises
-        ------
-        ValueError
-            if no task exist with the id `task_name`
-        """
-        if not task_name in self.__task_begin_timestamp:
-             raise ValueError("The {0} task does not exist!".format(task_name))
-        duration = time.time() - self.__task_begin_timestamp[task_name]
-        del self.__task_begin_timestamp[task_name]
-        return duration
-
-if __name__ == "__main__":
-    chrono = Chronometer()
-    chrono.start("test")
-    chrono.start("test2")
-    time.sleep(3)
-    print(chrono.stop("test"))
-    time.sleep(3)
-    print(chrono.stop("test2"))
\ No newline at end of file
diff --git a/combination_embeddings.py b/combination_embeddings.py
index 7913784..94ee452 100644
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -62,7 +62,7 @@ def get_new_ids(cooc_data,id_first_value):
 
 # Logging
 import logging
-from chrono import Chronometer
+from helpers import Chronometer
 logging.basicConfig(
     format='[%(asctime)s][%(levelname)s] %(message)s ', 
     datefmt='%m/%d/%Y %I:%M:%S %p',
diff --git a/documentation/imgs/first_approach.png b/documentation/imgs/first_approach.png
new file mode 100644
index 0000000..297c1a5
--- /dev/null
+++ b/documentation/imgs/first_approach.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a243605f4d58dee8bad4a18845ab78ca2319049e633b35e6a89540add684be8
+size 298011
diff --git a/documentation/imgs/second_approach.png b/documentation/imgs/second_approach.png
new file mode 100644
index 0000000..e5e693f
--- /dev/null
+++ b/documentation/imgs/second_approach.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8bab13df1e420e97a08977aa38382076491a8294d85b7daa0a10d69a36a52fc0
+size 457738
diff --git a/documentation/imgs/third_approach.png b/documentation/imgs/third_approach.png
new file mode 100644
index 0000000..d96596a
--- /dev/null
+++ b/documentation/imgs/third_approach.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ad7cbed2e748b814c38eb070d29a23f7417469169a56aeb0e660a743e00430fd
+size 31104
diff --git a/4_embeddings_lat_lon_type.py b/embeddings_lat_lon_type.py
similarity index 99%
rename from 4_embeddings_lat_lon_type.py
rename to embeddings_lat_lon_type.py
index e674e50..d5778f6 100644
--- a/4_embeddings_lat_lon_type.py
+++ b/embeddings_lat_lon_type.py
@@ -39,7 +39,7 @@ from utils import CoordinatesEncoder,TokenizerCustom,_split
 
 # Logging
 import logging
-from chrono import Chronometer
+from helpers import Chronometer
 logging.basicConfig(
     format='[%(asctime)s][%(levelname)s] %(message)s ', 
     datefmt='%m/%d/%Y %I:%M:%S %p',
diff --git a/evalgeonamesembeddings.py b/evalgeonamesembeddings.py
deleted file mode 100644
index c7d346d..0000000
--- a/evalgeonamesembeddings.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Evaluation process
-import gensim
-import glob
-import re
-import gensim
-import random
-from helpers import *
-from scipy.spatial.distance import cosine
-from shapely.geometry import Point
-from scipy.stats.stats import pearsonr
-
-import pandas as pd
-import geopandas as gpd
-
-from tqdm import tqdm
-
-NPAIR = 100000
-fns = glob.glob("data/embeddings/*.bin")
-
-def get_data(fn):
-    data = [int(x) for x in re.findall("\d+",fn)]
-    if not len(data) == 4:
-        return {"embedding_size":data[0],
-        "walk_length":data[1],
-        "number_of_walks":data[2],
-        "word2vec_window_size":data[3],
-        "filepath":fn,
-        "noise":data[4]
-        }
-        #raise Exception("filename should have 4 integers")
-    return {
-        "embedding_size":data[0],
-        "walk_length":data[1],
-        "number_of_walks":data[2],
-        "word2vec_window_size":data[3],
-        "filepath":fn
-    }
-    
-df = read_geonames("./data/geonamesData/FR.txt")
-df["geometry"] = df["latitude longitude".split()].apply(lambda x:Point(x.longitude,x.latitude),axis=1)
-
-# Create GeoDataFrame for faster spatial comparison operations
-gdf = gpd.GeoDataFrame(df)
-
-# Select a sample that concerns the departement "La Manche"
-manche_gdf = gdf[gdf.admin2_code == "50"].copy()
-
-df =pd.DataFrame([get_data(fn) for fn in fns])
-
-def get_pearsons(model):
-    manche_gdf.loc[:,"geometry_centroid"]=manche_gdf.centroid
-    coords = dict(manche_gdf.loc[:,"geonameid geometry_centroid".split()].values)
-    places = list(coords.keys())
-    geodesic_d = []
-    embeddings_d = []
-    for i in tqdm(range(NPAIR),disable=True):
-        placeA=random.choice(places)
-        placeB=random.choice(places)
-        geodesic_d.append(coords[placeA].distance(coords[placeB]))
-        embeddings_d.append(cosine(model.wv[str(placeA)],model.wv[str(placeB)]))
-    return pearsonr(geodesic_d , embeddings_d) # Compute Pearson correlation and associated p-value
-
-df["pearson"] = df.filepath.apply(lambda x : get_pearsons(gensim.models.KeyedVectors.load(x))[0])
-df.fillna(0,inplace=True)
-df.plot.scatter(x="walk_length", y="pearson",c="noise",cmap='inferno')
-plt.show()
-df.plot.scatter(x="number_of_walks", y="pearson",c="noise",cmap='inferno')
-plt.show()
-df.plot.scatter(x="word2vec_window_size", y="pearson",c="noise",cmap='inferno')
-plt.show()
\ No newline at end of file
diff --git a/1_extractDataFromWikidata.py b/extractDataFromWikidata.py
similarity index 100%
rename from 1_extractDataFromWikidata.py
rename to extractDataFromWikidata.py
diff --git a/2_extractLearningDataset.py b/extractLearningDataset.py
similarity index 100%
rename from 2_extractLearningDataset.py
rename to extractLearningDataset.py
diff --git a/helpers.py b/helpers.py
index c1e1f34..554e620 100644
--- a/helpers.py
+++ b/helpers.py
@@ -88,3 +88,60 @@ def save_embedding(model,tokenizer,layer_idx,fn):
             f.write('\n')
 
 
+import time
+
+class Chronometer():
+    def __init__(self):
+        self.__task_begin_timestamp = {}
+
+    def start(self,task_name):
+        """
+        Start a new task chronometer
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Raises
+        ------
+        ValueError
+            if a running task already exists with that name
+        """
+        if task_name in self.__task_begin_timestamp:
+            raise ValueError("A running task exists with the name {0}!".format(task_name))
+        self.__task_begin_timestamp[task_name] = time.time()
+
+    def stop(self,task_name):
+        """
+        Stop and return the duration of the task
+        
+        Parameters
+        ----------
+        task_name : str
+            task id
+        
+        Returns
+        -------
+        float
+            duration of the task in seconds
+        
+        Raises
+        ------
+        ValueError
+            if no task exist with the id `task_name`
+        """
+        if not task_name in self.__task_begin_timestamp:
+             raise ValueError("The {0} task does not exist!".format(task_name))
+        duration = time.time() - self.__task_begin_timestamp[task_name]
+        del self.__task_begin_timestamp[task_name]
+        return duration
+
+if __name__ == "__main__":
+    chrono = Chronometer()
+    chrono.start("test")
+    chrono.start("test2")
+    time.sleep(3)
+    print(chrono.stop("test"))
+    time.sleep(3)
+    print(chrono.stop("test2"))
\ No newline at end of file
diff --git a/utils.py b/utils.py
index 37eb4a7..6e052db 100644
--- a/utils.py
+++ b/utils.py
@@ -1,24 +1,27 @@
+# Basic import 
 import math
-import numpy as np
-
-from nltk.tokenize import word_tokenize
-
-import textwrap
-from ngram import NGram
-
-
 import argparse
 import os
 import json
 
-from tqdm import tqdm
+# Data Structure
+import numpy as np
 import geopandas as gpd
+from shapely.geometry import Point,box
+
+# NLP 
+from nltk.tokenize import word_tokenize
+import textwrap
+from ngram import NGram
 
+# Machine learning 
 from keras.layers import Embedding
 from gensim.models import Word2Vec
 
+#Â Visualisation and parallelisation
+from tqdm import tqdm
 from joblib import Parallel,delayed
-from shapely.geometry import Point,box
+
 
 
 class TokenizerCustom():
@@ -94,7 +97,8 @@ class NgramIndex():
         Returns
         -------
         list of int
-            list of ngram index
+            listfrom shapely.geometry import Point,box
+ of ngram index
         """
         ngrams = word.lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))
-- 
GitLab