From fb3e6b21439382b4b870c31e941e62cc60b11561 Mon Sep 17 00:00:00 2001
From: Fize Jacques <jacques.fize@cirad.fr>
Date: Mon, 20 Jan 2020 15:30:35 +0100
Subject: [PATCH] Update Readme and few debug

---
 .gitignore                |   2 +
 README.md                 | 110 +++++++++++++++++++++++++++++++++-----
 combination_embeddings.py |   3 --
 geonames_embedding.py     |   8 +--
 requirements.txt          |   3 +-
 5 files changed, 106 insertions(+), 20 deletions(-)

diff --git a/.gitignore b/.gitignore
index 58819ac..9f23506 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,5 @@ WikipediaExtract/*
 
 *.DS_Store
 test_comb.sh
+
+.vscode/*
diff --git a/README.md b/README.md
index 72ba7df..de19b3a 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,106 @@
-# INSTALL BASEMAP
+# Requirements
 
-```bash
-brew install geos
-pip3 install https://github.com/matplotlib/basemap/archive/master.zip
-```
+ - Python3.6+
+ - Os free (all dependencies work on Windows !)
 
-# GET DATA
+## Install dependencies
 
-## Process Wikipedia
+    pip3 install -r requirements.txt
 
-python3 -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz
+# Different approaches execution
 
-## Process Wikidata
+## Embedding using places Wikipedia pages
 
-python3 extractInfoWikidata.py
+Three scripts need to be used : 
+ * 1_extractDataFromWikidata.py
+ * 2_extractLearningDataset.py
+ * 4_embeddings_lat_lon_type.py
 
-## Fuse Data for training 
+### Step 1: Parse Wikipedia data !
 
-python3 extractsubWikipedia.py
\ No newline at end of file
+First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2* 
+
+Then, use the `gensim` parser (doc [here](https://radimrehurek.com/gensim/scripts/segment_wiki.html)). Use the following command :
+
+    python3 -m gensim.scripts.segment_wiki -i -f <wikipedia_dump_file> -o <output>
+
+### Step 2: Select and Filter entity from Wikidata
+
+We use Wikidata to identify which Wikipedia pages concern a place. Simply, run the following command : 
+
+    python3 1_extractDataFromWikidata.py <Wikidata Dump (.gz)> <output_filename>
+
+### Step 3: Extract data from Wikipedia pages
+
+Using previous output, we extract text data from selected Wikipedia pages with the following command:
+
+    python3 2_extractLearningDataset.py <wikipedia_filename (output from step 1)> <wikidata_extract(output from step2)> <output_filename>
+
+### Step 4 : Run Embedding extraction
+
+To learn extract the place embedding, use the `4_embeddings_lat_lon_type.py`
+
+#### Available Parameters
+
+| Parameter              | Value (default)     |
+|------------------------|---------------------|
+| --max_sequence_length          | Maximum sequence length (15) |
+| --embedding_dimension           | Embedding vector size (100)             |
+| --batch_size | batch size used in the training (100)             |
+| --epochs         | Number of epochs (100) |
+| -v                     | Display the keras verbose          |
+
+#### Output
+
+The different outputs (on for each neural network architecture) are put in the `outputs` directory : 
+
+* outputs/Bi-GRU_100dim_20epoch_1000batch__coord.png : **coordinates accuracy plot**
+* outputs/Bi-GRU_100dim_20epoch_1000batch__place_type.png : **place type accuracy plot**
+* outputs/Bi-GRU_100dim_20epoch_1000batch.csv : **training history**
+* outputs/Bi-GRU_100dim_20epoch_1000batch.txt : **embeddings**
+
+
+## Geonames place embedding
+
+First, download the Geonames dump here : https://download.geonames.org/export/dump/ 
+
+*N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM).
+
+    python3 geonames_embedding.py <geonames dump(*.txt)>
+
+### Available Parameters
+
+| Parameter              | Value (default)                                                   |
+|------------------------|-------------------------------------------------------------------|
+| --nbcpu                | Cpu used for the embedding learning phase                         |
+| --vector-size          | embedding size                                                    |
+| --walk-length          | Generated Walk length                                             |
+| --num-walks            | Number of walks for each vertex (place)                           |
+| --word2vec-window-size | Window-size used in Word2vec                                      |
+| --buffer-size          | Buffer size used to detect adjacency relationships between places |
+| -d                     | Integrate distances between places in the topology graph          |
+| --dist                 | Distance used if '-d'                                             |
+
+### Output 
+
+Gensim word2vec format is saved in the execution directory.
+
+## Embedding : train using concatenation of close places
+
+
+
+
+    Toponym Combination
+
+    positional arguments:
+    geoname_input         Filepath of the Geonames file you want to use.
+    geoname_hierachy_input
+                            Filepath of the Geonames file you want to use.
+
+    optional arguments:
+    -h, --help            show this help message and exit
+    -v, --verbose
+    -n NGRAM_SIZE, --ngram-size NGRAM_SIZE
+    -t TOLERANCE_VALUE, --tolerance-value TOLERANCE_VALUE
+    -e EPOCHS, --epochs EPOCHS
+    -m {CNN,LSTM}, --model {CNN,LSTM}
\ No newline at end of file
diff --git a/combination_embeddings.py b/combination_embeddings.py
index 1fddfe6..ef5bdc0 100644
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -106,9 +106,6 @@ geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
 del filtered
 
 
-
-
-
 embedding_dim = 256
 num_words = len(index.index_ngram) # necessary for the embedding matrix 
 
diff --git a/geonames_embedding.py b/geonames_embedding.py
index e5c6305..8906e95 100644
--- a/geonames_embedding.py
+++ b/geonames_embedding.py
@@ -217,12 +217,12 @@ model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_
 
 # Saving the embedding model
 if not IS_NOISE:
-    model.save("manche_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
+    model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
     walk_l = WALK_LENGTH,
     num_walk = NUMBER_OF_WALK,
-    window = WORD2VEC_WINDOW))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
+    window = WORD2VEC_WINDOW,filename=GEONAMES_FN.split("/")[-1] ))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
 else:
-    model.save("manche_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
+    model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
     walk_l = WALK_LENGTH,
     num_walk = NUMBER_OF_WALK,
-    window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED))
\ No newline at end of file
+    window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED,filename=GEONAMES_FN.split("/")[-1]))
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index b109202..35fa742 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,5 @@ sklearn
 tensorflow
 keras
 ngram
-shapely
\ No newline at end of file
+shapely
+sqlitedict
\ No newline at end of file
-- 
GitLab