Update Readme and few debug

fb3e6b21 · Fize Jacques · 935b46a8 · fb3e6b21 · fb3e6b21 · fb3e6b21
Commit fb3e6b21 authored 5 years ago by Fize Jacques
--- a/.gitignore
+++ b/.gitignore
@@ -141,3 +141,5 @@ WikipediaExtract/*
 *.DS_Store
 test_comb.sh
+.vscode/*
--- a/README.md
+++ b/README.md
-# INSTALL BASEMAP
+# Requirements
-```bash
+ - Python3.6+
-brew install geos
+ - Os free (all dependencies work on Windows !)
-pip3 install https://github.com/matplotlib/basemap/archive/master.zip
-```
-# GET DATA
+## Install dependencies
-## Process Wikipedia
+    pip3 install -r requirements.txt
-python3 -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz
+# Different approaches execution
-## Process Wikidata
+## Embedding using places Wikipedia pages
-python3 extractInfoWikidata.py
+Three scripts need to be used : 
+ * 1_extractDataFromWikidata.py
+ * 2_extractLearningDataset.py
+ * 4_embeddings_lat_lon_type.py
-## Fuse Data for training 
+### Step 1: Parse Wikipedia data !
-python3 extractsubWikipedia.py
+First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2* 
\ No newline at end of file
+Then, use the `gensim` parser (doc [here](https://radimrehurek.com/gensim/scripts/segment_wiki.html)). Use the following command :
+    python3 -m gensim.scripts.segment_wiki -i -f <wikipedia_dump_file> -o <output>
+### Step 2: Select and Filter entity from Wikidata
+We use Wikidata to identify which Wikipedia pages concern a place. Simply, run the following command : 
+    python3 1_extractDataFromWikidata.py <Wikidata Dump (.gz)> <output_filename>
+### Step 3: Extract data from Wikipedia pages
+Using previous output, we extract text data from selected Wikipedia pages with the following command:
+    python3 2_extractLearningDataset.py <wikipedia_filename (output from step 1)> <wikidata_extract(output from step2)> <output_filename>
+### Step 4 : Run Embedding extraction
+To learn extract the place embedding, use the `4_embeddings_lat_lon_type.py`
+#### Available Parameters
+| Parameter              | Value (default)     |
+|------------------------|---------------------|
+| --max_sequence_length          | Maximum sequence length (15) |
+| --embedding_dimension           | Embedding vector size (100)             |
+| --batch_size | batch size used in the training (100)             |
+| --epochs         | Number of epochs (100) |
+| -v                     | Display the keras verbose          |
+#### Output
+The different outputs (on for each neural network architecture) are put in the `outputs` directory : 
+* outputs/Bi-GRU_100dim_20epoch_1000batch__coord.png : **coordinates accuracy plot**
+* outputs/Bi-GRU_100dim_20epoch_1000batch__place_type.png : **place type accuracy plot**
+* outputs/Bi-GRU_100dim_20epoch_1000batch.csv : **training history**
+* outputs/Bi-GRU_100dim_20epoch_1000batch.txt : **embeddings**
+## Geonames place embedding
+First, download the Geonames dump here : https://download.geonames.org/export/dump/ 
+*N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM).
+    python3 geonames_embedding.py <geonames dump(*.txt)>
+### Available Parameters
+| Parameter              | Value (default)                                                   |
+|------------------------|-------------------------------------------------------------------|
+| --nbcpu                | Cpu used for the embedding learning phase                         |
+| --vector-size          | embedding size                                                    |
+| --walk-length          | Generated Walk length                                             |
+| --num-walks            | Number of walks for each vertex (place)                           |
+| --word2vec-window-size | Window-size used in Word2vec                                      |
+| --buffer-size          | Buffer size used to detect adjacency relationships between places |
+| -d                     | Integrate distances between places in the topology graph          |
+| --dist                 | Distance used if '-d'                                             |
+### Output 
+Gensim word2vec format is saved in the execution directory.
+## Embedding : train using concatenation of close places
+    Toponym Combination
+    positional arguments:
+    geoname_input         Filepath of the Geonames file you want to use.
+    geoname_hierachy_input
+                            Filepath of the Geonames file you want to use.
+    optional arguments:
+    -h, --help            show this help message and exit
+    -v, --verbose
+    -n NGRAM_SIZE, --ngram-size NGRAM_SIZE
+    -t TOLERANCE_VALUE, --tolerance-value TOLERANCE_VALUE
+    -e EPOCHS, --epochs EPOCHS
+    -m {CNN,LSTM}, --model {CNN,LSTM}
\ No newline at end of file
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -106,9 +106,6 @@ geoname_vec = dict(filtered["geonameid cell_vec".split()].values)
 del filtered
 embedding_dim = 256
 num_words = len(index.index_ngram) # necessary for the embedding matrix 

--- a/geonames_embedding.py
+++ b/geonames_embedding.py
@@ -217,12 +217,12 @@ model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_
 # Saving the embedding model
 if not IS_NOISE:
-    model.save("manche_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
+    model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE,
    walk_l = WALK_LENGTH,
    num_walk = NUMBER_OF_WALK,
-    window = WORD2VEC_WINDOW))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
+    window = WORD2VEC_WINDOW,filename=GEONAMES_FN.split("/")[-1] ))#,noise = NUMBER_OF_NODE_DESPATIALIZED))
 else:
-    model.save("manche_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
+    model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE,
    walk_l = WALK_LENGTH,
    num_walk = NUMBER_OF_WALK,
-    window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED))
+    window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED,filename=GEONAMES_FN.split("/")[-1]))
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,4 +13,5 @@ sklearn
 tensorflow
 keras
 ngram
 shapely
\ No newline at end of file
+sqlitedict
\ No newline at end of file