From fb3e6b21439382b4b870c31e941e62cc60b11561 Mon Sep 17 00:00:00 2001 From: Fize Jacques <jacques.fize@cirad.fr> Date: Mon, 20 Jan 2020 15:30:35 +0100 Subject: [PATCH] Update Readme and few debug --- .gitignore | 2 + README.md | 110 +++++++++++++++++++++++++++++++++----- combination_embeddings.py | 3 -- geonames_embedding.py | 8 +-- requirements.txt | 3 +- 5 files changed, 106 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 58819ac..9f23506 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,5 @@ WikipediaExtract/* *.DS_Store test_comb.sh + +.vscode/* diff --git a/README.md b/README.md index 72ba7df..de19b3a 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,106 @@ -# INSTALL BASEMAP +# Requirements -```bash -brew install geos -pip3 install https://github.com/matplotlib/basemap/archive/master.zip -``` + - Python3.6+ + - Os free (all dependencies work on Windows !) -# GET DATA +## Install dependencies -## Process Wikipedia + pip3 install -r requirements.txt -python3 -m gensim.scripts.segment_wiki -i -f enwiki-latest-pages-articles.xml.bz2 -o enwiki-latest.json.gz +# Different approaches execution -##Â Process Wikidata +## Embedding using places Wikipedia pages -python3 extractInfoWikidata.py +Three scripts need to be used : + * 1_extractDataFromWikidata.py + * 2_extractLearningDataset.py + * 4_embeddings_lat_lon_type.py -## Fuse Data for training +### Step 1: Parse Wikipedia data ! -python3 extractsubWikipedia.py \ No newline at end of file +First, download the Wikipedia corpus in the wanted language, *e.g. enwiki-latest-pages-articles.xml.bz2* + +Then, use the `gensim` parser (doc [here](https://radimrehurek.com/gensim/scripts/segment_wiki.html)). Use the following command : + + python3 -m gensim.scripts.segment_wiki -i -f <wikipedia_dump_file> -o <output> + +### Step 2: Select and Filter entity from Wikidata + +We use Wikidata to identify which Wikipedia pages concern a place. Simply, run the following command : + + python3 1_extractDataFromWikidata.py <Wikidata Dump (.gz)> <output_filename> + +### Step 3: Extract data from Wikipedia pages + +Using previous output, we extract text data from selected Wikipedia pages with the following command: + + python3 2_extractLearningDataset.py <wikipedia_filename (output from step 1)> <wikidata_extract(output from step2)> <output_filename> + +### Step 4 : Run Embedding extraction + +To learn extract the place embedding, use the `4_embeddings_lat_lon_type.py` + +#### Available Parameters + +| Parameter | Value (default) | +|------------------------|---------------------| +| --max_sequence_length | Maximum sequence length (15) | +| --embedding_dimension | Embedding vector size (100) | +| --batch_size | batch size used in the training (100) | +| --epochs | Number of epochs (100) | +| -v | Display the keras verbose | + +#### Output + +The different outputs (on for each neural network architecture) are put in the `outputs` directory : + +* outputs/Bi-GRU_100dim_20epoch_1000batch__coord.png : **coordinates accuracy plot** +* outputs/Bi-GRU_100dim_20epoch_1000batch__place_type.png : **place type accuracy plot** +* outputs/Bi-GRU_100dim_20epoch_1000batch.csv : **training history** +* outputs/Bi-GRU_100dim_20epoch_1000batch.txt : **embeddings** + + +## Geonames place embedding + +First, download the Geonames dump here : https://download.geonames.org/export/dump/ + +*N.B.* We advise you to take only the data from one country ! (Adjacency graph need a lot of RAM). + + python3 geonames_embedding.py <geonames dump(*.txt)> + +### Available Parameters + +| Parameter | Value (default) | +|------------------------|-------------------------------------------------------------------| +| --nbcpu | Cpu used for the embedding learning phase | +| --vector-size | embedding size | +| --walk-length | Generated Walk length | +| --num-walks | Number of walks for each vertex (place) | +| --word2vec-window-size | Window-size used in Word2vec | +| --buffer-size | Buffer size used to detect adjacency relationships between places | +| -d | Integrate distances between places in the topology graph | +| --dist | Distance used if '-d' | + +### Output + +Gensim word2vec format is saved in the execution directory. + +## Embedding : train using concatenation of close places + + + + + Toponym Combination + + positional arguments: + geoname_input Filepath of the Geonames file you want to use. + geoname_hierachy_input + Filepath of the Geonames file you want to use. + + optional arguments: + -h, --help show this help message and exit + -v, --verbose + -n NGRAM_SIZE, --ngram-size NGRAM_SIZE + -t TOLERANCE_VALUE, --tolerance-value TOLERANCE_VALUE + -e EPOCHS, --epochs EPOCHS + -m {CNN,LSTM}, --model {CNN,LSTM} \ No newline at end of file diff --git a/combination_embeddings.py b/combination_embeddings.py index 1fddfe6..ef5bdc0 100644 --- a/combination_embeddings.py +++ b/combination_embeddings.py @@ -106,9 +106,6 @@ geoname_vec = dict(filtered["geonameid cell_vec".split()].values) del filtered - - - embedding_dim = 256 num_words = len(index.index_ngram) # necessary for the embedding matrix diff --git a/geonames_embedding.py b/geonames_embedding.py index e5c6305..8906e95 100644 --- a/geonames_embedding.py +++ b/geonames_embedding.py @@ -217,12 +217,12 @@ model = node2vec.fit(window=WORD2VEC_WINDOW, min_count=1, batch_words=NUMBER_OF_ # Saving the embedding model if not IS_NOISE: - model.save("manche_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE, + model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}.bin".format(dim = VECTOR_SIZE, walk_l = WALK_LENGTH, num_walk = NUMBER_OF_WALK, - window = WORD2VEC_WINDOW))#,noise = NUMBER_OF_NODE_DESPATIALIZED)) + window = WORD2VEC_WINDOW,filename=GEONAMES_FN.split("/")[-1] ))#,noise = NUMBER_OF_NODE_DESPATIALIZED)) else: - model.save("manche_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE, + model.save("{filename}_{dim}_{walk_l}_{num_walk}_{window}_{noise}.bin".format(dim = VECTOR_SIZE, walk_l = WALK_LENGTH, num_walk = NUMBER_OF_WALK, - window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED)) \ No newline at end of file + window = WORD2VEC_WINDOW,noise = NUMBER_OF_NODE_DESPATIALIZED,filename=GEONAMES_FN.split("/")[-1])) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b109202..35fa742 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,4 +13,5 @@ sklearn tensorflow keras ngram -shapely \ No newline at end of file +shapely +sqlitedict \ No newline at end of file -- GitLab