From e7c14b5d417c5782c93f88d8e33c964a5bf9b24c Mon Sep 17 00:00:00 2001
From: Jacques Fize <jacques.fize@insa-lyon.fr>
Date: Wed, 23 Sep 2020 16:50:48 +0200
Subject: [PATCH] UPD

---
 combination_embeddingsv4.py                   | 309 ++++--------------
 generate_dataset.py                           | 148 +++++++++
 lib/datageneratorv4.py                        |  24 ++
 .../toponym_combination_embedding_v3.json     |  18 +
 4 files changed, 251 insertions(+), 248 deletions(-)
 create mode 100644 generate_dataset.py
 create mode 100644 lib/datageneratorv4.py
 create mode 100644 parser_config/toponym_combination_embedding_v3.json

diff --git a/combination_embeddingsv4.py b/combination_embeddingsv4.py
index b1d3745..cfbd49d 100644
--- a/combination_embeddingsv4.py
+++ b/combination_embeddingsv4.py
@@ -1,78 +1,35 @@
 # Base module 
-import re
 import os
-import json
+import sys
 
 #Â Structure
 import pandas as pd
 import numpy as np
-import geopandas as gpd
 
 #Â DEEPL module
 from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout
 from keras.models import Model
-from keras import backend as K
 from keras.callbacks import ModelCheckpoint
 
-import tensorflow as tf
-
-#Â Geometry
-from shapely.geometry import Point
-
 #Â Custom module
-from helpers import read_geonames
-from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
+from lib.geo import zero_one_encoding
 from lib.ngram_index import NgramIndex
 from lib.utils import ConfigurationReader
-from lib.metrics import lat_accuracy,lon_accuracy
-from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
-
+from lib.geo import accuracy_k,haversine_tf_1circle
+from helpers import EpochTimer
 
 # Logging
-from tqdm import tqdm
 import logging
-from helpers import parse_title_wiki,EpochTimer
-
 logging.getLogger('gensim').setLevel(logging.WARNING)
-
-def get_new_ids(cooc_data,id_first_value):
-    """
-    Return new ids from cooccurrence data
-    
-    Parameters
-    ----------
-    cooc_data : pd.DataFrame
-        cooccurrence da
-    id_first_value : int
-        id beginning value
-    
-    Returns
-    -------
-    dict
-        new ids for each toponyms 
-    """
-    topo_id = {}
-    id_ = id_first_value
-    for title in cooc_data.title.values:
-        if not title in topo_id:
-            id_+=1
-            topo_id[id_]=title
-    for interlinks in cooc_data.interlinks.values:
-        for interlink in interlinks.split("|"):
-            if not interlink in topo_id:
-                id_+=1
-                topo_id[id_]=interlink
-    return topo_id
-
-# LOGGING CONF
-logging.basicConfig(
+logging.basicConfig( # LOGGING CONF
     format='[%(asctime)s][%(levelname)s] %(message)s ', 
     datefmt='%m/%d/%Y %I:%M:%S %p',
     level=logging.INFO  
     )
 
-args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
-    .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
+#Â COMMAND ARGS
+args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\
+    .parse_args("FR FR_inclusion.csv FR_adjacent.csv FR_cooc.csv -i -a -w".split())#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split())
 
 #
 #################################################
@@ -82,235 +39,108 @@ MODEL_NAME = "Bi-LSTM_NGRAM"
 NGRAM_SIZE = args.ngram_size
 ACCURACY_TOLERANCE = args.tolerance_value
 EPOCHS = args.epochs
-ITER_ADJACENCY = args.adjacency_iteration
-COOC_SAMPLING_NUMBER = args.cooc_sample_size
 WORDVEC_ITER = args.ngram_word2vec_iter
-EMBEDDING_DIM = 256
+EMBEDDING_DIM = args.dimension
 #################################################
 ########## FILENAME VARIABLE ####################
 #################################################
-GEONAME_FN = args.geoname_input
-DATASET_NAME = args.geoname_input.split("/")[-1]
-GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
-REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
-ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
-        GEONAME_FN,
-        ITER_ADJACENCY,
-        REGION_SUFFIX_FN)
-
-COOC_FN = args.wikipedia_cooc_fn
-PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
-    GEONAME_FN.split("/")[-1],
-    EPOCHS,
-    NGRAM_SIZE,
-    ACCURACY_TOLERANCE,
-    REGION_SUFFIX_FN)
-
-REL_CODE=""
+INCLUSION_FN = args.geoname_inclusion
+ADJACENT_FN = args.geonames_adjacent
+COOC_FN = args.wikipedia_cooc
+
+DATASET_NAME = args.dataset_name
+
+PREFIX_OUTPUT_FN = DATASET_NAME
+PREFIX_OUTPUT_FN+="_{0}".format(NGRAM_SIZE)
+PREFIX_OUTPUT_FN+="_{0}".format(EPOCHS)
+
 if args.adjacency:
     PREFIX_OUTPUT_FN += "_A"
-    REL_CODE+= "A"
 if args.inclusion:
     PREFIX_OUTPUT_FN += "_I"
-    REL_CODE+= "I"
 if args.wikipedia_cooc:
     PREFIX_OUTPUT_FN += "_C"
-    REL_CODE+= "C"
 
 MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
 INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
 HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
 
-from lib.utils import MetaDataSerializer
-
-meta_data = MetaDataSerializer(
-    MODEL_NAME,
-    DATASET_NAME,
-    REL_CODE,
-    COOC_SAMPLING_NUMBER,
-    ITER_ADJACENCY,
-    NGRAM_SIZE,
-    ACCURACY_TOLERANCE,
-    EPOCHS,
-    EMBEDDING_DIM,
-    WORDVEC_ITER,
-    INDEX_FN,
-    MODEL_OUTPUT_FN,
-    HISTORY_FN
-)
-meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
-
 #############################################################################################
 ################################# LOAD DATA #################################################
 #############################################################################################
 
-# LOAD  Geonames DATA
-logging.info("Load Geonames data...")
-geoname_data = read_geonames(GEONAME_FN).fillna("")
-
-train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
-test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
-
-logging.info("Geonames data loaded!")
-
-# SELECT ENTRY with class == to A and P (Areas and Populated Places)
-filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places
-#CLEAR RAM
-del geoname_data
+data_used = []
 
+if args.wikipedia:
+    data_used.append(pd.read_csv(COOC_FN,sep="\t"))
 
-# IF REGION
-if args.admin_code_1 != "None":
-    filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
+if args.inclusion:
+    data_used.append(pd.read_csv(INCLUSION_FN,sep="\t"))
 
-# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
-filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
+if args.adjacency:
+    data_used.append(pd.read_csv(ADJACENT_FN, sep="\t"))
 
+if  len(data_used) <1:
+    print("No Type of toponyms indicated. Stopping the program...")
+    sys.exit(1)
 
+pairs_of_toponym = pd.concat(data_used)
 
 #############################################################################################
 ################################# RETRIEVE RELATIONSHIPS ####################################
 #############################################################################################
 
-
-# INITIALIZE RELATION STORE
-rel_store = []
-
-# Retrieve adjacency relationships
-if args.adjacency:
-    logging.info("Retrieve adjacency relationships ! ")
-
-    if not os.path.exists(ADJACENCY_REL_FILENAME):
-        bounds = get_bounds(filtered) # Required to get adjacency relationships
-        rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
-        json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
-    else:
-        logging.info("Open and load data from previous computation!")
-        rel_store=json.load(open(ADJACENCY_REL_FILENAME))
-
-    logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
-
-# Retrieve inclusion relationships
-if args.inclusion:
-    logging.info("Retrieve inclusion relationships ! ")
-
-    cpt_rel = len(rel_store)
-    rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
-
-    logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
-
-
-
-if args.wikipedia_cooc:
-    logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
-    
-    cooc_data = pd.read_csv(COOC_FN,sep="\t")
-    cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
-    cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
-    id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
-    wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
-    title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))}
-    cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
-    filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
-    train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
-    if not "title" in train_cooc_indices:
-        train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
-    train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
-    test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
-
-    logging.info("Merged with Geonames data !")
-
-    # EXTRACT rel
-    logging.info("Extracting cooccurrence relationships")
-    cpt=0
-    for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
-        for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
-            cpt+=1
-            rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
-    logging.info("Extract {0} cooccurrence relationships !".format(cpt))
-
-
-# STORE ID to name
-geoname2name = dict(filtered["geonameid name".split()].values)
-
 # ENCODING NAME USING N-GRAM SPLITTING
 logging.info("Encoding toponyms to ngram...")
 index = NgramIndex(NGRAM_SIZE)
 
  # Identify all ngram available
-filtered.name.apply(lambda x : index.split_and_add(x))
-if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
+pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x))
+pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x))
 
-geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
-
-if args.wikipedia_cooc:
-    geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
+num_words = len(index.index_ngram) # necessary for the embedding matrix
 
 # SAVE THE INDEX TO REUSE THE MODEL
 index.save(INDEX_FN)
-
 logging.info("Done !")
 
-
 #############################################################################################
-################################# ENCODE COORDINATES ########################################
+################################# NGRAM EMBEDDINGS ##########################################
 #############################################################################################
 
-
-
-# Encode each geonames entry coordinates
-geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()}
-# CLEAR RAM
-del filtered
-
-
-EMBEDDING_DIM = 256
-num_words = len(index.index_ngram) # necessary for the embedding matrix 
-
-logging.info("Preparing Input and Output data...")
-
+logging.info("Generating N-GRAM Embedding...")
+embedding_weights = index.get_embedding_layer(np.concatenate((pairs_of_toponym.toponym.values,pairs_of_toponym.toponym_context.values)),dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
+logging.info("Embedding generated !")
 
 #############################################################################################
 ################################# BUILD TRAIN/TEST DATASETS #################################
 #############################################################################################
+logging.info("Preparing Input and Output data...")
 
-X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
-X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
+X_1_train,X_2_train=[],[]
+X_1_test,X_2_test=[],[]
 y_train,y_test = [],[]
 
-for couple in rel_store:
-    geonameId_1,geonameId_2 = couple[0],couple[1]
-    if not geonameId_1 in geoname2encodedname:
-        continue
-    top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
-    if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
-        
-        X_1_train.append(top1)
-        X_2_train.append(top2)
-
-        y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]])
-        #y_lon_train.append(geoname_vec[geonameId_1][0])
-        #y_lat_train.append(geoname_vec[geonameId_1][1])
-    
+for couple in pairs_of_toponym["toponym toponym_context split longitude latitude".split()].itertuples():
+    top,top_c,split_ = couple[1], couple[2], couple[3]
+    coord = zero_one_encoding(couple[-2],couple[-1]) # 0 and 1 encoding
+    enc_top, enc_top_c = index.encode(top),index.encode(top_c)
+    if split_ == "train":
+        X_1_train.append(enc_top)
+        X_2_train.append(enc_top_c)
+        y_train.append(list(coord))
     else:
-        X_1_test.append(top1)
-        X_2_test.append(top2)
+        X_1_test.append(enc_top)
+        X_2_test.append(enc_top_c)
+        y_test.append(list(coord))
 
-        y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]])
-        #y_lon_test.append(geoname_vec[geonameId_1][0])
-        #y_lat_test.append(geoname_vec[geonameId_1][1])
-
-# NUMPYZE inputs and output lists
+# "NUMPYZE" inputs and output lists
 X_1_train = np.array(X_1_train)
 X_2_train = np.array(X_2_train)
-y_lat_train = np.array(y_lat_train)
-y_lon_train = np.array(y_lon_train)
 y_train = np.array(y_train)
 
 X_1_test = np.array(X_1_test)
 X_2_test = np.array(X_2_test)
-y_lat_test = np.array(y_lat_test)
-y_lon_test = np.array(y_lon_test)
 y_test = np.array(y_test)
 
 logging.info("Data prepared !")
@@ -320,20 +150,11 @@ logging.info("Data prepared !")
 if not os.path.exists("outputs/"):
     os.makedirs("outputs/")
 
-#############################################################################################
-################################# NGRAM EMBEDDINGS ##########################################
-#############################################################################################
-
-
-logging.info("Generating N-GRAM Embedding...")
-embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
-logging.info("Embedding generated !")
 
 #############################################################################################
 ################################# MODEL DEFINITION ##########################################
 #############################################################################################
 
-
 input_1 = Input(shape=(index.max_len,))
 input_2 = Input(shape=(index.max_len,))
 
@@ -343,20 +164,15 @@ x1 = embedding_layer(input_1)
 x2 = embedding_layer(input_2)
 
 #Â Each LSTM learn on a permutation of the input toponyms
-x1 = Bidirectional(LSTM(98))(x1)
-x2 = Bidirectional(LSTM(98))(x2)
-
-x = concatenate([x1,x2])#,x3])
+x1 = Bidirectional(LSTM(100))(x1)
+x2 = Bidirectional(LSTM(100))(x2)
+x = concatenate([x1,x2])
 
 x1 = Dense(500,activation="relu")(x)
-# x1 = Dropout(0.3)(x1)
 x1 = Dense(500,activation="relu")(x1)
-# x1 = Dropout(0.3)(x1)
 
 x2 = Dense(500,activation="relu")(x)
-# x2 = Dropout(0.3)(x2)
 x2 = Dense(500,activation="relu")(x2)
-# x2 = Dropout(0.3)(x2)
 
 output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
 output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
@@ -364,14 +180,8 @@ output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
 output_coord = concatenate([output_lon,output_lat],name="output_coord")
 
 model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3
-
 model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)})
 
-# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
-
-# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
-
-
 #############################################################################################
 ################################# TRAINING LAUNCH ###########################################
 #############################################################################################
@@ -383,7 +193,7 @@ epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
 
 
 history = model.fit(x=[X_1_train,X_2_train],
-    y=y_train,#[y_lon_train,y_lat_train],
+    y=y_train,
     verbose=True, batch_size=100,
     epochs=EPOCHS,
     validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]),
@@ -397,5 +207,8 @@ model.save(MODEL_OUTPUT_FN)
 
 #Â Erase Model Checkpoint file
 if os.path.exists(MODEL_OUTPUT_FN + ".part"):
-    import shutil
-    shutil.rmtree(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
+    try:
+        import shutil
+        shutil.rmtree(MODEL_OUTPUT_FN + ".part")
+    except: #Â Depends on Keras version
+        os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
diff --git a/generate_dataset.py b/generate_dataset.py
new file mode 100644
index 0000000..4e873df
--- /dev/null
+++ b/generate_dataset.py
@@ -0,0 +1,148 @@
+import argparse
+
+import pandas as pd
+import numpy as np
+
+from helpers import read_geonames
+from lib.geo import latlon2healpix
+
+from tqdm import tqdm
+from sklearn.model_selection import train_test_split
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("geonames_dataset")
+parser.add_argument("wikipedia_dataset")
+parser.add_argument("geonames_hierarchy_data")
+parser.add_argument("--cooc-sampling", default=4, type=int)
+parser.add_argument("--adj-sampling", default=4, type=int)
+parser.add_argument("--adj-nside", default=128, type=int)
+parser.add_argument("--split-nside", default=128, type=int)
+parser.add_argument("--split-method", default="per_pair", type=str, choices="per_pair per_place".split())
+
+args = parser.parse_args()#("../data/geonamesData/FR.txt ../data/wikipedia/cooccurrence_FR.txt ../data/geonamesData/hierarchy.txt".split())
+
+PREFIX = args.geonames_dataset.split("/")[-1].split(".")[0]  # Ouch !
+
+# Â LOAD DATA
+geonames_data = read_geonames(args.geonames_dataset)
+wikipedia_data = pd.read_csv(args.wikipedia_dataset, sep="\t")
+geonames_hierarchy_data = pd.read_csv(args.geonames_hierarchy_data, sep="\t", header=None,
+                                      names="parentId,childId,type".split(",")).fillna("")
+
+# Add IDs for the Wikipedia Cooc Dataset
+min_id = geonames_data.geonameid.max() + 1
+max_id = min_id + len(wikipedia_data)
+wikipedia_data["geonameid"] = np.arange(min_id, max_id)
+
+# Â Healpix cell id computation
+geonames_data["adj_split"] = geonames_data.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.adj_nside),
+                                                 axis=1)
+
+
+def get_adjacent_pairs(dataframe, sampling_nb):
+    """
+    Return pairs of place toponyms that are adjacent geographicaly.
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+        geonames data
+    sampling_nb : int
+        number of adjacent place drawn
+
+    Returns
+    -------
+    list of list
+        [[ID,place toponym,adjacent place toponym, latitude, longitude],...]
+    """
+    new_pairs = []
+    for ix, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Get Adjacent Toponym Pairs"):
+        healpix_cell = row.adj_split
+        topo_prin = row["name"]
+        lat, lon = row.latitude, row.longitude
+        within_cell = dataframe[dataframe.adj_split == healpix_cell]["name"].values
+        selected = np.random.choice(within_cell, sampling_nb)
+        new_pairs.extend([[row.geonameid, topo_prin, sel, lat, lon] for sel in selected])
+    return new_pairs
+
+
+def get_cooccurrence_pairs(dataframe, sampling_nb):
+    """
+    Return pairs of place toponyms where toponyms appears in a same wikipedia page
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+        wikipedia cooccurrence data
+    sampling_nb : int
+        number of adjacent place drawn
+
+    Returns
+    -------
+    list of list
+        [[ID,place toponym,adjacent place toponym, latitude, longitude],...]
+    """
+    new_pairs = []
+    dataframe["interlinks"] = dataframe.interlinks.apply(lambda x: np.random.choice(x.split("|"), sampling_nb))
+    for ix, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Get Cooccurrent Toponym Pairs"):
+        topo_prin = row.title
+        lat, lon = row.latitude, row.longitude
+        new_pairs.extend([[row.geonameid, topo_prin, sel, lat, lon] for sel in row["interlinks"]])
+    return new_pairs
+
+
+def get_inclusion_pairs(geoname_df, hierarchy_df):
+    """
+    Return pairs of place toponyms that share an inclusion relationship. Ex. Paris, France (Paris geometry is included in France geometry)
+    Parameters
+    ----------
+    dataframe : pandas.DataFrame
+        geonames data
+    hierarchy_df : pandas.DataFrame
+        geonames hierarchy data
+
+    Returns
+    -------
+    list of list
+        [[ID,place toponym,adjacent place toponym, latitude, longitude],...]
+    """
+    geonamesIDS = set(geoname_df.geonameid.values)
+    id_label = dict(geonames_data["geonameid name".split()].values)
+    id_lat = dict(geonames_data["geonameid latitude".split()].values)
+    id_lon = dict(geonames_data["geonameid longitude".split()].values)
+    filter_mask = (hierarchy_df.childId.isin(geonamesIDS) & hierarchy_df.parentId.isin(geonamesIDS))
+    pairs_id = hierarchy_df[filter_mask]["childId parentId".split()].values.tolist()
+    return [[p[0], id_label[p[0]], id_label[p[1]], id_lat[p[0]], id_lon[p[0]]] for p in pairs_id]
+
+# EXTRACT PAIRS FROM INPUT DATA
+cooc_pairs = pd.DataFrame(get_cooccurrence_pairs(wikipedia_data, args.cooc_sampling),
+                          columns="ID toponym toponym_context latitude longitude".split())
+adjacent_pairs = pd.DataFrame(get_adjacent_pairs(geonames_data, args.adj_sampling),
+                              columns="ID toponym toponym_context latitude longitude".split())
+inclusion_pairs = pd.DataFrame(get_inclusion_pairs(geonames_data, geonames_hierarchy_data),
+                               columns="ID toponym toponym_context latitude longitude".split())
+
+# FOR EACH PAIR, COMPUTE THE HEALPIX CELL ID FOR EACH COORDINATES ASSOCIATED
+cooc_pairs["hp_split"] = cooc_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside), axis=1)
+adjacent_pairs["hp_split"] = adjacent_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside),
+                                                  axis=1)
+inclusion_pairs["hp_split"] = inclusion_pairs.apply(lambda x: latlon2healpix(x.latitude, x.longitude, args.split_nside),
+                                                    axis=1)
+
+# SPLIT DATASETS BETWEEN TRAIN AND TEST GEOGRAPHICALY
+field = "hp_split"
+if args.split_method == "per_place":
+    field = "ID"
+
+for df in [cooc_pairs, adjacent_pairs]:
+    df_train, _ = train_test_split(df, stratify=df[field].values, test_size=0.33)
+    df["split"] = "test"
+    df.loc[df_train.index.values, "split"] = "train"
+
+inc_train, _ = train_test_split(inclusion_pairs, test_size=0.33)
+inclusion_pairs["split"] = "test"
+inclusion_pairs.loc[inc_train.index.values, "split"] = "train"
+
+#Â SAVE DATA
+inclusion_pairs.to_csv("{0}_inclusion.csv".format(PREFIX), sep="\t")
+adjacent_pairs.to_csv("{0}_adjacent.csv".format(PREFIX), sep="\t")
+cooc_pairs.to_csv("{0}_cooc.csv".format(PREFIX), sep="\t")
diff --git a/lib/datageneratorv4.py b/lib/datageneratorv4.py
new file mode 100644
index 0000000..f413a63
--- /dev/null
+++ b/lib/datageneratorv4.py
@@ -0,0 +1,24 @@
+import os
+from gzip import GzipFile
+
+import keras
+from keras.utils import to_categorical
+import numpy as np
+import pandas as pd
+
+from .geo import zero_one_encoding
+
+from helpers import parse_title_wiki,read_geonames
+from gensim.models.keyedvectors import KeyedVectors
+
+from sklearn.preprocessing import LabelEncoder
+
+
+class DataGenerator(keras.utils.Sequence):
+    def __init__(self,*dataset):
+        pass
+    def __next__(self):
+        pass
+    def isOver(self):
+        pass
+
diff --git a/parser_config/toponym_combination_embedding_v3.json b/parser_config/toponym_combination_embedding_v3.json
new file mode 100644
index 0000000..37f9ae5
--- /dev/null
+++ b/parser_config/toponym_combination_embedding_v3.json
@@ -0,0 +1,18 @@
+{
+    "description": "Toponym Combination",
+    "args": [
+        { "short": "dataset_name", "help": "Filepath of the Geonames file you want to use." },
+        { "short": "geoname_inclusion", "help": "Filepath of the Geonames file you want to use." },
+        { "short": "geonames_adjacent", "help": "Filepath of the Geonames file you want to use." },
+        { "long": "wikipedia_cooc","help":"Cooccurrence data filename"},
+        { "short": "-v", "long": "--verbose", "action": "store_true" },
+        { "short": "-i", "long": "--inclusion", "action": "store_true" },
+        { "short": "-a", "long": "--adjacency", "action": "store_true" },
+        { "short": "-w", "long": "--wikipedia", "action": "store_true" },
+        { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
+        { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
+        { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 100 },
+        { "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
+        { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }
+    ]
+}
\ No newline at end of file
-- 
GitLab