From 1094549646e178d9720aa35d3064688eabf26833 Mon Sep 17 00:00:00 2001
From: Jacques Fize <jacques.fize@insa-lyon.fr>
Date: Mon, 7 Sep 2020 10:24:19 +0200
Subject: [PATCH] UPDATE, DEBUG, AND CLEANING of the repo

---
 .gitignore                             |   3 +
 bert.py                                | 388 ++++++++++++++++++++++++
 combination_embeddings.py              |   8 +-
 combination_embeddings_baselines.py    | 360 +++++++++++++++++++++++
 combination_embeddings_word.py         | 390 +++++++++++++++++++++++++
 combination_embeddingsv2.py            |  47 +--
 combination_embeddingsv3.py            | 216 ++++++++++++++
 desamb_eval.py                         |  16 +-
 desamb_eval_runs.sh                    |   4 +-
 lib/data_generator.py                  |  20 +-
 lib/data_generatorv3.py                | 354 ++++++++++++++++++++++
 lib/geo.py                             |   6 +-
 lib/ngram_index.py                     |   5 +-
 lib/run.py                             |   7 +-
 lib/torch_generator.py                 |  49 ++++
 lib/utils.py                           |  26 +-
 lib/word_index.py                      | 180 ++++++++++++
 predict_toponym_coordinates.py         |   3 +-
 region_embedding.py => region_model.py |   0
 run_train.py                           |   8 +-
 scripts/embeddingngram.py              |   9 +-
 svm_predict_hp.py                      |  36 +++
 train_test_split_cooccurrence_data.py  |  67 +++--
 23 files changed, 2105 insertions(+), 97 deletions(-)
 create mode 100644 bert.py
 create mode 100644 combination_embeddings_baselines.py
 create mode 100644 combination_embeddings_word.py
 create mode 100644 combination_embeddingsv3.py
 create mode 100644 lib/data_generatorv3.py
 create mode 100644 lib/torch_generator.py
 create mode 100644 lib/word_index.py
 rename region_embedding.py => region_model.py (100%)
 create mode 100644 svm_predict_hp.py

diff --git a/.gitignore b/.gitignore
index 0fe655a..10ad02e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -152,3 +152,6 @@ log*
 temp*
 subset*
 time*
+
+
+/data*
\ No newline at end of file
diff --git a/bert.py b/bert.py
new file mode 100644
index 0000000..df3e134
--- /dev/null
+++ b/bert.py
@@ -0,0 +1,388 @@
+# REQUIREMENTS : pandas keras torch numpy transformers
+
+"""
+Based from the article : https://mccormickml.com/2019/07/22/BERT-fine-tuning/
+by Chris McCormick
+
+"""
+import os
+import sys
+import time
+import random
+import argparse
+import datetime
+
+import pandas as pd
+import numpy as np
+
+import tensorflow as tf
+import torch
+
+from tqdm import tqdm
+tqdm.pandas()
+
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from keras.preprocessing.sequence import pad_sequences
+from transformers import BertTokenizer
+from transformers import BertForSequenceClassification, AdamW, BertConfig
+from transformers import get_linear_schedule_with_warmup
+
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("train" ,help="TSV with two columns : 'sentence' and 'label'")
+parser.add_argument("test",help="TSV with two columns : 'sentence' and 'label'")
+parser.add_argument("outputdir",help="TSV with two columns : 'sentence' and 'label'")
+parser.add_argument("-e","--epochs",type=int,default=5)
+parser.add_argument("-b","--batch_size",default=32,type=int)
+
+args = parser.parse_args()#("-b 32 -e 10 cooc_adj_bert_train.csv cooc_adj_bert_test.csv output_bert_allcooc_adjsampling3radius20km_batch32_epoch10".split())
+
+if not os.path.exists(args.train) or not os.path.exists(args.test):
+    raise FileNotFoundError("Train or Test filepath is incorrect !")
+
+# Number of training epochs (authors recommend between 2 and 4)
+epochs = args.epochs
+
+# The DataLoader needs to know the batch size for training, so I specify it here.
+# For fine-tuning BERT on a specific task, the authors recommend a batch size of
+# 16 or 32.
+
+batch_size = args.batch_size
+
+# OUTPUT DIR
+output_dir = args.outputdir
+
+if not os.path.exists(args.outputdir):
+    raise FileNotFoundError("{0} directory does not exists ! ".format(args.output_dir))
+if not os.path.isdir(args.outputdir):
+    raise NotADirectoryError("{0} is not a directory".format(args.output_dir))
+
+df_train = pd.read_csv(args.train, sep="\t")
+df_test = pd.read_csv(args.test, sep="\t")
+
+# Get the GPU device name.
+device_name = tf.test.gpu_device_name()
+
+# The device name should look like the following:
+if device_name == '/device:GPU:0':
+    print('Found GPU at: {}'.format(device_name))
+else:
+    raise SystemError('GPU device not found')
+
+# If there's a GPU available...
+if torch.cuda.is_available():    
+
+    # Tell PyTorch to use the GPU.    
+    device = torch.device("cuda")
+    print('There are %d GPU(s) available.' % torch.cuda.device_count())
+    print('We will use the GPU:', torch.cuda.get_device_name(0))
+# If not...
+else:
+    print('No GPU available, using the CPU instead.')
+    device = torch.device("cpu")
+
+
+# Load the BERT tokenizer.
+print('Loading {0} tokenizer...'.format("bert-base-multilingual-cased"))
+tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased',do_lower_case=False)
+
+"""
+print("Tokenize Input Data")
+df_train["input_ids"] = df_train.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True))
+df_test["input_ids"] = df_test.sentence.progress_apply(lambda x: tokenizer.encode(x,add_special_tokens = True))
+
+
+# Set the maximum sequence length.
+# took the size of the largest sentence
+MAX_LEN = df_train.input_ids.apply(len).max()+2
+
+print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
+print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))
+
+
+df_train["input_ids"] =  pad_sequences(df_train.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist()
+df_test["input_ids"] =  pad_sequences(df_test.input_ids.values, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post").tolist()
+
+df_train["attention_mask"] = df_train.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x] )
+df_test["attention_mask"] = df_test.input_ids.apply(lambda x: [int(token_id > 0) for token_id in x])
+
+train_inputs = torch.tensor(np.array(df_train.input_ids.values.tolist()))
+del df_train["input_ids"]
+validation_inputs = torch.tensor(np.array(df_test.input_ids.values.tolist()))
+del df_test["input_ids"]
+
+train_labels = torch.tensor(np.array(df_train.label.values.tolist()))
+del df_train["label"]
+validation_labels = torch.tensor(np.array(df_test.label.values.tolist()))
+del df_test["label"]
+
+train_masks = torch.tensor(np.array(df_train.attention_mask.values.tolist()))
+del df_train["attention_mask"]
+validation_masks = torch.tensor(np.array(df_test.attention_mask.values.tolist()))
+del df_test["attention_mask"]
+"""
+
+from lib.torch_generator import SentenceDataset
+# Create the DataLoader for training set.
+train_data = SentenceDataset(df_train,tokenizer,batch_size=batch_size)
+#train_sampler = RandomSampler(train_data)
+train_dataloader = DataLoader(train_data,  batch_size=batch_size)#,sampler=train_sampler,)
+"""
+del train_inputs
+del train_masks
+del train_labels
+"""
+# Create the DataLoader for validation set.
+validation_data = SentenceDataset(df_test,tokenizer,batch_size=batch_size)
+#validation_sampler = SequentialSampler(validation_data)
+validation_dataloader = DataLoader(validation_data, batch_size=batch_size)#, sampler=validation_sampler)
+"""
+del validation_inputs
+del validation_masks 
+del validation_labels
+"""
+# Load BertForSequenceClassification, the pretrained BERT model with a single 
+# linear classification layer on top. 
+model = BertForSequenceClassification.from_pretrained(
+    "bert-base-multilingual-cased", # Use the 12-layer BERT model, with an uncased vocab.
+    num_labels = max(df_test.label.max(),df_train.label.max())+1, # The number of output labels--2 for binary classification.
+                    # You can increase this for multi-class tasks.   
+    output_attentions = False, # Whether the model returns attentions weights.
+    output_hidden_states = False, # Whether the model returns all hidden-states.
+)
+
+# Tell pytorch to run this model on the GPU.
+model.cuda()
+
+optimizer = AdamW(model.parameters(),
+                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                )
+
+
+
+# Total number of training steps is number of batches * number of epochs.
+total_steps = len(train_data) * epochs
+
+# Create the learning rate scheduler.
+scheduler = get_linear_schedule_with_warmup(optimizer, 
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+
+
+# Set the seed value all over the place to make this reproducible.
+seed_val = 42
+
+random.seed(seed_val)
+np.random.seed(seed_val)
+torch.manual_seed(seed_val)
+torch.cuda.manual_seed_all(seed_val)
+
+# Store the average loss after each epoch so I can plot them.
+loss_values = []
+
+history = []
+# For each epoch...
+for epoch_i in range(0, epochs):
+    epoch_data={}
+    
+    # ========================================
+    #               Training
+    # ========================================
+    
+    # Perform one full pass over the training set.
+
+    print("")
+    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+    print('Training...')
+
+    # Measure how long the training epoch takes.
+    t0 = time.time()
+
+    # Reset the total loss for this epoch.
+    total_loss = 0
+
+    # Put the model into training mode.
+    model.train()
+
+    # For each batch of training data...
+    for step, batch in enumerate(train_dataloader):
+
+        # Progress update every 40 batches.
+        if step % 100 == 0 and not step == 0:
+            # Calculate elapsed time in minutes.
+            elapsed = format_time(time.time() - t0)
+            
+            # Report progress.#Changed to sys.stdout to avoid uneccessary \n
+            sys.stdout.write('\r  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+
+        # Unpack this training batch from the dataloader. 
+        #
+        # As I unpack the batch, I'll also copy each tensor to the GPU using the 
+        # `to` method.
+        #
+        # `batch` contains three pytorch tensors:
+        #   [0]: input ids 
+        #   [1]: attention masks
+        #   [2]: labels 
+        b_input_ids = batch[0].to(device)
+        b_input_mask = batch[1].to(device)
+        b_labels = batch[2].to(device)
+
+        # Always clear any previously calculated gradients before performing a
+        # backward pass. PyTorch doesn't do this automatically because 
+        # accumulating the gradients is "convenient while training RNNs". 
+        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+        model.zero_grad()        
+
+        # Perform a forward pass (evaluate the model on this training batch).
+        # This will return the loss (rather than the model output) because I
+        # have provided the `labels`.
+        # The documentation for this `model` function is here: 
+        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+        outputs = model(b_input_ids, 
+                    token_type_ids=None, 
+                    attention_mask=b_input_mask, 
+                    labels=b_labels)
+        
+        # The call to `model` always returns a tuple, so I need to pull the 
+        # loss value out of the tuple.
+        loss = outputs[0]
+
+        # Accumulate the training loss over all of the batches so that I can
+        # calculate the average loss at the end. `loss` is a Tensor containing a
+        # single value; the `.item()` function just returns the Python value 
+        # from the tensor.
+        
+        total_loss += loss.item()
+
+        # Perform a backward pass to calculate the gradients.
+        loss.backward()
+
+        # Clip the norm of the gradients to 1.0.
+        # This is to help prevent the "exploding gradients" problem.
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+        # Update parameters and take a step using the computed gradient.
+        # The optimizer dictates the "update rule"--how the parameters are
+        # modified based on their gradients, the learning rate, etc.
+        optimizer.step()
+
+        # Update the learning rate.
+        scheduler.step()
+
+    # Calculate the average loss over the training data.
+    avg_train_loss = total_loss / len(train_dataloader)            
+    
+    # Store the loss value for plotting the learning curve.
+    loss_values.append(avg_train_loss)
+
+    print("")
+    print("  Average training loss: {0:.2f}".format(avg_train_loss))
+    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
+    epoch_data["loss"]=avg_train_loss
+    epoch_data["epoch_duration"] = time.time() - t0
+
+    # ========================================
+    #               Validation
+    # ========================================
+    # After the completion of each training epoch, measure the performance on
+    # the validation set.
+
+    print("")
+    print("Running Validation...")
+
+    t0 = time.time()
+
+    # Put the model in evaluation mode--the dropout layers behave differently
+    # during evaluation.
+    model.eval()
+
+    # Tracking variables 
+    eval_loss, eval_accuracy = 0, 0
+    nb_eval_steps, nb_eval_examples = 0, 0
+
+    # Evaluate data for one epoch
+    for batch in validation_dataloader:
+        
+        # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+        
+        # Unpack the inputs from dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+        
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up validation
+        with torch.no_grad():        
+
+            # Forward pass, calculate logit predictions.
+            # This will return the logits rather than the loss because we have
+            # not provided labels.
+            # token_type_ids is the same as the "segment ids", which 
+            # differentiates sentence 1 and 2 in 2-sentence tasks.
+            # The documentation for this `model` function is here: 
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            outputs = model(b_input_ids, 
+                            token_type_ids=None, 
+                            attention_mask=b_input_mask)
+        
+        # Get the "logits" output by the model. The "logits" are the output
+        # values prior to applying an activation function like the softmax.
+        logits = outputs[0]
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+        
+        # Calculate the accuracy for this batch of test sentences.
+        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
+        
+        # Accumulate the total accuracy.
+        eval_accuracy += tmp_eval_accuracy
+
+        # Track the number of batches
+        nb_eval_steps += 1
+
+    
+    # Report the final accuracy for this validation run.
+    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+    print("  Validation took: {:}".format(format_time(time.time() - t0)))
+    epoch_data["accuracy"] = eval_accuracy/nb_eval_steps
+    epoch_data["validation_duration"] = time.time() - t0
+    history.append(epoch_data)
+print("")
+print("Training complete!")
+
+print("Save History")
+pd.DataFrame(history).to_csv(output_dir+"/history_bert.csv",sep="\t")
+
+
+
+# Create output directory if needed
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+
+print("Saving model to %s" % output_dir)
+
+# Save a trained model, configuration and tokenizer using `save_pretrained()`.
+# They can then be reloaded using `from_pretrained()`
+model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+model_to_save.save_pretrained(output_dir)
+tokenizer.save_pretrained(output_dir)
\ No newline at end of file
diff --git a/combination_embeddings.py b/combination_embeddings.py
index 599ad1a..094578e 100644
--- a/combination_embeddings.py
+++ b/combination_embeddings.py
@@ -47,7 +47,7 @@ def get_new_ids(cooc_data,id_first_value):
     Returns
     -------
     dict
-        new ids for each toponyms
+        new ids for each toponyms 
     """
     topo_id = {}
     id_ = id_first_value
@@ -76,12 +76,13 @@ args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")
 #################################################
 ############# MODEL TRAINING PARAMETER ##########
 #################################################
+MODEL_NAME = "Bi-LSTM_NGRAM"
 NGRAM_SIZE = args.ngram_size
 ACCURACY_TOLERANCE = args.tolerance_value
 EPOCHS = args.epochs
 ITER_ADJACENCY = args.adjacency_iteration
 COOC_SAMPLING_NUMBER = args.cooc_sample_size
-WORDVEC_ITER = 1 #args.ngram_word2vec_iter
+WORDVEC_ITER = args.ngram_word2vec_iter
 EMBEDDING_DIM = 256
 #################################################
 ########## FILENAME VARIABLE ####################
@@ -121,6 +122,7 @@ HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
 from lib.utils import MetaDataSerializer
 
 meta_data = MetaDataSerializer(
+    MODEL_NAME,
     DATASET_NAME,
     REL_CODE,
     COOC_SAMPLING_NUMBER,
@@ -209,6 +211,8 @@ if args.wikipedia_cooc:
     cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
     filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
     train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
+    if not "title" in train_cooc_indices:
+        train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
     train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
     test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
 
diff --git a/combination_embeddings_baselines.py b/combination_embeddings_baselines.py
new file mode 100644
index 0000000..b4c495b
--- /dev/null
+++ b/combination_embeddings_baselines.py
@@ -0,0 +1,360 @@
+# Base module 
+import re
+import os
+import json
+
+# Structure
+import pandas as pd
+import numpy as np
+import geopandas as gpd
+
+import tensorflow as tf
+
+# Geometry
+from shapely.geometry import Point
+
+# Custom module
+from helpers import read_geonames
+from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
+from lib.ngram_index import NgramIndex
+from lib.utils import ConfigurationReader
+from lib.metrics import lat_accuracy,lon_accuracy
+
+# Logging
+from tqdm import tqdm
+import logging
+from helpers import parse_title_wiki,EpochTimer
+
+logging.getLogger('gensim').setLevel(logging.WARNING)
+
+def get_new_ids(cooc_data,id_first_value):
+    """
+    Return new ids from cooccurrence data
+    
+    Parameters
+    ----------
+    cooc_data : pd.DataFrame
+        cooccurrence da
+    id_first_value : int
+        id beginning value
+    
+    Returns
+    -------
+    dict
+        new ids for each toponyms 
+    """
+    topo_id = {}
+    id_ = id_first_value
+    for title in cooc_data.title.values:
+        if not title in topo_id:
+            id_+=1
+            topo_id[id_]=title
+    for interlinks in cooc_data.interlinks.values:
+        for interlink in interlinks.split("|"):
+            if not interlink in topo_id:
+                id_+=1
+                topo_id[id_]=interlink
+    return topo_id
+
+# LOGGING CONF
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO  
+    )
+
+args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
+    .parse_args()#("-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt -n 4 --ngram-word2vec-iter 10 -e 100 ../data/geonamesData/US_FR.txt ../data/geonamesData/hierarchy.txt".split())
+
+#
+#################################################
+############# MODEL TRAINING PARAMETER ##########
+#################################################
+MODEL_NAME = "BASELINE"
+NGRAM_SIZE = args.ngram_size
+ACCURACY_TOLERANCE = args.tolerance_value
+EPOCHS = args.epochs
+ITER_ADJACENCY = args.adjacency_iteration
+COOC_SAMPLING_NUMBER = args.cooc_sample_size
+WORDVEC_ITER = args.ngram_word2vec_iter
+EMBEDDING_DIM = 256
+#################################################
+########## FILENAME VARIABLE ####################
+#################################################
+GEONAME_FN = args.geoname_input
+DATASET_NAME = args.geoname_input.split("/")[-1]
+GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
+REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
+ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
+        GEONAME_FN,
+        ITER_ADJACENCY,
+        REGION_SUFFIX_FN)
+
+COOC_FN = args.wikipedia_cooc_fn
+PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
+    GEONAME_FN.split("/")[-1],
+    EPOCHS,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE,
+    REGION_SUFFIX_FN)
+
+REL_CODE=""
+if args.adjacency:
+    PREFIX_OUTPUT_FN += "_A"
+    REL_CODE+= "A"
+if args.inclusion:
+    PREFIX_OUTPUT_FN += "_I"
+    REL_CODE+= "I"
+if args.wikipedia_cooc:
+    PREFIX_OUTPUT_FN += "_C"
+    REL_CODE+= "C"
+
+MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
+INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
+HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
+
+from lib.utils import MetaDataSerializer
+
+meta_data = MetaDataSerializer(
+    MODEL_NAME,
+    DATASET_NAME,
+    REL_CODE,
+    COOC_SAMPLING_NUMBER,
+    ITER_ADJACENCY,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE,
+    EPOCHS,
+    EMBEDDING_DIM,
+    WORDVEC_ITER,
+    INDEX_FN,
+    MODEL_OUTPUT_FN,
+    HISTORY_FN
+)
+meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
+print(REL_CODE)
+
+#############################################################################################
+################################# LOAD DATA #################################################
+#############################################################################################
+
+# LOAD  Geonames DATA
+logging.info("Load Geonames data...")
+geoname_data = read_geonames(GEONAME_FN).fillna("")
+
+train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
+test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
+
+logging.info("Geonames data loaded!")
+
+# SELECT ENTRY with class == to A and P (Areas and Populated Places)
+filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
+#CLEAR RAM
+del geoname_data
+
+
+# IF REGION
+if args.admin_code_1 != "None":
+    filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
+
+# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
+filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
+
+
+
+#############################################################################################
+################################# RETRIEVE RELATIONSHIPS ####################################
+#############################################################################################
+
+
+# INITIALIZE RELATION STORE
+rel_store = []
+
+# Retrieve adjacency relationships
+if args.adjacency:
+    logging.info("Retrieve adjacency relationships ! ")
+
+    if not os.path.exists(ADJACENCY_REL_FILENAME):
+        bounds = get_bounds(filtered) # Required to get adjacency relationships
+        rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
+        json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
+    else:
+        logging.info("Open and load data from previous computation!")
+        rel_store=json.load(open(ADJACENCY_REL_FILENAME))
+
+    logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
+
+# Retrieve inclusion relationships
+if args.inclusion:
+    logging.info("Retrieve inclusion relationships ! ")
+
+    cpt_rel = len(rel_store)
+    rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
+
+    logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
+
+
+
+if args.wikipedia_cooc:
+    logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
+    
+    cooc_data = pd.read_csv(COOC_FN,sep="\t")
+    cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
+    cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
+    id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
+    wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
+    title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))}
+    cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
+    filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
+    train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
+    if not "title" in train_cooc_indices:
+        train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
+    train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
+    test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
+
+    logging.info("Merged with Geonames data !")
+
+    # EXTRACT rel
+    logging.info("Extracting cooccurrence relationships")
+    cpt=0
+    for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
+        for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
+            cpt+=1
+            rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
+    logging.info("Extract {0} cooccurrence relationships !".format(cpt))
+
+
+# STORE ID to name
+geoname2name = dict(filtered["geonameid name".split()].values)
+
+# ENCODING NAME USING N-GRAM SPLITTING
+logging.info("Encoding toponyms to ngram...")
+index = NgramIndex(NGRAM_SIZE)
+
+ # Identify all ngram available
+filtered.name.apply(lambda x : index.split_and_add(x))
+if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
+
+geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
+
+if args.wikipedia_cooc:
+    geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
+
+# SAVE THE INDEX TO REUSE THE MODEL
+index.save(INDEX_FN)
+
+logging.info("Done !")
+
+
+#############################################################################################
+################################# ENCODE COORDINATES ########################################
+#############################################################################################
+
+from lib.geo import latlon2healpix
+
+# Encode each geonames entry coordinates
+geoname_vec = {row.geonameid : latlon2healpix(row.latitude,row.longitude,128) for row in filtered.itertuples()}
+# CLEAR RAM
+del filtered
+
+
+EMBEDDING_DIM = 256
+num_words = len(index.index_ngram) # necessary for the embedding matrix 
+
+logging.info("Preparing Input and Output data...")
+
+
+#############################################################################################
+################################# BUILD TRAIN/TEST DATASETS #################################
+#############################################################################################
+
+X_train,y_train = [],[]
+X_test,y_test = [],[]
+
+from joblib import Parallel,delayed
+from tensorflow.keras.utils import to_categorical
+
+def parse_bow(x):
+    return np.sum(to_categorical(x,num_classes=index.cpt+1),axis=0)
+
+for couple in rel_store:
+    geonameId_1,geonameId_2 = couple[0],couple[1]
+    if not geonameId_1 in geoname2encodedname:
+        continue
+    top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
+    if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
+        
+        X_train.append(top1 + top2)
+        y_train.append(geoname_vec[geonameId_1])
+    
+    else:
+        X_test.append(top1 + top2)
+        y_test.append(geoname_vec[geonameId_1])
+
+# NUMPYZE inputs and output lists
+X_train = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_train))
+X_train = np.array(X_train)
+y_train = np.array(y_train)
+
+X_test = Parallel(n_jobs=4,backend="multiprocessing")(delayed(parse_bow)(x) for x in tqdm(X_test))
+X_test = np.array(X_test)
+y_test = np.array(y_test)
+
+logging.info("Data prepared !")
+
+
+# check for output dir
+if not os.path.exists("outputs/"):
+    os.makedirs("outputs/")
+
+from scipy.sparse import csr_matrix
+from sklearn import svm
+from sklearn.naive_bayes import GaussianNB,MultinomialNB
+from sklearn.metrics import classification_report
+from sklearn import tree
+from sklearn.ensemble import RandomForestClassifier
+
+X_train = csr_matrix(X_train)
+X_test = csr_matrix(X_test)
+
+print(REL_CODE)
+oupt = open("log_baseline_US_FR_{0}.txt".format(REL_CODE),'a')
+oupt.write("------")
+
+from joblib import dump
+import sys
+f=True
+
+for kernel in ["rbf","linear","poly"]:
+    clf = svm.SVC(kernel=kernel)
+    clf.fit(X_train,y_train)
+    if kernel =="linear" and f:
+        dump(clf,"SVMLINEAR_US_FR_{0}.bin".format(REL_CODE))
+        sys.exit()
+    y_pred  = clf.predict(X_test)
+    oupt.write("Results for : "+"SVM with the kernel "+kernel)
+    oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"]))
+    oupt.flush()
+
+for alg in (GaussianNB,MultinomialNB):
+    clf = alg()
+    clf.fit(X_train.toarray(),y_train)
+    y_pred  = clf.predict(X_test.toarray())
+    oupt.write("Results for : "+"NaiveBayes with the alg "+alg.__name__)
+    oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"])+"\n")
+    oupt.flush()
+
+clf = tree.DecisionTreeClassifier()
+clf.fit(X_train,y_train)
+y_pred  = clf.predict(X_test)
+oupt.write("Results for : "+"Decision Tree classifier")
+oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"]))
+oupt.flush()
+
+clf = RandomForestClassifier(max_depth=8, random_state=0)
+clf.fit(X_train,y_train)
+y_pred  = clf.predict(X_test)
+oupt.write("Results for : "+"Random Forest classifier")
+oupt.write(str(classification_report(y_test,y_pred,output_dict =True)["accuracy"]))
+oupt.flush()
+
+oupt.close()
\ No newline at end of file
diff --git a/combination_embeddings_word.py b/combination_embeddings_word.py
new file mode 100644
index 0000000..762780d
--- /dev/null
+++ b/combination_embeddings_word.py
@@ -0,0 +1,390 @@
+# Base module 
+import re
+import os
+import json
+
+# Structure
+import pandas as pd
+import numpy as np
+import geopandas as gpd
+
+# DEEPL module
+from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout
+from keras.models import Model
+from keras import backend as K
+from keras.callbacks import ModelCheckpoint
+
+import tensorflow as tf
+
+# Geometry
+from shapely.geometry import Point
+
+# Custom module
+from helpers import read_geonames
+from lib.geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds
+from lib.ngram_index import NgramIndex
+from lib.word_index import WordIndex
+from lib.utils import ConfigurationReader
+from lib.metrics import lat_accuracy,lon_accuracy
+
+# Logging
+from tqdm import tqdm
+import logging
+from helpers import parse_title_wiki,EpochTimer
+
+logging.getLogger('gensim').setLevel(logging.WARNING)
+
+def get_new_ids(cooc_data,id_first_value):
+    """
+    Return new ids from cooccurrence data
+    
+    Parameters
+    ----------
+    cooc_data : pd.DataFrame
+        cooccurrence da
+    id_first_value : int
+        id beginning value
+    
+    Returns
+    -------
+    dict
+        new ids for each toponyms 
+    """
+    topo_id = {}
+    id_ = id_first_value
+    for title in cooc_data.title.values:
+        if not title in topo_id:
+            id_+=1
+            topo_id[id_]=title
+    for interlinks in cooc_data.interlinks.values:
+        for interlink in interlinks.split("|"):
+            if not interlink in topo_id:
+                id_+=1
+                topo_id[id_]=interlink
+    return topo_id
+
+# LOGGING CONF
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO  
+    )
+
+args = ConfigurationReader("./parser_config/toponym_combination_embedding.json")\
+    .parse_args()#("-w --wikipedia-cooc-fn subsetCoocALL.csv ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split())
+
+#
+#################################################
+############# MODEL TRAINING PARAMETER ##########
+#################################################
+MODEL_NAME = "Bi-LSTM_WORD"
+NGRAM_SIZE = args.ngram_size
+ACCURACY_TOLERANCE = args.tolerance_value
+EPOCHS = args.epochs
+ITER_ADJACENCY = args.adjacency_iteration
+COOC_SAMPLING_NUMBER = args.cooc_sample_size
+WORDVEC_ITER = args.ngram_word2vec_iter
+EMBEDDING_DIM = 256
+
+
+#################################################
+########## FILENAME VARIABLE ####################
+#################################################
+
+GEONAME_FN = args.geoname_input
+DATASET_NAME = args.geoname_input.split("/")[-1]
+GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input
+REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
+ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
+        GEONAME_FN,
+        ITER_ADJACENCY,
+        REGION_SUFFIX_FN)
+
+COOC_FN = args.wikipedia_cooc_fn
+PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}_{5}".format(MODEL_NAME,
+    GEONAME_FN.split("/")[-1],
+    EPOCHS,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE,
+    REGION_SUFFIX_FN)
+
+REL_CODE=""
+if args.adjacency:
+    PREFIX_OUTPUT_FN += "_A"
+    REL_CODE+= "A"
+if args.inclusion:
+    PREFIX_OUTPUT_FN += "_I"
+    REL_CODE+= "I"
+if args.wikipedia_cooc:
+    PREFIX_OUTPUT_FN += "_C"
+    REL_CODE+= "C"
+
+MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
+INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
+HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
+
+from lib.utils import MetaDataSerializer
+
+meta_data = MetaDataSerializer(
+    MODEL_NAME,
+    DATASET_NAME,
+    REL_CODE,
+    COOC_SAMPLING_NUMBER,
+    ITER_ADJACENCY,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE,
+    EPOCHS,
+    EMBEDDING_DIM,
+    WORDVEC_ITER,
+    INDEX_FN,
+    MODEL_OUTPUT_FN,
+    HISTORY_FN
+)
+meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
+
+#############################################################################################
+################################# LOAD DATA #################################################
+#############################################################################################
+
+# LOAD  Geonames DATA
+logging.info("Load Geonames data...")
+geoname_data = read_geonames(GEONAME_FN).fillna("")
+
+train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values)
+test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values)
+
+logging.info("Geonames data loaded!")
+
+# SELECT ENTRY with class == to A and P (Areas and Populated Places)
+filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() # Only take area and populated places
+#CLEAR RAM
+del geoname_data
+
+
+# IF REGION
+if args.admin_code_1 != "None":
+    filtered = filtered[filtered.admin1_code == args.admin_code_1].copy()
+
+# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS
+filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD
+
+
+
+#############################################################################################
+################################# RETRIEVE RELATIONSHIPS ####################################
+#############################################################################################
+
+
+# INITIALIZE RELATION STORE
+rel_store = []
+
+# Retrieve adjacency relationships
+if args.adjacency:
+    logging.info("Retrieve adjacency relationships ! ")
+
+    if not os.path.exists(ADJACENCY_REL_FILENAME):
+        bounds = get_bounds(filtered) # Required to get adjacency relationships
+        rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY))
+        json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w'))
+    else:
+        logging.info("Open and load data from previous computation!")
+        rel_store=json.load(open(ADJACENCY_REL_FILENAME))
+
+    logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store)))
+
+# Retrieve inclusion relationships
+if args.inclusion:
+    logging.info("Retrieve inclusion relationships ! ")
+
+    cpt_rel = len(rel_store)
+    rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN))
+
+    logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel))
+
+
+
+if args.wikipedia_cooc:
+    logging.info("Load Wikipedia Cooccurrence data and merge with geonames")
+    
+    cooc_data = pd.read_csv(COOC_FN,sep="\t")
+    cooc_data["title"] = cooc_data.title.apply(parse_title_wiki)
+    cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki)
+    id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max())
+    wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()}
+    title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))}
+    cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x])
+    filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy()))
+    train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t")
+    if not "title" in train_cooc_indices:
+        train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv")
+    train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
+    test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values))
+
+    logging.info("Merged with Geonames data !")
+
+    # EXTRACT rel
+    logging.info("Extracting cooccurrence relationships")
+    cpt=0
+    for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"):
+        for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER):
+            cpt+=1
+            rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]])
+    logging.info("Extract {0} cooccurrence relationships !".format(cpt))
+
+
+# STORE ID to name
+geoname2name = dict(filtered["geonameid name".split()].values)
+
+# ENCODING NAME USING N-GRAM SPLITTING
+logging.info("Encoding toponyms to ngram...")
+index = WordIndex()
+
+ # Identify all ngram available
+filtered.name.apply(lambda x : index.split_and_add(x))
+if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id]
+
+geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association
+
+if args.wikipedia_cooc:
+    geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()})
+
+# SAVE THE INDEX TO REUSE THE MODEL
+index.save(INDEX_FN)
+
+logging.info("Done !")
+
+
+#############################################################################################
+################################# ENCODE COORDINATES ########################################
+#############################################################################################
+
+
+
+# Encode each geonames entry coordinates
+geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()}
+# CLEAR RAM
+del filtered
+
+
+EMBEDDING_DIM = 256
+num_words = len(index.index_word) # necessary for the embedding matrix 
+
+logging.info("Preparing Input and Output data...")
+
+
+#############################################################################################
+################################# BUILD TRAIN/TEST DATASETS #################################
+#############################################################################################
+
+X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[]
+X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[]
+
+for couple in rel_store:
+    geonameId_1,geonameId_2 = couple[0],couple[1]
+    if not geonameId_1 in geoname2encodedname:
+        continue
+    top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2]
+    if geonameId_1 in train_indices: #and geonameId_2 in train_indices:
+        
+        X_1_train.append(top1)
+        X_2_train.append(top2)
+
+        y_lon_train.append(geoname_vec[geonameId_1][0])
+        y_lat_train.append(geoname_vec[geonameId_1][1])
+    
+    else:
+        X_1_test.append(top1)
+        X_2_test.append(top2)
+
+        y_lon_test.append(geoname_vec[geonameId_1][0])
+        y_lat_test.append(geoname_vec[geonameId_1][1])
+
+# NUMPYZE inputs and output lists
+X_1_train = np.array(X_1_train)
+X_2_train = np.array(X_2_train)
+y_lat_train = np.array(y_lat_train)
+y_lon_train = np.array(y_lon_train)
+
+X_1_test = np.array(X_1_test)
+X_2_test = np.array(X_2_test)
+y_lat_test = np.array(y_lat_test)
+y_lon_test = np.array(y_lon_test)
+
+logging.info("Data prepared !")
+
+
+# check for output dir
+if not os.path.exists("outputs/"):
+    os.makedirs("outputs/")
+
+#############################################################################################
+################################# NGRAM EMBEDDINGS ##########################################
+#############################################################################################
+
+
+logging.info("Generating N-GRAM Embedding...")
+embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER)
+logging.info("Embedding generated !")
+
+#############################################################################################
+################################# MODEL DEFINITION ##########################################
+#############################################################################################
+
+
+input_1 = Input(shape=(index.max_len,))
+input_2 = Input(shape=(index.max_len,))
+
+embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
+
+x1 = embedding_layer(input_1)
+x2 = embedding_layer(input_2)
+
+# Each LSTM learn on a permutation of the input toponyms
+x1 = Bidirectional(LSTM(98))(x1)
+x2 = Bidirectional(LSTM(98))(x2)
+
+x = concatenate([x1,x2])#,x3])
+
+x1 = Dense(500,activation="relu")(x)
+# x1 = Dropout(0.3)(x1)
+x1 = Dense(500,activation="relu")(x1)
+# x1 = Dropout(0.3)(x1)
+
+x2 = Dense(500,activation="relu")(x)
+# x2 = Dropout(0.3)(x2)
+x2 = Dense(500,activation="relu")(x2)
+# x2 = Dropout(0.3)(x2)
+
+output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
+output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
+
+model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
+
+model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
+
+
+#############################################################################################
+################################# TRAINING LAUNCH ###########################################
+#############################################################################################
+
+checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
+    save_best_only=True, mode='auto', period=1)
+
+epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
+
+history = model.fit(x=[X_1_train,X_2_train],
+    y=[y_lon_train,y_lat_train],
+    verbose=True, batch_size=100,
+    epochs=EPOCHS,
+    validation_data=([X_1_test,X_2_test],[y_lon_test,y_lat_test]),
+    callbacks=[checkpoint,epoch_timer])
+
+
+hist_df = pd.DataFrame(history.history)
+hist_df.to_csv(HISTORY_FN)
+
+model.save(MODEL_OUTPUT_FN)
+
+# Erase Model Checkpoint file
+if os.path.exists(MODEL_OUTPUT_FN + ".part"):
+    os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
diff --git a/combination_embeddingsv2.py b/combination_embeddingsv2.py
index ef1de05..96fefc7 100644
--- a/combination_embeddingsv2.py
+++ b/combination_embeddingsv2.py
@@ -3,9 +3,10 @@ import os
 
 # Structure
 import pandas as pd
+import numpy as np
 
 # DEEPL module
-from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
+from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU
 from keras.models import Model
 from keras.callbacks import ModelCheckpoint
 from tensorflow.keras.layers import Lambda
@@ -140,7 +141,7 @@ num_words = len(index.index_ngram)
 #############################################################################################
 
 embedding_weights = load_embedding(args.embedding_fn) 
-
+EMBEDDING_DIM = len(embedding_weights[0])
 
 #############################################################################################
 ################################# MODEL DEFINITION ##########################################
@@ -153,49 +154,33 @@ input_2 = Input(shape=(index.max_len,))
 
 embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
 
-x1 = Dropout(0.1)(embedding_layer(input_1))
-x2 = Dropout(0.1)(embedding_layer(input_2))
+x1 = embedding_layer(input_1)
+x2 = embedding_layer(input_2)
 
 # Each LSTM learn on a permutation of the input toponyms
-biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
+biLSTM = Bidirectional(LSTM(64,activation="pentanh", recurrent_activation="pentanh"))
 x1 = biLSTM(x1)
 x2 = biLSTM(x2)
 x = concatenate([x2,x1])#,x3])
 
-aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
-
-x1 = Dense(5000,
-    activation="relu",
-    kernel_regularizer=regularizers.l2(0.01)
-    )(x)
-x1 = Dropout(0.3)(x1)
-x1 = Dense(5000,
-    activation="relu",
-    kernel_regularizer=regularizers.l2(0.01)
-    )(x1)
-x1 = Dropout(0.3)(x1)
-
-x2 = Dense(5000,
-    activation="relu",
-    kernel_regularizer=regularizers.l2(0.01)
-    )(x)
-x2 = Dropout(0.3)(x2)
-x2 = Dense(5000,
-    activation="relu",
-    kernel_regularizer=regularizers.l2(0.01)
-    )(x2)
-x2 = Dropout(0.3)(x2)
+x1 = Dense(1000,activation="pentanh")(x)
+# x1 = Dropout(0.3)(x1)
+x1 = Dense(1000,activation="pentanh")(x1)
+# x1 = Dropout(0.3)(x1)
 
+x2 = Dense(1000,activation="pentanh")(x)
+# x2 = Dropout(0.3)(x2)
+x2 = Dense(1000,activation="pentanh")(x2)
+# x2 = Dropout(0.3)(x2)
 
 output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
 output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
 
 output = concatenate([output_lon,output_lat],name="output_layer")
 
-model = Model(inputs = [input_1,input_2], outputs = [output,aux_layer])#input_3
-
-model.compile(loss={"output_layer":haversine_tf_1circle,"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy","output_layer":accuracy_k(ACCURACY_TOLERANCE)})
+model = Model(inputs = [input_1,input_2], outputs = output)#input_3
 
+model.compile(loss={"output_layer":haversine_tf_1circle}, optimizer='adam',metrics={"output_layer":accuracy_k(ACCURACY_TOLERANCE)})
 
 #############################################################################################
 ################################# TRAINING LAUNCH ###########################################
diff --git a/combination_embeddingsv3.py b/combination_embeddingsv3.py
new file mode 100644
index 0000000..ccb1031
--- /dev/null
+++ b/combination_embeddingsv3.py
@@ -0,0 +1,216 @@
+# Base module 
+import os
+
+# Structure
+import pandas as pd
+import numpy as np
+
+# DEEPL module
+from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU
+from keras.models import Model
+from keras.callbacks import ModelCheckpoint
+from tensorflow.keras.layers import Lambda
+import keras.backend as K 
+import tensorflow as tf 
+from lib.custom_layer import *
+
+# Custom module
+from lib.ngram_index import NgramIndex
+from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
+from lib.metrics import lat_accuracy,lon_accuracy
+from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
+from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
+
+# Logging
+import logging
+
+logging.getLogger('gensim').setLevel(logging.WARNING)
+
+from helpers import EpochTimer
+
+# LOGGING CONF
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO  
+    )
+
+args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
+    .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
+
+#.parse_args("-w  --wikipedia-cooc-fn  subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
+
+#
+#################################################
+############# MODEL TRAINING PARAMETER ##########
+#################################################
+NGRAM_SIZE = args.ngram_size
+ACCURACY_TOLERANCE = args.k_value
+EPOCHS = args.epochs
+ADJACENCY_SAMPLING = args.adjacency_sample
+COOC_SAMPLING = args.cooc_sample
+WORDVEC_ITER = 50
+EMBEDDING_DIM = args.dimension
+BATCH_SIZE = args.batch_size
+#################################################
+########## FILENAME VARIABLE ####################
+#################################################
+# check for output dir
+if not os.path.exists("outputs/"):
+    os.makedirs("outputs/")
+
+GEONAME_FN = args.geoname_input
+DATASET_NAME = args.geoname_input.split("/")[-1]
+GEONAMES_HIERARCHY_FN = args.inclusion_fn
+ADJACENCY_REL_FILENAME = args.adjacency_fn
+COOC_FN = args.wikipedia_cooc_fn
+
+PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format(
+    GEONAME_FN.split("/")[-1],
+    EPOCHS,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE)
+
+REL_CODE=""
+if args.adjacency:
+    PREFIX_OUTPUT_FN += "_A"
+    REL_CODE+= "A"
+if args.inclusion:
+    PREFIX_OUTPUT_FN += "_I"
+    REL_CODE+= "I"
+if args.wikipedia_cooc:
+    PREFIX_OUTPUT_FN += "_C"
+    REL_CODE+= "C"
+
+MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
+INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
+HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
+
+
+meta_data = MetaDataSerializer(
+    DATASET_NAME,
+    REL_CODE,
+    COOC_SAMPLING,
+    ADJACENCY_SAMPLING,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE,
+    EPOCHS,
+    EMBEDDING_DIM,
+    WORDVEC_ITER,
+    INDEX_FN,
+    MODEL_OUTPUT_FN,
+    HISTORY_FN
+)
+meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
+
+
+### PUT DATASRC + GENERATOR
+
+index = NgramIndex.load(args.ngram_index_fn)
+
+train_src = []
+test_src = []
+
+class_encoder = LabelEncoder()
+
+if args.wikipedia_cooc:
+    train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4,use_healpix=False))
+    test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4,use_healpix=False))
+
+if args.adjacency:
+    a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
+    a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
+    train_src.append(a_train)
+    test_src.append(a_test)
+
+if args.inclusion:
+    i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
+    i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
+    train_src.append(i_train)
+    test_src.append(i_test)
+#Adjacency
+
+print("Number of classes:",class_encoder.get_num_classes())
+
+d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE) 
+d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE) 
+
+num_words = len(index.index_ngram)  
+
+#############################################################################################
+################################# NGRAM EMBEDDINGS ##########################################
+#############################################################################################
+
+embedding_weights = load_embedding(args.embedding_fn) 
+EMBEDDING_DIM = len(embedding_weights[0])
+
+#############################################################################################
+################################# MODEL DEFINITION ##########################################
+#############################################################################################
+
+from keras import regularizers
+####
+
+input_1 = Input(shape=(index.max_len,))
+input_2 = Input(shape=(index.max_len,))
+
+embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
+
+x1 = embedding_layer(input_1)
+x2 = embedding_layer(input_2)
+
+# Each LSTM learn on a permutation of the input toponyms
+biLSTM = Bidirectional(GRU(128,activation="pentanh", recurrent_activation="pentanh"))
+x1 = biLSTM(x1)
+x2 = biLSTM(x2)
+
+x = concatenate([x1,x2])#,x3])
+
+x1 = Dense(500,activation="relu")(x)
+x1 = Dropout(0.3)(x1)
+x1 = Dense(500,activation="relu")(x1)
+x1 = Dropout(0.3)(x1)
+
+x2 = Dense(500,activation="relu")(x)
+x2 = Dropout(0.3)(x2)
+x2 = Dense(500,activation="relu")(x2)
+x2 = Dropout(0.3)(x2)
+
+#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D)
+
+output_lon = Dense(1,activation="sigmoid")(x1)
+output_lat = Dense(1,activation="sigmoid")(x2)
+
+output_coord = concatenate([output_lon,output_lat],name="output_coord")
+
+#####
+model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3
+
+model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)})
+
+model.summary()
+#############################################################################################
+################################# TRAINING LAUNCH ###########################################
+#############################################################################################
+
+checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
+    save_best_only=True, mode='auto', period=1)
+
+epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
+
+
+history = model.fit_generator(generator=d_train,
+    validation_data=d_test,
+    verbose=True,
+    epochs=EPOCHS,
+    callbacks=[checkpoint,epoch_timer])
+
+
+hist_df = pd.DataFrame(history.history)
+hist_df.to_csv(HISTORY_FN)
+
+model.save(MODEL_OUTPUT_FN)
+
+# Erase Model Checkpoint file
+if os.path.exists(MODEL_OUTPUT_FN + ".part"):
+    os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
diff --git a/desamb_eval.py b/desamb_eval.py
index ab90a48..34f04ad 100644
--- a/desamb_eval.py
+++ b/desamb_eval.py
@@ -52,11 +52,15 @@ prefixes = [x.rstrip(".h5") for x in glob(args.models_directory+"/*.h5")]
 
 final_output = []
 for prefix in prefixes:
-    df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
-    data = json.load(open(prefix+".json"))
-    data["acccuracy@100km"] = (df.dist<100).sum()/len(df)
-    data["acccuracy@50km"] = (df.dist<50).sum()/len(df)
-    data["acccuracy@25km"] = (df.dist<25).sum()/len(df)
-    final_output.append(data)
+    try:
+        df = eval_model(EVAL_DATASET_FN,prefix + ".h5",prefix + "_index")
+        data = json.load(open(prefix+".json"))
+        data["acccuracy@100km"] = (df.dist<100).sum()/len(df)
+        data["acccuracy@50km"] = (df.dist<50).sum()/len(df)
+        data["acccuracy@25km"] = (df.dist<25).sum()/len(df)
+        final_output.append(data)
+    except:
+        print("BUMP!")
+    
 
 pd.DataFrame(final_output).to_csv("{0}_RESULT.csv".format(EVAL_DATASET_FN.rstrip(".csv")))
\ No newline at end of file
diff --git a/desamb_eval_runs.sh b/desamb_eval_runs.sh
index a45dd2c..20eb682 100644
--- a/desamb_eval_runs.sh
+++ b/desamb_eval_runs.sh
@@ -1,3 +1,3 @@
 python3 desamb_eval.py -g ../data/geocoding_evaluation/fr_dataset_ambiguity_sample50percent.csv outputs/FR_RESULT 
-#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/US\ FR\ results
-#python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/US\ FR\ results
+python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_cooc_test.csv outputs/USFR_WORD
+python3 desamb_eval.py -g ../data/geocoding_evaluation/us_fr_dataset_ambiguity.csv outputs/USFR_WORD
diff --git a/lib/data_generator.py b/lib/data_generator.py
index fb87146..58cbad4 100644
--- a/lib/data_generator.py
+++ b/lib/data_generator.py
@@ -178,14 +178,13 @@ class Inclusion(DataSource):
 
 
 class CoOccurrences(DataSource):
-    def __init__(self, filename, label_encoder,sampling=3,resolution = 1):
+    def __init__(self, filename, label_encoder,sampling=3,resolution = 256,use_healpix=False):
         super().__init__("Co-Occurrence data",filename)
-        self.is_there_healpix = True
+        self.is_there_healpix = use_healpix
         # LOAD DATA
-        try:
-            self.data_src = pd.read_csv(filename)
-        except:
-            self.data_src = pd.read_csv(filename,sep="\t")
+
+        self.data_src = pd.read_csv(filename,sep="\t")
+
         # CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA
         if not "healpix_{0}".format(resolution) in self.data_src.columns:
             raise KeyError("healpix_{0} column does not exists ! ".format(resolution))
@@ -272,7 +271,6 @@ class DataGenerator(keras.utils.Sequence):
         self.num_classes = class_encoder.get_num_classes()
 
         self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
-
     def __len__(self):
         'Denotes the number of batches per epoch'
         return int(np.floor(self.len / self.batch_size))
@@ -281,7 +279,7 @@ class DataGenerator(keras.utils.Sequence):
         if self.is_there_healpix and self.only_healpix:
             return [X[:,0],X[:,1]],y2
 
-        if self.is_there_healpix:
+        elif self.is_there_healpix:
             return [X[:,0],X[:,1]],[y,y2]
         else:
             return [X[:,0],X[:,1]],y
@@ -299,7 +297,6 @@ class DataGenerator(keras.utils.Sequence):
                 self.datasrc_index += 1
                 self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
 
-
         if self.datasrc_index >= len(self.data_src):
             self.return_(X,y,y2)
         
@@ -332,7 +329,10 @@ def load_embedding(model_fn,dim_vector=100):
     N = len(model.wv.vocab)
     M = np.zeros((N,dim_vector))
     for i in range(N):
-        M[i] = model.wv[str(i)]
+        try:
+            M[i] = model.wv[str(i)]
+        except KeyError:
+            pass
     return M
 
 if __name__ == "__main__":
diff --git a/lib/data_generatorv3.py b/lib/data_generatorv3.py
new file mode 100644
index 0000000..cd17cdb
--- /dev/null
+++ b/lib/data_generatorv3.py
@@ -0,0 +1,354 @@
+import os
+from gzip import GzipFile
+
+import keras
+from keras.utils import to_categorical
+import numpy as np
+import pandas as pd
+
+from .geo import zero_one_encoding
+
+from helpers import parse_title_wiki,read_geonames
+from gensim.models.keyedvectors import KeyedVectors
+
+from sklearn.preprocessing import LabelEncoder
+
+
+def wc_l(filename,gzip=True):
+    lc = 0
+    if not gzip:
+        f = open(filename)
+    if gzip:
+        f = GzipFile(filename)
+    while f.readline():
+        lc += 1 
+    f.close()       
+    return lc
+
+class SamplingProbabilities:
+    def __init__(self):
+        self.count = {}
+    
+    def get_probs(self,item):
+        if not item in self.count:
+            self.count[item] = 0
+        self.count[item]+=1
+        return 1/self.count[item]
+    def __call__(self,a):
+        return self.get_probs(a)
+        
+
+class DataSource(object):
+    def __init__(self,name,input_filename):
+        self.name = name
+        assert os.path.exists(input_filename)
+        self.input_filename = input_filename
+        self.len = 0
+
+        self.is_there_healpix = False
+
+    def __next__(self):
+        raise NotImplementedError()
+
+    def __iter__(self):
+        return self
+    
+    def __len__(self):
+        return self.len
+    
+    def __reset__(self):
+        raise NotImplementedError()
+
+    def isOver(self):
+        raise NotImplementedError()
+
+class Adjacency(DataSource):
+    def __init__(self,filename,geonames_filename,sampling=3,len_=None,gzip=True):
+        DataSource.__init__(self,"Adjacency SRC",filename)
+
+        assert os.path.exists(geonames_filename)
+        self.geonames_data_dict = {row.geonameid:row.name for row in read_geonames(geonames_filename).itertuples()}
+        
+        self.gzip = gzip
+        if not self.gzip:
+            self.data_src = open(self.input_filename,'rb')
+        else:
+            self.data_src = GzipFile(self.input_filename,'rb')
+
+        if len_:
+            self.len = len_*sampling
+        else:
+            self.len = wc_l(filename,gzip=gzip)
+        
+        self.data_src.readline() # header line
+
+        self.sampling = sampling
+        if self.sampling:
+            self.probs_storage = SamplingProbabilities() 
+
+        self.topo = None
+        self.context_topo_context = []
+        self.curr_probs = None
+        self.lat, self.lon = None, None
+
+
+        self.i = 0
+        self.is_over = False
+    
+    def __next__(self):
+        if  self.i >= len(self.context_topo_context):
+            line = self.data_src.readline()
+            if not line:
+                self.is_over = True
+                raise StopIteration
+            line = line.decode("utf-8").rstrip("\n")
+            _,geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(","))
+
+            self.topo = int(geonameid)
+            self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")]
+            if self.sampling:
+                self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
+                self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
+            self.lat, self.lon = float(latitude),float(longitude)
+
+            self.i = 0
+        
+        self.i += 1
+        return (self.geonames_data_dict[self.topo],
+        self.geonames_data_dict[self.context_topo_context[self.i-1]],
+        self.lat,self.lon)
+
+    def __reset__(self):
+        if not self.gzip:
+            self.data_src = open(self.input_filename,'rb')
+        else:
+            self.data_src = GzipFile(self.input_filename,'rb')
+
+        self.data_src.readline() # header line
+        self.is_over = False
+
+    def isOver(self):
+        return self.is_over
+
+
+class Inclusion(DataSource):
+    def __init__(self, geonames_filename,hierarchy_filename,mask_ids=None):
+        super().__init__("Inclusion SRC",hierarchy_filename)
+        assert os.path.exists(geonames_filename)
+        self.geonames_data_dict = {row.geonameid:(row.name,row.latitude,row.longitude) for row in read_geonames(geonames_filename).itertuples()}
+        
+        self.data_src = pd.read_csv(self.input_filename,
+            sep="\t",
+            header=None,
+            names="parentId,childId,type".split(",")
+        ).fillna("")
+        
+        if mask_ids:
+            self.data_src = self.data_src[self.data_src.childId.isin(mask_ids)]
+        self.data_src= self.data_src[self.data_src.childId.isin(self.geonames_data_dict)]
+        self.data_src= self.data_src[self.data_src.parentId.isin(self.geonames_data_dict)]
+
+        self.data_src = self.data_src["childId parentId".split()].values.tolist()
+        self.len = len(self.data_src)
+
+        self.i = 0
+
+        self.is_over = False
+
+    def __next__(self):
+        if self.i+1 >= self.len:
+            self.eof = True
+            raise StopIteration
+        else:
+            self.i += 1
+            tup_ = tuple(self.data_src[self.i-1])
+            return (self.geonames_data_dict[tup_[0]][0],
+            self.geonames_data_dict[tup_[1]][0],
+            self.geonames_data_dict[tup_[0]][2],
+            self.geonames_data_dict[tup_[0]][1])
+
+    def __reset__(self):
+        self.i = 0
+        self.is_over = False
+    
+    def isOver(self):
+        return (self.i == self.len)
+    
+
+
+
+class CoOccurrences(DataSource):
+    def __init__(self, filename, label_encoder,sampling=3,resolution = 256,use_healpix=False):
+        super().__init__("Co-Occurrence data",filename)
+        self.is_there_healpix = use_healpix
+        # LOAD DATA
+
+        self.data_src = pd.read_csv(filename,sep="\t")
+
+        # CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA
+        if not "healpix_{0}".format(resolution) in self.data_src.columns:
+            raise KeyError("healpix_{0} column does not exists ! ".format(resolution))
+        
+        # PARSE TOPONYMS
+        self.data_src["title"] = self.data_src.title.apply(parse_title_wiki)
+        try:
+            self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki)
+        except:
+            pass
+
+        # LOOP parameter
+        self.sampling = sampling
+        if self.sampling:
+            self.probs_storage = SamplingProbabilities()
+            
+        # LOOP INDICES
+        self.i = 0
+        self.j = 0
+        self.is_over = False
+        self.len = len(self.data_src)*(self.sampling-1)
+
+        
+        # BUFFER VARIABLE
+        self.topo = None
+        self.context_topo_context = []
+        self.curr_probs = None
+        self.lat, self.lon = None, None
+
+
+        self.resolution = resolution
+        self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist()
+
+        self.class_encoder = label_encoder
+        self.class_encoder.fit(self.classes)
+
+        self.healpix = None
+
+    def __next__(self):
+        if self.isOver() or self.i*self.sampling == self.len:
+            self.is_over = True
+            raise StopIteration 
+
+        if  self.j >= len(self.context_topo_context):
+            line = self.data_src.iloc[self.i]
+            
+            self.topo = line.title
+            self.context_topo_context = [x for x in str(line.interlinks).split("|")]
+            N = len(self.context_topo_context)
+            triple = []
+            for i in range(N):
+                if i+1 == N:
+                    break
+                triple.append((self.context_topo_context[i],self.context_topo_context[i+1]))
+                
+
+            self.context_topo_context = triple
+            np.random.shuffle(self.context_topo_context)
+            self.lat, self.lon = line.latitude,line.longitude
+            
+            self.healpix = line["healpix_{0}".format(self.resolution)]
+            
+            self.i += 1
+            self.j = 0
+        
+        self.j += 1
+        return (self.topo,
+        *self.context_topo_context[self.j-1],
+        self.lat,self.lon,self.class_encoder.transform([self.healpix])[0])
+
+    def __reset__(self):
+        self.i = 0
+        self.is_over = False
+    
+    def isOver(self):
+        return self.is_over
+    
+class DataGenerator(keras.utils.Sequence):
+    'Generates data for Keras'
+    def __init__(self,data_sources,ngram_index,class_encoder,**kwargs):
+        'Initialization'
+        self.data_src = data_sources
+        self.ngram_index = ngram_index
+
+        self.batch_size = kwargs.get("batch_size",1000)
+        self.only_healpix = kwargs.get("only_healpix",False)
+        
+        self.len = sum([len(d) for d in self.data_src])
+        self.datasrc_index = 0
+
+        self.num_classes = class_encoder.get_num_classes()
+
+        self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
+    def __len__(self):
+        'Denotes the number of batches per epoch'
+        return int(np.floor(self.len / self.batch_size))
+
+    def return_(self,X,y,y2=None):
+        if self.is_there_healpix and self.only_healpix:
+            return [X[:,0],X[:,1],X[:,2]],y2
+
+        elif self.is_there_healpix:
+            return [X[:,0],X[:,1],X[:,2]],[y,y2]
+        else:
+            return [X[:,0],X[:,1],X[:,2]],y
+
+    def __getitem__(self, index):
+        'Generate one batch of data'
+        X = np.empty((self.batch_size,3,self.ngram_index.max_len),dtype=np.int32) # toponym
+        y = np.empty((self.batch_size,2),dtype=float) #lat lon coord
+
+        y2=None # For healpix
+        if self.is_there_healpix:
+            y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
+
+        if self.data_src[self.datasrc_index].isOver():
+                self.datasrc_index += 1
+                self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
+
+        if self.datasrc_index >= len(self.data_src):
+            self.return_(X,y,y2)
+        
+        for i in range(self.batch_size):
+            if self.data_src[self.datasrc_index].isOver():
+                return self.return_(X,y,y2)
+            try:
+                topo, topo_context_1,topo_context_2, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__()
+            except StopIteration as e:
+                return self.return_(X,y,y2)
+            
+            X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context_1),self.ngram_index.encode(topo_context_2)]
+            y[i] =  [*zero_one_encoding(longitude,latitude)]
+            if self.is_there_healpix:
+                y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
+)
+
+            #y[i] = [longitude,latitude]
+        return self.return_(X,y,y2)
+
+    def on_epoch_end(self):
+        'Updates indexes after each epoch'
+        [d.__reset__() for d in self.data_src]
+        self.datasrc_index = 0
+
+
+    
+def load_embedding(model_fn,dim_vector=100):
+    model = KeyedVectors.load(model_fn)
+    N = len(model.wv.vocab)
+    M = np.zeros((N,dim_vector))
+    for i in range(N):
+        try:
+            M[i] = model.wv[str(i)]
+        except KeyError:
+            pass
+    return M
+
+if __name__ == "__main__":
+    # All adj nb of line :7955000-1
+    from lib.ngram_index import NgramIndex
+    from tqdm import tqdm
+    ng = NgramIndex.load("../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json")
+    c= CoOccurrences("../data/wikipedia/cooccurrence_FR.txt_test.csv",sampling=3)
+    a = Adjacency("/home/jacques/sample_adjacency.txt",geonames_filename="../data/geonamesData/allCountries.txt",gzip=False,sampling=10)
+    i= Inclusion(geonames_filename="../data/geonamesData/allCountries.txt",hierarchy_filename="../data/geonamesData/hierarchy.txt")
+    d= DataGenerator([c,a,i],ng) 
+    for x in tqdm(range(len(d))):d[i]
diff --git a/lib/geo.py b/lib/geo.py
index 1bd6396..1840705 100644
--- a/lib/geo.py
+++ b/lib/geo.py
@@ -26,9 +26,9 @@ def tf_deg2rad(deg):
 def latlon2healpix( lat , lon , res ):
     lat = np.radians(lat)
     lon = np.radians(lon)
-    xs = ( np.cos(lat) * np.cos(lon) )#
-    ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
-    zs = ( np.sin(lat) )#
+    xs = ( np.cos(lat) * np.cos(lon) ) #
+    ys = ( np.cos(lat) * np.sin(lon) ) # -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
+    zs = ( np.sin(lat) ) # 
     return healpy.vec2pix( int(res) , xs , ys , zs )
 
 def haversine_tf(y_true,y_pred):
diff --git a/lib/ngram_index.py b/lib/ngram_index.py
index b69fcfe..47a5a70 100644
--- a/lib/ngram_index.py
+++ b/lib/ngram_index.py
@@ -75,7 +75,7 @@ class NgramIndex():
         """
         ngrams = word.lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))
-        ngrams = [ng for ng in ngrams if ng.count("$")<self.size-1]
+        ngrams = [ng for ng in ngrams if ng.count("$")<2]
         if not self.loaded:
             [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
         return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
@@ -125,7 +125,8 @@ class NgramIndex():
         N = len(self.ngram_index)
         embedding_matrix = np.zeros((N,dim))
         for i in range(N):
-            embedding_matrix[i] = model.wv[str(i)]
+            if str(i) in model.wv:
+                embedding_matrix[i] = model.wv[str(i)]
         return embedding_matrix
 
     def save(self,fn):
diff --git a/lib/run.py b/lib/run.py
index 5c652b4..f2a7b79 100644
--- a/lib/run.py
+++ b/lib/run.py
@@ -146,7 +146,8 @@ class Run(object):
 
         out_proc = subprocess.PIPE
         if log_filename:
-            out_proc = open(log_filename,'w')
+            out_proc = open(log_filename,'a')
+            print(4)
         process = subprocess.Popen(self.get_command().split(),stdout=out_proc)
         _, _ = process.communicate() # We don't care of the output (if so, we use the log_filename argument)
 
@@ -209,8 +210,10 @@ class GridSearchModel:
         log_filename : str, optional
             log filename, by default None
         """
+        i=0
         for task in self.tasks:
-            task.run(log_filename=log_filename)
+            task.run(log_filename=log_filename+"_"+str(i))
+            i+=1
 
     
 if __name__ == "__main__":
diff --git a/lib/torch_generator.py b/lib/torch_generator.py
new file mode 100644
index 0000000..718d169
--- /dev/null
+++ b/lib/torch_generator.py
@@ -0,0 +1,49 @@
+import torch
+from keras.preprocessing.sequence import pad_sequences
+import numpy as np 
+
+def chunks(lst, n):
+    """Yield successive n-sized chunks from lst."""
+    for i in range(0, len(lst), n):
+        yield lst[i:i + n]
+
+class SentenceDataset(torch.utils.data.Dataset):
+    'Characterizes a dataset for PyTorch'
+    def __init__(self, dataframe,tokenizer,max_len=96,batch_size=32):
+        'Initialization'
+        self.sentences = dataframe["sentence"].values
+        self.labels = dataframe["label"].values
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+
+        self.batch_size = batch_size
+        a = np.arange(len(dataframe))
+        np.random.shuffle(a)
+        self.batch_tokenization = list(chunks(a,batch_size))
+        assert(len(self.batch_tokenization[0])==batch_size)
+        self.current_batch_id = 0
+        self.boundaries = (0,0+batch_size)
+        self.current_batch_tokenized = self.tokenize(self.current_batch_id)
+
+    def tokenize(self,batch_index):
+        X = [ self.tokenizer.encode(self.sentences[x],add_special_tokens = True,max_length=512) for x in self.batch_tokenization[batch_index]]# Tokenizer
+        X = pad_sequences(X, maxlen=self.max_len, dtype="long", value=0, truncating="post", padding="post").tolist()
+        return X
+
+    def __len__(self):
+        'Denotes the total number of samples'
+        return len(self.sentences)
+    def __getitem__(self, index):
+        'Generates one sample of data'
+        if not index < self.boundaries[1] or not index >= self.boundaries[0]:
+            self.current_batch_id = index//self.batch_size
+            self.current_batch_tokenized = self.tokenize(self.current_batch_id)
+            self.boundaries= (self.current_batch_id*self.batch_size,self.current_batch_id*self.batch_size + self.batch_size)
+        # Load data and get label
+        
+        index_in_batch = index-self.boundaries[0]
+        #print(self.boundaries,index_in_batch)
+        X = self.current_batch_tokenized[index_in_batch]
+        M = [int(token_id > 0) for token_id in X] # attention mask
+        y = self.labels[index]
+        return torch.tensor(np.array(X)),torch.tensor(np.array(M)),torch.tensor(np.array(y))
\ No newline at end of file
diff --git a/lib/utils.py b/lib/utils.py
index 5af3a90..82531b3 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -3,6 +3,8 @@ import math
 import argparse
 import os
 import json
+import time
+import datetime
 
 # Data Structure
 import numpy as np
@@ -101,6 +103,7 @@ class ConfigurationReader(object):
 
 class MetaDataSerializer(object):
     def __init__(self,
+    model_name,
     dataset_name,
     rel_code,
     cooc_sample_size,
@@ -113,6 +116,7 @@ class MetaDataSerializer(object):
     index_fn,
     keras_model_fn,
     train_test_history_fn):
+        self.model_name = model_name
         self.dataset_name = dataset_name
         self.rel_code = rel_code
         self.cooc_sample_size = cooc_sample_size
@@ -128,6 +132,7 @@ class MetaDataSerializer(object):
     
     def save(self,fn):
         json.dump({
+        "model_name":self.model_name,
         "dataset_name" : self.dataset_name,
         "rel_code" : self.rel_code,
         "cooc_sample_size" : self.cooc_sample_size,
@@ -193,4 +198,23 @@ class Chronometer:
         duration = time.time() - self.__task_begin_timestamp[task_name]
         del self.__task_begin_timestamp[task_name]
 
-        return duration
\ No newline at end of file
+        return duration
+
+
+        # Function to calculate the accuracy of our predictions vs labels
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+    
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
\ No newline at end of file
diff --git a/lib/word_index.py b/lib/word_index.py
new file mode 100644
index 0000000..974a8ab
--- /dev/null
+++ b/lib/word_index.py
@@ -0,0 +1,180 @@
+import json
+
+import numpy as np
+
+from ngram import NGram
+
+# Machine learning 
+from gensim.models import Word2Vec
+
+class WordIndex():
+    """
+    Class used for encoding words in ngram representation
+    """
+    def __init__(self,loaded = False):
+        """
+        Constructor
+        
+        Parameters
+        ----------
+        loaded : bool
+            if loaded from external file
+        """
+        self.word_index = {"":0}
+        self.index_word = {0:""}
+        self.cpt = 0
+        self.max_len = 0
+
+        self.loaded = loaded
+
+    def split_and_add(self,word):
+        """
+        Split word in multiple ngram and add each one of them to the index
+        
+        Parameters
+        ----------
+        word : str
+            a word
+        """
+        grams = word.lower().split(" ")
+        [self.add(subword) for subword in grams ]
+        self.max_len = max(self.max_len,len(grams))
+
+    def add(self,subword):
+        """
+        Add a ngram to the index
+        
+        Parameters
+        ----------
+        ngram : str
+            ngram
+        """
+        if not subword in self.word_index:
+            self.cpt+=1
+            self.word_index[subword]=self.cpt
+            self.index_word[self.cpt]=subword
+        
+
+    def encode(self,word):
+        """
+        Return a ngram representation of a word
+        
+        Parameters
+        ----------
+        word : str
+            a word
+        
+        Returns
+        -------
+        list of int
+            listfrom shapely.geometry import Point,box
+ of ngram index
+        """
+        subwords = [w.lower() for w in word.split(" ")]
+        if not self.loaded:
+            [self.add(ng) for ng in subwords if not ng in self.word_index]
+        if self.max_len < len(subwords):
+            self.max_len = max(self.max_len,len(subwords))
+        return self.complete([self.word_index[ng] for ng in subwords if ng in self.word_index],self.max_len)
+
+    def complete(self,ngram_encoding,MAX_LEN,filling_item=0):
+        """
+        Complete a ngram encoded version of word with void ngram. It's necessary for neural network.
+        
+        Parameters
+        ----------
+        ngram_encoding : list of int
+            first encoding of a word
+        MAX_LEN : int
+            desired length of the encoding
+        filling_item : int, optional
+            ngram index you wish to use, by default 0
+        
+        Returns
+        -------
+        list of int
+            list of ngram index
+        """
+        if self.loaded and len(ngram_encoding) >=MAX_LEN:
+            return ngram_encoding[:MAX_LEN]
+        assert len(ngram_encoding) <= MAX_LEN
+        diff = MAX_LEN - len(ngram_encoding)
+        ngram_encoding.extend([filling_item]*diff)  
+        return ngram_encoding
+    
+    def get_embedding_layer(self,texts,dim=100,**kwargs):
+        """
+        Return an embedding matrix for each ngram using encoded texts. Using gensim.Word2vec model.
+        
+        Parameters
+        ----------
+        texts : list of [list of int]
+            list of encoded word
+        dim : int, optional
+            embedding dimension, by default 100
+        
+        Returns
+        -------
+        np.array
+            embedding matrix
+        """
+        model = Word2Vec([[str(w) for w in t] for t in texts], size=dim,window=5, min_count=1, workers=4,**kwargs)
+        N = len(self.word_index)
+        embedding_matrix = np.zeros((N,dim))
+        for i in range(N):
+            if str(i) in model.wv:
+                embedding_matrix[i] = model.wv[str(i)]
+        return embedding_matrix
+
+    def save(self,fn):
+        """
+
+        Save the NgramIndex
+        
+        Parameters
+        ----------
+        fn : str
+            output filename
+        """
+        data = {
+            "word_index": self.word_index,
+            "cpt_state": self.cpt,
+            "max_len_state": self.max_len
+        }
+        json.dump(data,open(fn,'w'))
+
+    @staticmethod
+    def load(fn):
+        """
+        
+        Load a NgramIndex state from a file.
+        
+        Parameters
+        ----------
+        fn : str
+            input filename
+        
+        Returns
+        -------
+        NgramIndex
+            ngram index
+        
+        Raises
+        ------
+        KeyError
+            raised if a required field does not appear in the input file
+        """
+        try:
+            data = json.load(open(fn))
+        except json.JSONDecodeError:
+            print("Data file must be a JSON")
+        for key in ["word_index","cpt_state","max_len_state"]:
+            if not key in data:
+                raise KeyError("{0} field cannot be found in given file".format(key))
+        new_obj = WordIndex(loaded=True)
+        new_obj.word_index = data["word_index"]
+        new_obj.index_word = {v:k for k,v in new_obj.word_index.items()}
+        new_obj.cpt = data["cpt_state"]
+        new_obj.max_len = data["max_len_state"]
+        return new_obj
+
diff --git a/predict_toponym_coordinates.py b/predict_toponym_coordinates.py
index 6fb4930..6ed8169 100644
--- a/predict_toponym_coordinates.py
+++ b/predict_toponym_coordinates.py
@@ -3,6 +3,7 @@ import os
 import tensorflow as tf
 import keras.backend as K
 from lib.ngram_index import NgramIndex
+from lib.word_index import WordIndex
 import numpy as np
 
 from tensorflow.python.keras.backend import set_session
@@ -64,7 +65,7 @@ class Geocoder(object):
         # graph = tf.compat.v1.get_default_graph()
         # set_session(sess)
         self.keras_model = load_model(keras_model_fn,custom_objects={"accuracy_at_k_lat":lat_accuracy(),"accuracy_at_k_lon":lon_accuracy()})
-        self.ngram_encoder = NgramIndex.load(ngram_index_file)
+        self.ngram_encoder = WordIndex.load(ngram_index_file)
 
     def get_coord(self,toponym,context_toponym):
         global sess
diff --git a/region_embedding.py b/region_model.py
similarity index 100%
rename from region_embedding.py
rename to region_model.py
diff --git a/run_train.py b/run_train.py
index 987eaf8..3030694 100644
--- a/run_train.py
+++ b/run_train.py
@@ -14,11 +14,11 @@ for rel in rels:
 c_f = "--wikipedia-cooc-fn ../data/wikipedia/cooccurrence_US_FR.txt"
 # Init GridsearchModel
 grid = GridSearchModel(\
-    "python3 combination_embeddings.py",
+    "python3 combination_embeddings_baselines.py",
     **OrderedDict({ # necessary because some args have to be given in a certain order
-    "rel":["-i -a",("-i -w "+c_f),"-a -w","-a -i -w"],
+    "rel":["-w "+c_f,("-i -w "+c_f),"-a -w "+c_f,"-a -i -w "+c_f], # ,"-a -i -w "+c_f ,"-i -a"
     "-n":[4],
-    "--ngram-word2vec-iter" :[1],
+    "--ngram-word2vec-iter" :[10],
     "-e":[100],
     "geoname_fn":"../data/geonamesData/US_FR.txt".split(),
     "hierarchy_fn":"../data/geonamesData/hierarchy.txt".split()
@@ -27,6 +27,6 @@ grid = GridSearchModel(\
 print("########### THE FOLLOWING COMMAND(S) WILL BE EXECUTED ###########" )
 [print(task.get_command()) for task in grid.tasks]
 print("#################################################################")
-grid.run("outputs/log_RUN_TEXAS_IDFrance.txt")
+grid.run("outputs/log_{0}".format("FR_baseline"))
 
 #["-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -a","-w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -i"]
\ No newline at end of file
diff --git a/scripts/embeddingngram.py b/scripts/embeddingngram.py
index 000e8f2..ac5ec02 100644
--- a/scripts/embeddingngram.py
+++ b/scripts/embeddingngram.py
@@ -34,12 +34,13 @@ p= [wiki_labels.extend(x.split("|")) for x in df_cooc["interlinks"].values]
 del df_geo
 del df_cooc
 
+N = 5
 
 
-ng = NgramIndex(4)
+ng = NgramIndex(N)
 p = [ng.split_and_add(x) for x in tqdm(geonames_label)]
 p = [ng.split_and_add(x) for x in tqdm(wiki_labels)]
-ng.save("4gramWiki+Geonames_index.json")
+ng.save("{0}gramWiki+Geonames_index.json".format(N))
 
 geonames_label.extend(wiki_labels)
 
@@ -51,8 +52,8 @@ class MySentences(object):
         for w in self.texts:
             yield [str(x)for x in ng.encode(w)]
 
-model = Word2Vec(MySentences(geonames_label), size=100, window=5, min_count=1, workers=4)
-model.save("embedding4gramWiki+Geonames.bin")
+model = Word2Vec(MySentences(geonames_label), size=100, window=5, min_count=1, workers=4,sg=1)
+model.save("embedding{0}gramWiki+Geonames.bin".format(5))
 
 
 
diff --git a/svm_predict_hp.py b/svm_predict_hp.py
new file mode 100644
index 0000000..c836859
--- /dev/null
+++ b/svm_predict_hp.py
@@ -0,0 +1,36 @@
+import numpy as np
+
+from joblib import dump, load
+from tensorflow.keras.utils import to_categorical
+
+from lib.geo import latlon2healpix
+from lib.ngram_index import NgramIndex
+
+
+def parse_bow(x,index):
+    return np.sum(to_categorical(x,num_classes=index.cpt+1),axis=0)
+
+def is_in(lat,lon,hp_predicted,hp_nside):
+    hp_truth = latlon2healpix(lat,lon,hp_nside)
+    return hp_truth == hp_predicted
+
+class HealpixGeocoder(object):
+    
+    def __init__(self,model_fn,ngram_index_filename):
+        self.model = load(model_fn)
+        self.ng_index = NgramIndex.load(ngram_index_filename)
+    
+    def geocode(self,phrase1,phrase2):
+        if not phrase1 or not phrase2:
+            return None
+        vec = parse_bow(np.array(self.ng_index.encode(phrase1)),self.ng_index)+\
+            parse_bow(np.array(self.ng_index.encode(phrase2)),self.ng_index)
+        return self.model.predict([vec])[0]
+    
+    def geocode_multi(self,phrases1,phrases2):
+        vecs = np.array([ parse_bow(np.array(self.ng_index.encode(ph)),self.ng_index) for ph in phrases1 if ph])
+        vecs += np.array([ parse_bow(np.array(self.ng_index.encode(ph)),self.ng_index) for ph in phrases2 if ph])
+        return self.model.predict(vecs)
+
+hp = HealpixGeocoder("SVMLINEAR_US_FR_AC.bin","outputs/US_FR.txt_100_4_0.002__A_C_index")
+hp.geocode("paris","montpellier")
diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py
index 750803b..edacbaa 100644
--- a/train_test_split_cooccurrence_data.py
+++ b/train_test_split_cooccurrence_data.py
@@ -1,6 +1,7 @@
 import argparse
 
 import pandas as pd
+import numpy as np
 import geopandas as gpd
 
 import logging
@@ -13,12 +14,13 @@ logging.basicConfig(
 from sklearn.model_selection import train_test_split
 from shapely.geometry import Point
 
-from lib.geo import Grid
+from lib.geo import Grid,latlon2healpix
 
 from tqdm import tqdm 
 
 parser = argparse.ArgumentParser()
 parser.add_argument("cooccurrence_file")
+parser.add_argument("-s",action="store_true")
 
 args = parser.parse_args()#("data/wikipedia/cooccurrence_FR.txt".split())#("data/geonamesData/FR.txt".split())
 
@@ -29,31 +31,31 @@ COOC_FN = args.cooccurrence_file
 
 logging.info("Load Cooc DATA data...")
 cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("")
-cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
-cooc_data = gpd.GeoDataFrame(cooc_data)
+# cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
+# cooc_data = gpd.GeoDataFrame(cooc_data)
 logging.info("Cooc data loaded!")
 
-# World Shape bounds
-world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
-world["nn"] = 1
-dissolved = world.dissolve(by="nn").iloc[0].geometry
+# # World Shape bounds
+# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
+# world["nn"] = 1
+# dissolved = world.dissolve(by="nn").iloc[0].geometry
 
-#Creating Grid
-logging.info("Initializing Grid (360,180)...")
-g = Grid(*dissolved.bounds,[360,180])
-logging.info("Fit Data to the Grid...")
-g.fit_data(cooc_data)
-logging.info("Placing place into the grid...")
-[g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))]
+# #Creating Grid
+# logging.info("Initializing Grid (360,180)...")
+# g = Grid(*dissolved.bounds,[360,180])
+# logging.info("Fit Data to the Grid...")
+# g.fit_data(cooc_data)
+# logging.info("Placing place into the grid...")
+# [g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))]
 
-#ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
-logging.info("Associate a cell number to each place in the Geoname Dataframe")
-def foo(g,id_):
-    for ix,cell in enumerate(g.cells):
-        if id_ in cell.list_object:
-            return ix
+# #ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
+# logging.info("Associate a cell number to each place in the Geoname Dataframe")
+# def foo(g,id_):
+#     for ix,cell in enumerate(g.cells):
+#         if id_ in cell.list_object:
+#             return ix
 
-cooc_data["cat"] = cooc_data.title.apply(lambda x:foo(g,x))
+cooc_data["cat"] = cooc_data.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1)
 
 # TRAIN AND TEST SPLIT
 logging.info("Split Between Train and Test")
@@ -66,20 +68,27 @@ while 1:
         break
     i+=1
 
-for i in range(i+1,len(g.cells)):
+for i in np.unique(cooc_data.cat.values):
     try:
-        x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i])
+        if not args.s:
+            x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i])
+        else:
+            x_train,x_test = train_test_split(cooc_data[cooc_data.cat == i].sample(frac=0.1))
+
         X_train,X_test = pd.concat((X_train,x_train)),pd.concat((X_test,x_test))
     except Exception as e:
         print(e) #print("Error",len(filtered[filtered.cat == i]))
 
-del X_train["geometry"]
-del X_train["nn"]
+# del X_train["geometry"]
+# del X_train["nn"]
 del X_train["cat"]
 del X_test["cat"]
-del X_test["geometry"]
-del X_test["nn"]
+# del X_test["geometry"]
+# del X_test["nn"]
 # SAVING THE DATA
 logging.info("Saving Output !")
-X_train.to_csv(COOC_FN+"_train.csv")
-X_test.to_csv(COOC_FN+"_test.csv")
+suffix =""
+if args.s:
+    suffix = "10per"
+X_train.to_csv(COOC_FN+suffix+"_train.csv")
+X_test.to_csv(COOC_FN+suffix+"_test.csv")
-- 
GitLab