From 565887da06f51a00f1d938f8ee5cd310d3a7ea28 Mon Sep 17 00:00:00 2001
From: Jacques Fize <jacques.fize@insa-lyon.fr>
Date: Mon, 6 Apr 2020 17:25:24 +0200
Subject: [PATCH] Add last Modification (Healpix)

---
 .gitignore                                 |  2 +-
 combination_embeddingsv2.py                | 58 ++++++++++------------
 lib/custom_layer.py                        | 31 ++++++++++++
 data_generator.py => lib/data_generator.py | 48 ++++++++++++------
 lib/geo.py                                 | 37 ++++++++++++++
 lib/ngram_index.py                         |  1 +
 lib/utils.py                               | 19 +++++++
 requirements.txt                           |  1 +
 scripts/gethealpix.py                      | 31 ++++++++++++
 9 files changed, 179 insertions(+), 49 deletions(-)
 create mode 100644 lib/custom_layer.py
 rename data_generator.py => lib/data_generator.py (87%)
 create mode 100644 scripts/gethealpix.py

diff --git a/.gitignore b/.gitignore
index 951b144..0fe655a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -147,7 +147,7 @@ notes.md
 .idea*
 other/*
 test*
-nohup.out
+nohup.out*
 log*
 temp*
 subset*
diff --git a/combination_embeddingsv2.py b/combination_embeddingsv2.py
index dcde16a..ef1de05 100644
--- a/combination_embeddingsv2.py
+++ b/combination_embeddingsv2.py
@@ -5,19 +5,20 @@ import os
 import pandas as pd
 
 # DEEPL module
-from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM
+from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
 from keras.models import Model
 from keras.callbacks import ModelCheckpoint
 from tensorflow.keras.layers import Lambda
 import keras.backend as K 
 import tensorflow as tf 
+from lib.custom_layer import *
 
 # Custom module
 from lib.ngram_index import NgramIndex
-from lib.utils import ConfigurationReader, MetaDataSerializer
+from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
 from lib.metrics import lat_accuracy,lon_accuracy
-from data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
-from lib.geo import haversine_tf,accuracy_k
+from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
+from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
 
 # Logging
 import logging
@@ -34,7 +35,7 @@ logging.basicConfig(
     )
 
 args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
-    .parse_args("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
+    .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
 
 #.parse_args("-w  --wikipedia-cooc-fn  subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
 
@@ -109,9 +110,10 @@ index = NgramIndex.load(args.ngram_index_fn)
 train_src = []
 test_src = []
 
+class_encoder = LabelEncoder()
 if args.wikipedia_cooc:
-    train_src.append(CoOccurrences(COOC_FN + "_train.csv",sampling=4))
-    test_src.append(CoOccurrences(COOC_FN + "_test.csv",sampling=4))
+    train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4))
+    test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4))
 
 if args.adjacency:
     a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
@@ -128,8 +130,8 @@ if args.inclusion:
 
 
 
-d_train = DataGenerator(train_src,index,batch_size=BATCH_SIZE) 
-d_test = DataGenerator(test_src,index,batch_size=BATCH_SIZE) 
+d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE) 
+d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE) 
 
 num_words = len(index.index_ngram)  
 
@@ -149,58 +151,50 @@ from keras import regularizers
 input_1 = Input(shape=(index.max_len,))
 input_2 = Input(shape=(index.max_len,))
 
-embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
+embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
 
-x1 = embedding_layer(input_1)
-x2 = embedding_layer(input_2)
+x1 = Dropout(0.1)(embedding_layer(input_1))
+x2 = Dropout(0.1)(embedding_layer(input_2))
 
 # Each LSTM learn on a permutation of the input toponyms
-x1 = Bidirectional(LSTM(98))(x1)
-x2 = Bidirectional(LSTM(98))(x2)
+biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
+x1 = biLSTM(x1)
+x2 = biLSTM(x2)
+x = concatenate([x2,x1])#,x3])
 
-x = concatenate([x1,x2])#,x3])
+aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
 
-x1 = Dense(500,
+x1 = Dense(5000,
     activation="relu",
     kernel_regularizer=regularizers.l2(0.01)
     )(x)
 x1 = Dropout(0.3)(x1)
-x1 = Dense(500,
+x1 = Dense(5000,
     activation="relu",
     kernel_regularizer=regularizers.l2(0.01)
     )(x1)
 x1 = Dropout(0.3)(x1)
 
-x2 = Dense(500,
+x2 = Dense(5000,
     activation="relu",
     kernel_regularizer=regularizers.l2(0.01)
     )(x)
 x2 = Dropout(0.3)(x2)
-x2 = Dense(500,
+x2 = Dense(5000,
     activation="relu",
     kernel_regularizer=regularizers.l2(0.01)
     )(x2)
 x2 = Dropout(0.3)(x2)
 
+
 output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
 output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
-from keras.layers import Lambda
-
-def to_wgs84_lat(lat):
-    return ((lat*180)-90)
-def to_wgs84_lon(lon):
-    return ((lon*360)-180)
-
-#output_lon = Lambda(to_wgs84_lon)(output_lon)
-#output_lat = Lambda(to_wgs84_lat)(output_lat) Still between 0 and 1 to avoid loss value explosion
-
-
 
 output = concatenate([output_lon,output_lat],name="output_layer")
 
-model = Model(inputs = [input_1,input_2], outputs = output)#input_3
+model = Model(inputs = [input_1,input_2], outputs = [output,aux_layer])#input_3
 
-model.compile(loss=haversine_tf, optimizer='adam',metrics=[accuracy_k(ACCURACY_TOLERANCE)])
+model.compile(loss={"output_layer":haversine_tf_1circle,"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy","output_layer":accuracy_k(ACCURACY_TOLERANCE)})
 
 
 #############################################################################################
diff --git a/lib/custom_layer.py b/lib/custom_layer.py
new file mode 100644
index 0000000..7204573
--- /dev/null
+++ b/lib/custom_layer.py
@@ -0,0 +1,31 @@
+# This Layer implementation comes from 
+
+import keras
+from keras import backend as K
+from keras.engine.topology import Layer
+
+
+
+class Pentanh(Layer):
+    """
+    Implementation for the "Penalized Tanh" activation function presented in :
+        Xu, Bing, Ruitong Huang, et Mu Li. « Revise saturated activation functions ». arXiv preprint arXiv:1602.05980, 2016.
+    
+    Code Author: Ana Bárbara Cardoso https://github.com/barbarainacioc/toponym-resolution/blob/master/system/nn_model.py
+    """
+
+    def __init__(self, **kwargs):
+        super(Pentanh, self).__init__(**kwargs)
+        self.supports_masking = True
+        self.__name__ = 'pentanh'
+
+    def call(self, inputs):
+        return K.switch(K.greater(inputs,0), K.tanh(inputs), 0.25 * K.tanh(inputs))
+
+    def get_config(self):
+        return super(Pentanh, self).get_config()
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+keras.utils.generic_utils.get_custom_objects().update({'pentanh': Pentanh()})
\ No newline at end of file
diff --git a/data_generator.py b/lib/data_generator.py
similarity index 87%
rename from data_generator.py
rename to lib/data_generator.py
index 12d38bb..5183017 100644
--- a/data_generator.py
+++ b/lib/data_generator.py
@@ -2,10 +2,11 @@ import os
 from gzip import GzipFile
 
 import keras
+from keras.utils import to_categorical
 import numpy as np
 import pandas as pd
 
-from lib.geo import zero_one_encoding
+from .geo import zero_one_encoding
 
 from helpers import parse_title_wiki,read_geonames
 from gensim.models.keyedvectors import KeyedVectors
@@ -20,8 +21,6 @@ def wc_l(filename,gzip=True):
         lc += 1 
     f.close()       
     return lc
-    
-
 
 class SamplingProbabilities:
     def __init__(self):
@@ -183,8 +182,8 @@ class Inclusion(DataSource):
             tup_ = tuple(self.data_src[self.i-1])
             return (self.geonames_data_dict[tup_[0]][0],
             self.geonames_data_dict[tup_[1]][0],
-            self.geonames_data_dict[tup_[0]][1],
-            self.geonames_data_dict[tup_[0]][2])
+            self.geonames_data_dict[tup_[0]][2],
+            self.geonames_data_dict[tup_[0]][1])
 
     def __reset__(self):
         self.i = 0
@@ -194,9 +193,10 @@ class Inclusion(DataSource):
         return (self.i == self.len)
     
 
+from sklearn.preprocessing import LabelEncoder
 
 class CoOccurrences(DataSource):
-    def __init__(self, filename, sampling=3):
+    def __init__(self, filename, label_encoder,sampling=3):
         super().__init__("Co-Occurrence data",filename)
 
         try:
@@ -225,6 +225,15 @@ class CoOccurrences(DataSource):
         self.curr_probs = None
         self.lat, self.lon = None, None
 
+
+        self.resolution = 64 #fixed for now
+        self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist()
+
+        self.class_encoder = label_encoder
+        self.class_encoder.fit(self.classes)
+
+        self.healpix = None
+
     def __next__(self):
         if self.isOver() or self.i*self.sampling == self.len:
             self.is_over = True
@@ -239,13 +248,14 @@ class CoOccurrences(DataSource):
                 self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
                 self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
             self.lat, self.lon = line.latitude,line.longitude
+            self.healpix = line["healpix_{0}".format(self.resolution)]
             self.i += 1
             self.j = 0
         
         self.j += 1
         return (self.topo,
         self.context_topo_context[self.j-1],
-        self.lat,self.lon)
+        self.lat,self.lon,self.class_encoder.transform([self.healpix])[0])
 
     def __reset__(self):
         self.i = 0
@@ -259,7 +269,7 @@ class CoOccurrences(DataSource):
 
 class DataGenerator(keras.utils.Sequence):
     'Generates data for Keras'
-    def __init__(self,data_sources,ngram_index,**kwargs):
+    def __init__(self,data_sources,ngram_index,class_encoder,**kwargs):
         'Initialization'
         self.data_src = data_sources
         self.ngram_index = ngram_index
@@ -270,6 +280,8 @@ class DataGenerator(keras.utils.Sequence):
         self.len = sum([len(d) for d in self.data_src])
         self.datasrc_index = 0
 
+        self.num_classes = class_encoder.get_num_classes()
+
         #self.on_epoch_end()
 
     def __len__(self):
@@ -278,31 +290,35 @@ class DataGenerator(keras.utils.Sequence):
 
     def __getitem__(self, index):
         'Generate one batch of data'
-        X = np.empty((self.batch_size,2,self.ngram_index.max_len))
-        y = np.empty((self.batch_size,2),dtype=float)
+        X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) # toponym
+        y = np.empty((self.batch_size,2),dtype=float) #lat lon coord
+        y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
         if self.data_src[self.datasrc_index].isOver():
                 self.datasrc_index += 1
         if self.datasrc_index >= len(self.data_src):
-            return X,y
+            return X,[y,y2]
         
         for i in range(self.batch_size):
             if self.data_src[self.datasrc_index].isOver():
                 return X, y
             try:
-                topo, topo_context,latitude,longitude = self.data_src[self.datasrc_index].__next__()
+                topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__()
             except StopIteration as e:
-                return X, y
+                return X, [y,y2]
             
             X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
-            y[i] = [*zero_one_encoding(longitude,latitude)]
+            y[i] =  [*zero_one_encoding(longitude,latitude)]
+            y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
+)
+
             #y[i] = [longitude,latitude]
-        return [X[:,0],X[:,1]], y#[y[:,0],y[:,1]]
+        return [X[:,0],X[:,1]], [y,y2]#[y[:,0],y[:,1]]
 
     def on_epoch_end(self):
         'Updates indexes after each epoch'
         [d.__reset__() for d in self.data_src]
         self.datasrc_index = 0
-
+    
 def load_embedding(model_fn,dim_vector=100):
     model = KeyedVectors.load(model_fn)
     N = len(model.wv.vocab)
diff --git a/lib/geo.py b/lib/geo.py
index 6841247..1bd6396 100644
--- a/lib/geo.py
+++ b/lib/geo.py
@@ -4,6 +4,7 @@ import numpy as np
 import pandas as pd
 
 from shapely.geometry import Point,box 
+import healpy
 
 from tqdm import tqdm
 
@@ -21,6 +22,15 @@ def tf_deg2rad(deg):
     pi_on_180 = 0.017453292519943295
     return deg * pi_on_180
 
+# convert lat and lon to a healpix code encoding a region, with a given resolution
+def latlon2healpix( lat , lon , res ):
+    lat = np.radians(lat)
+    lon = np.radians(lon)
+    xs = ( np.cos(lat) * np.cos(lon) )#
+    ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
+    zs = ( np.sin(lat) )#
+    return healpy.vec2pix( int(res) , xs , ys , zs )
+
 def haversine_tf(y_true,y_pred):
     """
     Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
@@ -48,6 +58,33 @@ def haversine_tf(y_true,y_pred):
     
     return 6367 * 2 * tf.math.asin(K.sqrt(a))
 
+def haversine_tf_1circle(y_true,y_pred):
+    """
+    Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
+    
+    Parameters
+    ----------
+    lon1 : numeric or array-like (pandas Dataframe works also)
+        longitude of first coordinates
+    lat1 :  numeric or array-like (pandas Dataframe works also)
+        latitude of first coordinates
+    lon2 :  numeric or array-like (pandas Dataframe works also)
+        longitude of second coordinates
+    lat2 :  numeric or array-like (pandas Dataframe works also)
+        longitude of second coordinates
+    
+    Returns
+    -------
+    float or array-like
+        distance(s) value(s)
+    """
+    lon1, lat1, lon2, lat2 = map(tf_deg2rad, [y_true[:,0], y_true[:,1], y_pred[:,0], y_pred[:,1]])
+    dlon = lon2 - lon1
+    dlat = lat2 - lat1
+    a = K.sin(dlat/2.0)**2 + K.cos(lat1) * K.cos(lat2) * K.sin(dlon/2.0)**2
+    
+    return 1 * 2 * tf.math.asin(K.sqrt(a))
+
 def to_wgs84_lat(lat):
     return ((lat*180)-90)
 def to_wgs84_lon(lon):
diff --git a/lib/ngram_index.py b/lib/ngram_index.py
index 0cef529..b69fcfe 100644
--- a/lib/ngram_index.py
+++ b/lib/ngram_index.py
@@ -75,6 +75,7 @@ class NgramIndex():
         """
         ngrams = word.lower().replace(" ","$")
         ngrams = list(self.ngram_gen.split(ngrams))
+        ngrams = [ng for ng in ngrams if ng.count("$")<self.size-1]
         if not self.loaded:
             [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
         return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
diff --git a/lib/utils.py b/lib/utils.py
index c3fb872..5af3a90 100644
--- a/lib/utils.py
+++ b/lib/utils.py
@@ -16,6 +16,25 @@ from ngram import NGram
 # Visualisation and parallelisation
 from tqdm import tqdm
 
+class LabelEncoder():
+    def __init__(self):
+        self.dict_ = {}
+        self.cpt = 0
+    
+    def fit_transform(self,list_element):
+        self.fit(list_element)
+        return self.transform(list_element)
+
+    def fit(self,list_element):
+        for l in list_element:
+            if not l in self.dict_:
+                self.dict_[l] = self.cpt
+                self.cpt+=1
+    def transform(self,list_element):
+        return [self.dict_[l] for l in list_element]
+    
+    def get_num_classes(self):
+        return self.cpt
 
 class TokenizerCustom():
     def __init__(self,vocab):
diff --git a/requirements.txt b/requirements.txt
index 4eb9e43..7802df3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,3 +19,4 @@ nltk
 folium
 flask
 numba
+healpy
\ No newline at end of file
diff --git a/scripts/gethealpix.py b/scripts/gethealpix.py
new file mode 100644
index 0000000..387cacf
--- /dev/null
+++ b/scripts/gethealpix.py
@@ -0,0 +1,31 @@
+
+
+import pandas as pd
+
+from tqdm import tqdm
+tqdm.pandas()
+import argparse
+
+import numpy as np
+import healpy
+# convert lat and lon to a healpix code encoding a region, with a given resolution
+def latlon2healpix( lat , lon , res ):
+    lat = np.radians(lat)
+    lon = np.radians(lon)
+    xs = ( np.cos(lat) * np.cos(lon) )#
+    ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
+    zs = ( np.sin(lat) )#
+    return healpy.vec2pix( int(res) , xs , ys , zs )
+
+parser = argparse.ArgumentParser()
+parser.add_argument("input_file")
+parser.add_argument("output_file")
+
+args = parser.parse_args()
+
+df = pd.read_csv(args.input_file,sep="\t") 
+df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1)
+df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1)
+df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1)
+
+df.to_csv(args.output_file,sep="\t",index=False)
\ No newline at end of file
-- 
GitLab