From 565887da06f51a00f1d938f8ee5cd310d3a7ea28 Mon Sep 17 00:00:00 2001 From: Jacques Fize <jacques.fize@insa-lyon.fr> Date: Mon, 6 Apr 2020 17:25:24 +0200 Subject: [PATCH] Add last Modification (Healpix) --- .gitignore | 2 +- combination_embeddingsv2.py | 58 ++++++++++------------ lib/custom_layer.py | 31 ++++++++++++ data_generator.py => lib/data_generator.py | 48 ++++++++++++------ lib/geo.py | 37 ++++++++++++++ lib/ngram_index.py | 1 + lib/utils.py | 19 +++++++ requirements.txt | 1 + scripts/gethealpix.py | 31 ++++++++++++ 9 files changed, 179 insertions(+), 49 deletions(-) create mode 100644 lib/custom_layer.py rename data_generator.py => lib/data_generator.py (87%) create mode 100644 scripts/gethealpix.py diff --git a/.gitignore b/.gitignore index 951b144..0fe655a 100644 --- a/.gitignore +++ b/.gitignore @@ -147,7 +147,7 @@ notes.md .idea* other/* test* -nohup.out +nohup.out* log* temp* subset* diff --git a/combination_embeddingsv2.py b/combination_embeddingsv2.py index dcde16a..ef1de05 100644 --- a/combination_embeddingsv2.py +++ b/combination_embeddingsv2.py @@ -5,19 +5,20 @@ import os import pandas as pd # DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout from keras.models import Model from keras.callbacks import ModelCheckpoint from tensorflow.keras.layers import Lambda import keras.backend as K import tensorflow as tf +from lib.custom_layer import * # Custom module from lib.ngram_index import NgramIndex -from lib.utils import ConfigurationReader, MetaDataSerializer +from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder from lib.metrics import lat_accuracy,lon_accuracy -from data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency -from lib.geo import haversine_tf,accuracy_k +from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency +from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle # Logging import logging @@ -34,7 +35,7 @@ logging.basicConfig( ) args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) #.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) @@ -109,9 +110,10 @@ index = NgramIndex.load(args.ngram_index_fn) train_src = [] test_src = [] +class_encoder = LabelEncoder() if args.wikipedia_cooc: - train_src.append(CoOccurrences(COOC_FN + "_train.csv",sampling=4)) - test_src.append(CoOccurrences(COOC_FN + "_test.csv",sampling=4)) + train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4)) + test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4)) if args.adjacency: a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) @@ -128,8 +130,8 @@ if args.inclusion: -d_train = DataGenerator(train_src,index,batch_size=BATCH_SIZE) -d_test = DataGenerator(test_src,index,batch_size=BATCH_SIZE) +d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE) +d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE) num_words = len(index.index_ngram) @@ -149,58 +151,50 @@ from keras import regularizers input_1 = Input(shape=(index.max_len,)) input_2 = Input(shape=(index.max_len,)) -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) +x1 = Dropout(0.1)(embedding_layer(input_1)) +x2 = Dropout(0.1)(embedding_layer(input_2)) # Each LSTM learn on a permutation of the input toponyms -x1 = Bidirectional(LSTM(98))(x1) -x2 = Bidirectional(LSTM(98))(x2) +biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh")) +x1 = biLSTM(x1) +x2 = biLSTM(x2) +x = concatenate([x2,x1])#,x3]) -x = concatenate([x1,x2])#,x3]) +aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x) -x1 = Dense(500, +x1 = Dense(5000, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x) x1 = Dropout(0.3)(x1) -x1 = Dense(500, +x1 = Dense(5000, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x1) x1 = Dropout(0.3)(x1) -x2 = Dense(500, +x2 = Dense(5000, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x) x2 = Dropout(0.3)(x2) -x2 = Dense(500, +x2 = Dense(5000, activation="relu", kernel_regularizer=regularizers.l2(0.01) )(x2) x2 = Dropout(0.3)(x2) + output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) -from keras.layers import Lambda - -def to_wgs84_lat(lat): - return ((lat*180)-90) -def to_wgs84_lon(lon): - return ((lon*360)-180) - -#output_lon = Lambda(to_wgs84_lon)(output_lon) -#output_lat = Lambda(to_wgs84_lat)(output_lat) Still between 0 and 1 to avoid loss value explosion - - output = concatenate([output_lon,output_lat],name="output_layer") -model = Model(inputs = [input_1,input_2], outputs = output)#input_3 +model = Model(inputs = [input_1,input_2], outputs = [output,aux_layer])#input_3 -model.compile(loss=haversine_tf, optimizer='adam',metrics=[accuracy_k(ACCURACY_TOLERANCE)]) +model.compile(loss={"output_layer":haversine_tf_1circle,"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy","output_layer":accuracy_k(ACCURACY_TOLERANCE)}) ############################################################################################# diff --git a/lib/custom_layer.py b/lib/custom_layer.py new file mode 100644 index 0000000..7204573 --- /dev/null +++ b/lib/custom_layer.py @@ -0,0 +1,31 @@ +# This Layer implementation comes from + +import keras +from keras import backend as K +from keras.engine.topology import Layer + + + +class Pentanh(Layer): + """ + Implementation for the "Penalized Tanh" activation function presented in : + Xu, Bing, Ruitong Huang, et Mu Li. « Revise saturated activation functions ». arXiv preprint arXiv:1602.05980, 2016. + + Code Author: Ana Bárbara Cardoso https://github.com/barbarainacioc/toponym-resolution/blob/master/system/nn_model.py + """ + + def __init__(self, **kwargs): + super(Pentanh, self).__init__(**kwargs) + self.supports_masking = True + self.__name__ = 'pentanh' + + def call(self, inputs): + return K.switch(K.greater(inputs,0), K.tanh(inputs), 0.25 * K.tanh(inputs)) + + def get_config(self): + return super(Pentanh, self).get_config() + + def compute_output_shape(self, input_shape): + return input_shape + +keras.utils.generic_utils.get_custom_objects().update({'pentanh': Pentanh()}) \ No newline at end of file diff --git a/data_generator.py b/lib/data_generator.py similarity index 87% rename from data_generator.py rename to lib/data_generator.py index 12d38bb..5183017 100644 --- a/data_generator.py +++ b/lib/data_generator.py @@ -2,10 +2,11 @@ import os from gzip import GzipFile import keras +from keras.utils import to_categorical import numpy as np import pandas as pd -from lib.geo import zero_one_encoding +from .geo import zero_one_encoding from helpers import parse_title_wiki,read_geonames from gensim.models.keyedvectors import KeyedVectors @@ -20,8 +21,6 @@ def wc_l(filename,gzip=True): lc += 1 f.close() return lc - - class SamplingProbabilities: def __init__(self): @@ -183,8 +182,8 @@ class Inclusion(DataSource): tup_ = tuple(self.data_src[self.i-1]) return (self.geonames_data_dict[tup_[0]][0], self.geonames_data_dict[tup_[1]][0], - self.geonames_data_dict[tup_[0]][1], - self.geonames_data_dict[tup_[0]][2]) + self.geonames_data_dict[tup_[0]][2], + self.geonames_data_dict[tup_[0]][1]) def __reset__(self): self.i = 0 @@ -194,9 +193,10 @@ class Inclusion(DataSource): return (self.i == self.len) +from sklearn.preprocessing import LabelEncoder class CoOccurrences(DataSource): - def __init__(self, filename, sampling=3): + def __init__(self, filename, label_encoder,sampling=3): super().__init__("Co-Occurrence data",filename) try: @@ -225,6 +225,15 @@ class CoOccurrences(DataSource): self.curr_probs = None self.lat, self.lon = None, None + + self.resolution = 64 #fixed for now + self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist() + + self.class_encoder = label_encoder + self.class_encoder.fit(self.classes) + + self.healpix = None + def __next__(self): if self.isOver() or self.i*self.sampling == self.len: self.is_over = True @@ -239,13 +248,14 @@ class CoOccurrences(DataSource): self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context] self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs) self.lat, self.lon = line.latitude,line.longitude + self.healpix = line["healpix_{0}".format(self.resolution)] self.i += 1 self.j = 0 self.j += 1 return (self.topo, self.context_topo_context[self.j-1], - self.lat,self.lon) + self.lat,self.lon,self.class_encoder.transform([self.healpix])[0]) def __reset__(self): self.i = 0 @@ -259,7 +269,7 @@ class CoOccurrences(DataSource): class DataGenerator(keras.utils.Sequence): 'Generates data for Keras' - def __init__(self,data_sources,ngram_index,**kwargs): + def __init__(self,data_sources,ngram_index,class_encoder,**kwargs): 'Initialization' self.data_src = data_sources self.ngram_index = ngram_index @@ -270,6 +280,8 @@ class DataGenerator(keras.utils.Sequence): self.len = sum([len(d) for d in self.data_src]) self.datasrc_index = 0 + self.num_classes = class_encoder.get_num_classes() + #self.on_epoch_end() def __len__(self): @@ -278,31 +290,35 @@ class DataGenerator(keras.utils.Sequence): def __getitem__(self, index): 'Generate one batch of data' - X = np.empty((self.batch_size,2,self.ngram_index.max_len)) - y = np.empty((self.batch_size,2),dtype=float) + X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) # toponym + y = np.empty((self.batch_size,2),dtype=float) #lat lon coord + y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class if self.data_src[self.datasrc_index].isOver(): self.datasrc_index += 1 if self.datasrc_index >= len(self.data_src): - return X,y + return X,[y,y2] for i in range(self.batch_size): if self.data_src[self.datasrc_index].isOver(): return X, y try: - topo, topo_context,latitude,longitude = self.data_src[self.datasrc_index].__next__() + topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__() except StopIteration as e: - return X, y + return X, [y,y2] X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)] - y[i] = [*zero_one_encoding(longitude,latitude)] + y[i] = [*zero_one_encoding(longitude,latitude)] + y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32' +) + #y[i] = [longitude,latitude] - return [X[:,0],X[:,1]], y#[y[:,0],y[:,1]] + return [X[:,0],X[:,1]], [y,y2]#[y[:,0],y[:,1]] def on_epoch_end(self): 'Updates indexes after each epoch' [d.__reset__() for d in self.data_src] self.datasrc_index = 0 - + def load_embedding(model_fn,dim_vector=100): model = KeyedVectors.load(model_fn) N = len(model.wv.vocab) diff --git a/lib/geo.py b/lib/geo.py index 6841247..1bd6396 100644 --- a/lib/geo.py +++ b/lib/geo.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd from shapely.geometry import Point,box +import healpy from tqdm import tqdm @@ -21,6 +22,15 @@ def tf_deg2rad(deg): pi_on_180 = 0.017453292519943295 return deg * pi_on_180 +# convert lat and lon to a healpix code encoding a region, with a given resolution +def latlon2healpix( lat , lon , res ): + lat = np.radians(lat) + lon = np.radians(lon) + xs = ( np.cos(lat) * np.cos(lon) )# + ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates + zs = ( np.sin(lat) )# + return healpy.vec2pix( int(res) , xs , ys , zs ) + def haversine_tf(y_true,y_pred): """ Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates @@ -48,6 +58,33 @@ def haversine_tf(y_true,y_pred): return 6367 * 2 * tf.math.asin(K.sqrt(a)) +def haversine_tf_1circle(y_true,y_pred): + """ + Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates + + Parameters + ---------- + lon1 : numeric or array-like (pandas Dataframe works also) + longitude of first coordinates + lat1 : numeric or array-like (pandas Dataframe works also) + latitude of first coordinates + lon2 : numeric or array-like (pandas Dataframe works also) + longitude of second coordinates + lat2 : numeric or array-like (pandas Dataframe works also) + longitude of second coordinates + + Returns + ------- + float or array-like + distance(s) value(s) + """ + lon1, lat1, lon2, lat2 = map(tf_deg2rad, [y_true[:,0], y_true[:,1], y_pred[:,0], y_pred[:,1]]) + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = K.sin(dlat/2.0)**2 + K.cos(lat1) * K.cos(lat2) * K.sin(dlon/2.0)**2 + + return 1 * 2 * tf.math.asin(K.sqrt(a)) + def to_wgs84_lat(lat): return ((lat*180)-90) def to_wgs84_lon(lon): diff --git a/lib/ngram_index.py b/lib/ngram_index.py index 0cef529..b69fcfe 100644 --- a/lib/ngram_index.py +++ b/lib/ngram_index.py @@ -75,6 +75,7 @@ class NgramIndex(): """ ngrams = word.lower().replace(" ","$") ngrams = list(self.ngram_gen.split(ngrams)) + ngrams = [ng for ng in ngrams if ng.count("$")<self.size-1] if not self.loaded: [self.add(ng) for ng in ngrams if not ng in self.ngram_index] return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len) diff --git a/lib/utils.py b/lib/utils.py index c3fb872..5af3a90 100644 --- a/lib/utils.py +++ b/lib/utils.py @@ -16,6 +16,25 @@ from ngram import NGram # Visualisation and parallelisation from tqdm import tqdm +class LabelEncoder(): + def __init__(self): + self.dict_ = {} + self.cpt = 0 + + def fit_transform(self,list_element): + self.fit(list_element) + return self.transform(list_element) + + def fit(self,list_element): + for l in list_element: + if not l in self.dict_: + self.dict_[l] = self.cpt + self.cpt+=1 + def transform(self,list_element): + return [self.dict_[l] for l in list_element] + + def get_num_classes(self): + return self.cpt class TokenizerCustom(): def __init__(self,vocab): diff --git a/requirements.txt b/requirements.txt index 4eb9e43..7802df3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,3 +19,4 @@ nltk folium flask numba +healpy \ No newline at end of file diff --git a/scripts/gethealpix.py b/scripts/gethealpix.py new file mode 100644 index 0000000..387cacf --- /dev/null +++ b/scripts/gethealpix.py @@ -0,0 +1,31 @@ + + +import pandas as pd + +from tqdm import tqdm +tqdm.pandas() +import argparse + +import numpy as np +import healpy +# convert lat and lon to a healpix code encoding a region, with a given resolution +def latlon2healpix( lat , lon , res ): + lat = np.radians(lat) + lon = np.radians(lon) + xs = ( np.cos(lat) * np.cos(lon) )# + ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates + zs = ( np.sin(lat) )# + return healpy.vec2pix( int(res) , xs , ys , zs ) + +parser = argparse.ArgumentParser() +parser.add_argument("input_file") +parser.add_argument("output_file") + +args = parser.parse_args() + +df = pd.read_csv(args.input_file,sep="\t") +df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1) +df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1) +df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1) + +df.to_csv(args.output_file,sep="\t",index=False) \ No newline at end of file -- GitLab