Skip to content
Snippets Groups Projects
Commit 565887da authored by Jacques Fize's avatar Jacques Fize
Browse files

Add last Modification (Healpix)

parent 749acccb
No related branches found
No related tags found
No related merge requests found
...@@ -147,7 +147,7 @@ notes.md ...@@ -147,7 +147,7 @@ notes.md
.idea* .idea*
other/* other/*
test* test*
nohup.out nohup.out*
log* log*
temp* temp*
subset* subset*
......
...@@ -5,19 +5,20 @@ import os ...@@ -5,19 +5,20 @@ import os
import pandas as pd import pandas as pd
# DEEPL module # DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
from keras.models import Model from keras.models import Model
from keras.callbacks import ModelCheckpoint from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda from tensorflow.keras.layers import Lambda
import keras.backend as K import keras.backend as K
import tensorflow as tf import tensorflow as tf
from lib.custom_layer import *
# Custom module # Custom module
from lib.ngram_index import NgramIndex from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy from lib.metrics import lat_accuracy,lon_accuracy
from data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging # Logging
import logging import logging
...@@ -34,7 +35,7 @@ logging.basicConfig( ...@@ -34,7 +35,7 @@ logging.basicConfig(
) )
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) #.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
...@@ -109,9 +110,10 @@ index = NgramIndex.load(args.ngram_index_fn) ...@@ -109,9 +110,10 @@ index = NgramIndex.load(args.ngram_index_fn)
train_src = [] train_src = []
test_src = [] test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc: if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",sampling=4)) train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",sampling=4)) test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4))
if args.adjacency: if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
...@@ -128,8 +130,8 @@ if args.inclusion: ...@@ -128,8 +130,8 @@ if args.inclusion:
d_train = DataGenerator(train_src,index,batch_size=BATCH_SIZE) d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE)
d_test = DataGenerator(test_src,index,batch_size=BATCH_SIZE) d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE)
num_words = len(index.index_ngram) num_words = len(index.index_ngram)
...@@ -149,58 +151,50 @@ from keras import regularizers ...@@ -149,58 +151,50 @@ from keras import regularizers
input_1 = Input(shape=(index.max_len,)) input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,)) input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1) x1 = Dropout(0.1)(embedding_layer(input_1))
x2 = embedding_layer(input_2) x2 = Dropout(0.1)(embedding_layer(input_2))
# Each LSTM learn on a permutation of the input toponyms # Each LSTM learn on a permutation of the input toponyms
x1 = Bidirectional(LSTM(98))(x1) biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
x2 = Bidirectional(LSTM(98))(x2) x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x2,x1])#,x3])
x = concatenate([x1,x2])#,x3]) aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
x1 = Dense(500, x1 = Dense(5000,
activation="relu", activation="relu",
kernel_regularizer=regularizers.l2(0.01) kernel_regularizer=regularizers.l2(0.01)
)(x) )(x)
x1 = Dropout(0.3)(x1) x1 = Dropout(0.3)(x1)
x1 = Dense(500, x1 = Dense(5000,
activation="relu", activation="relu",
kernel_regularizer=regularizers.l2(0.01) kernel_regularizer=regularizers.l2(0.01)
)(x1) )(x1)
x1 = Dropout(0.3)(x1) x1 = Dropout(0.3)(x1)
x2 = Dense(500, x2 = Dense(5000,
activation="relu", activation="relu",
kernel_regularizer=regularizers.l2(0.01) kernel_regularizer=regularizers.l2(0.01)
)(x) )(x)
x2 = Dropout(0.3)(x2) x2 = Dropout(0.3)(x2)
x2 = Dense(500, x2 = Dense(5000,
activation="relu", activation="relu",
kernel_regularizer=regularizers.l2(0.01) kernel_regularizer=regularizers.l2(0.01)
)(x2) )(x2)
x2 = Dropout(0.3)(x2) x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
from keras.layers import Lambda
def to_wgs84_lat(lat):
return ((lat*180)-90)
def to_wgs84_lon(lon):
return ((lon*360)-180)
#output_lon = Lambda(to_wgs84_lon)(output_lon)
#output_lat = Lambda(to_wgs84_lat)(output_lat) Still between 0 and 1 to avoid loss value explosion
output = concatenate([output_lon,output_lat],name="output_layer") output = concatenate([output_lon,output_lat],name="output_layer")
model = Model(inputs = [input_1,input_2], outputs = output)#input_3 model = Model(inputs = [input_1,input_2], outputs = [output,aux_layer])#input_3
model.compile(loss=haversine_tf, optimizer='adam',metrics=[accuracy_k(ACCURACY_TOLERANCE)]) model.compile(loss={"output_layer":haversine_tf_1circle,"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy","output_layer":accuracy_k(ACCURACY_TOLERANCE)})
############################################################################################# #############################################################################################
......
# This Layer implementation comes from
import keras
from keras import backend as K
from keras.engine.topology import Layer
class Pentanh(Layer):
"""
Implementation for the "Penalized Tanh" activation function presented in :
Xu, Bing, Ruitong Huang, et Mu Li. « Revise saturated activation functions ». arXiv preprint arXiv:1602.05980, 2016.
Code Author: Ana Bárbara Cardoso https://github.com/barbarainacioc/toponym-resolution/blob/master/system/nn_model.py
"""
def __init__(self, **kwargs):
super(Pentanh, self).__init__(**kwargs)
self.supports_masking = True
self.__name__ = 'pentanh'
def call(self, inputs):
return K.switch(K.greater(inputs,0), K.tanh(inputs), 0.25 * K.tanh(inputs))
def get_config(self):
return super(Pentanh, self).get_config()
def compute_output_shape(self, input_shape):
return input_shape
keras.utils.generic_utils.get_custom_objects().update({'pentanh': Pentanh()})
\ No newline at end of file
...@@ -2,10 +2,11 @@ import os ...@@ -2,10 +2,11 @@ import os
from gzip import GzipFile from gzip import GzipFile
import keras import keras
from keras.utils import to_categorical
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from lib.geo import zero_one_encoding from .geo import zero_one_encoding
from helpers import parse_title_wiki,read_geonames from helpers import parse_title_wiki,read_geonames
from gensim.models.keyedvectors import KeyedVectors from gensim.models.keyedvectors import KeyedVectors
...@@ -20,8 +21,6 @@ def wc_l(filename,gzip=True): ...@@ -20,8 +21,6 @@ def wc_l(filename,gzip=True):
lc += 1 lc += 1
f.close() f.close()
return lc return lc
class SamplingProbabilities: class SamplingProbabilities:
def __init__(self): def __init__(self):
...@@ -183,8 +182,8 @@ class Inclusion(DataSource): ...@@ -183,8 +182,8 @@ class Inclusion(DataSource):
tup_ = tuple(self.data_src[self.i-1]) tup_ = tuple(self.data_src[self.i-1])
return (self.geonames_data_dict[tup_[0]][0], return (self.geonames_data_dict[tup_[0]][0],
self.geonames_data_dict[tup_[1]][0], self.geonames_data_dict[tup_[1]][0],
self.geonames_data_dict[tup_[0]][1], self.geonames_data_dict[tup_[0]][2],
self.geonames_data_dict[tup_[0]][2]) self.geonames_data_dict[tup_[0]][1])
def __reset__(self): def __reset__(self):
self.i = 0 self.i = 0
...@@ -194,9 +193,10 @@ class Inclusion(DataSource): ...@@ -194,9 +193,10 @@ class Inclusion(DataSource):
return (self.i == self.len) return (self.i == self.len)
from sklearn.preprocessing import LabelEncoder
class CoOccurrences(DataSource): class CoOccurrences(DataSource):
def __init__(self, filename, sampling=3): def __init__(self, filename, label_encoder,sampling=3):
super().__init__("Co-Occurrence data",filename) super().__init__("Co-Occurrence data",filename)
try: try:
...@@ -225,6 +225,15 @@ class CoOccurrences(DataSource): ...@@ -225,6 +225,15 @@ class CoOccurrences(DataSource):
self.curr_probs = None self.curr_probs = None
self.lat, self.lon = None, None self.lat, self.lon = None, None
self.resolution = 64 #fixed for now
self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist()
self.class_encoder = label_encoder
self.class_encoder.fit(self.classes)
self.healpix = None
def __next__(self): def __next__(self):
if self.isOver() or self.i*self.sampling == self.len: if self.isOver() or self.i*self.sampling == self.len:
self.is_over = True self.is_over = True
...@@ -239,13 +248,14 @@ class CoOccurrences(DataSource): ...@@ -239,13 +248,14 @@ class CoOccurrences(DataSource):
self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context] self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs) self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
self.lat, self.lon = line.latitude,line.longitude self.lat, self.lon = line.latitude,line.longitude
self.healpix = line["healpix_{0}".format(self.resolution)]
self.i += 1 self.i += 1
self.j = 0 self.j = 0
self.j += 1 self.j += 1
return (self.topo, return (self.topo,
self.context_topo_context[self.j-1], self.context_topo_context[self.j-1],
self.lat,self.lon) self.lat,self.lon,self.class_encoder.transform([self.healpix])[0])
def __reset__(self): def __reset__(self):
self.i = 0 self.i = 0
...@@ -259,7 +269,7 @@ class CoOccurrences(DataSource): ...@@ -259,7 +269,7 @@ class CoOccurrences(DataSource):
class DataGenerator(keras.utils.Sequence): class DataGenerator(keras.utils.Sequence):
'Generates data for Keras' 'Generates data for Keras'
def __init__(self,data_sources,ngram_index,**kwargs): def __init__(self,data_sources,ngram_index,class_encoder,**kwargs):
'Initialization' 'Initialization'
self.data_src = data_sources self.data_src = data_sources
self.ngram_index = ngram_index self.ngram_index = ngram_index
...@@ -270,6 +280,8 @@ class DataGenerator(keras.utils.Sequence): ...@@ -270,6 +280,8 @@ class DataGenerator(keras.utils.Sequence):
self.len = sum([len(d) for d in self.data_src]) self.len = sum([len(d) for d in self.data_src])
self.datasrc_index = 0 self.datasrc_index = 0
self.num_classes = class_encoder.get_num_classes()
#self.on_epoch_end() #self.on_epoch_end()
def __len__(self): def __len__(self):
...@@ -278,31 +290,35 @@ class DataGenerator(keras.utils.Sequence): ...@@ -278,31 +290,35 @@ class DataGenerator(keras.utils.Sequence):
def __getitem__(self, index): def __getitem__(self, index):
'Generate one batch of data' 'Generate one batch of data'
X = np.empty((self.batch_size,2,self.ngram_index.max_len)) X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) # toponym
y = np.empty((self.batch_size,2),dtype=float) y = np.empty((self.batch_size,2),dtype=float) #lat lon coord
y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
if self.data_src[self.datasrc_index].isOver(): if self.data_src[self.datasrc_index].isOver():
self.datasrc_index += 1 self.datasrc_index += 1
if self.datasrc_index >= len(self.data_src): if self.datasrc_index >= len(self.data_src):
return X,y return X,[y,y2]
for i in range(self.batch_size): for i in range(self.batch_size):
if self.data_src[self.datasrc_index].isOver(): if self.data_src[self.datasrc_index].isOver():
return X, y return X, y
try: try:
topo, topo_context,latitude,longitude = self.data_src[self.datasrc_index].__next__() topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__()
except StopIteration as e: except StopIteration as e:
return X, y return X, [y,y2]
X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)] X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
y[i] = [*zero_one_encoding(longitude,latitude)] y[i] = [*zero_one_encoding(longitude,latitude)]
y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
)
#y[i] = [longitude,latitude] #y[i] = [longitude,latitude]
return [X[:,0],X[:,1]], y#[y[:,0],y[:,1]] return [X[:,0],X[:,1]], [y,y2]#[y[:,0],y[:,1]]
def on_epoch_end(self): def on_epoch_end(self):
'Updates indexes after each epoch' 'Updates indexes after each epoch'
[d.__reset__() for d in self.data_src] [d.__reset__() for d in self.data_src]
self.datasrc_index = 0 self.datasrc_index = 0
def load_embedding(model_fn,dim_vector=100): def load_embedding(model_fn,dim_vector=100):
model = KeyedVectors.load(model_fn) model = KeyedVectors.load(model_fn)
N = len(model.wv.vocab) N = len(model.wv.vocab)
......
...@@ -4,6 +4,7 @@ import numpy as np ...@@ -4,6 +4,7 @@ import numpy as np
import pandas as pd import pandas as pd
from shapely.geometry import Point,box from shapely.geometry import Point,box
import healpy
from tqdm import tqdm from tqdm import tqdm
...@@ -21,6 +22,15 @@ def tf_deg2rad(deg): ...@@ -21,6 +22,15 @@ def tf_deg2rad(deg):
pi_on_180 = 0.017453292519943295 pi_on_180 = 0.017453292519943295
return deg * pi_on_180 return deg * pi_on_180
# convert lat and lon to a healpix code encoding a region, with a given resolution
def latlon2healpix( lat , lon , res ):
lat = np.radians(lat)
lon = np.radians(lon)
xs = ( np.cos(lat) * np.cos(lon) )#
ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
zs = ( np.sin(lat) )#
return healpy.vec2pix( int(res) , xs , ys , zs )
def haversine_tf(y_true,y_pred): def haversine_tf(y_true,y_pred):
""" """
Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
...@@ -48,6 +58,33 @@ def haversine_tf(y_true,y_pred): ...@@ -48,6 +58,33 @@ def haversine_tf(y_true,y_pred):
return 6367 * 2 * tf.math.asin(K.sqrt(a)) return 6367 * 2 * tf.math.asin(K.sqrt(a))
def haversine_tf_1circle(y_true,y_pred):
"""
Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
Parameters
----------
lon1 : numeric or array-like (pandas Dataframe works also)
longitude of first coordinates
lat1 : numeric or array-like (pandas Dataframe works also)
latitude of first coordinates
lon2 : numeric or array-like (pandas Dataframe works also)
longitude of second coordinates
lat2 : numeric or array-like (pandas Dataframe works also)
longitude of second coordinates
Returns
-------
float or array-like
distance(s) value(s)
"""
lon1, lat1, lon2, lat2 = map(tf_deg2rad, [y_true[:,0], y_true[:,1], y_pred[:,0], y_pred[:,1]])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = K.sin(dlat/2.0)**2 + K.cos(lat1) * K.cos(lat2) * K.sin(dlon/2.0)**2
return 1 * 2 * tf.math.asin(K.sqrt(a))
def to_wgs84_lat(lat): def to_wgs84_lat(lat):
return ((lat*180)-90) return ((lat*180)-90)
def to_wgs84_lon(lon): def to_wgs84_lon(lon):
......
...@@ -75,6 +75,7 @@ class NgramIndex(): ...@@ -75,6 +75,7 @@ class NgramIndex():
""" """
ngrams = word.lower().replace(" ","$") ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams)) ngrams = list(self.ngram_gen.split(ngrams))
ngrams = [ng for ng in ngrams if ng.count("$")<self.size-1]
if not self.loaded: if not self.loaded:
[self.add(ng) for ng in ngrams if not ng in self.ngram_index] [self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len) return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
......
...@@ -16,6 +16,25 @@ from ngram import NGram ...@@ -16,6 +16,25 @@ from ngram import NGram
# Visualisation and parallelisation # Visualisation and parallelisation
from tqdm import tqdm from tqdm import tqdm
class LabelEncoder():
def __init__(self):
self.dict_ = {}
self.cpt = 0
def fit_transform(self,list_element):
self.fit(list_element)
return self.transform(list_element)
def fit(self,list_element):
for l in list_element:
if not l in self.dict_:
self.dict_[l] = self.cpt
self.cpt+=1
def transform(self,list_element):
return [self.dict_[l] for l in list_element]
def get_num_classes(self):
return self.cpt
class TokenizerCustom(): class TokenizerCustom():
def __init__(self,vocab): def __init__(self,vocab):
......
...@@ -19,3 +19,4 @@ nltk ...@@ -19,3 +19,4 @@ nltk
folium folium
flask flask
numba numba
healpy
\ No newline at end of file
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import argparse
import numpy as np
import healpy
# convert lat and lon to a healpix code encoding a region, with a given resolution
def latlon2healpix( lat , lon , res ):
lat = np.radians(lat)
lon = np.radians(lon)
xs = ( np.cos(lat) * np.cos(lon) )#
ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
zs = ( np.sin(lat) )#
return healpy.vec2pix( int(res) , xs , ys , zs )
parser = argparse.ArgumentParser()
parser.add_argument("input_file")
parser.add_argument("output_file")
args = parser.parse_args()
df = pd.read_csv(args.input_file,sep="\t")
df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1)
df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1)
df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1)
df.to_csv(args.output_file,sep="\t",index=False)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment