Skip to content
Snippets Groups Projects
Commit 565887da authored by Jacques Fize's avatar Jacques Fize
Browse files

Add last Modification (Healpix)

parent 749acccb
No related branches found
No related tags found
No related merge requests found
......@@ -147,7 +147,7 @@ notes.md
.idea*
other/*
test*
nohup.out
nohup.out*
log*
temp*
subset*
......
......@@ -5,19 +5,20 @@ import os
import pandas as pd
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
import keras.backend as K
import tensorflow as tf
from lib.custom_layer import *
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer
from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy
from data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k
from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
import logging
......@@ -34,7 +35,7 @@ logging.basicConfig(
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
.parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
......@@ -109,9 +110,10 @@ index = NgramIndex.load(args.ngram_index_fn)
train_src = []
test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",sampling=4))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",sampling=4))
train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4))
if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
......@@ -128,8 +130,8 @@ if args.inclusion:
d_train = DataGenerator(train_src,index,batch_size=BATCH_SIZE)
d_test = DataGenerator(test_src,index,batch_size=BATCH_SIZE)
d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE)
d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE)
num_words = len(index.index_ngram)
......@@ -149,58 +151,50 @@ from keras import regularizers
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
x1 = Dropout(0.1)(embedding_layer(input_1))
x2 = Dropout(0.1)(embedding_layer(input_2))
# Each LSTM learn on a permutation of the input toponyms
x1 = Bidirectional(LSTM(98))(x1)
x2 = Bidirectional(LSTM(98))(x2)
biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x2,x1])#,x3])
x = concatenate([x1,x2])#,x3])
aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
x1 = Dense(500,
x1 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x)
x1 = Dropout(0.3)(x1)
x1 = Dense(500,
x1 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x1)
x1 = Dropout(0.3)(x1)
x2 = Dense(500,
x2 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x)
x2 = Dropout(0.3)(x2)
x2 = Dense(500,
x2 = Dense(5000,
activation="relu",
kernel_regularizer=regularizers.l2(0.01)
)(x2)
x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
from keras.layers import Lambda
def to_wgs84_lat(lat):
return ((lat*180)-90)
def to_wgs84_lon(lon):
return ((lon*360)-180)
#output_lon = Lambda(to_wgs84_lon)(output_lon)
#output_lat = Lambda(to_wgs84_lat)(output_lat) Still between 0 and 1 to avoid loss value explosion
output = concatenate([output_lon,output_lat],name="output_layer")
model = Model(inputs = [input_1,input_2], outputs = output)#input_3
model = Model(inputs = [input_1,input_2], outputs = [output,aux_layer])#input_3
model.compile(loss=haversine_tf, optimizer='adam',metrics=[accuracy_k(ACCURACY_TOLERANCE)])
model.compile(loss={"output_layer":haversine_tf_1circle,"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy","output_layer":accuracy_k(ACCURACY_TOLERANCE)})
#############################################################################################
......
# This Layer implementation comes from
import keras
from keras import backend as K
from keras.engine.topology import Layer
class Pentanh(Layer):
"""
Implementation for the "Penalized Tanh" activation function presented in :
Xu, Bing, Ruitong Huang, et Mu Li. « Revise saturated activation functions ». arXiv preprint arXiv:1602.05980, 2016.
Code Author: Ana Bárbara Cardoso https://github.com/barbarainacioc/toponym-resolution/blob/master/system/nn_model.py
"""
def __init__(self, **kwargs):
super(Pentanh, self).__init__(**kwargs)
self.supports_masking = True
self.__name__ = 'pentanh'
def call(self, inputs):
return K.switch(K.greater(inputs,0), K.tanh(inputs), 0.25 * K.tanh(inputs))
def get_config(self):
return super(Pentanh, self).get_config()
def compute_output_shape(self, input_shape):
return input_shape
keras.utils.generic_utils.get_custom_objects().update({'pentanh': Pentanh()})
\ No newline at end of file
......@@ -2,10 +2,11 @@ import os
from gzip import GzipFile
import keras
from keras.utils import to_categorical
import numpy as np
import pandas as pd
from lib.geo import zero_one_encoding
from .geo import zero_one_encoding
from helpers import parse_title_wiki,read_geonames
from gensim.models.keyedvectors import KeyedVectors
......@@ -20,8 +21,6 @@ def wc_l(filename,gzip=True):
lc += 1
f.close()
return lc
class SamplingProbabilities:
def __init__(self):
......@@ -183,8 +182,8 @@ class Inclusion(DataSource):
tup_ = tuple(self.data_src[self.i-1])
return (self.geonames_data_dict[tup_[0]][0],
self.geonames_data_dict[tup_[1]][0],
self.geonames_data_dict[tup_[0]][1],
self.geonames_data_dict[tup_[0]][2])
self.geonames_data_dict[tup_[0]][2],
self.geonames_data_dict[tup_[0]][1])
def __reset__(self):
self.i = 0
......@@ -194,9 +193,10 @@ class Inclusion(DataSource):
return (self.i == self.len)
from sklearn.preprocessing import LabelEncoder
class CoOccurrences(DataSource):
def __init__(self, filename, sampling=3):
def __init__(self, filename, label_encoder,sampling=3):
super().__init__("Co-Occurrence data",filename)
try:
......@@ -225,6 +225,15 @@ class CoOccurrences(DataSource):
self.curr_probs = None
self.lat, self.lon = None, None
self.resolution = 64 #fixed for now
self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist()
self.class_encoder = label_encoder
self.class_encoder.fit(self.classes)
self.healpix = None
def __next__(self):
if self.isOver() or self.i*self.sampling == self.len:
self.is_over = True
......@@ -239,13 +248,14 @@ class CoOccurrences(DataSource):
self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
self.lat, self.lon = line.latitude,line.longitude
self.healpix = line["healpix_{0}".format(self.resolution)]
self.i += 1
self.j = 0
self.j += 1
return (self.topo,
self.context_topo_context[self.j-1],
self.lat,self.lon)
self.lat,self.lon,self.class_encoder.transform([self.healpix])[0])
def __reset__(self):
self.i = 0
......@@ -259,7 +269,7 @@ class CoOccurrences(DataSource):
class DataGenerator(keras.utils.Sequence):
'Generates data for Keras'
def __init__(self,data_sources,ngram_index,**kwargs):
def __init__(self,data_sources,ngram_index,class_encoder,**kwargs):
'Initialization'
self.data_src = data_sources
self.ngram_index = ngram_index
......@@ -270,6 +280,8 @@ class DataGenerator(keras.utils.Sequence):
self.len = sum([len(d) for d in self.data_src])
self.datasrc_index = 0
self.num_classes = class_encoder.get_num_classes()
#self.on_epoch_end()
def __len__(self):
......@@ -278,31 +290,35 @@ class DataGenerator(keras.utils.Sequence):
def __getitem__(self, index):
'Generate one batch of data'
X = np.empty((self.batch_size,2,self.ngram_index.max_len))
y = np.empty((self.batch_size,2),dtype=float)
X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) # toponym
y = np.empty((self.batch_size,2),dtype=float) #lat lon coord
y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
if self.data_src[self.datasrc_index].isOver():
self.datasrc_index += 1
if self.datasrc_index >= len(self.data_src):
return X,y
return X,[y,y2]
for i in range(self.batch_size):
if self.data_src[self.datasrc_index].isOver():
return X, y
try:
topo, topo_context,latitude,longitude = self.data_src[self.datasrc_index].__next__()
topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__()
except StopIteration as e:
return X, y
return X, [y,y2]
X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
y[i] = [*zero_one_encoding(longitude,latitude)]
y[i] = [*zero_one_encoding(longitude,latitude)]
y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
)
#y[i] = [longitude,latitude]
return [X[:,0],X[:,1]], y#[y[:,0],y[:,1]]
return [X[:,0],X[:,1]], [y,y2]#[y[:,0],y[:,1]]
def on_epoch_end(self):
'Updates indexes after each epoch'
[d.__reset__() for d in self.data_src]
self.datasrc_index = 0
def load_embedding(model_fn,dim_vector=100):
model = KeyedVectors.load(model_fn)
N = len(model.wv.vocab)
......
......@@ -4,6 +4,7 @@ import numpy as np
import pandas as pd
from shapely.geometry import Point,box
import healpy
from tqdm import tqdm
......@@ -21,6 +22,15 @@ def tf_deg2rad(deg):
pi_on_180 = 0.017453292519943295
return deg * pi_on_180
# convert lat and lon to a healpix code encoding a region, with a given resolution
def latlon2healpix( lat , lon , res ):
lat = np.radians(lat)
lon = np.radians(lon)
xs = ( np.cos(lat) * np.cos(lon) )#
ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
zs = ( np.sin(lat) )#
return healpy.vec2pix( int(res) , xs , ys , zs )
def haversine_tf(y_true,y_pred):
"""
Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
......@@ -48,6 +58,33 @@ def haversine_tf(y_true,y_pred):
return 6367 * 2 * tf.math.asin(K.sqrt(a))
def haversine_tf_1circle(y_true,y_pred):
"""
Return the geodesic distance between (lon1,lat1) and (lon2,lat2) coordinates
Parameters
----------
lon1 : numeric or array-like (pandas Dataframe works also)
longitude of first coordinates
lat1 : numeric or array-like (pandas Dataframe works also)
latitude of first coordinates
lon2 : numeric or array-like (pandas Dataframe works also)
longitude of second coordinates
lat2 : numeric or array-like (pandas Dataframe works also)
longitude of second coordinates
Returns
-------
float or array-like
distance(s) value(s)
"""
lon1, lat1, lon2, lat2 = map(tf_deg2rad, [y_true[:,0], y_true[:,1], y_pred[:,0], y_pred[:,1]])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = K.sin(dlat/2.0)**2 + K.cos(lat1) * K.cos(lat2) * K.sin(dlon/2.0)**2
return 1 * 2 * tf.math.asin(K.sqrt(a))
def to_wgs84_lat(lat):
return ((lat*180)-90)
def to_wgs84_lon(lon):
......
......@@ -75,6 +75,7 @@ class NgramIndex():
"""
ngrams = word.lower().replace(" ","$")
ngrams = list(self.ngram_gen.split(ngrams))
ngrams = [ng for ng in ngrams if ng.count("$")<self.size-1]
if not self.loaded:
[self.add(ng) for ng in ngrams if not ng in self.ngram_index]
return self.complete([self.ngram_index[ng] for ng in ngrams if ng in self.ngram_index],self.max_len)
......
......@@ -16,6 +16,25 @@ from ngram import NGram
# Visualisation and parallelisation
from tqdm import tqdm
class LabelEncoder():
def __init__(self):
self.dict_ = {}
self.cpt = 0
def fit_transform(self,list_element):
self.fit(list_element)
return self.transform(list_element)
def fit(self,list_element):
for l in list_element:
if not l in self.dict_:
self.dict_[l] = self.cpt
self.cpt+=1
def transform(self,list_element):
return [self.dict_[l] for l in list_element]
def get_num_classes(self):
return self.cpt
class TokenizerCustom():
def __init__(self,vocab):
......
......@@ -19,3 +19,4 @@ nltk
folium
flask
numba
healpy
\ No newline at end of file
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
import argparse
import numpy as np
import healpy
# convert lat and lon to a healpix code encoding a region, with a given resolution
def latlon2healpix( lat , lon , res ):
lat = np.radians(lat)
lon = np.radians(lon)
xs = ( np.cos(lat) * np.cos(lon) )#
ys = ( np.cos(lat) * np.sin(lon) )# -> Sphere coordinates: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates
zs = ( np.sin(lat) )#
return healpy.vec2pix( int(res) , xs , ys , zs )
parser = argparse.ArgumentParser()
parser.add_argument("input_file")
parser.add_argument("output_file")
args = parser.parse_args()
df = pd.read_csv(args.input_file,sep="\t")
df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1)
df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1)
df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1)
df.to_csv(args.output_file,sep="\t",index=False)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment