Skip to content
Snippets Groups Projects
Commit 8b047924 authored by Jacques Fize's avatar Jacques Fize
Browse files

ADD region prediction network code

parent 565887da
No related branches found
No related tags found
No related merge requests found
...@@ -11,6 +11,9 @@ from .geo import zero_one_encoding ...@@ -11,6 +11,9 @@ from .geo import zero_one_encoding
from helpers import parse_title_wiki,read_geonames from helpers import parse_title_wiki,read_geonames
from gensim.models.keyedvectors import KeyedVectors from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import LabelEncoder
def wc_l(filename,gzip=True): def wc_l(filename,gzip=True):
lc = 0 lc = 0
if not gzip: if not gzip:
...@@ -40,7 +43,9 @@ class DataSource(object): ...@@ -40,7 +43,9 @@ class DataSource(object):
self.name = name self.name = name
assert os.path.exists(input_filename) assert os.path.exists(input_filename)
self.input_filename = input_filename self.input_filename = input_filename
self.len = 0 self.len = 0
self.is_there_healpix = False
def __next__(self): def __next__(self):
raise NotImplementedError() raise NotImplementedError()
...@@ -112,29 +117,6 @@ class Adjacency(DataSource): ...@@ -112,29 +117,6 @@ class Adjacency(DataSource):
return (self.geonames_data_dict[self.topo], return (self.geonames_data_dict[self.topo],
self.geonames_data_dict[self.context_topo_context[self.i-1]], self.geonames_data_dict[self.context_topo_context[self.i-1]],
self.lat,self.lon) self.lat,self.lon)
def __nextv2__(self):
if self.i >= len(self.context_topo_context):
line = self.data_src.readline()
if not line:
self.is_over = True
raise StopIteration
line = line.decode("utf-8").rstrip("\n")
geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(","))
self.topo = int(geonameid)
self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")]
if self.sampling:
self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
self.lat, self.lon = float(latitude),float(longitude)
self.i = 0
self.i += 1
return (self.topo,
self.context_topo_context[self.i-1],
self.lat,self.lon)
def __reset__(self): def __reset__(self):
if not self.gzip: if not self.gzip:
...@@ -193,40 +175,48 @@ class Inclusion(DataSource): ...@@ -193,40 +175,48 @@ class Inclusion(DataSource):
return (self.i == self.len) return (self.i == self.len)
from sklearn.preprocessing import LabelEncoder
class CoOccurrences(DataSource): class CoOccurrences(DataSource):
def __init__(self, filename, label_encoder,sampling=3): def __init__(self, filename, label_encoder,sampling=3,resolution = 1):
super().__init__("Co-Occurrence data",filename) super().__init__("Co-Occurrence data",filename)
self.is_there_healpix = True
# LOAD DATA
try: try:
self.data_src = pd.read_csv(filename) self.data_src = pd.read_csv(filename)
except: except:
self.data_src = pd.read_csv(filename,sep="\t") self.data_src = pd.read_csv(filename,sep="\t")
# CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA
if not "healpix_{0}".format(resolution) in self.data_src.columns:
raise KeyError("healpix_{0} column does not exists ! ".format(resolution))
# PARSE TOPONYMS
self.data_src["title"] = self.data_src.title.apply(parse_title_wiki) self.data_src["title"] = self.data_src.title.apply(parse_title_wiki)
try: try:
self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki) self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki)
except: except:
pass pass
# LOOP parameter
self.sampling = sampling
if self.sampling:
self.probs_storage = SamplingProbabilities()
# LOOP INDICES
self.i = 0 self.i = 0
self.j = 0 self.j = 0
self.is_over = False self.is_over = False
self.sampling = sampling
self.len = len(self.data_src)*self.sampling self.len = len(self.data_src)*self.sampling
if self.sampling:
self.probs_storage = SamplingProbabilities()
# BUFFER VARIABLE
self.topo = None self.topo = None
self.context_topo_context = [] self.context_topo_context = []
self.curr_probs = None self.curr_probs = None
self.lat, self.lon = None, None self.lat, self.lon = None, None
self.resolution = 64 #fixed for now self.resolution = resolution
self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist() self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist()
self.class_encoder = label_encoder self.class_encoder = label_encoder
...@@ -248,7 +238,9 @@ class CoOccurrences(DataSource): ...@@ -248,7 +238,9 @@ class CoOccurrences(DataSource):
self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context] self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs) self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
self.lat, self.lon = line.latitude,line.longitude self.lat, self.lon = line.latitude,line.longitude
self.healpix = line["healpix_{0}".format(self.resolution)] self.healpix = line["healpix_{0}".format(self.resolution)]
self.i += 1 self.i += 1
self.j = 0 self.j = 0
...@@ -264,9 +256,6 @@ class CoOccurrences(DataSource): ...@@ -264,9 +256,6 @@ class CoOccurrences(DataSource):
def isOver(self): def isOver(self):
return self.is_over return self.is_over
class DataGenerator(keras.utils.Sequence): class DataGenerator(keras.utils.Sequence):
'Generates data for Keras' 'Generates data for Keras'
def __init__(self,data_sources,ngram_index,class_encoder,**kwargs): def __init__(self,data_sources,ngram_index,class_encoder,**kwargs):
...@@ -275,49 +264,68 @@ class DataGenerator(keras.utils.Sequence): ...@@ -275,49 +264,68 @@ class DataGenerator(keras.utils.Sequence):
self.ngram_index = ngram_index self.ngram_index = ngram_index
self.batch_size = kwargs.get("batch_size",1000) self.batch_size = kwargs.get("batch_size",1000)
self.only_healpix = kwargs.get("only_healpix",False)
self.len = sum([len(d) for d in self.data_src]) self.len = sum([len(d) for d in self.data_src])
self.datasrc_index = 0 self.datasrc_index = 0
self.num_classes = class_encoder.get_num_classes() self.num_classes = class_encoder.get_num_classes()
#self.on_epoch_end() self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
def __len__(self): def __len__(self):
'Denotes the number of batches per epoch' 'Denotes the number of batches per epoch'
return int(np.floor(self.len / self.batch_size)) return int(np.floor(self.len / self.batch_size))
def return_(self,X,y,y2=None):
if self.is_there_healpix and self.only_healpix:
return [X[:,0],X[:,1]],y2
if self.is_there_healpix:
return [X[:,0],X[:,1]],[y,y2]
else:
return [X[:,0],X[:,1]],y
def __getitem__(self, index): def __getitem__(self, index):
'Generate one batch of data' 'Generate one batch of data'
X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) # toponym X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) # toponym
y = np.empty((self.batch_size,2),dtype=float) #lat lon coord y = np.empty((self.batch_size,2),dtype=float) #lat lon coord
y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
y2=None # For healpix
if self.is_there_healpix:
y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
if self.data_src[self.datasrc_index].isOver(): if self.data_src[self.datasrc_index].isOver():
self.datasrc_index += 1 self.datasrc_index += 1
self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
if self.datasrc_index >= len(self.data_src): if self.datasrc_index >= len(self.data_src):
return X,[y,y2] self.return_(X,y,y2)
for i in range(self.batch_size): for i in range(self.batch_size):
if self.data_src[self.datasrc_index].isOver(): if self.data_src[self.datasrc_index].isOver():
return X, y return self.return_(X,y,y2)
try: try:
topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__() topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__()
except StopIteration as e: except StopIteration as e:
return X, [y,y2] return self.return_(X,y,y2)
X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)] X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
y[i] = [*zero_one_encoding(longitude,latitude)] y[i] = [*zero_one_encoding(longitude,latitude)]
y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32' if self.is_there_healpix:
y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
) )
#y[i] = [longitude,latitude] #y[i] = [longitude,latitude]
return [X[:,0],X[:,1]], [y,y2]#[y[:,0],y[:,1]] return self.return_(X,y,y2)
def on_epoch_end(self): def on_epoch_end(self):
'Updates indexes after each epoch' 'Updates indexes after each epoch'
[d.__reset__() for d in self.data_src] [d.__reset__() for d in self.data_src]
self.datasrc_index = 0 self.datasrc_index = 0
def load_embedding(model_fn,dim_vector=100): def load_embedding(model_fn,dim_vector=100):
model = KeyedVectors.load(model_fn) model = KeyedVectors.load(model_fn)
......
# Base module
import os
# Structure
import pandas as pd
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
import keras.backend as K
import tensorflow as tf
from lib.custom_layer import *
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy
from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)
from helpers import EpochTimer
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.k_value
EPOCHS = args.epochs
ADJACENCY_SAMPLING = args.adjacency_sample
COOC_SAMPLING = args.cooc_sample
WORDVEC_ITER = 50
EMBEDDING_DIM = args.dimension
BATCH_SIZE = args.batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.inclusion_fn
ADJACENCY_REL_FILENAME = args.adjacency_fn
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "REGION_{0}_{1}_{2}_{3}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
meta_data = MetaDataSerializer(
DATASET_NAME,
REL_CODE,
COOC_SAMPLING,
ADJACENCY_SAMPLING,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
### PUT DATASRC + GENERATOR
index = NgramIndex.load(args.ngram_index_fn)
train_src = []
test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4))
if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
train_src.append(a_train)
test_src.append(a_test)
if args.inclusion:
i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
train_src.append(i_train)
test_src.append(i_test)
#Adjacency
d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True)
d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True)
num_words = len(index.index_ngram)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from keras import regularizers
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x1,x2])#,x3])
#x = Dense(class_encoder.get_num_classes()*2,activation="relu")(x)
aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
model = Model(inputs = [input_1,input_2], outputs = aux_layer)#input_3
model.compile(loss={"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy"})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit_generator(generator=d_train,
validation_data=d_test,
verbose=True,
epochs=EPOCHS,
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
...@@ -27,5 +27,6 @@ df = pd.read_csv(args.input_file,sep="\t") ...@@ -27,5 +27,6 @@ df = pd.read_csv(args.input_file,sep="\t")
df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1) df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1)
df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1) df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1)
df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1) df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1)
df["healpix_1"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=1),axis=1)
df.to_csv(args.output_file,sep="\t",index=False) df.to_csv(args.output_file,sep="\t",index=False)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment