diff --git a/lib/data_generator.py b/lib/data_generator.py index 5183017b47d0a81c3bf2cfd2c0399ece4c34c761..fb871467542858a421f3ef2d33ad644825cae9cb 100644 --- a/lib/data_generator.py +++ b/lib/data_generator.py @@ -11,6 +11,9 @@ from .geo import zero_one_encoding from helpers import parse_title_wiki,read_geonames from gensim.models.keyedvectors import KeyedVectors +from sklearn.preprocessing import LabelEncoder + + def wc_l(filename,gzip=True): lc = 0 if not gzip: @@ -40,7 +43,9 @@ class DataSource(object): self.name = name assert os.path.exists(input_filename) self.input_filename = input_filename - self.len = 0 + self.len = 0 + + self.is_there_healpix = False def __next__(self): raise NotImplementedError() @@ -112,29 +117,6 @@ class Adjacency(DataSource): return (self.geonames_data_dict[self.topo], self.geonames_data_dict[self.context_topo_context[self.i-1]], self.lat,self.lon) - - def __nextv2__(self): - if self.i >= len(self.context_topo_context): - line = self.data_src.readline() - if not line: - self.is_over = True - raise StopIteration - line = line.decode("utf-8").rstrip("\n") - geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(",")) - - self.topo = int(geonameid) - self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")] - if self.sampling: - self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context] - self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs) - self.lat, self.lon = float(latitude),float(longitude) - - self.i = 0 - - self.i += 1 - return (self.topo, - self.context_topo_context[self.i-1], - self.lat,self.lon) def __reset__(self): if not self.gzip: @@ -193,40 +175,48 @@ class Inclusion(DataSource): return (self.i == self.len) -from sklearn.preprocessing import LabelEncoder + class CoOccurrences(DataSource): - def __init__(self, filename, label_encoder,sampling=3): + def __init__(self, filename, label_encoder,sampling=3,resolution = 1): super().__init__("Co-Occurrence data",filename) - + self.is_there_healpix = True + # LOAD DATA try: self.data_src = pd.read_csv(filename) except: self.data_src = pd.read_csv(filename,sep="\t") - + # CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA + if not "healpix_{0}".format(resolution) in self.data_src.columns: + raise KeyError("healpix_{0} column does not exists ! ".format(resolution)) + + # PARSE TOPONYMS self.data_src["title"] = self.data_src.title.apply(parse_title_wiki) try: self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki) except: pass + # LOOP parameter + self.sampling = sampling + if self.sampling: + self.probs_storage = SamplingProbabilities() + + # LOOP INDICES self.i = 0 self.j = 0 self.is_over = False - - self.sampling = sampling self.len = len(self.data_src)*self.sampling - if self.sampling: - self.probs_storage = SamplingProbabilities() + # BUFFER VARIABLE self.topo = None self.context_topo_context = [] self.curr_probs = None self.lat, self.lon = None, None - self.resolution = 64 #fixed for now + self.resolution = resolution self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist() self.class_encoder = label_encoder @@ -248,7 +238,9 @@ class CoOccurrences(DataSource): self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context] self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs) self.lat, self.lon = line.latitude,line.longitude + self.healpix = line["healpix_{0}".format(self.resolution)] + self.i += 1 self.j = 0 @@ -264,9 +256,6 @@ class CoOccurrences(DataSource): def isOver(self): return self.is_over - - - class DataGenerator(keras.utils.Sequence): 'Generates data for Keras' def __init__(self,data_sources,ngram_index,class_encoder,**kwargs): @@ -275,49 +264,68 @@ class DataGenerator(keras.utils.Sequence): self.ngram_index = ngram_index self.batch_size = kwargs.get("batch_size",1000) + self.only_healpix = kwargs.get("only_healpix",False) - self.len = sum([len(d) for d in self.data_src]) self.datasrc_index = 0 self.num_classes = class_encoder.get_num_classes() - #self.on_epoch_end() + self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix def __len__(self): 'Denotes the number of batches per epoch' return int(np.floor(self.len / self.batch_size)) + def return_(self,X,y,y2=None): + if self.is_there_healpix and self.only_healpix: + return [X[:,0],X[:,1]],y2 + + if self.is_there_healpix: + return [X[:,0],X[:,1]],[y,y2] + else: + return [X[:,0],X[:,1]],y + def __getitem__(self, index): 'Generate one batch of data' X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) #Â toponym y = np.empty((self.batch_size,2),dtype=float) #lat lon coord - y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class + + y2=None # For healpix + if self.is_there_healpix: + y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class + if self.data_src[self.datasrc_index].isOver(): self.datasrc_index += 1 + self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix + + if self.datasrc_index >= len(self.data_src): - return X,[y,y2] + self.return_(X,y,y2) for i in range(self.batch_size): if self.data_src[self.datasrc_index].isOver(): - return X, y + return self.return_(X,y,y2) try: topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__() except StopIteration as e: - return X, [y,y2] + return self.return_(X,y,y2) X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)] y[i] = [*zero_one_encoding(longitude,latitude)] - y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32' + if self.is_there_healpix: + y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32' ) #y[i] = [longitude,latitude] - return [X[:,0],X[:,1]], [y,y2]#[y[:,0],y[:,1]] + return self.return_(X,y,y2) def on_epoch_end(self): 'Updates indexes after each epoch' [d.__reset__() for d in self.data_src] self.datasrc_index = 0 + + def load_embedding(model_fn,dim_vector=100): model = KeyedVectors.load(model_fn) diff --git a/region_embedding.py b/region_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..906a422ba6ac0d8ecd4003325f8462f6fb70ec44 --- /dev/null +++ b/region_embedding.py @@ -0,0 +1,199 @@ +# Base module +import os + +#Â Structure +import pandas as pd + +#Â DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout +from keras.models import Model +from keras.callbacks import ModelCheckpoint +from tensorflow.keras.layers import Lambda +import keras.backend as K +import tensorflow as tf +from lib.custom_layer import * + +#Â Custom module +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder +from lib.metrics import lat_accuracy,lon_accuracy +from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency +from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle + +# Logging +import logging + +logging.getLogger('gensim').setLevel(logging.WARNING) + +from helpers import EpochTimer + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ + .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + +#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.k_value +EPOCHS = args.epochs +ADJACENCY_SAMPLING = args.adjacency_sample +COOC_SAMPLING = args.cooc_sample +WORDVEC_ITER = 50 +EMBEDDING_DIM = args.dimension +BATCH_SIZE = args.batch_size +################################################# +########## FILENAME VARIABLE #################### +################################################# +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.inclusion_fn +ADJACENCY_REL_FILENAME = args.adjacency_fn +COOC_FN = args.wikipedia_cooc_fn + +PREFIX_OUTPUT_FN = "REGION_{0}_{1}_{2}_{3}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + + +meta_data = MetaDataSerializer( + DATASET_NAME, + REL_CODE, + COOC_SAMPLING, + ADJACENCY_SAMPLING, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) + + +###Â PUT DATASRC + GENERATOR + +index = NgramIndex.load(args.ngram_index_fn) + +train_src = [] +test_src = [] + +class_encoder = LabelEncoder() +if args.wikipedia_cooc: + train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4)) + test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4)) + +if args.adjacency: + a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + train_src.append(a_train) + test_src.append(a_test) + +if args.inclusion: + i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv") + i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv") + train_src.append(i_train) + test_src.append(i_test) +#Adjacency + + + +d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True) +d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True) + +num_words = len(index.index_ngram) + +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# + +embedding_weights = load_embedding(args.embedding_fn) + + +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# + +from keras import regularizers + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) + +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +#Â Each LSTM learn on a permutation of the input toponyms +biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh")) +x1 = biLSTM(x1) +x2 = biLSTM(x2) +x = concatenate([x1,x2])#,x3]) + +#x = Dense(class_encoder.get_num_classes()*2,activation="relu")(x) + + +aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x) + +model = Model(inputs = [input_1,input_2], outputs = aux_layer)#input_3 + +model.compile(loss={"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy"}) + + +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + +epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") + + +history = model.fit_generator(generator=d_train, + validation_data=d_test, + verbose=True, + epochs=EPOCHS, + callbacks=[checkpoint,epoch_timer]) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv(HISTORY_FN) + +model.save(MODEL_OUTPUT_FN) + +#Â Erase Model Checkpoint file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/scripts/gethealpix.py b/scripts/gethealpix.py index 387cacfbd700a175ec96b1957cb9dff819dc77c1..6e572fdb256e92c9a6690df2b7e806e1b11e1573 100644 --- a/scripts/gethealpix.py +++ b/scripts/gethealpix.py @@ -27,5 +27,6 @@ df = pd.read_csv(args.input_file,sep="\t") df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1) df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1) df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1) +df["healpix_1"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=1),axis=1) df.to_csv(args.output_file,sep="\t",index=False) \ No newline at end of file