ADD region prediction network code

8b047924 · Jacques Fize · 565887da · 8b047924 · 8b047924 · 8b047924
Commit 8b047924 authored 5 years ago by Jacques Fize
--- a/lib/data_generator.py
+++ b/lib/data_generator.py
@@ -11,6 +11,9 @@ from .geo import zero_one_encoding
 from helpers import parse_title_wiki,read_geonames
 from gensim.models.keyedvectors import KeyedVectors
+from sklearn.preprocessing import LabelEncoder
 def wc_l(filename,gzip=True):
    lc = 0
    if not gzip:
@@ -40,7 +43,9 @@ class DataSource(object):
        self.name = name
        assert os.path.exists(input_filename)
        self.input_filename = input_filename
-        self.len = 0 
+        self.len = 0
+        self.is_there_healpix = False
    def __next__(self):
        raise NotImplementedError()
@@ -112,29 +117,6 @@ class Adjacency(DataSource):
        return (self.geonames_data_dict[self.topo],
        self.geonames_data_dict[self.context_topo_context[self.i-1]],
        self.lat,self.lon)
-    def __nextv2__(self):
-        if  self.i >= len(self.context_topo_context):
-            line = self.data_src.readline()
-            if not line:
-                self.is_over = True
-                raise StopIteration
-            line = line.decode("utf-8").rstrip("\n")
-            geonameid, adjacent_geoname_id,latitude,longitude = tuple(line.split(","))
-            self.topo = int(geonameid)
-            self.context_topo_context = [int(x) for x in adjacent_geoname_id.split("|")]
-            if self.sampling:
-                self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
-                self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
-            self.lat, self.lon = float(latitude),float(longitude)
-            self.i = 0
-        self.i += 1
-        return (self.topo,
-        self.context_topo_context[self.i-1],
-        self.lat,self.lon)
    def __reset__(self):
        if not self.gzip:
@@ -193,40 +175,48 @@ class Inclusion(DataSource):
        return (self.i == self.len)
-from sklearn.preprocessing import LabelEncoder
 class CoOccurrences(DataSource):
-    def __init__(self, filename, label_encoder,sampling=3):
+    def __init__(self, filename, label_encoder,sampling=3,resolution = 1):
        super().__init__("Co-Occurrence data",filename)
+        self.is_there_healpix = True
+        # LOAD DATA
        try:
            self.data_src = pd.read_csv(filename)
        except:
            self.data_src = pd.read_csv(filename,sep="\t")
+        # CHECK IF THE HEALPIX RESOLUTION DATA APPEARS IN THE DATA
+        if not "healpix_{0}".format(resolution) in self.data_src.columns:
+            raise KeyError("healpix_{0} column does not exists ! ".format(resolution))
+        # PARSE TOPONYMS
        self.data_src["title"] = self.data_src.title.apply(parse_title_wiki)
        try:
            self.data_src["interlinks"] = self.data_src.interlinks.apply(parse_title_wiki)
        except:
            pass
+        # LOOP parameter
+        self.sampling = sampling
+        if self.sampling:
+            self.probs_storage = SamplingProbabilities()
+        # LOOP INDICES
        self.i = 0
        self.j = 0
        self.is_over = False
-        self.sampling = sampling
        self.len = len(self.data_src)*self.sampling
-        if self.sampling:
-            self.probs_storage = SamplingProbabilities()
+        # BUFFER VARIABLE
        self.topo = None
        self.context_topo_context = []
        self.curr_probs = None
        self.lat, self.lon = None, None
-        self.resolution = 64 #fixed for now
+        self.resolution = resolution
        self.classes = self.data_src["healpix_{0}".format(self.resolution)].unique().tolist()
        self.class_encoder = label_encoder
@@ -248,7 +238,9 @@ class CoOccurrences(DataSource):
                self.curr_probs = [self.probs_storage(x) for x in self.context_topo_context]
                self.context_topo_context = np.random.choice(self.context_topo_context,self.sampling,self.curr_probs)
            self.lat, self.lon = line.latitude,line.longitude
            self.healpix = line["healpix_{0}".format(self.resolution)]
            self.i += 1
            self.j = 0
@@ -264,9 +256,6 @@ class CoOccurrences(DataSource):
    def isOver(self):
        return self.is_over
 class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self,data_sources,ngram_index,class_encoder,**kwargs):
@@ -275,49 +264,68 @@ class DataGenerator(keras.utils.Sequence):
        self.ngram_index = ngram_index
        self.batch_size = kwargs.get("batch_size",1000)
+        self.only_healpix = kwargs.get("only_healpix",False)
        self.len = sum([len(d) for d in self.data_src])
        self.datasrc_index = 0
        self.num_classes = class_encoder.get_num_classes()
-        #self.on_epoch_end()
+        self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(self.len / self.batch_size))
+    def return_(self,X,y,y2=None):
+        if self.is_there_healpix and self.only_healpix:
+            return [X[:,0],X[:,1]],y2
+        if self.is_there_healpix:
+            return [X[:,0],X[:,1]],[y,y2]
+        else:
+            return [X[:,0],X[:,1]],y
    def __getitem__(self, index):
        'Generate one batch of data'
        X = np.empty((self.batch_size,2,self.ngram_index.max_len),dtype=np.int32) # toponym
        y = np.empty((self.batch_size,2),dtype=float) #lat lon coord
-        y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
+        y2=None # For healpix
+        if self.is_there_healpix:
+            y2 = np.empty((self.batch_size,self.num_classes),dtype=float) # healpix class
        if self.data_src[self.datasrc_index].isOver():
                self.datasrc_index += 1
+                self.is_there_healpix = self.data_src[self.datasrc_index].is_there_healpix
        if self.datasrc_index >= len(self.data_src):
-            return X,[y,y2]
+            self.return_(X,y,y2)
        for i in range(self.batch_size):
            if self.data_src[self.datasrc_index].isOver():
-                return X, y
+                return self.return_(X,y,y2)
            try:
                topo, topo_context, latitude, longitude, healpix_class = self.data_src[self.datasrc_index].__next__()
            except StopIteration as e:
-                return X, [y,y2]
+                return self.return_(X,y,y2)
            X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
            y[i] =  [*zero_one_encoding(longitude,latitude)]
-            y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
+            if self.is_there_healpix:
+                y2[i] = to_categorical(healpix_class, num_classes=self.num_classes, dtype='int32'
 )
            #y[i] = [longitude,latitude]
-        return [X[:,0],X[:,1]], [y,y2]#[y[:,0],y[:,1]]
+        return self.return_(X,y,y2)
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        [d.__reset__() for d in self.data_src]
        self.datasrc_index = 0
 def load_embedding(model_fn,dim_vector=100):
    model = KeyedVectors.load(model_fn)

--- a/region_embedding.py
+++ b/region_embedding.py
+# Base module 
+import os
+# Structure
+import pandas as pd
+# DEEPL module
+from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout
+from keras.models import Model
+from keras.callbacks import ModelCheckpoint
+from tensorflow.keras.layers import Lambda
+import keras.backend as K 
+import tensorflow as tf 
+from lib.custom_layer import *
+# Custom module
+from lib.ngram_index import NgramIndex
+from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
+from lib.metrics import lat_accuracy,lon_accuracy
+from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
+from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
+# Logging
+import logging
+logging.getLogger('gensim').setLevel(logging.WARNING)
+from helpers import EpochTimer
+# LOGGING CONF
+logging.basicConfig(
+    format='[%(asctime)s][%(levelname)s] %(message)s ', 
+    datefmt='%m/%d/%Y %I:%M:%S %p',
+    level=logging.INFO  
+    )
+args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
+    .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
+#.parse_args("-w  --wikipedia-cooc-fn  subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
+#
+#################################################
+############# MODEL TRAINING PARAMETER ##########
+#################################################
+NGRAM_SIZE = args.ngram_size
+ACCURACY_TOLERANCE = args.k_value
+EPOCHS = args.epochs
+ADJACENCY_SAMPLING = args.adjacency_sample
+COOC_SAMPLING = args.cooc_sample
+WORDVEC_ITER = 50
+EMBEDDING_DIM = args.dimension
+BATCH_SIZE = args.batch_size
+#################################################
+########## FILENAME VARIABLE ####################
+#################################################
+# check for output dir
+if not os.path.exists("outputs/"):
+    os.makedirs("outputs/")
+GEONAME_FN = args.geoname_input
+DATASET_NAME = args.geoname_input.split("/")[-1]
+GEONAMES_HIERARCHY_FN = args.inclusion_fn
+ADJACENCY_REL_FILENAME = args.adjacency_fn
+COOC_FN = args.wikipedia_cooc_fn
+PREFIX_OUTPUT_FN = "REGION_{0}_{1}_{2}_{3}".format(
+    GEONAME_FN.split("/")[-1],
+    EPOCHS,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE)
+REL_CODE=""
+if args.adjacency:
+    PREFIX_OUTPUT_FN += "_A"
+    REL_CODE+= "A"
+if args.inclusion:
+    PREFIX_OUTPUT_FN += "_I"
+    REL_CODE+= "I"
+if args.wikipedia_cooc:
+    PREFIX_OUTPUT_FN += "_C"
+    REL_CODE+= "C"
+MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
+INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
+HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
+meta_data = MetaDataSerializer(
+    DATASET_NAME,
+    REL_CODE,
+    COOC_SAMPLING,
+    ADJACENCY_SAMPLING,
+    NGRAM_SIZE,
+    ACCURACY_TOLERANCE,
+    EPOCHS,
+    EMBEDDING_DIM,
+    WORDVEC_ITER,
+    INDEX_FN,
+    MODEL_OUTPUT_FN,
+    HISTORY_FN
+)
+meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
+### PUT DATASRC + GENERATOR
+index = NgramIndex.load(args.ngram_index_fn)
+train_src = []
+test_src = []
+class_encoder = LabelEncoder()
+if args.wikipedia_cooc:
+    train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4))
+    test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4))
+if args.adjacency:
+    a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
+    a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
+    train_src.append(a_train)
+    test_src.append(a_test)
+if args.inclusion:
+    i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
+    i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
+    train_src.append(i_train)
+    test_src.append(i_test)
+#Adjacency
+d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True) 
+d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE,only_healpix=True) 
+num_words = len(index.index_ngram)  
+#############################################################################################
+################################# NGRAM EMBEDDINGS ##########################################
+#############################################################################################
+embedding_weights = load_embedding(args.embedding_fn) 
+#############################################################################################
+################################# MODEL DEFINITION ##########################################
+#############################################################################################
+from keras import regularizers
+input_1 = Input(shape=(index.max_len,))
+input_2 = Input(shape=(index.max_len,))
+embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
+x1 = embedding_layer(input_1)
+x2 = embedding_layer(input_2)
+# Each LSTM learn on a permutation of the input toponyms
+biLSTM = Bidirectional(LSTM(32,activation="pentanh", recurrent_activation="pentanh"))
+x1 = biLSTM(x1)
+x2 = biLSTM(x2)
+x = concatenate([x1,x2])#,x3])
+#x = Dense(class_encoder.get_num_classes()*2,activation="relu")(x)
+aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(x)
+model = Model(inputs = [input_1,input_2], outputs = aux_layer)#input_3
+model.compile(loss={"aux_layer":"categorical_crossentropy"}, optimizer='adam',metrics={"aux_layer":"accuracy"})
+#############################################################################################
+################################# TRAINING LAUNCH ###########################################
+#############################################################################################
+checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
+    save_best_only=True, mode='auto', period=1)
+epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
+history = model.fit_generator(generator=d_train,
+    validation_data=d_test,
+    verbose=True,
+    epochs=EPOCHS,
+    callbacks=[checkpoint,epoch_timer])
+hist_df = pd.DataFrame(history.history)
+hist_df.to_csv(HISTORY_FN)
+model.save(MODEL_OUTPUT_FN)
+# Erase Model Checkpoint file
+if os.path.exists(MODEL_OUTPUT_FN + ".part"):
+    os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
--- a/scripts/gethealpix.py
+++ b/scripts/gethealpix.py
@@ -27,5 +27,6 @@ df = pd.read_csv(args.input_file,sep="\t")
 df["healpix_256"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=256),axis=1)
 df["healpix_64"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=64),axis=1)
 df["healpix_32"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=32),axis=1)
+df["healpix_1"] = df.progress_apply(lambda row:latlon2healpix(lat=row.latitude,lon=row.longitude,res=1),axis=1)
 df.to_csv(args.output_file,sep="\t",index=False)
\ No newline at end of file