diff --git a/combination_embeddingsv2.py b/combination_embeddingsv2.py new file mode 100644 index 0000000000000000000000000000000000000000..e68c836df2948a5a2315a0213db62c6a3bf8717d --- /dev/null +++ b/combination_embeddingsv2.py @@ -0,0 +1,178 @@ +# Base module +import os + +#Â Structure +import pandas as pd +#Â DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM +from keras.models import Model +from keras.callbacks import ModelCheckpoint + +#Â Custom module +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader, MetaDataSerializer +from lib.metrics import lat_accuracy,lon_accuracy +from data_generator import DataGenerator,CoOccurrences,load_embedding + +# Logging +import logging + +logging.getLogger('gensim').setLevel(logging.WARNING) + + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ + .parse_args()#("-w -e 100 ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.tolerance_value +EPOCHS = args.epochs +ITER_ADJACENCY = args.adjacency_iteration +COOC_SAMPLING_NUMBER = args.cooc_sample_size +WORDVEC_ITER = args.ngram_word2vec_iter +EMBEDDING_DIM = 100 +################################################# +########## FILENAME VARIABLE #################### +################################################# +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +GEONAME_FN = "ALL"#args.geoname_input +DATASET_NAME = "ALL"#args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = ""#args.geoname_hierachy_input +REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 +ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( + GEONAME_FN, + ITER_ADJACENCY, + REGION_SUFFIX_FN) + +COOC_FN = args.wikipedia_cooc_fn +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + REGION_SUFFIX_FN) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + + +meta_data = MetaDataSerializer( + DATASET_NAME, + REL_CODE, + COOC_SAMPLING_NUMBER, + ITER_ADJACENCY, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) + + +###Â PUT DATASRC + GENERATOR + +index = NgramIndex.load(args.ngram_index_fn) +c_train = CoOccurrences(COOC_FN + "_train.csv",sampling=3) +c_test = CoOccurrences(COOC_FN + "_test.csv",sampling=3) + +BATCH_SIZE = 1000 +d_train = DataGenerator([c_train],index,batch_size=BATCH_SIZE) +d_test = DataGenerator([c_test],index,batch_size=BATCH_SIZE) + +num_words = len(index.index_ngram) + +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# + +embedding_weights = load_embedding(args.embedding_fn) + + +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# + + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) + +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +#Â Each LSTM learn on a permutation of the input toponyms +x1 = Bidirectional(LSTM(98))(x1) +x2 = Bidirectional(LSTM(98))(x2) + +x = concatenate([x1,x2])#,x3]) + +x1 = Dense(500,activation="relu")(x) +# x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +# x1 = Dropout(0.3)(x1) + +x2 = Dense(500,activation="relu")(x) +# x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +# x2 = Dropout(0.3)(x2) + +output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) +output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) + +model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 + +model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='rmsprop',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) + + +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + +history = model.fit_generator(generator=d_train, + validation_data=d_test, + verbose=True, + epochs=EPOCHS, + callbacks=[checkpoint]) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv(HISTORY_FN) + +model.save(MODEL_OUTPUT_FN) + +#Â Erase Model Checkpoint file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/data_generator.py b/data_generator.py index e17c29f49dd965a5121ba8ec9d9d21aac143d463..1456966015f3955963dec8b3b8569dc95cf3135f 100644 --- a/data_generator.py +++ b/data_generator.py @@ -5,6 +5,8 @@ import keras import numpy as np import pandas as pd +from lib.geo import zero_one_encoding + from helpers import parse_title_wiki,read_geonames from gensim.models.keyedvectors import KeyedVectors @@ -267,8 +269,8 @@ class DataGenerator(keras.utils.Sequence): return X, y X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)] - y[i] = [longitude,latitude] - return X, y + y[i] = [*zero_one_encoding(longitude,latitude)] + return [X[:,0],X[:,1]], [y[:,0],y[:,1]] def on_epoch_end(self): 'Updates indexes after each epoch' diff --git a/parser_config/toponym_combination_embedding_v2.json b/parser_config/toponym_combination_embedding_v2.json new file mode 100644 index 0000000000000000000000000000000000000000..050c8446e7d885c60706f05bfdbfe4a19029c0a3 --- /dev/null +++ b/parser_config/toponym_combination_embedding_v2.json @@ -0,0 +1,21 @@ +{ + "description": "Toponym Combination", + "args": [ + { "short": "ngram_index_fn", "help": "Filepath of the NgramIndex file you want to use." }, + { "short": "embedding_fn", "help": "Filepath of the Embedding file you want to use." }, + { "short": "-v", "long": "--verbose", "action": "store_true" }, + { "short": "-i", "long": "--inclusion", "action": "store_true" }, + { "short": "-a", "long": "--adjacency", "action": "store_true" }, + { "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" }, + { "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"}, + { "long": "--adjacency-fn","help":"Adjacency data filename"}, + { "long": "--cooc-sample-size", "type": "int", "default": 3 }, + {"long": "--adjacency-iteration", "type":"int","default":1}, + { "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 }, + { "long": "--ngram-word2vec-iter", "type": "int", "default": 50 }, + { "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 }, + { "short": "-e", "long": "--epochs", "type": "int", "default": 100 }, + { "short": "-d", "long": "--dimension", "type": "int", "default": 256 }, + { "long": "--admin_code_1", "default": "None" } + ] +} \ No newline at end of file