diff --git a/combination_embeddingsv2.py b/combination_embeddingsv2.py deleted file mode 100644 index 96fefc70e3eab286d6bf833bd9fd0c950c82ff3e..0000000000000000000000000000000000000000 --- a/combination_embeddingsv2.py +++ /dev/null @@ -1,209 +0,0 @@ -# Base module -import os - -#Â Structure -import pandas as pd -import numpy as np - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU -from keras.models import Model -from keras.callbacks import ModelCheckpoint -from tensorflow.keras.layers import Lambda -import keras.backend as K -import tensorflow as tf -from lib.custom_layer import * - -#Â Custom module -from lib.ngram_index import NgramIndex -from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder -from lib.metrics import lat_accuracy,lon_accuracy -from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency -from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle - -# Logging -import logging - -logging.getLogger('gensim').setLevel(logging.WARNING) - -from helpers import EpochTimer - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) - -#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.k_value -EPOCHS = args.epochs -ADJACENCY_SAMPLING = args.adjacency_sample -COOC_SAMPLING = args.cooc_sample -WORDVEC_ITER = 50 -EMBEDDING_DIM = args.dimension -BATCH_SIZE = args.batch_size -################################################# -########## FILENAME VARIABLE #################### -################################################# -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.inclusion_fn -ADJACENCY_REL_FILENAME = args.adjacency_fn -COOC_FN = args.wikipedia_cooc_fn - -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE) - -REL_CODE="" -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - - -meta_data = MetaDataSerializer( - DATASET_NAME, - REL_CODE, - COOC_SAMPLING, - ADJACENCY_SAMPLING, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) - - -###Â PUT DATASRC + GENERATOR - -index = NgramIndex.load(args.ngram_index_fn) - -train_src = [] -test_src = [] - -class_encoder = LabelEncoder() -if args.wikipedia_cooc: - train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4)) - test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4)) - -if args.adjacency: - a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) - a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) - train_src.append(a_train) - test_src.append(a_test) - -if args.inclusion: - i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv") - i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv") - train_src.append(i_train) - test_src.append(i_test) -#Adjacency - - - -d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE) -d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE) - -num_words = len(index.index_ngram) - -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - -embedding_weights = load_embedding(args.embedding_fn) -EMBEDDING_DIM = len(embedding_weights[0]) - -############################################################################################# -################################# MODEL DEFINITION ########################################## -############################################################################################# - -from keras import regularizers - -input_1 = Input(shape=(index.max_len,)) -input_2 = Input(shape=(index.max_len,)) - -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) - -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) - -#Â Each LSTM learn on a permutation of the input toponyms -biLSTM = Bidirectional(LSTM(64,activation="pentanh", recurrent_activation="pentanh")) -x1 = biLSTM(x1) -x2 = biLSTM(x2) -x = concatenate([x2,x1])#,x3]) - -x1 = Dense(1000,activation="pentanh")(x) -# x1 = Dropout(0.3)(x1) -x1 = Dense(1000,activation="pentanh")(x1) -# x1 = Dropout(0.3)(x1) - -x2 = Dense(1000,activation="pentanh")(x) -# x2 = Dropout(0.3)(x2) -x2 = Dense(1000,activation="pentanh")(x2) -# x2 = Dropout(0.3)(x2) - -output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) -output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - -output = concatenate([output_lon,output_lat],name="output_layer") - -model = Model(inputs = [input_1,input_2], outputs = output)#input_3 - -model.compile(loss={"output_layer":haversine_tf_1circle}, optimizer='adam',metrics={"output_layer":accuracy_k(ACCURACY_TOLERANCE)}) - -############################################################################################# -################################# TRAINING LAUNCH ########################################### -############################################################################################# - -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - -epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") - - -history = model.fit_generator(generator=d_train, - validation_data=d_test, - verbose=True, - epochs=EPOCHS, - callbacks=[checkpoint,epoch_timer]) - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv(HISTORY_FN) - -model.save(MODEL_OUTPUT_FN) - -#Â Erase Model Checkpoint file -if os.path.exists(MODEL_OUTPUT_FN + ".part"): - os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/combination_embeddingsv3.py b/combination_embeddingsv3.py deleted file mode 100644 index ccb10318413925cac5f23238ee30f2b42d44e94d..0000000000000000000000000000000000000000 --- a/combination_embeddingsv3.py +++ /dev/null @@ -1,216 +0,0 @@ -# Base module -import os - -#Â Structure -import pandas as pd -import numpy as np - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU -from keras.models import Model -from keras.callbacks import ModelCheckpoint -from tensorflow.keras.layers import Lambda -import keras.backend as K -import tensorflow as tf -from lib.custom_layer import * - -#Â Custom module -from lib.ngram_index import NgramIndex -from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder -from lib.metrics import lat_accuracy,lon_accuracy -from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency -from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle - -# Logging -import logging - -logging.getLogger('gensim').setLevel(logging.WARNING) - -from helpers import EpochTimer - -# LOGGING CONF -logging.basicConfig( - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) - -#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.k_value -EPOCHS = args.epochs -ADJACENCY_SAMPLING = args.adjacency_sample -COOC_SAMPLING = args.cooc_sample -WORDVEC_ITER = 50 -EMBEDDING_DIM = args.dimension -BATCH_SIZE = args.batch_size -################################################# -########## FILENAME VARIABLE #################### -################################################# -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.inclusion_fn -ADJACENCY_REL_FILENAME = args.adjacency_fn -COOC_FN = args.wikipedia_cooc_fn - -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE) - -REL_CODE="" -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - - -meta_data = MetaDataSerializer( - DATASET_NAME, - REL_CODE, - COOC_SAMPLING, - ADJACENCY_SAMPLING, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) - - -###Â PUT DATASRC + GENERATOR - -index = NgramIndex.load(args.ngram_index_fn) - -train_src = [] -test_src = [] - -class_encoder = LabelEncoder() - -if args.wikipedia_cooc: - train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4,use_healpix=False)) - test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4,use_healpix=False)) - -if args.adjacency: - a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) - a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) - train_src.append(a_train) - test_src.append(a_test) - -if args.inclusion: - i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv") - i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv") - train_src.append(i_train) - test_src.append(i_test) -#Adjacency - -print("Number of classes:",class_encoder.get_num_classes()) - -d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE) -d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE) - -num_words = len(index.index_ngram) - -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - -embedding_weights = load_embedding(args.embedding_fn) -EMBEDDING_DIM = len(embedding_weights[0]) - -############################################################################################# -################################# MODEL DEFINITION ########################################## -############################################################################################# - -from keras import regularizers -#### - -input_1 = Input(shape=(index.max_len,)) -input_2 = Input(shape=(index.max_len,)) - -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) - -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) - -#Â Each LSTM learn on a permutation of the input toponyms -biLSTM = Bidirectional(GRU(128,activation="pentanh", recurrent_activation="pentanh")) -x1 = biLSTM(x1) -x2 = biLSTM(x2) - -x = concatenate([x1,x2])#,x3]) - -x1 = Dense(500,activation="relu")(x) -x1 = Dropout(0.3)(x1) -x1 = Dense(500,activation="relu")(x1) -x1 = Dropout(0.3)(x1) - -x2 = Dense(500,activation="relu")(x) -x2 = Dropout(0.3)(x2) -x2 = Dense(500,activation="relu")(x2) -x2 = Dropout(0.3)(x2) - -#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D) - -output_lon = Dense(1,activation="sigmoid")(x1) -output_lat = Dense(1,activation="sigmoid")(x2) - -output_coord = concatenate([output_lon,output_lat],name="output_coord") - -##### -model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 - -model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) - -model.summary() -############################################################################################# -################################# TRAINING LAUNCH ########################################### -############################################################################################# - -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - -epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") - - -history = model.fit_generator(generator=d_train, - validation_data=d_test, - verbose=True, - epochs=EPOCHS, - callbacks=[checkpoint,epoch_timer]) - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv(HISTORY_FN) - -model.save(MODEL_OUTPUT_FN) - -#Â Erase Model Checkpoint file -if os.path.exists(MODEL_OUTPUT_FN + ".part"): - os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/get_all_adjacency_rel.py b/scripts/get_all_adjacency_rel.py similarity index 100% rename from get_all_adjacency_rel.py rename to scripts/get_all_adjacency_rel.py