diff --git a/combination_embeddingsv3.py b/combination_embeddingsv3.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb10318413925cac5f23238ee30f2b42d44e94d --- /dev/null +++ b/combination_embeddingsv3.py @@ -0,0 +1,216 @@ +# Base module +import os + +#Â Structure +import pandas as pd +import numpy as np + +#Â DEEPL module +from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU +from keras.models import Model +from keras.callbacks import ModelCheckpoint +from tensorflow.keras.layers import Lambda +import keras.backend as K +import tensorflow as tf +from lib.custom_layer import * + +#Â Custom module +from lib.ngram_index import NgramIndex +from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder +from lib.metrics import lat_accuracy,lon_accuracy +from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency +from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle + +# Logging +import logging + +logging.getLogger('gensim').setLevel(logging.WARNING) + +from helpers import EpochTimer + +# LOGGING CONF +logging.basicConfig( + format='[%(asctime)s][%(levelname)s] %(message)s ', + datefmt='%m/%d/%Y %I:%M:%S %p', + level=logging.INFO + ) + +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ + .parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + +#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split()) + +# +################################################# +############# MODEL TRAINING PARAMETER ########## +################################################# +NGRAM_SIZE = args.ngram_size +ACCURACY_TOLERANCE = args.k_value +EPOCHS = args.epochs +ADJACENCY_SAMPLING = args.adjacency_sample +COOC_SAMPLING = args.cooc_sample +WORDVEC_ITER = 50 +EMBEDDING_DIM = args.dimension +BATCH_SIZE = args.batch_size +################################################# +########## FILENAME VARIABLE #################### +################################################# +# check for output dir +if not os.path.exists("outputs/"): + os.makedirs("outputs/") + +GEONAME_FN = args.geoname_input +DATASET_NAME = args.geoname_input.split("/")[-1] +GEONAMES_HIERARCHY_FN = args.inclusion_fn +ADJACENCY_REL_FILENAME = args.adjacency_fn +COOC_FN = args.wikipedia_cooc_fn + +PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format( + GEONAME_FN.split("/")[-1], + EPOCHS, + NGRAM_SIZE, + ACCURACY_TOLERANCE) + +REL_CODE="" +if args.adjacency: + PREFIX_OUTPUT_FN += "_A" + REL_CODE+= "A" +if args.inclusion: + PREFIX_OUTPUT_FN += "_I" + REL_CODE+= "I" +if args.wikipedia_cooc: + PREFIX_OUTPUT_FN += "_C" + REL_CODE+= "C" + +MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) +INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) +HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) + + +meta_data = MetaDataSerializer( + DATASET_NAME, + REL_CODE, + COOC_SAMPLING, + ADJACENCY_SAMPLING, + NGRAM_SIZE, + ACCURACY_TOLERANCE, + EPOCHS, + EMBEDDING_DIM, + WORDVEC_ITER, + INDEX_FN, + MODEL_OUTPUT_FN, + HISTORY_FN +) +meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) + + +###Â PUT DATASRC + GENERATOR + +index = NgramIndex.load(args.ngram_index_fn) + +train_src = [] +test_src = [] + +class_encoder = LabelEncoder() + +if args.wikipedia_cooc: + train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4,use_healpix=False)) + test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4,use_healpix=False)) + +if args.adjacency: + a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False) + train_src.append(a_train) + test_src.append(a_test) + +if args.inclusion: + i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv") + i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv") + train_src.append(i_train) + test_src.append(i_test) +#Adjacency + +print("Number of classes:",class_encoder.get_num_classes()) + +d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE) +d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE) + +num_words = len(index.index_ngram) + +############################################################################################# +################################# NGRAM EMBEDDINGS ########################################## +############################################################################################# + +embedding_weights = load_embedding(args.embedding_fn) +EMBEDDING_DIM = len(embedding_weights[0]) + +############################################################################################# +################################# MODEL DEFINITION ########################################## +############################################################################################# + +from keras import regularizers +#### + +input_1 = Input(shape=(index.max_len,)) +input_2 = Input(shape=(index.max_len,)) + +embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True) + +x1 = embedding_layer(input_1) +x2 = embedding_layer(input_2) + +#Â Each LSTM learn on a permutation of the input toponyms +biLSTM = Bidirectional(GRU(128,activation="pentanh", recurrent_activation="pentanh")) +x1 = biLSTM(x1) +x2 = biLSTM(x2) + +x = concatenate([x1,x2])#,x3]) + +x1 = Dense(500,activation="relu")(x) +x1 = Dropout(0.3)(x1) +x1 = Dense(500,activation="relu")(x1) +x1 = Dropout(0.3)(x1) + +x2 = Dense(500,activation="relu")(x) +x2 = Dropout(0.3)(x2) +x2 = Dense(500,activation="relu")(x2) +x2 = Dropout(0.3)(x2) + +#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D) + +output_lon = Dense(1,activation="sigmoid")(x1) +output_lat = Dense(1,activation="sigmoid")(x2) + +output_coord = concatenate([output_lon,output_lat],name="output_coord") + +##### +model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 + +model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) + +model.summary() +############################################################################################# +################################# TRAINING LAUNCH ########################################### +############################################################################################# + +checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, + save_best_only=True, mode='auto', period=1) + +epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") + + +history = model.fit_generator(generator=d_train, + validation_data=d_test, + verbose=True, + epochs=EPOCHS, + callbacks=[checkpoint,epoch_timer]) + + +hist_df = pd.DataFrame(history.history) +hist_df.to_csv(HISTORY_FN) + +model.save(MODEL_OUTPUT_FN) + +#Â Erase Model Checkpoint file +if os.path.exists(MODEL_OUTPUT_FN + ".part"): + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/train_test_split_cooccurrence_data.py b/train_test_split_cooccurrence_data.py index edacbaa85ca2aff468114ecfe7a1aaae1a9fadcd..4438aaf6a210b6b95d442a87e6866e833397a175 100644 --- a/train_test_split_cooccurrence_data.py +++ b/train_test_split_cooccurrence_data.py @@ -14,7 +14,7 @@ logging.basicConfig( from sklearn.model_selection import train_test_split from shapely.geometry import Point -from lib.geo import Grid,latlon2healpix +from lib.geo import latlon2healpix from tqdm import tqdm @@ -27,33 +27,10 @@ args = parser.parse_args()#("data/wikipedia/cooccurrence_FR.txt".split())#("data # LOAD DATAgeopandas COOC_FN = args.cooccurrence_file - - logging.info("Load Cooc DATA data...") cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("") -# cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1) -# cooc_data = gpd.GeoDataFrame(cooc_data) logging.info("Cooc data loaded!") -# #Â World Shape bounds -# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) -# world["nn"] = 1 -# dissolved = world.dissolve(by="nn").iloc[0].geometry - -# #Creating Grid -# logging.info("Initializing Grid (360,180)...") -# g = Grid(*dissolved.bounds,[360,180]) -# logging.info("Fit Data to the Grid...") -# g.fit_data(cooc_data) -# logging.info("Placing place into the grid...") -# [g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))] - -# #ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME -# logging.info("Associate a cell number to each place in the Geoname Dataframe") -# def foo(g,id_): -# for ix,cell in enumerate(g.cells): -# if id_ in cell.list_object: -# return ix cooc_data["cat"] = cooc_data.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1) @@ -79,12 +56,9 @@ for i in np.unique(cooc_data.cat.values): except Exception as e: print(e) #print("Error",len(filtered[filtered.cat == i])) -# del X_train["geometry"] -# del X_train["nn"] del X_train["cat"] del X_test["cat"] -# del X_test["geometry"] -# del X_test["nn"] + # SAVING THE DATA logging.info("Saving Output !") suffix =""