diff --git a/train_geocoder.py b/train_geocoder.py index 71ff546eba853c4fdbf0c8eeff263021df737c15..4b7bfc3c94a20e19b44cac248b98538c44774ce2 100644 --- a/train_geocoder.py +++ b/train_geocoder.py @@ -1,78 +1,44 @@ # Base module -import re import os -import json +import sys #Â Structure import pandas as pd import numpy as np -import geopandas as gpd #Â DEEPL module from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout from keras.models import Model -from keras import backend as K from keras.callbacks import ModelCheckpoint -import tensorflow as tf - -#Â Geometry -from shapely.geometry import Point - #Â Custom module -from helpers import read_geonames -from lib.utils_geo import Grid,zero_one_encoding, get_adjacency_rels, get_geonames_inclusion_rel,get_bounds +from lib.utils_geo import zero_one_encoding from lib.ngram_index import NgramIndex +from lib.word_index import WordIndex from lib.utils import ConfigurationReader -from lib.metrics import lat_accuracy,lon_accuracy -from lib.utils_geo import haversine_tf,accuracy_k,haversine_tf_1circle - +from lib.utils_geo import accuracy_k,haversine_tf_1circle +from helpers import EpochTimer +from lib.datageneratorv4 import DataGenerator # Logging -from tqdm import tqdm import logging -from helpers import parse_title_wiki,EpochTimer - logging.getLogger('gensim').setLevel(logging.WARNING) - -def get_new_ids(cooc_data,id_first_value): - """ - Return new ids from cooccurrence data - - Parameters - ---------- - cooc_data : pd.DataFrame - cooccurrence da - id_first_value : int - id beginning value - - Returns - ------- - dict - new ids for each toponyms - """ - topo_id = {} - id_ = id_first_value - for title in cooc_data.title.values: - if not title in topo_id: - id_+=1 - topo_id[id_]=title - for interlinks in cooc_data.interlinks.values: - for interlink in interlinks.split("|"): - if not interlink in topo_id: - id_+=1 - topo_id[id_]=interlink - return topo_id - -# LOGGING CONF -logging.basicConfig( +logging.basicConfig( # LOGGING CONF format='[%(asctime)s][%(levelname)s] %(message)s ', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO ) -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\ - .parse_args()#("-i -a -w --wikipedia-cooc-fn ../data/wikipedia/cooccurrence_FR.txt -n 4 --ngram-word2vec-iter 1 -e 100 ../data/geonamesData/FR.txt ../data/geonamesData/hierarchy.txt".split()) +import tensorflow as tf +try: + physical_devices = tf.config.list_physical_devices('GPU') + tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) +except: + print("NO GPU FOUND...") + +#Â COMMAND ARGS +args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ + .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split()) # ################################################# @@ -82,236 +48,99 @@ MODEL_NAME = "Bi-LSTM_NGRAM" NGRAM_SIZE = args.ngram_size ACCURACY_TOLERANCE = args.tolerance_value EPOCHS = args.epochs -ITER_ADJACENCY = args.adjacency_iteration -COOC_SAMPLING_NUMBER = args.cooc_sample_size WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = 256 +EMBEDDING_DIM = args.dimension ################################################# ########## FILENAME VARIABLE #################### ################################################# -GEONAME_FN = args.geoname_input -DATASET_NAME = args.geoname_input.split("/")[-1] -GEONAMES_HIERARCHY_FN = args.geoname_hierachy_input -REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1 -ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format( - GEONAME_FN, - ITER_ADJACENCY, - REGION_SUFFIX_FN) - -COOC_FN = args.wikipedia_cooc_fn -PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format( - GEONAME_FN.split("/")[-1], - EPOCHS, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - REGION_SUFFIX_FN) - -REL_CODE="" +INCLUSION_FN = args.geoname_inclusion +ADJACENT_FN = args.geonames_adjacent +COOC_FN = args.wikipedia_cooc + +DATASET_NAME = args.dataset_name + +PREFIX_OUTPUT_FN = DATASET_NAME +PREFIX_OUTPUT_FN+="_{0}".format(NGRAM_SIZE) +EMBEDDING_FN = "outputs/{0}_embedding.npy".format(PREFIX_OUTPUT_FN) +PREFIX_OUTPUT_FN+="_{0}".format(EPOCHS) + if args.adjacency: PREFIX_OUTPUT_FN += "_A" - REL_CODE+= "A" if args.inclusion: PREFIX_OUTPUT_FN += "_I" - REL_CODE+= "I" -if args.wikipedia_cooc: - PREFIX_OUTPUT_FN += "_C" - REL_CODE+= "C" +if args.wikipedia: + PREFIX_OUTPUT_FN += "_P" MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) -from lib.utils import MetaDataSerializer - -meta_data = MetaDataSerializer( - MODEL_NAME, - DATASET_NAME, - REL_CODE, - COOC_SAMPLING_NUMBER, - ITER_ADJACENCY, - NGRAM_SIZE, - ACCURACY_TOLERANCE, - EPOCHS, - EMBEDDING_DIM, - WORDVEC_ITER, - INDEX_FN, - MODEL_OUTPUT_FN, - HISTORY_FN -) -meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN)) ############################################################################################# ################################# LOAD DATA ################################################# ############################################################################################# -# LOAD Geonames DATA -logging.info("Load Geonames data...") -geoname_data = read_geonames(GEONAME_FN).fillna("") - -train_indices = set(pd.read_csv(GEONAME_FN+"_train.csv").geonameid.values) -test_indices = set(pd.read_csv(GEONAME_FN+"_test.csv").geonameid.values) +data_used = [] -logging.info("Geonames data loaded!") +if args.wikipedia: + data_used.append(pd.read_csv(COOC_FN,sep="\t")) -# SELECT ENTRY with class == to A and P (Areas and Populated Places) -filtered = geoname_data[geoname_data.feature_class.isin("A P".split())].copy() #Â Only take area and populated places -#CLEAR RAM -del geoname_data - - -# IF REGION -if args.admin_code_1 != "None": - filtered = filtered[filtered.admin1_code == args.admin_code_1].copy() +if args.inclusion: + data_used.append(pd.read_csv(INCLUSION_FN,sep="\t")) -# GET BOUNDS AND REDUCE DATA AVAILABLE FIELDS -filtered = filtered["geonameid name longitude latitude".split()] # KEEP ONLY ID LABEL AND COORD +if args.adjacency: + data_used.append(pd.read_csv(ADJACENT_FN, sep="\t")) +if len(data_used) <1: + print("No Type of toponyms indicated. Stopping the program...") + sys.exit(1) +pairs_of_toponym = pd.concat(data_used) ############################################################################################# ################################# RETRIEVE RELATIONSHIPS #################################### ############################################################################################# - -# INITIALIZE RELATION STORE -rel_store = [] - -# Retrieve adjacency relationships -if args.adjacency: - logging.info("Retrieve adjacency relationships ! ") - - if not os.path.exists(ADJACENCY_REL_FILENAME): - bounds = get_bounds(filtered) # Required to get adjacency relationships - rel_store.extend(get_adjacency_rels(filtered,bounds,[360,180],ITER_ADJACENCY)) - json.dump(rel_store,open(ADJACENCY_REL_FILENAME,'w')) - else: - logging.info("Open and load data from previous computation!") - rel_store=json.load(open(ADJACENCY_REL_FILENAME)) - - logging.info("{0} adjacency relationships retrieved ! ".format(len(rel_store))) - -# Retrieve inclusion relationships -if args.inclusion: - logging.info("Retrieve inclusion relationships ! ") - - cpt_rel = len(rel_store) - rel_store.extend(get_geonames_inclusion_rel(filtered,GEONAMES_HIERARCHY_FN)) - - logging.info("{0} inclusion relationships retrieved ! ".format(len(rel_store)-cpt_rel)) - - - -if args.wikipedia_cooc: - logging.info("Load Wikipedia Cooccurrence data and merge with geonames") - - cooc_data = pd.read_csv(COOC_FN,sep="\t") - cooc_data["title"] = cooc_data.title.apply(parse_title_wiki) - cooc_data["interlinks"] = cooc_data.interlinks.apply(parse_title_wiki) - id_wikipediatitle = get_new_ids(cooc_data,filtered.geonameid.max()) - wikipediatitle_id = {v:k for k,v in id_wikipediatitle.items()} - title_coord = {row.title: (row.longitude,row.latitude) for _,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))} - cooc_data["geonameid"] = cooc_data.title.apply(lambda x: wikipediatitle_id[x]) - filtered = pd.concat((filtered,cooc_data["geonameid title longitude latitude".split()].rename(columns={"title":"name"}).copy())) - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv",sep="\t"), pd.read_csv(COOC_FN+"_test.csv",sep="\t") - if not "title" in train_cooc_indices: - train_cooc_indices,test_cooc_indices = pd.read_csv(COOC_FN+"_train.csv"), pd.read_csv(COOC_FN+"_test.csv") - train_indices = train_indices.union(set(train_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - test_indices = test_indices.union(set(test_cooc_indices.title.apply(lambda x: wikipediatitle_id[parse_title_wiki(x)]).values)) - - logging.info("Merged with Geonames data !") - - # EXTRACT rel - logging.info("Extracting cooccurrence relationships") - cpt=0 - for ix, row in tqdm(cooc_data.iterrows(),total=len(cooc_data),desc="Extracting Wikipedia Cooccurrence"): - for inter in np.random.choice(row.interlinks.split("|"),COOC_SAMPLING_NUMBER): - cpt+=1 - rel_store.extend([[row.geonameid,wikipediatitle_id[inter]]]) - logging.info("Extract {0} cooccurrence relationships !".format(cpt)) - - -# STORE ID to name -geoname2name = dict(filtered["geonameid name".split()].values) - # ENCODING NAME USING N-GRAM SPLITTING logging.info("Encoding toponyms to ngram...") index = NgramIndex(NGRAM_SIZE) +if args.tokenization_method == "word-level": + index = WordIndex() +if args.tokenization_method == "bert": + index = NgramIndex(NGRAM_SIZE,bert_tokenization=True) # Identify all ngram available -filtered.name.apply(lambda x : index.split_and_add(x)) -if args.wikipedia_cooc:[index.split_and_add(k) for k in wikipediatitle_id] - -geoname2encodedname = {row.geonameid : index.encode(row.name) for row in filtered.itertuples()} #init a dict with the 'geonameid' --> 'encoded toponym' association +pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) +pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x)) -if args.wikipedia_cooc: - geoname2encodedname.update({v:index.encode(k) for k,v in wikipediatitle_id.items()}) +num_words = len(index.index_ngram) # necessary for the embedding matrix # SAVE THE INDEX TO REUSE THE MODEL index.save(INDEX_FN) - logging.info("Done !") - ############################################################################################# -################################# ENCODE COORDINATES ######################################## +################################# NGRAM EMBEDDINGS ########################################## ############################################################################################# - -# Encode each geonames entry coordinates -geoname_vec = {row.geonameid : zero_one_encoding(row.longitude,row.latitude) for row in filtered.itertuples()} -# CLEAR RAM -del filtered - - -EMBEDDING_DIM = 256 -num_words = len(index.index_ngram) # necessary for the embedding matrix - -logging.info("Preparing Input and Output data...") - +if os.path.exists(EMBEDDING_FN): + logging.info("Load previous N-GRAM Embedding...") + embedding_weights = np.load(EMBEDDING_FN) + logging.info("Embedding loaded ! ") +else: + logging.info("Generating N-GRAM Embedding...") + embedding_weights = index.get_embedding_layer([index.encode(p) for p in np.concatenate((pairs_of_toponym.toponym.unique(),pairs_of_toponym.toponym_context.unique()))],dim= EMBEDDING_DIM,iter=WORDVEC_ITER) + np.save(EMBEDDING_FN,embedding_weights) + logging.info("Embedding generated !") ############################################################################################# ################################# BUILD TRAIN/TEST DATASETS ################################# ############################################################################################# +logging.info("Preparing Input and Output data...") -X_1_train,X_2_train,y_lat_train,y_lon_train=[],[],[],[] -X_1_test,X_2_test,y_lat_test,y_lon_test=[],[],[],[] -y_train,y_test = [],[] - -for couple in rel_store: - geonameId_1,geonameId_2 = couple[0],couple[1] - if not geonameId_1 in geoname2encodedname: - continue - top1,top2 = geoname2encodedname[geonameId_1],geoname2encodedname[geonameId_2] - if geonameId_1 in train_indices: #and geonameId_2 in train_indices: - - X_1_train.append(top1) - X_2_train.append(top2) - - y_train.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) - #y_lon_train.append(geoname_vec[geonameId_1][0]) - #y_lat_train.append(geoname_vec[geonameId_1][1]) - - else: - X_1_test.append(top1) - X_2_test.append(top2) - - y_test.append([geoname_vec[geonameId_1][0],geoname_vec[geonameId_1][1]]) - #y_lon_test.append(geoname_vec[geonameId_1][0]) - #y_lat_test.append(geoname_vec[geonameId_1][1]) - -# NUMPYZE inputs and output lists -X_1_train = np.array(X_1_train) -X_2_train = np.array(X_2_train) -y_lat_train = np.array(y_lat_train) -y_lon_train = np.array(y_lon_train) -y_train = np.array(y_train) - -X_1_test = np.array(X_1_test) -X_2_test = np.array(X_2_test) -y_lat_test = np.array(y_lat_test) -y_lon_test = np.array(y_lon_test) -y_test = np.array(y_test) +training_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "train"],index) +validation_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "test"],index) logging.info("Data prepared !") @@ -320,20 +149,11 @@ logging.info("Data prepared !") if not os.path.exists("outputs/"): os.makedirs("outputs/") -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - - -logging.info("Generating N-GRAM Embedding...") -embedding_weights = index.get_embedding_layer(geoname2encodedname.values(),dim= EMBEDDING_DIM,iter=WORDVEC_ITER) -logging.info("Embedding generated !") ############################################################################################# ################################# MODEL DEFINITION ########################################## ############################################################################################# - input_1 = Input(shape=(index.max_len,)) input_2 = Input(shape=(index.max_len,)) @@ -342,36 +162,40 @@ embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len, x1 = embedding_layer(input_1) x2 = embedding_layer(input_2) +if not args.previous_state: #Â Each LSTM learn on a permutation of the input toponyms -x1 = Bidirectional(LSTM(98))(x1) -x2 = Bidirectional(LSTM(98))(x2) - -x = concatenate([x1,x2])#,x3]) - -x1 = Dense(500,activation="relu")(x) -# x1 = Dropout(0.3)(x1) -x1 = Dense(500,activation="relu")(x1) -# x1 = Dropout(0.3)(x1) - -x2 = Dense(500,activation="relu")(x) -# x2 = Dropout(0.3)(x2) -x2 = Dense(500,activation="relu")(x2) -# x2 = Dropout(0.3)(x2) - -output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) -output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - -output_coord = concatenate([output_lon,output_lat],name="output_coord") - -model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 - -model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) - -# model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3 - -# model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='adam',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()}) - - + if args.lstm_layer == 2: + x1 = Bidirectional(LSTM(100))(x1) + x2 = Bidirectional(LSTM(100))(x2) + x = concatenate([x1,x2]) + else: + lstm_unique_layer = Bidirectional(LSTM(100)) + x1 = lstm_unique_layer(x1) + x2 = lstm_unique_layer(x2) + x = concatenate([x1,x2]) + + x1 = Dense(500,activation="relu")(x) + x1 = Dense(500,activation="relu")(x1) + + x2 = Dense(500,activation="relu")(x) + x2 = Dense(500,activation="relu")(x2) + + output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) + output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) + + output_coord = concatenate([output_lon,output_lat],name="output_coord") + + model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 + model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) + +else: + if not os.path.exists(args.previous_state): + print("Model previous state was not found ! ") + sys.exit(1) + print("Load Previous state of the model...") + model = tf.keras.models.load_model(args.previous_state,custom_objects={"haversine_tf_1circle":haversine_tf_1circle,"compute_metric":accuracy_k(100)}) +print("Neural Network Architecture : ") +print(model.summary()) ############################################################################################# ################################# TRAINING LAUNCH ########################################### ############################################################################################# @@ -382,12 +206,11 @@ checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose= epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") -history = model.fit(x=[X_1_train,X_2_train], - y=y_train,#[y_lon_train,y_lat_train], - verbose=True, batch_size=100, - epochs=EPOCHS, - validation_data=([X_1_test,X_2_test],y_test),#[y_lon_test,y_lat_test]), - callbacks=[checkpoint,epoch_timer]) + +history = model.fit(training_generator,verbose=True, + validation_data=validation_generator, + callbacks=[checkpoint,epoch_timer],epochs=EPOCHS) + hist_df = pd.DataFrame(history.history) @@ -397,5 +220,8 @@ model.save(MODEL_OUTPUT_FN) #Â Erase Model Checkpoint file if os.path.exists(MODEL_OUTPUT_FN + ".part"): - import shutil - shutil.rmtree(MODEL_OUTPUT_FN + ".part") \ No newline at end of file + try: + import shutil + shutil.rmtree(MODEL_OUTPUT_FN + ".part") + except: #Â Depends on Keras version + os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file diff --git a/train_geocoder_v2.py b/train_geocoder_v2.py deleted file mode 100644 index 4b7bfc3c94a20e19b44cac248b98538c44774ce2..0000000000000000000000000000000000000000 --- a/train_geocoder_v2.py +++ /dev/null @@ -1,227 +0,0 @@ -# Base module -import os -import sys - -#Â Structure -import pandas as pd -import numpy as np - -#Â DEEPL module -from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM, Dropout -from keras.models import Model -from keras.callbacks import ModelCheckpoint - -#Â Custom module -from lib.utils_geo import zero_one_encoding -from lib.ngram_index import NgramIndex -from lib.word_index import WordIndex -from lib.utils import ConfigurationReader -from lib.utils_geo import accuracy_k,haversine_tf_1circle -from helpers import EpochTimer -from lib.datageneratorv4 import DataGenerator - -# Logging -import logging -logging.getLogger('gensim').setLevel(logging.WARNING) -logging.basicConfig( # LOGGING CONF - format='[%(asctime)s][%(levelname)s] %(message)s ', - datefmt='%m/%d/%Y %I:%M:%S %p', - level=logging.INFO - ) - -import tensorflow as tf -try: - physical_devices = tf.config.list_physical_devices('GPU') - tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) -except: - print("NO GPU FOUND...") - -#Â COMMAND ARGS -args = ConfigurationReader("./parser_config/toponym_combination_embedding_v3.json")\ - .parse_args()#("IGN ../data/IGN/IGN_inclusion.csv ../data/IGN/IGN_adjacent_corrected.csv ../data/IGN/IGN_cooc.csv -i -w -a -n 4 --ngram-word2vec-iter 1".split()) - -# -################################################# -############# MODEL TRAINING PARAMETER ########## -################################################# -MODEL_NAME = "Bi-LSTM_NGRAM" -NGRAM_SIZE = args.ngram_size -ACCURACY_TOLERANCE = args.tolerance_value -EPOCHS = args.epochs -WORDVEC_ITER = args.ngram_word2vec_iter -EMBEDDING_DIM = args.dimension -################################################# -########## FILENAME VARIABLE #################### -################################################# -INCLUSION_FN = args.geoname_inclusion -ADJACENT_FN = args.geonames_adjacent -COOC_FN = args.wikipedia_cooc - -DATASET_NAME = args.dataset_name - -PREFIX_OUTPUT_FN = DATASET_NAME -PREFIX_OUTPUT_FN+="_{0}".format(NGRAM_SIZE) -EMBEDDING_FN = "outputs/{0}_embedding.npy".format(PREFIX_OUTPUT_FN) -PREFIX_OUTPUT_FN+="_{0}".format(EPOCHS) - -if args.adjacency: - PREFIX_OUTPUT_FN += "_A" -if args.inclusion: - PREFIX_OUTPUT_FN += "_I" -if args.wikipedia: - PREFIX_OUTPUT_FN += "_P" - -MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN) -INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN) -HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN) - - -############################################################################################# -################################# LOAD DATA ################################################# -############################################################################################# - -data_used = [] - -if args.wikipedia: - data_used.append(pd.read_csv(COOC_FN,sep="\t")) - -if args.inclusion: - data_used.append(pd.read_csv(INCLUSION_FN,sep="\t")) - -if args.adjacency: - data_used.append(pd.read_csv(ADJACENT_FN, sep="\t")) - -if len(data_used) <1: - print("No Type of toponyms indicated. Stopping the program...") - sys.exit(1) - -pairs_of_toponym = pd.concat(data_used) - -############################################################################################# -################################# RETRIEVE RELATIONSHIPS #################################### -############################################################################################# - -# ENCODING NAME USING N-GRAM SPLITTING -logging.info("Encoding toponyms to ngram...") -index = NgramIndex(NGRAM_SIZE) -if args.tokenization_method == "word-level": - index = WordIndex() -if args.tokenization_method == "bert": - index = NgramIndex(NGRAM_SIZE,bert_tokenization=True) - - # Identify all ngram available -pairs_of_toponym.toponym.apply(lambda x : index.split_and_add(x)) -pairs_of_toponym.toponym_context.apply(lambda x : index.split_and_add(x)) - -num_words = len(index.index_ngram) # necessary for the embedding matrix - -# SAVE THE INDEX TO REUSE THE MODEL -index.save(INDEX_FN) -logging.info("Done !") - -############################################################################################# -################################# NGRAM EMBEDDINGS ########################################## -############################################################################################# - - -if os.path.exists(EMBEDDING_FN): - logging.info("Load previous N-GRAM Embedding...") - embedding_weights = np.load(EMBEDDING_FN) - logging.info("Embedding loaded ! ") -else: - logging.info("Generating N-GRAM Embedding...") - embedding_weights = index.get_embedding_layer([index.encode(p) for p in np.concatenate((pairs_of_toponym.toponym.unique(),pairs_of_toponym.toponym_context.unique()))],dim= EMBEDDING_DIM,iter=WORDVEC_ITER) - np.save(EMBEDDING_FN,embedding_weights) - logging.info("Embedding generated !") - -############################################################################################# -################################# BUILD TRAIN/TEST DATASETS ################################# -############################################################################################# -logging.info("Preparing Input and Output data...") - -training_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "train"],index) -validation_generator = DataGenerator(pairs_of_toponym[pairs_of_toponym.split == "test"],index) - -logging.info("Data prepared !") - - -# check for output dir -if not os.path.exists("outputs/"): - os.makedirs("outputs/") - - -############################################################################################# -################################# MODEL DEFINITION ########################################## -############################################################################################# - -input_1 = Input(shape=(index.max_len,)) -input_2 = Input(shape=(index.max_len,)) - -embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True) - -x1 = embedding_layer(input_1) -x2 = embedding_layer(input_2) - -if not args.previous_state: -#Â Each LSTM learn on a permutation of the input toponyms - if args.lstm_layer == 2: - x1 = Bidirectional(LSTM(100))(x1) - x2 = Bidirectional(LSTM(100))(x2) - x = concatenate([x1,x2]) - else: - lstm_unique_layer = Bidirectional(LSTM(100)) - x1 = lstm_unique_layer(x1) - x2 = lstm_unique_layer(x2) - x = concatenate([x1,x2]) - - x1 = Dense(500,activation="relu")(x) - x1 = Dense(500,activation="relu")(x1) - - x2 = Dense(500,activation="relu")(x) - x2 = Dense(500,activation="relu")(x2) - - output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1) - output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2) - - output_coord = concatenate([output_lon,output_lat],name="output_coord") - - model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3 - model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)}) - -else: - if not os.path.exists(args.previous_state): - print("Model previous state was not found ! ") - sys.exit(1) - print("Load Previous state of the model...") - model = tf.keras.models.load_model(args.previous_state,custom_objects={"haversine_tf_1circle":haversine_tf_1circle,"compute_metric":accuracy_k(100)}) -print("Neural Network Architecture : ") -print(model.summary()) -############################################################################################# -################################# TRAINING LAUNCH ########################################### -############################################################################################# - -checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1, - save_best_only=True, mode='auto', period=1) - -epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv") - - - -history = model.fit(training_generator,verbose=True, - validation_data=validation_generator, - callbacks=[checkpoint,epoch_timer],epochs=EPOCHS) - - - -hist_df = pd.DataFrame(history.history) -hist_df.to_csv(HISTORY_FN) - -model.save(MODEL_OUTPUT_FN) - -#Â Erase Model Checkpoint file -if os.path.exists(MODEL_OUTPUT_FN + ".part"): - try: - import shutil - shutil.rmtree(MODEL_OUTPUT_FN + ".part") - except: #Â Depends on Keras version - os.remove(MODEL_OUTPUT_FN + ".part") \ No newline at end of file