Skip to content
Snippets Groups Projects
Commit 1576e3e5 authored by Jacques Fize's avatar Jacques Fize
Browse files

Change files organisation

parent 6fe5e31c
No related branches found
No related tags found
No related merge requests found
# Base module
import os
# Structure
import pandas as pd
import numpy as np
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
import keras.backend as K
import tensorflow as tf
from lib.custom_layer import *
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy
from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)
from helpers import EpochTimer
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.k_value
EPOCHS = args.epochs
ADJACENCY_SAMPLING = args.adjacency_sample
COOC_SAMPLING = args.cooc_sample
WORDVEC_ITER = 50
EMBEDDING_DIM = args.dimension
BATCH_SIZE = args.batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.inclusion_fn
ADJACENCY_REL_FILENAME = args.adjacency_fn
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
meta_data = MetaDataSerializer(
DATASET_NAME,
REL_CODE,
COOC_SAMPLING,
ADJACENCY_SAMPLING,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
### PUT DATASRC + GENERATOR
index = NgramIndex.load(args.ngram_index_fn)
train_src = []
test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4))
if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
train_src.append(a_train)
test_src.append(a_test)
if args.inclusion:
i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
train_src.append(i_train)
test_src.append(i_test)
#Adjacency
d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE)
d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE)
num_words = len(index.index_ngram)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
EMBEDDING_DIM = len(embedding_weights[0])
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from keras import regularizers
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
biLSTM = Bidirectional(LSTM(64,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x2,x1])#,x3])
x1 = Dense(1000,activation="pentanh")(x)
# x1 = Dropout(0.3)(x1)
x1 = Dense(1000,activation="pentanh")(x1)
# x1 = Dropout(0.3)(x1)
x2 = Dense(1000,activation="pentanh")(x)
# x2 = Dropout(0.3)(x2)
x2 = Dense(1000,activation="pentanh")(x2)
# x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
output = concatenate([output_lon,output_lat],name="output_layer")
model = Model(inputs = [input_1,input_2], outputs = output)#input_3
model.compile(loss={"output_layer":haversine_tf_1circle}, optimizer='adam',metrics={"output_layer":accuracy_k(ACCURACY_TOLERANCE)})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit_generator(generator=d_train,
validation_data=d_test,
verbose=True,
epochs=EPOCHS,
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
# Base module
import os
# Structure
import pandas as pd
import numpy as np
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
import keras.backend as K
import tensorflow as tf
from lib.custom_layer import *
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy
from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)
from helpers import EpochTimer
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.k_value
EPOCHS = args.epochs
ADJACENCY_SAMPLING = args.adjacency_sample
COOC_SAMPLING = args.cooc_sample
WORDVEC_ITER = 50
EMBEDDING_DIM = args.dimension
BATCH_SIZE = args.batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.inclusion_fn
ADJACENCY_REL_FILENAME = args.adjacency_fn
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
meta_data = MetaDataSerializer(
DATASET_NAME,
REL_CODE,
COOC_SAMPLING,
ADJACENCY_SAMPLING,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
### PUT DATASRC + GENERATOR
index = NgramIndex.load(args.ngram_index_fn)
train_src = []
test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4,use_healpix=False))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4,use_healpix=False))
if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
train_src.append(a_train)
test_src.append(a_test)
if args.inclusion:
i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
train_src.append(i_train)
test_src.append(i_test)
#Adjacency
print("Number of classes:",class_encoder.get_num_classes())
d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE)
d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE)
num_words = len(index.index_ngram)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
EMBEDDING_DIM = len(embedding_weights[0])
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from keras import regularizers
####
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
biLSTM = Bidirectional(GRU(128,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x)
x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x)
x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
x2 = Dropout(0.3)(x2)
#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D)
output_lon = Dense(1,activation="sigmoid")(x1)
output_lat = Dense(1,activation="sigmoid")(x2)
output_coord = concatenate([output_lon,output_lat],name="output_coord")
#####
model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3
model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)})
model.summary()
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit_generator(generator=d_train,
validation_data=d_test,
verbose=True,
epochs=EPOCHS,
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
File moved
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment