Skip to content
Snippets Groups Projects
Commit 810b1dcf authored by Jacques Fize's avatar Jacques Fize
Browse files

add new_version of train script

parent 196cc045
No related branches found
No related tags found
No related merge requests found
# Base module
import os
# Structure
import pandas as pd
import numpy as np
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM,Dropout,GRU
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Lambda
import keras.backend as K
import tensorflow as tf
from lib.custom_layer import *
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer,LabelEncoder
from lib.metrics import lat_accuracy,lon_accuracy
from lib.data_generator import DataGenerator,CoOccurrences,load_embedding,Inclusion,Adjacency
from lib.geo import haversine_tf,accuracy_k,haversine_tf_1circle
# Logging
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)
from helpers import EpochTimer
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-i --inclusion-fn ../data/geonamesData/hierarchy.txt ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#.parse_args("-w --wikipedia-cooc-fn subsetCoocALLv2.csv ../data/geonamesData/allCountries.txt ../data/embeddings/word2vec4gram/4gramWiki+geonames_index.json ../data/embeddings/word2vec4gram/embedding4gramWiki+Geonames.bin".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.k_value
EPOCHS = args.epochs
ADJACENCY_SAMPLING = args.adjacency_sample
COOC_SAMPLING = args.cooc_sample
WORDVEC_ITER = 50
EMBEDDING_DIM = args.dimension
BATCH_SIZE = args.batch_size
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
GEONAME_FN = args.geoname_input
DATASET_NAME = args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = args.inclusion_fn
ADJACENCY_REL_FILENAME = args.adjacency_fn
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
meta_data = MetaDataSerializer(
DATASET_NAME,
REL_CODE,
COOC_SAMPLING,
ADJACENCY_SAMPLING,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
### PUT DATASRC + GENERATOR
index = NgramIndex.load(args.ngram_index_fn)
train_src = []
test_src = []
class_encoder = LabelEncoder()
if args.wikipedia_cooc:
train_src.append(CoOccurrences(COOC_FN + "_train.csv",class_encoder,sampling=4,use_healpix=False))
test_src.append(CoOccurrences(COOC_FN + "_test.csv",class_encoder,sampling=4,use_healpix=False))
if args.adjacency:
a_train = Adjacency(ADJACENCY_REL_FILENAME + "_train.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
a_test = Adjacency(ADJACENCY_REL_FILENAME + "_test.csv",GEONAME_FN,sampling=ADJACENCY_SAMPLING,gzip=False)
train_src.append(a_train)
test_src.append(a_test)
if args.inclusion:
i_train = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_train.csv")
i_test = Inclusion(GEONAME_FN,GEONAMES_HIERARCHY_FN+"_test.csv")
train_src.append(i_train)
test_src.append(i_test)
#Adjacency
print("Number of classes:",class_encoder.get_num_classes())
d_train = DataGenerator(train_src,index,class_encoder,batch_size=BATCH_SIZE)
d_test = DataGenerator(test_src,index,class_encoder,batch_size=BATCH_SIZE)
num_words = len(index.index_ngram)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
EMBEDDING_DIM = len(embedding_weights[0])
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
from keras import regularizers
####
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
biLSTM = Bidirectional(GRU(128,activation="pentanh", recurrent_activation="pentanh"))
x1 = biLSTM(x1)
x2 = biLSTM(x2)
x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x)
x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x)
x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
x2 = Dropout(0.3)(x2)
#aux_layer = Dense(class_encoder.get_num_classes(),activation="softmax",name="aux_layer")(D)
output_lon = Dense(1,activation="sigmoid")(x1)
output_lat = Dense(1,activation="sigmoid")(x2)
output_coord = concatenate([output_lon,output_lat],name="output_coord")
#####
model = Model(inputs = [input_1,input_2], outputs = output_coord)#input_3
model.compile(loss={"output_coord":haversine_tf_1circle}, optimizer='adam',metrics={"output_coord":accuracy_k(ACCURACY_TOLERANCE)})
model.summary()
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
epoch_timer = EpochTimer("outputs/"+PREFIX_OUTPUT_FN+"_epoch_timer_output.csv")
history = model.fit_generator(generator=d_train,
validation_data=d_test,
verbose=True,
epochs=EPOCHS,
callbacks=[checkpoint,epoch_timer])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
......@@ -14,7 +14,7 @@ logging.basicConfig(
from sklearn.model_selection import train_test_split
from shapely.geometry import Point
from lib.geo import Grid,latlon2healpix
from lib.geo import latlon2healpix
from tqdm import tqdm
......@@ -27,33 +27,10 @@ args = parser.parse_args()#("data/wikipedia/cooccurrence_FR.txt".split())#("data
# LOAD DATAgeopandas
COOC_FN = args.cooccurrence_file
logging.info("Load Cooc DATA data...")
cooc_data = pd.read_csv(COOC_FN,sep="\t").fillna("")
# cooc_data["geometry"] = cooc_data["longitude latitude".split()].apply(lambda x: Point(x.longitude,x.latitude),axis=1)
# cooc_data = gpd.GeoDataFrame(cooc_data)
logging.info("Cooc data loaded!")
# # World Shape bounds
# world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# world["nn"] = 1
# dissolved = world.dissolve(by="nn").iloc[0].geometry
# #Creating Grid
# logging.info("Initializing Grid (360,180)...")
# g = Grid(*dissolved.bounds,[360,180])
# logging.info("Fit Data to the Grid...")
# g.fit_data(cooc_data)
# logging.info("Placing place into the grid...")
# [g+(row.title,row.latitude,row.longitude) for ix,row in tqdm(cooc_data.iterrows(),total=len(cooc_data))]
# #ASSOCIATE CELL NUMBER TO EACH PLACE IN THE GEONAME DATAFRAME
# logging.info("Associate a cell number to each place in the Geoname Dataframe")
# def foo(g,id_):
# for ix,cell in enumerate(g.cells):
# if id_ in cell.list_object:
# return ix
cooc_data["cat"] = cooc_data.apply(lambda x:latlon2healpix(x.latitude,x.longitude,64),axis=1)
......@@ -79,12 +56,9 @@ for i in np.unique(cooc_data.cat.values):
except Exception as e:
print(e) #print("Error",len(filtered[filtered.cat == i]))
# del X_train["geometry"]
# del X_train["nn"]
del X_train["cat"]
del X_test["cat"]
# del X_test["geometry"]
# del X_test["nn"]
# SAVING THE DATA
logging.info("Saving Output !")
suffix =""
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment