Skip to content
Snippets Groups Projects
Commit 9d554416 authored by Jacques Fize's avatar Jacques Fize
Browse files

Add new learning script that enable to work with big data

parent 12946ad6
No related branches found
No related tags found
No related merge requests found
# Base module
import os
# Structure
import pandas as pd
# DEEPL module
from keras.layers import Dense, Input, Embedding,concatenate,Bidirectional,LSTM
from keras.models import Model
from keras.callbacks import ModelCheckpoint
# Custom module
from lib.ngram_index import NgramIndex
from lib.utils import ConfigurationReader, MetaDataSerializer
from lib.metrics import lat_accuracy,lon_accuracy
from data_generator import DataGenerator,CoOccurrences,load_embedding
# Logging
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)
# LOGGING CONF
logging.basicConfig(
format='[%(asctime)s][%(levelname)s] %(message)s ',
datefmt='%m/%d/%Y %I:%M:%S %p',
level=logging.INFO
)
args = ConfigurationReader("./parser_config/toponym_combination_embedding_v2.json")\
.parse_args()#("-w -e 100 ../data/geonamesData/allCountries.txt ../data/geonamesData/hierarchy.txt".split())
#
#################################################
############# MODEL TRAINING PARAMETER ##########
#################################################
NGRAM_SIZE = args.ngram_size
ACCURACY_TOLERANCE = args.tolerance_value
EPOCHS = args.epochs
ITER_ADJACENCY = args.adjacency_iteration
COOC_SAMPLING_NUMBER = args.cooc_sample_size
WORDVEC_ITER = args.ngram_word2vec_iter
EMBEDDING_DIM = 100
#################################################
########## FILENAME VARIABLE ####################
#################################################
# check for output dir
if not os.path.exists("outputs/"):
os.makedirs("outputs/")
GEONAME_FN = "ALL"#args.geoname_input
DATASET_NAME = "ALL"#args.geoname_input.split("/")[-1]
GEONAMES_HIERARCHY_FN = ""#args.geoname_hierachy_input
REGION_SUFFIX_FN = "" if args.admin_code_1 == "None" else "_" + args.admin_code_1
ADJACENCY_REL_FILENAME = "{0}_{1}{2}adjacency.json".format(
GEONAME_FN,
ITER_ADJACENCY,
REGION_SUFFIX_FN)
COOC_FN = args.wikipedia_cooc_fn
PREFIX_OUTPUT_FN = "{0}_{1}_{2}_{3}_{4}".format(
GEONAME_FN.split("/")[-1],
EPOCHS,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
REGION_SUFFIX_FN)
REL_CODE=""
if args.adjacency:
PREFIX_OUTPUT_FN += "_A"
REL_CODE+= "A"
if args.inclusion:
PREFIX_OUTPUT_FN += "_I"
REL_CODE+= "I"
if args.wikipedia_cooc:
PREFIX_OUTPUT_FN += "_C"
REL_CODE+= "C"
MODEL_OUTPUT_FN = "outputs/{0}.h5".format(PREFIX_OUTPUT_FN)
INDEX_FN = "outputs/{0}_index".format(PREFIX_OUTPUT_FN)
HISTORY_FN = "outputs/{0}.csv".format(PREFIX_OUTPUT_FN)
meta_data = MetaDataSerializer(
DATASET_NAME,
REL_CODE,
COOC_SAMPLING_NUMBER,
ITER_ADJACENCY,
NGRAM_SIZE,
ACCURACY_TOLERANCE,
EPOCHS,
EMBEDDING_DIM,
WORDVEC_ITER,
INDEX_FN,
MODEL_OUTPUT_FN,
HISTORY_FN
)
meta_data.save("outputs/{0}.json".format(PREFIX_OUTPUT_FN))
### PUT DATASRC + GENERATOR
index = NgramIndex.load(args.ngram_index_fn)
c_train = CoOccurrences(COOC_FN + "_train.csv",sampling=3)
c_test = CoOccurrences(COOC_FN + "_test.csv",sampling=3)
BATCH_SIZE = 1000
d_train = DataGenerator([c_train],index,batch_size=BATCH_SIZE)
d_test = DataGenerator([c_test],index,batch_size=BATCH_SIZE)
num_words = len(index.index_ngram)
#############################################################################################
################################# NGRAM EMBEDDINGS ##########################################
#############################################################################################
embedding_weights = load_embedding(args.embedding_fn)
#############################################################################################
################################# MODEL DEFINITION ##########################################
#############################################################################################
input_1 = Input(shape=(index.max_len,))
input_2 = Input(shape=(index.max_len,))
embedding_layer = Embedding(num_words, EMBEDDING_DIM,input_length=index.max_len,weights=[embedding_weights],trainable=False)#, trainable=True)
x1 = embedding_layer(input_1)
x2 = embedding_layer(input_2)
# Each LSTM learn on a permutation of the input toponyms
x1 = Bidirectional(LSTM(98))(x1)
x2 = Bidirectional(LSTM(98))(x2)
x = concatenate([x1,x2])#,x3])
x1 = Dense(500,activation="relu")(x)
# x1 = Dropout(0.3)(x1)
x1 = Dense(500,activation="relu")(x1)
# x1 = Dropout(0.3)(x1)
x2 = Dense(500,activation="relu")(x)
# x2 = Dropout(0.3)(x2)
x2 = Dense(500,activation="relu")(x2)
# x2 = Dropout(0.3)(x2)
output_lon = Dense(1,activation="sigmoid",name="Output_LON")(x1)
output_lat = Dense(1,activation="sigmoid",name="Output_LAT")(x2)
model = Model(inputs = [input_1,input_2], outputs = [output_lon,output_lat])#input_3
model.compile(loss=['mean_squared_error','mean_squared_error'], optimizer='rmsprop',metrics={"Output_LON":lon_accuracy(),"Output_LAT":lat_accuracy()})
#############################################################################################
################################# TRAINING LAUNCH ###########################################
#############################################################################################
checkpoint = ModelCheckpoint(MODEL_OUTPUT_FN + ".part", monitor='loss', verbose=1,
save_best_only=True, mode='auto', period=1)
history = model.fit_generator(generator=d_train,
validation_data=d_test,
verbose=True,
epochs=EPOCHS,
callbacks=[checkpoint])
hist_df = pd.DataFrame(history.history)
hist_df.to_csv(HISTORY_FN)
model.save(MODEL_OUTPUT_FN)
# Erase Model Checkpoint file
if os.path.exists(MODEL_OUTPUT_FN + ".part"):
os.remove(MODEL_OUTPUT_FN + ".part")
\ No newline at end of file
...@@ -5,6 +5,8 @@ import keras ...@@ -5,6 +5,8 @@ import keras
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from lib.geo import zero_one_encoding
from helpers import parse_title_wiki,read_geonames from helpers import parse_title_wiki,read_geonames
from gensim.models.keyedvectors import KeyedVectors from gensim.models.keyedvectors import KeyedVectors
...@@ -267,8 +269,8 @@ class DataGenerator(keras.utils.Sequence): ...@@ -267,8 +269,8 @@ class DataGenerator(keras.utils.Sequence):
return X, y return X, y
X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)] X[i] = [ self.ngram_index.encode(topo),self.ngram_index.encode(topo_context)]
y[i] = [longitude,latitude] y[i] = [*zero_one_encoding(longitude,latitude)]
return X, y return [X[:,0],X[:,1]], [y[:,0],y[:,1]]
def on_epoch_end(self): def on_epoch_end(self):
'Updates indexes after each epoch' 'Updates indexes after each epoch'
......
{
"description": "Toponym Combination",
"args": [
{ "short": "ngram_index_fn", "help": "Filepath of the NgramIndex file you want to use." },
{ "short": "embedding_fn", "help": "Filepath of the Embedding file you want to use." },
{ "short": "-v", "long": "--verbose", "action": "store_true" },
{ "short": "-i", "long": "--inclusion", "action": "store_true" },
{ "short": "-a", "long": "--adjacency", "action": "store_true" },
{ "short": "-w", "long": "--wikipedia-cooc", "action": "store_true" },
{ "long": "--wikipedia-cooc-fn","help":"Cooccurrence data filename"},
{ "long": "--adjacency-fn","help":"Adjacency data filename"},
{ "long": "--cooc-sample-size", "type": "int", "default": 3 },
{"long": "--adjacency-iteration", "type":"int","default":1},
{ "short": "-n", "long": "--ngram-size", "type": "int", "default": 2 },
{ "long": "--ngram-word2vec-iter", "type": "int", "default": 50 },
{ "short": "-t", "long": "--tolerance-value", "type": "float", "default": 0.002 },
{ "short": "-e", "long": "--epochs", "type": "int", "default": 100 },
{ "short": "-d", "long": "--dimension", "type": "int", "default": 256 },
{ "long": "--admin_code_1", "default": "None" }
]
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment