From ded9f813c97d34fb2557827fa2d3341ed9870d2c Mon Sep 17 00:00:00 2001 From: lmoncla <ludovic.moncla@insa-lyon.fr> Date: Fri, 17 Sep 2021 11:30:52 +0200 Subject: [PATCH] add main and args to training_bertFineTuning.py --- bert_settings.conf | 6 +- experimentsClassicClassifiers.py | 4 +- script.txt | 12 ---- tmp_preprocess_data.py | 16 +++++- training_bertFineTuning.py | 97 +++++++++++++++++++++++++++++--- 5 files changed, 108 insertions(+), 27 deletions(-) delete mode 100644 script.txt diff --git a/bert_settings.conf b/bert_settings.conf index ccba612..baf830d 100644 --- a/bert_settings.conf +++ b/bert_settings.conf @@ -1,13 +1,11 @@ [general] -dataPath = Data/dataframe_with_ensemble_domaine_enccre.csv columnText = contentWithoutClass columnClass = ensemble_domaine_enccre -minOfInstancePerClass = 200 +minOfInstancePerClass = 50 maxOfInstancePerClass = 1500 [model] - tokeniser = bert-base-multilingual-cased #tokeniser = camembert-base model = bert-base-multilingual-cased @@ -15,5 +13,3 @@ model = bert-base-multilingual-cased max_len_sequences = 256 batch_size = 32 epochs = 4 -pathModel = ' ' -modelName = ' ' diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py index 17a5409..f5d6ade 100644 --- a/experimentsClassicClassifiers.py +++ b/experimentsClassicClassifiers.py @@ -75,11 +75,13 @@ doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) print("size after resampling, ",len(df)) #prepare data -df = df[df[columnClass] != 'unclassified'] +#df = df[df[columnClass] != 'unclassified'] y = df[columnClass] print(df.head()) +print(df[columnClass].head()) + train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) encoder = preprocessing.LabelEncoder() train_y = encoder.fit_transform(train_y) diff --git a/script.txt b/script.txt deleted file mode 100644 index bcde25b..0000000 --- a/script.txt +++ /dev/null @@ -1,12 +0,0 @@ -pip install -r requirements.txt -python tmp_preprocess_data.py -python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 300 1500 -python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 50 1500 -python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 50 800 -python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 100 1500 -python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 300 1500 -python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 50 1500 -python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 300 500 -python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 300 1500 -python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 50 2000 -python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 50 500 diff --git a/tmp_preprocess_data.py b/tmp_preprocess_data.py index bc852b8..30b0d75 100644 --- a/tmp_preprocess_data.py +++ b/tmp_preprocess_data.py @@ -24,15 +24,27 @@ import pandas as pd # Reading data and preprocessings steps -#preprocessor = Preprocessor() + print("load dataset") df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") #df = df_original.copy() +print("len(df)",len(df)) + + print("remove blank rows") df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True) +print("len(df)",len(df)) + +print("remove small articles < 15 words") +#preprocessor = Preprocessor() +#preprocessor.removeArticlesByTokensNumbers(df, 'content', 25) +df = df.loc[(df['nb_word']>=15)] +print("len(df)",len(df)) + + df.reset_index(drop=True, inplace=True) @@ -90,4 +102,4 @@ tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) tosave.to_excel("normClass_artfl.xlsx") print(df_original.shape) -''' +''' \ No newline at end of file diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py index 285be2d..d52ec22 100644 --- a/training_bertFineTuning.py +++ b/training_bertFineTuning.py @@ -10,7 +10,7 @@ import time import datetime import random import os - +import argparse def flat_accuracy(preds, labels): @@ -19,9 +19,6 @@ def flat_accuracy(preds, labels): return np.sum(pred_flat == labels_flat) / len(labels_flat) - - - def format_time(elapsed): ''' Takes a time in seconds and returns a string hh:mm:ss @@ -50,9 +47,6 @@ def training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_si print('No GPU available, using the CPU instead.') device = torch.device("cpu") - - - ############################################################################################################ ########################## Model: Tokenization & Input Formatting ################################################################### ########################################################################################################### @@ -398,3 +392,92 @@ model_save_name = config.get('model','modelName') path = config.get('model','path') #torch.save(model.state_dict(), os.path.join(path,model_save_name)) torch.save(model, os.path.join(path,model_save_name))''' + + + + + + + + + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument("input_dataset") + parser.add_argument("conf_file") + parser.add_argument("output_path") + + args = parser.parse_args() + + INPUT_DATASET = args.input_dataset + CONF_FILE = args.conf_file + OUTPUT_PATH = args.output_path + + config = configparser.ConfigParser() + config.read(CONF_FILE) + + #dataPath = config.get('general','dataPath') + columnText = config.get('general','columnText') + columnClass = config.get('general','columnClass') + + minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) + maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) + + chosen_tokeniser = config.get('model','tokeniser') + chosen_model = config.get('model','model') + + max_len = int(config.get('model','max_len_sequences')) + batch_size = int(config.get('model','batch_size')) + epochs = int(config.get('model','epochs')) + + + df = pd.read_csv(INPUT_DATASET) + df = remove_weak_classes(df, columnClass, minOfInstancePerClass) + df = resample_classes(df, columnClass, maxOfInstancePerClass) + #df = df[df[columnClass] != 'unclassified'] + + + y = df[columnClass] + numberOfClasses = y.nunique() + encoder = preprocessing.LabelEncoder() + y = encoder.fit_transform(y) + + + train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) + + sentences = train_x[columnText].values + labels = train_y.tolist() + + + #call train method + + model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs) + + + #save the model + model_save_name = chosen_model+"_b"+batch_size+"_e"+epochs + + torch.save(model, os.path.join(OUTPUT_PATH,model_save_name)) + + #print the model parameters + params = list(model.named_parameters()) + + print('The BERT model has {:} different named parameters.\n'.format(len(params))) + + print('==== Embedding Layer ====\n') + + for p in params[0:5]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + print('\n==== First Transformer ====\n') + + for p in params[5:21]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) + + print('\n==== Output Layer ====\n') + + for p in params[-4:]: + print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) -- GitLab