From ded9f813c97d34fb2557827fa2d3341ed9870d2c Mon Sep 17 00:00:00 2001
From: lmoncla <ludovic.moncla@insa-lyon.fr>
Date: Fri, 17 Sep 2021 11:30:52 +0200
Subject: [PATCH] add main and args to training_bertFineTuning.py

---
 bert_settings.conf               |  6 +-
 experimentsClassicClassifiers.py |  4 +-
 script.txt                       | 12 ----
 tmp_preprocess_data.py           | 16 +++++-
 training_bertFineTuning.py       | 97 +++++++++++++++++++++++++++++---
 5 files changed, 108 insertions(+), 27 deletions(-)
 delete mode 100644 script.txt

diff --git a/bert_settings.conf b/bert_settings.conf
index ccba612..baf830d 100644
--- a/bert_settings.conf
+++ b/bert_settings.conf
@@ -1,13 +1,11 @@
 [general]
-dataPath = Data/dataframe_with_ensemble_domaine_enccre.csv
 columnText = contentWithoutClass
 columnClass = ensemble_domaine_enccre
-minOfInstancePerClass = 200
+minOfInstancePerClass = 50
 maxOfInstancePerClass = 1500
 
 
 [model]
-
 tokeniser = bert-base-multilingual-cased
 #tokeniser = camembert-base
 model =  bert-base-multilingual-cased
@@ -15,5 +13,3 @@ model =  bert-base-multilingual-cased
 max_len_sequences = 256
 batch_size = 32
 epochs = 4
-pathModel = ' '
-modelName = ' '
diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py
index 17a5409..f5d6ade 100644
--- a/experimentsClassicClassifiers.py
+++ b/experimentsClassicClassifiers.py
@@ -75,11 +75,13 @@ doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
 print("size after resampling, ",len(df))
 
 #prepare data
-df = df[df[columnClass] != 'unclassified']
+#df = df[df[columnClass] != 'unclassified']
 y  = df[columnClass]
 
 print(df.head())
 
+print(df[columnClass].head())
+
 train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
 encoder = preprocessing.LabelEncoder()
 train_y = encoder.fit_transform(train_y)
diff --git a/script.txt b/script.txt
deleted file mode 100644
index bcde25b..0000000
--- a/script.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-pip install -r requirements.txt
-python tmp_preprocess_data.py 
-python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 300 1500
-python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 50 1500 
-python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 50 800     
-python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 100 1500   
-python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 300 1500
-python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 50 1500
-python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 300 500            
-python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 300 1500
-python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 50 2000
-python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 50 500
diff --git a/tmp_preprocess_data.py b/tmp_preprocess_data.py
index bc852b8..30b0d75 100644
--- a/tmp_preprocess_data.py
+++ b/tmp_preprocess_data.py
@@ -24,15 +24,27 @@ import pandas as pd
 
 # Reading data and preprocessings steps
 
-#preprocessor = Preprocessor()
+
 
 print("load dataset")
 
 df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
 #df = df_original.copy()
 
+print("len(df)",len(df))
+
+
 print("remove blank rows")
 df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
+print("len(df)",len(df))
+
+print("remove small articles < 15 words")
+#preprocessor = Preprocessor()
+#preprocessor.removeArticlesByTokensNumbers(df, 'content', 25)
+df = df.loc[(df['nb_word']>=15)]
+print("len(df)",len(df))
+
+
 df.reset_index(drop=True, inplace=True)
 
 
@@ -90,4 +102,4 @@ tosave = pd.DataFrame.from_dict(d_3, orient='index',  columns=[ 'Count'])
 tosave.to_excel("normClass_artfl.xlsx")
 
 print(df_original.shape)
-'''
+'''
\ No newline at end of file
diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py
index 285be2d..d52ec22 100644
--- a/training_bertFineTuning.py
+++ b/training_bertFineTuning.py
@@ -10,7 +10,7 @@ import time
 import datetime
 import random
 import os
-
+import argparse
 
 
 def flat_accuracy(preds, labels):
@@ -19,9 +19,6 @@ def flat_accuracy(preds, labels):
     return np.sum(pred_flat == labels_flat) / len(labels_flat)
 
 
-
-
-
 def format_time(elapsed):
     '''
     Takes a time in seconds and returns a string hh:mm:ss
@@ -50,9 +47,6 @@ def training_bertFineTuning(chosen_model,  sentences, labels, max_len,  batch_si
         print('No GPU available, using the CPU instead.')
         device = torch.device("cpu")
 
-
-
-
 ############################################################################################################
 ########################## Model: Tokenization & Input Formatting ###################################################################
 ###########################################################################################################
@@ -398,3 +392,92 @@ model_save_name = config.get('model','modelName')
 path = config.get('model','path')
 #torch.save(model.state_dict(), os.path.join(path,model_save_name))
 torch.save(model, os.path.join(path,model_save_name))'''
+
+
+
+
+
+
+
+
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("input_dataset")
+    parser.add_argument("conf_file")
+    parser.add_argument("output_path")
+
+    args = parser.parse_args()
+
+    INPUT_DATASET = args.input_dataset
+    CONF_FILE = args.conf_file
+    OUTPUT_PATH = args.output_path
+
+    config = configparser.ConfigParser()
+    config.read(CONF_FILE)
+
+    #dataPath = config.get('general','dataPath')
+    columnText = config.get('general','columnText')
+    columnClass = config.get('general','columnClass')
+
+    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+
+    chosen_tokeniser = config.get('model','tokeniser')
+    chosen_model = config.get('model','model')
+
+    max_len = int(config.get('model','max_len_sequences'))
+    batch_size = int(config.get('model','batch_size'))
+    epochs = int(config.get('model','epochs'))
+
+
+    df = pd.read_csv(INPUT_DATASET)
+    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+    df = resample_classes(df, columnClass, maxOfInstancePerClass)
+    #df = df[df[columnClass] != 'unclassified']
+
+
+    y  = df[columnClass]
+    numberOfClasses = y.nunique()
+    encoder = preprocessing.LabelEncoder()
+    y = encoder.fit_transform(y)
+
+
+    train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
+
+    sentences = train_x[columnText].values
+    labels = train_y.tolist()
+
+
+    #call train method
+
+    model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
+
+
+    #save the model
+    model_save_name = chosen_model+"_b"+batch_size+"_e"+epochs
+
+    torch.save(model, os.path.join(OUTPUT_PATH,model_save_name))
+
+    #print the model parameters
+    params = list(model.named_parameters())
+
+    print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+    print('==== Embedding Layer ====\n')
+
+    for p in params[0:5]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== First Transformer ====\n')
+
+    for p in params[5:21]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== Output Layer ====\n')
+
+    for p in params[-4:]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
-- 
GitLab