Skip to content
Snippets Groups Projects
Commit ded9f813 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

add main and args to training_bertFineTuning.py

parent 44ec22f5
No related branches found
No related tags found
No related merge requests found
[general]
dataPath = Data/dataframe_with_ensemble_domaine_enccre.csv
columnText = contentWithoutClass
columnClass = ensemble_domaine_enccre
minOfInstancePerClass = 200
minOfInstancePerClass = 50
maxOfInstancePerClass = 1500
[model]
tokeniser = bert-base-multilingual-cased
#tokeniser = camembert-base
model = bert-base-multilingual-cased
......@@ -15,5 +13,3 @@ model = bert-base-multilingual-cased
max_len_sequences = 256
batch_size = 32
epochs = 4
pathModel = ' '
modelName = ' '
......@@ -75,11 +75,13 @@ doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
print("size after resampling, ",len(df))
#prepare data
df = df[df[columnClass] != 'unclassified']
#df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
print(df.head())
print(df[columnClass].head())
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
......
pip install -r requirements.txt
python tmp_preprocess_data.py
python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 50 800
python experimentsClassicClassifiers.py data/dataframe_with_ensemble_domaine_enccre.csv contentWithoutClass ensemble_domaine_enccre 100 1500
python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/dataframe_with_domaine_enccre.csv contentWithoutClass domaine_enccre 300 500
python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 300 1500
python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 50 2000
python experimentsClassicClassifiers.py data/dataframe_with_normClass.csv contentWithoutClass normClass 50 500
......@@ -24,15 +24,27 @@ import pandas as pd
# Reading data and preprocessings steps
#preprocessor = Preprocessor()
print("load dataset")
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
#df = df_original.copy()
print("len(df)",len(df))
print("remove blank rows")
df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
print("len(df)",len(df))
print("remove small articles < 15 words")
#preprocessor = Preprocessor()
#preprocessor.removeArticlesByTokensNumbers(df, 'content', 25)
df = df.loc[(df['nb_word']>=15)]
print("len(df)",len(df))
df.reset_index(drop=True, inplace=True)
......@@ -90,4 +102,4 @@ tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count'])
tosave.to_excel("normClass_artfl.xlsx")
print(df_original.shape)
'''
'''
\ No newline at end of file
......@@ -10,7 +10,7 @@ import time
import datetime
import random
import os
import argparse
def flat_accuracy(preds, labels):
......@@ -19,9 +19,6 @@ def flat_accuracy(preds, labels):
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
......@@ -50,9 +47,6 @@ def training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_si
print('No GPU available, using the CPU instead.')
device = torch.device("cpu")
############################################################################################################
########################## Model: Tokenization & Input Formatting ###################################################################
###########################################################################################################
......@@ -398,3 +392,92 @@ model_save_name = config.get('model','modelName')
path = config.get('model','path')
#torch.save(model.state_dict(), os.path.join(path,model_save_name))
torch.save(model, os.path.join(path,model_save_name))'''
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("input_dataset")
parser.add_argument("conf_file")
parser.add_argument("output_path")
args = parser.parse_args()
INPUT_DATASET = args.input_dataset
CONF_FILE = args.conf_file
OUTPUT_PATH = args.output_path
config = configparser.ConfigParser()
config.read(CONF_FILE)
#dataPath = config.get('general','dataPath')
columnText = config.get('general','columnText')
columnClass = config.get('general','columnClass')
minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
chosen_tokeniser = config.get('model','tokeniser')
chosen_model = config.get('model','model')
max_len = int(config.get('model','max_len_sequences'))
batch_size = int(config.get('model','batch_size'))
epochs = int(config.get('model','epochs'))
df = pd.read_csv(INPUT_DATASET)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
#df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
numberOfClasses = y.nunique()
encoder = preprocessing.LabelEncoder()
y = encoder.fit_transform(y)
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
sentences = train_x[columnText].values
labels = train_y.tolist()
#call train method
model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
#save the model
model_save_name = chosen_model+"_b"+batch_size+"_e"+epochs
torch.save(model, os.path.join(OUTPUT_PATH,model_save_name))
#print the model parameters
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment