Merge branch 'branch_dev_bert_exp' into 'master'

Branch dev bert exp See merge request !5

Merge branch 'branch_dev_bert_exp' into 'master'
Branch dev bert exp See merge request !5
5eb0043a · Ludovic Moncla · 28895891 · 66234a84 · 5eb0043a · 5eb0043a
Commit 5eb0043a authored 3 years ago by Ludovic Moncla
--- a/BertFineTuning_.ipynb
+++ b/BertFineTuning_.ipynb
--- a/README.md
+++ b/README.md
@@ -21,7 +21,13 @@ In order to run the classifiers, use the following command :

    python experimentsClassicClassifiers.py <dataset_tsv_file> <content_column_name> <labels_column_name> <min_sample_per_class> <max_sample_per_class>

+In order to run Classification with pre-trained models, use the following command :
+
+   
+    cd experiments/
+    
+    python bert_experiments.py  <model_Name> <classifier> 

 # Acknowledgment

-The authors are grateful to the ASLAN project (ANR-10-LABX-0081) of the Université de Lyon, for its financial support within the French program "Investments for the Future" operated by the National Research Agency (ANR).
\ No newline at end of file
+The authors are grateful to the ASLAN project (ANR-10-LABX-0081) of the Université de Lyon, for its financial support within the French program "Investments for the Future" operated by the National Research Agency (ANR).
--- a/bert_settings.conf
+++ b/bert_settings.conf
+[general]
+dataPath = Data/dataframe_with_ensemble_domaine_enccre.csv
+columnText = contentWithoutClass
+columnClass = ensemble_domaine_enccre
+minOfInstancePerClass = 200
+maxOfInstancePerClass = 1500
+
+
+[model]
+
+tokeniser = bert-base-multilingual-cased
+#tokeniser = camembert-base
+model =  bert-base-multilingual-cased
+#model = camembert-base
+max_len_sequences = 256
+batch_size = 32
+epochs = 4
+pathModel = ' '
+modelName = ' '
--- a/evaluate_bertFineTuning.py
+++ b/evaluate_bertFineTuning.py
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import seaborn as sns
+
+
+
+
+
+
+
+
+
+
+def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder):
+    report = classification_report( pred_labels_, true_labels_, output_dict = True)
+
+    classes = [str(e) for e in encoder.transform(encoder.classes_)]
+    classesName = encoder.classes_
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+
+    dff['className'] = classesName
+    dff['precision'] = precision
+    dff['recall'] = recall
+    dff['f1-score'] = f1
+    dff['support'] = support
+    dff['FP'] = FP
+    dff['FN'] = FN
+    dff['TP'] = TP
+    dff['TN'] = TN
+
+    return dff, accuracy, weighted_avg
--- a/experiments/bert_experiments.py
+++ b/experiments/bert_experiments.py
+import pandas as pd
+import numpy as np
+import torch
+import transformers as ppb
+from sklearn.model_selection import train_test_split
+from sklearn import preprocessing
+import statistics
+import os
+import sys
+import argparse
+import configparser
+from transformers import CamembertModel, CamembertTokenizer
+from transformers import FlaubertModel, FlaubertTokenizer
+
+
+from sklearn.svm import SVC
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import SGDClassifier
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.model_selection import GridSearchCV
+
+
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import seaborn as sns
+
+
+
+
+
+
+def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave):
+
+    #classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    weighted_avg = None
+    accuracy = None
+
+    df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    report = classification_report( y_pred, valid_y, output_dict = True)
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(valid_y, y_pred)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+
+    df['className'] = classesName
+    df['precision'] = precision
+    df['recall'] = recall
+    df['f1-score'] = f1
+    df['support'] = support
+    df['FP'] = FP
+    df['FN'] = FN
+    df['TP'] = TP
+    df['TN'] = TN
+    #disp = plot_confusion_matrix(classifier, test_x, valid_y,
+    #                                 display_labels= label_list,
+    #                                 cmap=plt.cm.Blues,
+    #                                 normalize=None)
+    #disp.ax_.set_title(title)
+
+    #print(title)
+    #print(disp.confusion_matrix)
+
+    #plt.show()
+    plt.rcParams["font.size"] = 3
+    plot_confusion_matrix(clf, X_test, y_test)
+    plt.savefig(pathSave)
+    return df, accuracy, weighted_avg
+
+
+
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+
+def remove_weak_classes(df, classColumnName, threshold):
+
+    dictOfClassInstances = create_dict(df,classColumnName)
+
+
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    #df = df[df[columnTarget] not in keys]
+    #df =  df.merge(df_tmp, how = 'outer' ,indicator=True)
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+
+
+def split_class(df, columnProcessed):
+    i = 0
+    new_df = pd.DataFrame(columns= df.columns)
+    for index, row in df.iterrows():
+        #cls = re.split(';', row[columnProcessed])
+        cls = filter(None, row[columnProcessed].split(';'))
+        cls = list(cls)
+        #cls = re.findall(r"[\w']+", row [columnProcessed])
+        r = row
+        for categ in cls:
+            r[columnProcessed] = categ
+            #new_df.append(r, ignore_index = True)
+            new_df.loc[i] = r
+            i = i + 1
+
+    return new_df
+
+
+def resample_classes(df, classColumnName, numberOfInstances):
+    # numberOfInstances first elements
+    #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
+    #random numberOfInstances elements
+    replace = False  # with replacement
+
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+
+
+def select_classifier(argument):
+
+    classifiers = {
+
+                'lr' :LogisticRegression(),
+                'sgd' :SGDClassifier(),
+                'svm' :SVC() ,
+                'decisionTree' :DecisionTreeClassifier(),
+                'rfc' :RandomForestClassifier(),
+                'knn' : KNeighborsClassifier()
+                }
+
+    param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
+    param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
+    param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
+    param_grid_lr = { "penalty":['none',"l2"]}
+    param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
+    param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
+
+    grid_params = {
+
+                'lr': param_grid_lr,
+                'sgd': param_grid_sgd ,
+                'svm': param_grid_svm,
+                'decisionTree': param_grid_decisionTree,
+                'rfc': param_grid_rfc ,
+                'knn': param_grid_knn,
+
+                }
+
+    return classifiers.get(argument), grid_params.get(argument)
+
+
+if __name__ == "__main__":
+
+
+
+
+
+    print('ok')
+    parser = argparse.ArgumentParser()
+    parser.add_argument("modelName", help="bert or distilBert or camembert or flaubert")
+    parser.add_argument("classifier", help="lr or knn or rfc or decisionTree or sgd or svm")
+
+
+    args = parser.parse_args()
+    arg = args.modelName
+    classifier = args.classifier
+
+    config = configparser.ConfigParser()
+    config.read('parameters.conf')
+
+    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+
+    dataPath = config.get('data','dataPath')
+    columnText = config.get('data','columnText')
+    columnClass = config.get('data','columnClass')
+
+
+
+    if not os.path.exists('reports'):
+        os.makedirs('reports')
+
+    if not os.path.exists(os.path.join('reports',  columnClass)):
+        os.makedirs(os.path.join('reports', columnClass))
+
+
+    dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
+    if not os.path.exists(os.path.join('reports',  columnClass, dir_name_report)):
+        os.makedirs(os.path.join('reports', columnClass, dir_name_report))
+
+
+
+    # read data
+    print(dataPath)
+    df = pd.read_csv(dataPath)
+    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+    df = resample_classes(df, columnClass, maxOfInstancePerClass)
+
+    print(df.head())
+    print(df.shape)
+    #encode labels
+    df = df[df[columnClass] != 'unclassified']
+    y  = df[columnClass]
+    encoder = preprocessing.LabelEncoder()
+    y = encoder.fit_transform(y)
+
+
+    sentences = df['firstParagraph']
+    labels = y.tolist()
+
+
+
+    # Features Extraction
+        #Bert
+    model_class_bert, tokenizer_class_bert, pretrained_weights_bert = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
+    tokenizer_bert = tokenizer_class_bert.from_pretrained(pretrained_weights_bert)
+    model_bert = model_class_bert.from_pretrained(pretrained_weights_bert)
+        #DistilBert
+    model_class_distilBert, tokenizer_class_distilBert, pretrained_weights_distilBert = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
+    tokenizer_distilBert = tokenizer_class_distilBert.from_pretrained(pretrained_weights_distilBert)
+    model_distilBert = model_class_distilBert.from_pretrained(pretrained_weights_distilBert)
+        #Camembert
+    camembert_tokenizer = CamembertTokenizer.from_pretrained("camembert/camembert-base")
+    camembert = CamembertModel.from_pretrained("camembert/camembert-base")
+        #Flaubert
+
+    flaubert, log = FlaubertModel.from_pretrained('flaubert/flaubert_base_cased', output_loading_info=True)
+    flaubert_tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased', do_lowercase=False)
+
+
+
+    models = {
+            'bert': model_bert,
+            'distilbert': model_distilBert ,
+            'camembert': camembert,
+            'flaubert': flaubert
+            }
+
+    tokenizers = {
+    'bert': tokenizer_bert,
+    'distilbert': tokenizer_distilBert ,
+    'camembert': camembert_tokenizer,
+    'flaubert': flaubert_tokenizer
+
+    }
+
+
+
+
+
+
+    if arg == 'flaubert':
+        model = flaubert
+        tokenizer = flaubert_tokenizer
+    elif arg == 'camembert':
+        model = camembert
+        tokenizer = camembert_tokenizer
+
+    elif arg == 'distilbert':
+        model = model_distilBert
+        tokenizer = tokenizer_distilBert
+
+    elif arg == 'bert':
+        model = model_bert
+        tokenizer = tokenizer_bert
+
+
+
+
+
+
+    tokenized = sentences.apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length = 512, truncation = True)))
+
+    # padding the sequences
+    max_len = 0
+    for i in tokenized.values:
+        if len(i) > max_len:
+            max_len = len(i)
+
+    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
+
+
+
+    # attention mask
+
+    attention_mask = np.where(padded != 0, 1, 0)
+
+
+
+    # get features
+    input_ids = torch.tensor(padded)
+    attention_mask = torch.tensor(attention_mask)
+
+    with torch.no_grad():
+        last_hidden_states = model(input_ids, attention_mask=attention_mask)
+
+    features = last_hidden_states[0][:,0,:].numpy()
+    print(features.shape)
+
+    train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
+
+
+    # classification
+
+
+    clf, grid_param = select_classifier(classifier)
+
+    print(features)
+
+
+
+    clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
+
+    clf.fit(train_x, train_y)
+
+    #evaluation
+
+
+    y_pred = clf.predict(test_x)
+
+
+    report, accuracy, weighted_avg = evaluate_model(clf, test_x, test_y, y_pred, test_y, [str(e) for e in encoder.transform(encoder.classes_)],  encoder.classes_, os.path.join('reports', columnClass, dir_name_report, arg+ '_' + classifier+'.pdf'))
+
+    report.to_csv(os.path.join('reports', columnClass,  dir_name_report, arg + '_' + classifier +'.csv'))
+    with open(os.path.join('reports', columnClass,  dir_name_report, arg + '_' + classifier+'.txt'), 'w') as f:
+
+        sys.stdout = f # Change the standard output to the file we created.
+        print('accuracy : {}'.format(accuracy))
+        print('weighted_Precision : {}'.format(weighted_avg['precision']))
+        print('weighted_Recall    : {}'.format(weighted_avg['recall']))
+        print('weighted_F-score   : {}'.format(weighted_avg['f1-score']))
+        print('weighted_Support   : {}'.format(weighted_avg['support']))
+        print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
+        #sys.stdout = sys.stdout # Reset the standard output to its original value
+        sys.stdout = sys.__stdout__
--- a/experiments/parameters.conf
+++ b/experiments/parameters.conf
+[general]
+
+minOfInstancePerClass = 1200
+maxOfInstancePerClass = 7
+
+[data]
+
+dataPath = ../Data/dataframe_with_ensemble_domaine_enccre.csv
+columnText = contentWithoutClass
+columnClass = ensemble_domaine_enccre
--- a/experiments/requierements.txt
+++ b/experiments/requierements.txt
+transformers==4.3.2
+sentencepiece
+sklearn
+pandas
+numpy
+torch==1.8.1
+
--- a/main.py
+++ b/main.py
+import pandas as pd
+import numpy as np
+import configparser
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+
+from training_bertFineTuning import training_bertFineTuning
+from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader
+from evaluate_bertFineTuning import evaluate_bertFineTuning
+
+
+
+
+
+
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+
+def remove_weak_classes(df, classColumnName, threshold):
+
+    dictOfClassInstances = create_dict(df,classColumnName)
+
+
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+
+
+def resample_classes(df, classColumnName, numberOfInstances):
+
+    #random numberOfInstances elements
+    replace = False  # with replacement
+
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+
+
+
+def main():
+
+    config = configparser.ConfigParser()
+    config.read('bert_settings.conf')
+
+    dataPath = config.get('general','dataPath')
+    columnText = config.get('general','columnText')
+    columnClass = config.get('general','columnClass')
+
+    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+
+    chosen_tokeniser = config.get('model','tokeniser')
+    chosen_model = config.get('model','model')
+
+    max_len = int(config.get('model','max_len_sequences'))
+    batch_size = int(config.get('model','batch_size'))
+    epochs = int(config.get('model','epochs'))
+
+    df = pd.read_csv(dataPath)
+    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+    df = resample_classes(df, columnClass, maxOfInstancePerClass)
+    df = df[df[columnClass] != 'unclassified']
+
+
+    y  = df[columnClass]
+    numberOfClasses = y.nunique()
+    encoder = preprocessing.LabelEncoder()
+    y = encoder.fit_transform(y)
+
+
+    train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
+
+    sentences = train_x[columnText].values
+    labels = train_y.tolist()
+
+
+    #call train method
+
+    model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
+    #save the model
+    model_save_name = config.get('model','modelName')
+    path = config.get('model','path')
+    torch.save(model, os.path.join(path,model_save_name))
+
+    #print the model parameters
+    params = list(model.named_parameters())
+
+    print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+    print('==== Embedding Layer ====\n')
+
+    for p in params[0:5]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== First Transformer ====\n')
+
+    for p in params[5:21]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== Output Layer ====\n')
+
+    for p in params[-4:]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+    #call predict method
+    prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32)
+    predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader)
+
+    #call Evaluate
+    result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder)
+
+    print(result_df)
+    print(accuracy)
+    print(weighted_avg)
+
+
+
+if __name__ == "__main__":
+    main()
--- a/predict_bertFineTuning.py
+++ b/predict_bertFineTuning.py
+import torch
+
+import pandas as pd
+
+import numpy as np
+
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import BertTokenizer, CamembertTokenizer
+
+def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
+
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+
+        input_ids_test.append(encoded_sent)
+
+    # Pad our input tokens
+    padded_test = []
+    for i in input_ids_test:
+
+      if len(i) > max_len:
+        padded_test.extend([i[:max_len]])
+      else:
+        padded_test.extend([i + [0] * (max_len - len(i))])
+    input_ids_test = np.array(padded_test)
+
+    # Create attention masks
+    attention_masks = []
+
+    # Create a mask of 1s for each token followed by 0s for padding
+    for seq in input_ids_test:
+        seq_mask = [float(i>0) for i in seq]
+        attention_masks.append(seq_mask)
+
+    # Convert to tensors.
+    prediction_inputs = torch.tensor(input_ids_test)
+    prediction_masks = torch.tensor(attention_masks)
+    prediction_labels = torch.tensor(labels)
+
+    # Set the batch size.
+    batch_size = 32
+
+    # Create the DataLoader.
+    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+    prediction_sampler = SequentialSampler(prediction_data)
+    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+
+    return prediction_dataloader
+
+
+
+def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
+
+
+    # If there's a GPU available...
+    if torch.cuda.is_available():
+
+        # Tell PyTorch to use the GPU.
+        device = torch.device("cuda")
+
+        print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+
+        # If not...
+    else:
+        print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
+
+    # Put model in evaluation mode
+    model.eval()
+
+    # Tracking variables
+    predictions_test , true_labels = [], []
+
+    # Predict
+    for batch in prediction_dataloader:
+    # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+
+        # Unpack the inputs from the dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up prediction
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+
+        logits = outputs[0]
+        #print(logits)
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+        #print(logits)
+
+        # Store predictions and true labels
+        predictions_test.append(logits)
+        true_labels.append(label_ids)
+
+        print('    DONE.')
+
+        pred_labels = []
+
+
+        for i in range(len(true_labels)):
+
+            # The predictions for this batch are a 2-column ndarray (one column for "0"
+            # and one column for "1"). Pick the label with the highest value and turn this
+            # in to a list of 0s and 1s.
+            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
+            pred_labels.append(pred_labels_i)
+
+        pred_labels_ = [item for sublist in pred_labels for item in sublist]
+        true_labels_ = [item for sublist in true_labels for item in sublist]
+        return predictions_test_, true_labels_
+
+
+def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
+    
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+
+        input_ids_test.append(encoded_sent)
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+
+        logits = outputs[0]
--- a/training_bertFineTuning.py
+++ b/training_bertFineTuning.py
+import torch
+import pandas as pd
+import numpy as np
+from sklearn import preprocessing
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import BertTokenizer, CamembertTokenizer
+from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
+from transformers import get_linear_schedule_with_warmup
+import time
+import datetime
+import random
+import os
+
+
+
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+
+
+
+
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
+
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+
+def training_bertFineTuning(chosen_model,  sentences, labels, max_len,  batch_size, epochs = 4):
+
+    # If there's a GPU available...
+    if torch.cuda.is_available():
+
+        # Tell PyTorch to use the GPU.
+        device = torch.device("cuda")
+
+        print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+
+        # If not...
+    else:
+        print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
+
+
+
+
+############################################################################################################
+########################## Model: Tokenization & Input Formatting ###################################################################
+###########################################################################################################
+
+
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+
+
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids = []
+
+    # For every sentence...
+    for sent in sentences:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+
+                            # This function also supports truncation and conversion
+                            # to pytorch tensors, but I need to do padding, so I
+                            # can't use these features.
+                            #max_length = 128,          # Truncate all sentences.
+                            #return_tensors = 'pt',     # Return pytorch tensors.
+                            )
+
+        # Add the encoded sentence to the list.
+        input_ids.append(encoded_sent)
+
+
+
+
+    padded = []
+    for i in input_ids:
+
+        if len(i) > max_len:
+            padded.extend([i[:max_len]])
+        else:
+            padded.extend([i + [0] * (max_len - len(i))])
+
+
+    padded  = np.array(padded)
+
+
+
+    # Create attention masks
+    attention_masks = []
+
+    # For each sentence...
+    for sent in padded:
+
+        # Create the attention mask.
+        #   - If a token ID is 0, then it's padding, set the mask to 0.
+        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
+        att_mask = [int(token_id > 0) for token_id in sent]
+
+        # Store the attention mask for this sentence.
+        attention_masks.append(att_mask)
+
+
+    # Use 90% for training and 10% for validation.
+    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.1, stratify = labels )
+    # Do the same for the masks.
+    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1, stratify = labels)
+
+
+    # Convert all inputs and labels into torch tensors, the required datatype
+    # for my model.
+    train_inputs = torch.tensor(train_inputs)
+    validation_inputs = torch.tensor(validation_inputs)
+
+    train_labels = torch.tensor(train_labels)
+    validation_labels = torch.tensor(validation_labels)
+
+    train_masks = torch.tensor(train_masks)
+    validation_masks = torch.tensor(validation_masks)
+
+
+
+
+    # The DataLoader needs to know the batch size for training, so I specify it here.
+    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
+    # 16 or 32.
+
+
+    # Create the DataLoader for training set.
+    train_data = TensorDataset(train_inputs, train_masks, train_labels)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
+
+    # Create the DataLoader for validation set.
+    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
+    validation_sampler = SequentialSampler(validation_data)
+    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
+
+
+
+
+
+    print(' Selecting a model .....')
+
+    numberOfClasses = len(set(labels))
+
+
+    # Load BertForSequenceClassification, the pretrained BERT model with a single
+    # linear classification layer on top.
+    if chosen_model == 'bert-base-multilingual-cased':
+        model = BertForSequenceClassification.from_pretrained(
+            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+            # You can increase this for multi-class tasks.
+            output_attentions = False, # Whether the model returns attentions weights.
+            output_hidden_states = False, # Whether the model returns all hidden-states.
+            )
+    elif chosen_model == 'camembert-base':
+
+        model = CamembertForSequenceClassification.from_pretrained(
+            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+            # You can increase this for multi-class tasks.
+            output_attentions = False, # Whether the model returns attentions weights.
+            output_hidden_states = False, # Whether the model returns all hidden-states.
+            )
+
+
+    # Tell pytorch to run this model on the GPU.
+    model.cuda()
+
+
+    #Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+    # I believe the 'W' stands for 'Weight Decay fix"
+    optimizer = AdamW(model.parameters(),
+                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                    )
+
+
+
+
+    # Total number of training steps is number of batches * number of epochs.
+    total_steps = len(train_dataloader) * epochs
+
+    # Create the learning rate scheduler.
+    scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
+
+
+
+
+    # This training code is based on the `run_glue.py` script here:
+    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+
+
+    # Set the seed value all over the place to make this reproducible.
+    seed_val = 42
+
+    random.seed(seed_val)
+    np.random.seed(seed_val)
+    torch.manual_seed(seed_val)
+    torch.cuda.manual_seed_all(seed_val)
+
+    # Store the average loss after each epoch so I can plot them.
+    loss_values = []
+
+    # For each epoch...
+    for epoch_i in range(0, epochs):
+
+        # ========================================
+        #               Training
+        # ========================================
+
+        # Perform one full pass over the training set.
+
+        print("")
+        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+        print('Training...')
+
+        # Measure how long the training epoch takes.
+        t0 = time.time()
+
+        # Reset the total loss for this epoch.
+        total_loss = 0
+
+        # Put the model into training mode.
+        model.train()
+
+        # For each batch of training data...
+        for step, batch in enumerate(train_dataloader):
+
+            # Progress update every 40 batches.
+            if step % 40 == 0 and not step == 0:
+                # Calculate elapsed time in minutes.
+                elapsed = format_time(time.time() - t0)
+
+                # Report progress.
+                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
+
+            # Unpack this training batch from the dataloader.
+            #
+            # As I unpack the batch, I'll also copy each tensor to the GPU using the
+            # `to` method.
+            #
+            # `batch` contains three pytorch tensors:
+            #   [0]: input ids
+            #   [1]: attention masks
+            #   [2]: labels
+            b_input_ids = batch[0].to(device)
+            b_input_mask = batch[1].to(device)
+            b_labels = batch[2].to(device)
+
+            # Always clear any previously calculated gradients before performing a
+            # backward pass. PyTorch doesn't do this automatically because
+            # accumulating the gradients is "convenient while training RNNs".
+            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+            model.zero_grad()
+
+            # Perform a forward pass (evaluate the model on this training batch).
+            # This will return the loss (rather than the model output) because I
+            # have provided the `labels`.
+            # The documentation for this `model` function is here:
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            outputs = model(b_input_ids,
+                        token_type_ids=None,
+                        attention_mask=b_input_mask,
+                        labels=b_labels)
+
+            # The call to `model` always returns a tuple, so I need to pull the
+            # loss value out of the tuple.
+            loss = outputs[0]
+
+            # Accumulate the training loss over all of the batches so that I can
+            # calculate the average loss at the end. `loss` is a Tensor containing a
+            # single value; the `.item()` function just returns the Python value
+            # from the tensor.
+            total_loss += loss.item()
+
+            #  Perform a backward pass to calculate the gradients.
+            loss.backward()
+
+            # Clip the norm of the gradients to 1.0.
+            # This is to help prevent the "exploding gradients" problem.
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+
+            # Update parameters and take a step using the computed gradient.
+            # The optimizer dictates the "update rule"--how the parameters are
+            # modified based on their gradients, the learning rate, etc.
+            optimizer.step()
+
+            # Update the learning rate.
+            scheduler.step()
+
+        # Calculate the average loss over the training data.
+        avg_train_loss = total_loss / len(train_dataloader)
+
+        # Store the loss value for plotting the learning curve.
+        loss_values.append(avg_train_loss)
+
+        print("")
+        print("  Average training loss: {0:.2f}".format(avg_train_loss))
+        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
+
+        # ========================================
+        #               Validation
+        # ========================================
+        # After the completion of each training epoch, measure the performance on
+        # the validation set.
+
+        print("")
+        print("Running Validation...")
+
+        t0 = time.time()
+
+        # Put the model in evaluation mode--the dropout layers behave differently
+        # during evaluation.
+        model.eval()
+
+        # Tracking variables
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+
+        # Evaluate data for one epoch
+        for batch in validation_dataloader:
+
+            # Add batch to GPU
+            batch = tuple(t.to(device) for t in batch)
+
+            # Unpack the inputs from dataloader
+            b_input_ids, b_input_mask, b_labels = batch
+
+            # Telling the model not to compute or store gradients, saving memory and
+            # speeding up validation
+            with torch.no_grad():
+
+                # Forward pass, calculate logit predictions.
+                # This will return the logits rather than the loss because we have
+                # not provided labels.
+                # token_type_ids is the same as the "segment ids", which
+                # differentiates sentence 1 and 2 in 2-sentence tasks.
+                # The documentation for this `model` function is here:
+                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+                outputs = model(b_input_ids,
+                                token_type_ids=None,
+                                attention_mask=b_input_mask)
+
+            # Get the "logits" output by the model. The "logits" are the output
+            # values prior to applying an activation function like the softmax.
+            logits = outputs[0]
+
+            # Move logits and labels to CPU
+            logits = logits.detach().cpu().numpy()
+            label_ids = b_labels.to('cpu').numpy()
+
+            # Calculate the accuracy for this batch of test sentences.
+            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
+
+            # Accumulate the total accuracy.
+            eval_accuracy += tmp_eval_accuracy
+
+            # Track the number of batches
+            nb_eval_steps += 1
+
+        # Report the final accuracy for this validation run.
+        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+        print("  Validation took: {:}".format(format_time(time.time() - t0)))
+
+    print("")
+    print("Training complete!")
+    return model
+
+
+'''print('Saving Model....')
+model_save_name = config.get('model','modelName')
+path = config.get('model','path')
+#torch.save(model.state_dict(), os.path.join(path,model_save_name))
+torch.save(model, os.path.join(path,model_save_name))'''