diff --git a/evaluate_bertFineTuning.py b/evaluate_bertFineTuning.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c9b52bb3dac78506110b0d49716fe97e18e10bf
--- /dev/null
+++ b/evaluate_bertFineTuning.py
@@ -0,0 +1,54 @@
+import matplotlib.pyplot as plt
+from sklearn.metrics import plot_confusion_matrix
+from sklearn.metrics import confusion_matrix
+from sklearn.metrics import classification_report
+import seaborn as sns
+
+
+
+
+
+
+
+
+
+
+def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder):
+    report = classification_report( pred_labels_, true_labels_, output_dict = True)
+
+    classes = [str(e) for e in encoder.transform(encoder.classes_)]
+    classesName = encoder.classes_
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+
+    precision = []
+    recall = []
+    f1 = []
+    support = []
+    dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
+    for c in classes:
+        precision.append(report[c]['precision'])
+        recall.append(report[c]['recall'])
+        f1.append(report[c]['f1-score'])
+        support.append(report[c]['support'])
+
+    accuracy = report['accuracy']
+    weighted_avg = report['weighted avg']
+    cnf_matrix = confusion_matrix(true_labels_, pred_labels_)
+    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
+    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
+    TP = np.diag(cnf_matrix)
+    TN = cnf_matrix.sum() - (FP + FN + TP)
+
+    dff['className'] = classesName
+    dff['precision'] = precision
+    dff['recall'] = recall
+    dff['f1-score'] = f1
+    dff['support'] = support
+    dff['FP'] = FP
+    dff['FN'] = FN
+    dff['TP'] = TP
+    dff['TN'] = TN
+
+    return dff, accuracy, weighted_avg
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..8301acc2f929d750e0cea915a905a721ab8150fb
--- /dev/null
+++ b/main.py
@@ -0,0 +1,120 @@
+import pandas as pd
+import numpy as np
+import configparser
+from sklearn import preprocessing
+from sklearn.model_selection import train_test_split
+
+from training_bertFineTuning import training_bertFineTuning
+from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader
+from evaluate_bertFineTuning import evaluate_bertFineTuning
+
+
+
+
+
+
+def create_dict(df, classColumnName):
+    return dict(df[classColumnName].value_counts())
+
+def remove_weak_classes(df, classColumnName, threshold):
+
+    dictOfClassInstances = create_dict(df,classColumnName)
+
+
+    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
+    keys = [*dictionary]
+    df_tmp = df[~ df[classColumnName].isin(keys)]
+    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
+    return df
+
+
+def resample_classes(df, classColumnName, numberOfInstances):
+
+    #random numberOfInstances elements
+    replace = False  # with replacement
+
+    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
+    return df.groupby(classColumnName, as_index=False).apply(fn)
+
+
+
+def main():
+
+    config = configparser.ConfigParser()
+    config.read('bert_settings.conf')
+
+    dataPath = config.get('general','dataPath')
+    columnText = config.get('general','columnText')
+    columnClass = config.get('general','columnClass')
+
+    minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
+    maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+
+    chosen_tokeniser = config.get('model','tokeniser')
+    chosen_model = config.get('model','model')
+
+    max_len = int(config.get('model','max_len_sequences'))
+    batch_size = int(config.get('model','batch_size'))
+    epochs = int(config.get('model','epochs'))
+
+    df = pd.read_csv(dataPath)
+    df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
+    df = resample_classes(df, columnClass, maxOfInstancePerClass)
+    df = df[df[columnClass] != 'unclassified']
+
+
+    y  = df[columnClass]
+    numberOfClasses = y.nunique()
+    encoder = preprocessing.LabelEncoder()
+    y = encoder.fit_transform(y)
+
+
+    train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
+
+    sentences = train_x[columnText].values
+    labels = train_y.tolist()
+
+
+    #call train method
+
+    model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs)
+    #save the model
+    model_save_name = config.get('model','modelName')
+    path = config.get('model','path')
+    torch.save(model, os.path.join(path,model_save_name))
+
+    #print the model parameters
+    params = list(model.named_parameters())
+
+    print('The BERT model has {:} different named parameters.\n'.format(len(params)))
+
+    print('==== Embedding Layer ====\n')
+
+    for p in params[0:5]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== First Transformer ====\n')
+
+    for p in params[5:21]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+        print('\n==== Output Layer ====\n')
+
+    for p in params[-4:]:
+        print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
+
+    #call predict method
+    prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32)
+    predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader)
+
+    #call Evaluate
+    result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder)
+
+    print(result_df)
+    print(accuracy)
+    print(weighted_avg)
+
+
+
+if __name__ == "__main__":
+    main()
diff --git a/predict_bertFineTuning.py b/predict_bertFineTuning.py
new file mode 100644
index 0000000000000000000000000000000000000000..4276122d0b88159c6631fe2dd2db9d14603558c3
--- /dev/null
+++ b/predict_bertFineTuning.py
@@ -0,0 +1,168 @@
+import torch
+
+import pandas as pd
+
+import numpy as np
+
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from transformers import BertTokenizer, CamembertTokenizer
+
+def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32):
+
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+
+        input_ids_test.append(encoded_sent)
+
+    # Pad our input tokens
+    padded_test = []
+    for i in input_ids_test:
+
+      if len(i) > max_len:
+        padded_test.extend([i[:max_len]])
+      else:
+        padded_test.extend([i + [0] * (max_len - len(i))])
+    input_ids_test = np.array(padded_test)
+
+    # Create attention masks
+    attention_masks = []
+
+    # Create a mask of 1s for each token followed by 0s for padding
+    for seq in input_ids_test:
+        seq_mask = [float(i>0) for i in seq]
+        attention_masks.append(seq_mask)
+
+    # Convert to tensors.
+    prediction_inputs = torch.tensor(input_ids_test)
+    prediction_masks = torch.tensor(attention_masks)
+    prediction_labels = torch.tensor(labels)
+
+    # Set the batch size.
+    batch_size = 32
+
+    # Create the DataLoader.
+    prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+    prediction_sampler = SequentialSampler(prediction_data)
+    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
+
+    return prediction_dataloader
+
+
+
+def predict_class_bertFineTuning(model, sentences_to_predict_dataloader):
+
+
+    # If there's a GPU available...
+    if torch.cuda.is_available():
+
+        # Tell PyTorch to use the GPU.
+        device = torch.device("cuda")
+
+        print('There are %d GPU(s) available.' % torch.cuda.device_count())
+
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
+
+        # If not...
+    else:
+        print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
+
+    # Put model in evaluation mode
+    model.eval()
+
+    # Tracking variables
+    predictions_test , true_labels = [], []
+
+    # Predict
+    for batch in prediction_dataloader:
+    # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+
+        # Unpack the inputs from the dataloader
+        b_input_ids, b_input_mask, b_labels = batch
+
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up prediction
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+
+        logits = outputs[0]
+        #print(logits)
+
+        # Move logits and labels to CPU
+        logits = logits.detach().cpu().numpy()
+        label_ids = b_labels.to('cpu').numpy()
+        #print(logits)
+
+        # Store predictions and true labels
+        predictions_test.append(logits)
+        true_labels.append(label_ids)
+
+        print('    DONE.')
+
+        pred_labels = []
+
+
+        for i in range(len(true_labels)):
+
+            # The predictions for this batch are a 2-column ndarray (one column for "0"
+            # and one column for "1"). Pick the label with the highest value and turn this
+            # in to a list of 0s and 1s.
+            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
+            pred_labels.append(pred_labels_i)
+
+        pred_labels_ = [item for sublist in pred_labels for item in sublist]
+        true_labels_ = [item for sublist in true_labels for item in sublist]
+        return predictions_test_, true_labels_
+
+
+def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict):
+    
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences_to_predict:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                    )
+
+        input_ids_test.append(encoded_sent)
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+
+        logits = outputs[0]
diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py
index 72a5929c733d95aebeab139af781661c184e984b..285be2d9a72d13d6cc693ad2a1c373b571e5fe86 100644
--- a/training_bertFineTuning.py
+++ b/training_bertFineTuning.py
@@ -2,464 +2,399 @@ import torch
 import pandas as pd
 import numpy as np
 from sklearn import preprocessing
-from sklearn.model_selection import train_test_split
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 from transformers import BertTokenizer, CamembertTokenizer
 from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification
 from transformers import get_linear_schedule_with_warmup
 import time
 import datetime
 import random
+import os
 
 
 
+def flat_accuracy(preds, labels):
+    pred_flat = np.argmax(preds, axis=1).flatten()
+    labels_flat = labels.flatten()
+    return np.sum(pred_flat == labels_flat) / len(labels_flat)
 
 
-###########################################################################
-########################## Utils Functions ################################
-###########################################################################
-
-def create_dict(df, classColumnName):
-    return dict(df[classColumnName].value_counts())
-
-def remove_weak_classes(df, classColumnName, threshold):
-
-    dictOfClassInstances = create_dict(df,classColumnName)
-
-
-    dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
-    keys = [*dictionary]
-    df_tmp = df[~ df[classColumnName].isin(keys)]
-    df =  pd.concat([df,df_tmp]).drop_duplicates(keep=False)
-    return df
-
-
-def resample_classes(df, classColumnName, numberOfInstances):
-
-    #random numberOfInstances elements
-    replace = False  # with replacement
-
-    fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
-    return df.groupby(classColumnName, as_index=False).apply(fn)
-
-##############################################################################################################
-########################## Setup GPU #########################################################################
-##############################################################################################################
-
-# If there's a GPU available...
-if torch.cuda.is_available():
 
-    # Tell PyTorch to use the GPU.
-    device = torch.device("cuda")
 
-    print('There are %d GPU(s) available.' % torch.cuda.device_count())
 
-    print('We will use the GPU:', torch.cuda.get_device_name(0))
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round((elapsed)))
 
-# If not...
-else:
-    print('No GPU available, using the CPU instead.')
-    device = torch.device("cpu")
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
 
 
+def training_bertFineTuning(chosen_model,  sentences, labels, max_len,  batch_size, epochs = 4):
 
+    # If there's a GPU available...
+    if torch.cuda.is_available():
 
-#############################################################################################################
-########################## parameters ###################################################################
-###########################################################################################################
+        # Tell PyTorch to use the GPU.
+        device = torch.device("cuda")
 
-config = configparser.ConfigParser()
-config.read('settings.conf')
+        print('There are %d GPU(s) available.' % torch.cuda.device_count())
 
-dataPath = config.get('general','dataPath')
-columnText = config.get('general','columnText')
-columnClass = config.get('general','columnClass')
+        print('We will use the GPU:', torch.cuda.get_device_name(0))
 
-minOfInstancePerClass = int(config.get('general','minOfInstancePerClass'))
-maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass'))
+        # If not...
+    else:
+        print('No GPU available, using the CPU instead.')
+        device = torch.device("cpu")
 
-chosen_tokeniser = config.get('model','tokeniser')
-chosen_model = config.get('model','model')
 
-max_len = int(config.get('model','max_len_sequences'))
 
 
-#############################################################################################################
-########################## Load Data ###################################################################
+############################################################################################################
+########################## Model: Tokenization & Input Formatting ###################################################################
 ###########################################################################################################
 
 
+    if chosen_model == 'bert-base-multilingual-cased' :
+        print('Loading Bert Tokenizer...')
+        tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True)
+    elif chosen_model == 'camembert-base':
+        print('Loading Camembert Tokenizer...')
+        tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True)
 
 
-df = pd.read_csv(dataPath)
-df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
-df = resample_classes(df, columnClass, maxOfInstancePerClass)
-df = df[df[columnClass] != 'unclassified']
-
 
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids = []
 
+    # For every sentence...
+    for sent in sentences:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
 
+                            # This function also supports truncation and conversion
+                            # to pytorch tensors, but I need to do padding, so I
+                            # can't use these features.
+                            #max_length = 128,          # Truncate all sentences.
+                            #return_tensors = 'pt',     # Return pytorch tensors.
+                            )
 
-y  = df[columnClass]
-numberOfClasses = y.nunique()
-encoder = preprocessing.LabelEncoder()
-y = encoder.fit_transform(y)
+        # Add the encoded sentence to the list.
+        input_ids.append(encoded_sent)
 
 
 
-sentences = train_x[columnText].values
-labels = train_y.tolist()
 
+    padded = []
+    for i in input_ids:
 
+        if len(i) > max_len:
+            padded.extend([i[:max_len]])
+        else:
+            padded.extend([i + [0] * (max_len - len(i))])
 
-############################################################################################################
-########################## Model: Tokenization & Input Formatting ###################################################################
-###########################################################################################################
-
-
-# Load the BERT tokenizer.
-print('Loading BERT tokenizer...')
-tokenizer = BertTokenizer.from_pretrained(tokeniser_bert, do_lower_case=True)
 
+    padded  = np.array(padded)
 
- # Tokenize all of the sentences and map the tokens to thier word IDs.
-input_ids = []
 
-# For every sentence...
-for sent in sentences:
-    # `encode` will:
-    #   (1) Tokenize the sentence.
-    #   (2) Prepend the `[CLS]` token to the start.
-    #   (3) Append the `[SEP]` token to the end.
-    #   (4) Map tokens to their IDs.
-    encoded_sent = tokenizer.encode(
-                        sent,                      # Sentence to encode.
-                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
 
-                        # This function also supports truncation and conversion
-                        # to pytorch tensors, but I need to do padding, so I
-                        # can't use these features.
-                        #max_length = 128,          # Truncate all sentences.
-                        #return_tensors = 'pt',     # Return pytorch tensors.
-                   )
+    # Create attention masks
+    attention_masks = []
 
-    # Add the encoded sentence to the list.
-    input_ids.append(encoded_sent)
+    # For each sentence...
+    for sent in padded:
 
+        # Create the attention mask.
+        #   - If a token ID is 0, then it's padding, set the mask to 0.
+        #   - If a token ID is > 0, then it's a real token, set the mask to 1.
+        att_mask = [int(token_id > 0) for token_id in sent]
 
+        # Store the attention mask for this sentence.
+        attention_masks.append(att_mask)
 
 
-padded = []
-for i in input_ids:
+    # Use 90% for training and 10% for validation.
+    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.1, stratify = labels )
+    # Do the same for the masks.
+    train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1, stratify = labels)
 
-  if len(i) > max_len:
-    padded.extend([i[:max_len]])
-  else:
-    padded.extend([i + [0] * (max_len - len(i))])
 
+    # Convert all inputs and labels into torch tensors, the required datatype
+    # for my model.
+    train_inputs = torch.tensor(train_inputs)
+    validation_inputs = torch.tensor(validation_inputs)
 
-padded = input_ids = np.array(padded)
+    train_labels = torch.tensor(train_labels)
+    validation_labels = torch.tensor(validation_labels)
 
+    train_masks = torch.tensor(train_masks)
+    validation_masks = torch.tensor(validation_masks)
 
 
- # Create attention masks
-attention_masks = []
 
-# For each sentence...
-for sent in padded:
 
-    # Create the attention mask.
-    #   - If a token ID is 0, then it's padding, set the mask to 0.
-    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
-    att_mask = [int(token_id > 0) for token_id in sent]
+    # The DataLoader needs to know the batch size for training, so I specify it here.
+    # For fine-tuning BERT on a specific task, the authors recommend a batch size of
+    # 16 or 32.
 
-    # Store the attention mask for this sentence.
-    attention_masks.append(att_mask)
 
+    # Create the DataLoader for training set.
+    train_data = TensorDataset(train_inputs, train_masks, train_labels)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
 
-# Use 90% for training and 10% for validation.
-train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels,
-                                                            random_state=2018, test_size=0.1, stratify = labels )
-# Do the same for the masks.
-train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,
-                                             random_state=2018, test_size=0.1, stratify = labels)
+    # Create the DataLoader for validation set.
+    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
+    validation_sampler = SequentialSampler(validation_data)
+    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
 
 
-# Convert all inputs and labels into torch tensors, the required datatype
-# for my model.
-train_inputs = torch.tensor(train_inputs)
-validation_inputs = torch.tensor(validation_inputs)
 
-train_labels = torch.tensor(train_labels)
-validation_labels = torch.tensor(validation_labels)
 
-train_masks = torch.tensor(train_masks)
-validation_masks = torch.tensor(validation_masks)
 
+    print(' Selecting a model .....')
 
+    numberOfClasses = len(set(labels))
 
-from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 
-# The DataLoader needs to know the batch size for training, so I specify it here.
-# For fine-tuning BERT on a specific task, the authors recommend a batch size of
-# 16 or 32.
+    # Load BertForSequenceClassification, the pretrained BERT model with a single
+    # linear classification layer on top.
+    if chosen_model == 'bert-base-multilingual-cased':
+        model = BertForSequenceClassification.from_pretrained(
+            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+            # You can increase this for multi-class tasks.
+            output_attentions = False, # Whether the model returns attentions weights.
+            output_hidden_states = False, # Whether the model returns all hidden-states.
+            )
+    elif chosen_model == 'camembert-base':
 
-batch_size = int(config.get('model','batch_size'))
+        model = CamembertForSequenceClassification.from_pretrained(
+            chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
+            num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
+            # You can increase this for multi-class tasks.
+            output_attentions = False, # Whether the model returns attentions weights.
+            output_hidden_states = False, # Whether the model returns all hidden-states.
+            )
 
-# Create the DataLoader for training set.
-train_data = TensorDataset(train_inputs, train_masks, train_labels)
-train_sampler = RandomSampler(train_data)
-train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
 
-# Create the DataLoader for validation set.
-validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
-validation_sampler = SequentialSampler(validation_data)
-validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
+    # Tell pytorch to run this model on the GPU.
+    model.cuda()
 
 
+    #Note: AdamW is a class from the huggingface library (as opposed to pytorch)
+    # I believe the 'W' stands for 'Weight Decay fix"
+    optimizer = AdamW(model.parameters(),
+                    lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
+                    eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
+                    )
 
 
-############################################################################################################
-########################## Model: Training ###################################################################
-###########################################################################################################
 
 
-print(' Selecting a model .....')
+    # Total number of training steps is number of batches * number of epochs.
+    total_steps = len(train_dataloader) * epochs
 
+    # Create the learning rate scheduler.
+    scheduler = get_linear_schedule_with_warmup(optimizer,
+                                            num_warmup_steps = 0, # Default value in run_glue.py
+                                            num_training_steps = total_steps)
 
 
-# Load BertForSequenceClassification, the pretrained BERT model with a single
-# linear classification layer on top.
 
-model = BertForSequenceClassification.from_pretrained(
-    chosen_model, # Use the 12-layer BERT model, with an uncased vocab.
-    num_labels = numberOfClasses, # The number of output labels--2 for binary classification.
-                    # You can increase this for multi-class tasks.
-    output_attentions = False, # Whether the model returns attentions weights.
-    output_hidden_states = False, # Whether the model returns all hidden-states.
-)
 
-# Tell pytorch to run this model on the GPU.
-model.cuda()
+    # This training code is based on the `run_glue.py` script here:
+    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
 
 
-#Note: AdamW is a class from the huggingface library (as opposed to pytorch)
-# I believe the 'W' stands for 'Weight Decay fix"
-optimizer = AdamW(model.parameters(),
-                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
-                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
-                )
+    # Set the seed value all over the place to make this reproducible.
+    seed_val = 42
 
+    random.seed(seed_val)
+    np.random.seed(seed_val)
+    torch.manual_seed(seed_val)
+    torch.cuda.manual_seed_all(seed_val)
 
+    # Store the average loss after each epoch so I can plot them.
+    loss_values = []
 
-# Number of training epochs (authors recommend between 2 and 4)
-epochs = int(config.get('model','epochs'))
+    # For each epoch...
+    for epoch_i in range(0, epochs):
 
-# Total number of training steps is number of batches * number of epochs.
-total_steps = len(train_dataloader) * epochs
+        # ========================================
+        #               Training
+        # ========================================
 
-# Create the learning rate scheduler.
-scheduler = get_linear_schedule_with_warmup(optimizer,
-                                            num_warmup_steps = 0, # Default value in run_glue.py
-                                            num_training_steps = total_steps)
+        # Perform one full pass over the training set.
 
+        print("")
+        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+        print('Training...')
 
-def flat_accuracy(preds, labels):
-    pred_flat = np.argmax(preds, axis=1).flatten()
-    labels_flat = labels.flatten()
-    return np.sum(pred_flat == labels_flat) / len(labels_flat)
+        # Measure how long the training epoch takes.
+        t0 = time.time()
 
+        # Reset the total loss for this epoch.
+        total_loss = 0
 
+        # Put the model into training mode.
+        model.train()
 
+        # For each batch of training data...
+        for step, batch in enumerate(train_dataloader):
 
+            # Progress update every 40 batches.
+            if step % 40 == 0 and not step == 0:
+                # Calculate elapsed time in minutes.
+                elapsed = format_time(time.time() - t0)
 
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round((elapsed)))
+                # Report progress.
+                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
 
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
+            # Unpack this training batch from the dataloader.
+            #
+            # As I unpack the batch, I'll also copy each tensor to the GPU using the
+            # `to` method.
+            #
+            # `batch` contains three pytorch tensors:
+            #   [0]: input ids
+            #   [1]: attention masks
+            #   [2]: labels
+            b_input_ids = batch[0].to(device)
+            b_input_mask = batch[1].to(device)
+            b_labels = batch[2].to(device)
 
+            # Always clear any previously calculated gradients before performing a
+            # backward pass. PyTorch doesn't do this automatically because
+            # accumulating the gradients is "convenient while training RNNs".
+            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
+            model.zero_grad()
 
+            # Perform a forward pass (evaluate the model on this training batch).
+            # This will return the loss (rather than the model output) because I
+            # have provided the `labels`.
+            # The documentation for this `model` function is here:
+            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+            outputs = model(b_input_ids,
+                        token_type_ids=None,
+                        attention_mask=b_input_mask,
+                        labels=b_labels)
 
+            # The call to `model` always returns a tuple, so I need to pull the
+            # loss value out of the tuple.
+            loss = outputs[0]
 
-# This training code is based on the `run_glue.py` script here:
-# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
+            # Accumulate the training loss over all of the batches so that I can
+            # calculate the average loss at the end. `loss` is a Tensor containing a
+            # single value; the `.item()` function just returns the Python value
+            # from the tensor.
+            total_loss += loss.item()
 
+            #  Perform a backward pass to calculate the gradients.
+            loss.backward()
 
-# Set the seed value all over the place to make this reproducible.
-seed_val = 42
+            # Clip the norm of the gradients to 1.0.
+            # This is to help prevent the "exploding gradients" problem.
+            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
 
-random.seed(seed_val)
-np.random.seed(seed_val)
-torch.manual_seed(seed_val)
-torch.cuda.manual_seed_all(seed_val)
+            # Update parameters and take a step using the computed gradient.
+            # The optimizer dictates the "update rule"--how the parameters are
+            # modified based on their gradients, the learning rate, etc.
+            optimizer.step()
 
-# Store the average loss after each epoch so I can plot them.
-loss_values = []
+            # Update the learning rate.
+            scheduler.step()
 
-# For each epoch...
-for epoch_i in range(0, epochs):
+        # Calculate the average loss over the training data.
+        avg_train_loss = total_loss / len(train_dataloader)
 
-    # ========================================
-    #               Training
-    # ========================================
+        # Store the loss value for plotting the learning curve.
+        loss_values.append(avg_train_loss)
 
-    # Perform one full pass over the training set.
+        print("")
+        print("  Average training loss: {0:.2f}".format(avg_train_loss))
+        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
 
-    print("")
-    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
-    print('Training...')
-
-    # Measure how long the training epoch takes.
-    t0 = time.time()
-
-    # Reset the total loss for this epoch.
-    total_loss = 0
-
-    # Put the model into training mode.
-    model.train()
-
-    # For each batch of training data...
-    for step, batch in enumerate(train_dataloader):
-
-        # Progress update every 40 batches.
-        if step % 40 == 0 and not step == 0:
-            # Calculate elapsed time in minutes.
-            elapsed = format_time(time.time() - t0)
-
-            # Report progress.
-            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
-
-        # Unpack this training batch from the dataloader.
-        #
-        # As I unpack the batch, I'll also copy each tensor to the GPU using the
-        # `to` method.
-        #
-        # `batch` contains three pytorch tensors:
-        #   [0]: input ids
-        #   [1]: attention masks
-        #   [2]: labels
-        b_input_ids = batch[0].to(device)
-        b_input_mask = batch[1].to(device)
-        b_labels = batch[2].to(device)
-
-        # Always clear any previously calculated gradients before performing a
-        # backward pass. PyTorch doesn't do this automatically because
-        # accumulating the gradients is "convenient while training RNNs".
-        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
-        model.zero_grad()
-
-        # Perform a forward pass (evaluate the model on this training batch).
-        # This will return the loss (rather than the model output) because I
-        # have provided the `labels`.
-        # The documentation for this `model` function is here:
-        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
-        outputs = model(b_input_ids,
-                    token_type_ids=None,
-                    attention_mask=b_input_mask,
-                    labels=b_labels)
-
-        # The call to `model` always returns a tuple, so I need to pull the
-        # loss value out of the tuple.
-        loss = outputs[0]
-
-        # Accumulate the training loss over all of the batches so that I can
-        # calculate the average loss at the end. `loss` is a Tensor containing a
-        # single value; the `.item()` function just returns the Python value
-        # from the tensor.
-        total_loss += loss.item()
-
-        # Perform a backward pass to calculate the gradients.
-        loss.backward()
-
-        # Clip the norm of the gradients to 1.0.
-        # This is to help prevent the "exploding gradients" problem.
-        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
-
-        # Update parameters and take a step using the computed gradient.
-        # The optimizer dictates the "update rule"--how the parameters are
-        # modified based on their gradients, the learning rate, etc.
-        optimizer.step()
-
-        # Update the learning rate.
-        scheduler.step()
-
-    # Calculate the average loss over the training data.
-    avg_train_loss = total_loss / len(train_dataloader)
-
-    # Store the loss value for plotting the learning curve.
-    loss_values.append(avg_train_loss)
+        # ========================================
+        #               Validation
+        # ========================================
+        # After the completion of each training epoch, measure the performance on
+        # the validation set.
 
-    print("")
-    print("  Average training loss: {0:.2f}".format(avg_train_loss))
-    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
+        print("")
+        print("Running Validation...")
 
-    # ========================================
-    #               Validation
-    # ========================================
-    # After the completion of each training epoch, measure the performance on
-    # the validation set.
+        t0 = time.time()
 
-    print("")
-    print("Running Validation...")
+        # Put the model in evaluation mode--the dropout layers behave differently
+        # during evaluation.
+        model.eval()
 
-    t0 = time.time()
+        # Tracking variables
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
 
-    # Put the model in evaluation mode--the dropout layers behave differently
-    # during evaluation.
-    model.eval()
+        # Evaluate data for one epoch
+        for batch in validation_dataloader:
 
-    # Tracking variables
-    eval_loss, eval_accuracy = 0, 0
-    nb_eval_steps, nb_eval_examples = 0, 0
+            # Add batch to GPU
+            batch = tuple(t.to(device) for t in batch)
 
-    # Evaluate data for one epoch
-    for batch in validation_dataloader:
+            # Unpack the inputs from dataloader
+            b_input_ids, b_input_mask, b_labels = batch
 
-        # Add batch to GPU
-        batch = tuple(t.to(device) for t in batch)
+            # Telling the model not to compute or store gradients, saving memory and
+            # speeding up validation
+            with torch.no_grad():
 
-        # Unpack the inputs from dataloader
-        b_input_ids, b_input_mask, b_labels = batch
+                # Forward pass, calculate logit predictions.
+                # This will return the logits rather than the loss because we have
+                # not provided labels.
+                # token_type_ids is the same as the "segment ids", which
+                # differentiates sentence 1 and 2 in 2-sentence tasks.
+                # The documentation for this `model` function is here:
+                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
+                outputs = model(b_input_ids,
+                                token_type_ids=None,
+                                attention_mask=b_input_mask)
 
-        # Telling the model not to compute or store gradients, saving memory and
-        # speeding up validation
-        with torch.no_grad():
+            # Get the "logits" output by the model. The "logits" are the output
+            # values prior to applying an activation function like the softmax.
+            logits = outputs[0]
 
-            # Forward pass, calculate logit predictions.
-            # This will return the logits rather than the loss because we have
-            # not provided labels.
-            # token_type_ids is the same as the "segment ids", which
-            # differentiates sentence 1 and 2 in 2-sentence tasks.
-            # The documentation for this `model` function is here:
-            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
-            outputs = model(b_input_ids,
-                            token_type_ids=None,
-                            attention_mask=b_input_mask)
+            # Move logits and labels to CPU
+            logits = logits.detach().cpu().numpy()
+            label_ids = b_labels.to('cpu').numpy()
 
-        # Get the "logits" output by the model. The "logits" are the output
-        # values prior to applying an activation function like the softmax.
-        logits = outputs[0]
+            # Calculate the accuracy for this batch of test sentences.
+            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
 
-        # Move logits and labels to CPU
-        logits = logits.detach().cpu().numpy()
-        label_ids = b_labels.to('cpu').numpy()
+            # Accumulate the total accuracy.
+            eval_accuracy += tmp_eval_accuracy
 
-        # Calculate the accuracy for this batch of test sentences.
-        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
+            # Track the number of batches
+            nb_eval_steps += 1
 
-        # Accumulate the total accuracy.
-        eval_accuracy += tmp_eval_accuracy
+        # Report the final accuracy for this validation run.
+        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
+        print("  Validation took: {:}".format(format_time(time.time() - t0)))
 
-        # Track the number of batches
-        nb_eval_steps += 1
+    print("")
+    print("Training complete!")
+    return model
 
-    # Report the final accuracy for this validation run.
-    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
-    print("  Validation took: {:}".format(format_time(time.time() - t0)))
 
-print("")
-print("Training complete!")
+'''print('Saving Model....')
+model_save_name = config.get('model','modelName')
+path = config.get('model','path')
+#torch.save(model.state_dict(), os.path.join(path,model_save_name))
+torch.save(model, os.path.join(path,model_save_name))'''