import pandas as pd import numpy as np import configparser from sklearn import preprocessing from sklearn.model_selection import train_test_split from training_bertFineTuning import training_bertFineTuning from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader from evaluate_bertFineTuning import evaluate_bertFineTuning def create_dict(df, classColumnName): return dict(df[classColumnName].value_counts()) def remove_weak_classes(df, classColumnName, threshold): dictOfClassInstances = create_dict(df,classColumnName) dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } keys = [*dictionary] df_tmp = df[~ df[classColumnName].isin(keys)] df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) return df def resample_classes(df, classColumnName, numberOfInstances): #random numberOfInstances elements replace = False # with replacement fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] return df.groupby(classColumnName, as_index=False).apply(fn) def main(): config = configparser.ConfigParser() config.read('bert_settings.conf') dataPath = config.get('general','dataPath') columnText = config.get('general','columnText') columnClass = config.get('general','columnClass') minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) chosen_tokeniser = config.get('model','tokeniser') chosen_model = config.get('model','model') max_len = int(config.get('model','max_len_sequences')) batch_size = int(config.get('model','batch_size')) epochs = int(config.get('model','epochs')) df = pd.read_csv(dataPath) df = remove_weak_classes(df, columnClass, minOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass) df = df[df[columnClass] != 'unclassified'] y = df[columnClass] numberOfClasses = y.nunique() encoder = preprocessing.LabelEncoder() y = encoder.fit_transform(y) train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) sentences = train_x[columnText].values labels = train_y.tolist() #call train method model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs) #save the model model_save_name = config.get('model','modelName') path = config.get('model','path') torch.save(model, os.path.join(path,model_save_name)) #print the model parameters params = list(model.named_parameters()) print('The BERT model has {:} different named parameters.\n'.format(len(params))) print('==== Embedding Layer ====\n') for p in params[0:5]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== First Transformer ====\n') for p in params[5:21]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) print('\n==== Output Layer ====\n') for p in params[-4:]: print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) #call predict method prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32) predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader) #call Evaluate result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder) print(result_df) print(accuracy) print(weighted_avg) if __name__ == "__main__": main()