From 9eb58e8aeeac61a7d96cbfbf5f17d92357a43d73 Mon Sep 17 00:00:00 2001 From: Ludovic Moncla <moncla.ludovic@gmail.com> Date: Mon, 11 Jul 2022 09:01:43 +0200 Subject: [PATCH] deleted --- ClassPreprocessor.py | 50 --- bert_settings.conf | 15 - classifiers.py | 41 --- data_preparation.py | 51 --- data_preprocessing.py | 134 -------- evaluate_bertFineTuning.py | 54 ---- evaluate_model.py | 59 ---- experimentsClassicClassifiers.py | 163 ---------- features_extractor.py | 111 ------- main.py | 120 ------- predict_bertFineTuning.py | 168 ---------- settings.conf | 10 - tmp_preprocess_data.py | 105 ------- training_bertFineTuning.py | 515 ------------------------------- 14 files changed, 1596 deletions(-) delete mode 100644 ClassPreprocessor.py delete mode 100644 bert_settings.conf delete mode 100644 classifiers.py delete mode 100644 data_preparation.py delete mode 100644 data_preprocessing.py delete mode 100644 evaluate_bertFineTuning.py delete mode 100644 evaluate_model.py delete mode 100644 experimentsClassicClassifiers.py delete mode 100644 features_extractor.py delete mode 100644 main.py delete mode 100644 predict_bertFineTuning.py delete mode 100644 settings.conf delete mode 100644 tmp_preprocess_data.py delete mode 100644 training_bertFineTuning.py diff --git a/ClassPreprocessor.py b/ClassPreprocessor.py deleted file mode 100644 index c861144..0000000 --- a/ClassPreprocessor.py +++ /dev/null @@ -1,50 +0,0 @@ -import pandas as pd -import numpy as np -import statistics - -def create_dict(df, classColumnName): - return dict(df[classColumnName].value_counts()) - -def remove_weak_classes(df, classColumnName, threshold): - - dictOfClassInstances = create_dict(df,classColumnName) - - - dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } - keys = [*dictionary] - df_tmp = df[~ df[classColumnName].isin(keys)] - #df = df[df[columnTarget] not in keys] - #df = df.merge(df_tmp, how = 'outer' ,indicator=True) - df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) - return df - - - -def split_class(df, columnProcessed): - i = 0 - new_df = pd.DataFrame(columns= df.columns) - for index, row in df.iterrows(): - #cls = re.split(';', row[columnProcessed]) - cls = filter(None, row[columnProcessed].split(';')) - cls = list(cls) - #cls = re.findall(r"[\w']+", row [columnProcessed]) - r = row - for categ in cls: - r[columnProcessed] = categ - #new_df.append(r, ignore_index = True) - new_df.loc[i] = r - i = i + 1 - - return new_df - -def get_median_dict(dict): - return statistics.median(dict.values()) - -def resample_classes(df, classColumnName, numberOfInstances): - # numberOfInstances first elements - #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns]) - #random numberOfInstances elements - replace = False # with replacement - - fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] - return df.groupby(classColumnName, as_index=False).apply(fn) diff --git a/bert_settings.conf b/bert_settings.conf deleted file mode 100644 index 66b061e..0000000 --- a/bert_settings.conf +++ /dev/null @@ -1,15 +0,0 @@ -[general] -columnText = contentWithoutClass -columnClass = ensemble_domaine_enccre -minOfInstancePerClass = 50 -maxOfInstancePerClass = 1500 - - -[model] -model = bert -path = bert-base-multilingual-cased -#model = camembert -#path = camembert-base -max_len_sequences = 256 -batch_size = 32 -epochs = 4 diff --git a/classifiers.py b/classifiers.py deleted file mode 100644 index 0d67584..0000000 --- a/classifiers.py +++ /dev/null @@ -1,41 +0,0 @@ - -from sklearn.naive_bayes import MultinomialNB -from sklearn.svm import SVC -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier -from sklearn.linear_model import LogisticRegression -from sklearn.linear_model import SGDClassifier -from sklearn.neighbors import KNeighborsClassifier - -import numpy as np - - -classifiers = [ - ('bayes', MultinomialNB()), - ('lr', LogisticRegression()), - ('sgd', SGDClassifier()), - ('svm', SVC() ), - #('decisionTree',DecisionTreeClassifier()), - ('rfc', RandomForestClassifier()), - #('knn', KNeighborsClassifier()) - ] - - -param_grid_svm = {'kernel':['linear','rbf']} -#param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } -param_grid_rfc = { 'max_features': ['sqrt', 'log2'], 'max_depth' : [4,5,6,7,8]} -param_grid_lr = {"C":np.logspace(-3,3,7)} -param_grid_sgd = { "loss" : ["log", "modified_huber"]} -#param_grid_knn = {'n_neighbors' : list(range(3,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } - - -grid_params = [ - ('bayes', None), - ('lr', param_grid_lr), - ('sgd', param_grid_sgd ), - ('svm', param_grid_svm), - #('decisionTree', param_grid_decisionTree), - ('rfc', param_grid_rfc ), - #('knn', param_grid_knn), - - ] diff --git a/data_preparation.py b/data_preparation.py deleted file mode 100644 index 4de045d..0000000 --- a/data_preparation.py +++ /dev/null @@ -1,51 +0,0 @@ -from os import path -from os.path import basename, splitext -import pandas as pd -import os -from data_process.TEIFile import TEIFile - - - - -def basename_without_ext(path): - base_name = basename(path) - stem, ext = splitext(base_name) - if stem.endswith('.tei'): - # Return base name without tei file - return stem[0:-4] - else: - return stem - - -def tei_to_csv_entry(tei_file, txt_file): - print(f"Going on {tei_file}") - tei = TEIFile(tei_file, txt_file) - print(f"Handled {tei_file}") - base_name = basename_without_ext(tei_file) - return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution - - -input_path = r'./data/EDdA/' -output_name = "corpus_tei.csv" - -column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"] - -df = pd.DataFrame(columns = column_names) - -marge = 0 - - -for tome in os.listdir(input_path): - volume = tome[1:] - - for index, article in enumerate(os.listdir(input_path + tome +"/")): - filepath = os.path.join(input_path, tome, article) - base_name = basename_without_ext(filepath) - - df.loc[index+marge] = tei_to_csv_entry(filepath, ' ') - df.loc[index+marge]['articleName'] = volume+'_'+base_name - marge += index +1 - - - -df.to_csv(output_name, index=False) diff --git a/data_preprocessing.py b/data_preprocessing.py deleted file mode 100644 index 5fe379e..0000000 --- a/data_preprocessing.py +++ /dev/null @@ -1,134 +0,0 @@ -import pandas as pd -import numpy as np -from re import search -import math -from unidecode import unidecode -from sklearn.feature_extraction.text import CountVectorizer -from nltk.stem.snowball import SnowballStemmer -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize -import re -import nltk - -class Preprocessor(): - - def init(self): - pass - - - def remove_null_rows(self, df, columnName): - #df = df[df[columnName].notna()] - df.dropna(subset = [columnName], inplace = True) - df.reset_index(drop=True, inplace=True) - - return - - def removeMarkers(self, df, textColumn, markerColumn = 'class'): - - #remove null values or add condition if exist - #self.remove_null_rows(df, markerColumn) - #self.remove_null_rows(df, textColumn) - - for index, row in df.iterrows(): - if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]): - - marker = row[markerColumn] - marker_with_brcts = '('+ marker +')' - row[textColumn] = row[textColumn].replace(marker_with_brcts , "") - row[textColumn] = row[textColumn].replace(marker , "") - full_text = row[textColumn] - i = unidecode(full_text).find(marker_with_brcts) - goOn = False - if i != -1: - goOn = True - while goOn: - - full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) - i = unidecode(full_text).find(marker_with_brcts) - if i == -1: - goOn = False - - - row[textColumn] = full_text - - return df - - - def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence): - - stop_words = set(stopwords.words('french')) - stemmer_fr = SnowballStemmer("french") - analyzer = CountVectorizer().build_analyzer() - - def token_fr(doc): - return (w for w in analyzer(doc) if not w in stop_words) - - stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None) - - docs = [] - - for index, row in df.iterrows(): - docs.append(row[textColumn]) - - stem_vectorizer_fr.fit(docs) - featured_docs = stem_vectorizer_fr.transform(docs) - tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs) - - for index, tokens in enumerate(tokens_per_docs): - # join token to recreate text with new tokens - new_text = ' '.join(tokens) - df.loc[index][textColumn] = new_text - - return - - def removeArticlesByTokensNumbers(self, df, textColumn, min_word_per_article): - - stop_words = set(stopwords.words('french')) - stemmer_fr = SnowballStemmer("french") - analyzer = CountVectorizer().build_analyzer() - - def token_fr(doc): - return (w for w in analyzer(doc) if not w in stop_words) - - stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr) - - docs = [] - - for index, row in df.iterrows(): - docs.append(row[textColumn]) - - stem_vectorizer_fr.fit(docs) - featured_docs = stem_vectorizer_fr.transform(docs) - tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs) - - concerned_article_index = [] - for index, tokens in enumerate(tokens_per_docs): - - if len(tokens) <= min_word_per_article: - concerned_article_index.append(index) - - df = df.drop(index = concerned_article_index, inplace = True) - - return - - - def getFirstParagraph(self, df, textColumn, columnName): - new_column = [] - for index, row in df.iterrows(): - paragraphs = row[textColumn].split('\n \n') - new_column.append(paragraphs[0]) - df[columnName] = new_column - return - - def getFirstSentence(self, df, textColumn, columnName): - sent = [] - for index, row in df.iterrows(): - sentences = nltk.sent_tokenize(row[textColumn]) - sent.append(sentences[0]) - df[columnName] = sent - return - - def saveDataFrametoCSV(self, df, pathName): - df.to_csv(pathName) - - diff --git a/evaluate_bertFineTuning.py b/evaluate_bertFineTuning.py deleted file mode 100644 index 3c9b52b..0000000 --- a/evaluate_bertFineTuning.py +++ /dev/null @@ -1,54 +0,0 @@ -import matplotlib.pyplot as plt -from sklearn.metrics import plot_confusion_matrix -from sklearn.metrics import confusion_matrix -from sklearn.metrics import classification_report -import seaborn as sns - - - - - - - - - - -def evaluate_bertFineTuning(pred_labels_, true_labels_, encoder): - report = classification_report( pred_labels_, true_labels_, output_dict = True) - - classes = [str(e) for e in encoder.transform(encoder.classes_)] - classesName = encoder.classes_ - - accuracy = report['accuracy'] - weighted_avg = report['weighted avg'] - - precision = [] - recall = [] - f1 = [] - support = [] - dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) - for c in classes: - precision.append(report[c]['precision']) - recall.append(report[c]['recall']) - f1.append(report[c]['f1-score']) - support.append(report[c]['support']) - - accuracy = report['accuracy'] - weighted_avg = report['weighted avg'] - cnf_matrix = confusion_matrix(true_labels_, pred_labels_) - FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) - FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) - TP = np.diag(cnf_matrix) - TN = cnf_matrix.sum() - (FP + FN + TP) - - dff['className'] = classesName - dff['precision'] = precision - dff['recall'] = recall - dff['f1-score'] = f1 - dff['support'] = support - dff['FP'] = FP - dff['FN'] = FN - dff['TP'] = TP - dff['TN'] = TN - - return dff, accuracy, weighted_avg diff --git a/evaluate_model.py b/evaluate_model.py deleted file mode 100644 index f258ccd..0000000 --- a/evaluate_model.py +++ /dev/null @@ -1,59 +0,0 @@ -import matplotlib.pyplot as plt -import numpy as np -from sklearn.metrics import plot_confusion_matrix -from sklearn.metrics import confusion_matrix -from sklearn.metrics import classification_report -import pandas as pd -import seaborn as sns - - -def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave): - - #classifier, label_list, test_x, valid_y, title = "Confusion matrix"): - precision = [] - recall = [] - f1 = [] - support = [] - weighted_avg = None - accuracy = None - - df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) - report = classification_report( y_pred, valid_y, output_dict = True) - for c in classes: - precision.append(report[c]['precision']) - recall.append(report[c]['recall']) - f1.append(report[c]['f1-score']) - support.append(report[c]['support']) - - accuracy = report['accuracy'] - weighted_avg = report['weighted avg'] - cnf_matrix = confusion_matrix(valid_y, y_pred) - FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) - FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) - TP = np.diag(cnf_matrix) - TN = cnf_matrix.sum() - (FP + FN + TP) - - df['className'] = classesName - df['precision'] = precision - df['recall'] = recall - df['f1-score'] = f1 - df['support'] = support - df['FP'] = FP - df['FN'] = FN - df['TP'] = TP - df['TN'] = TN - #disp = plot_confusion_matrix(classifier, test_x, valid_y, - # display_labels= label_list, - # cmap=plt.cm.Blues, - # normalize=None) - #disp.ax_.set_title(title) - - #print(title) - #print(disp.confusion_matrix) - - #plt.show() - plt.rcParams["font.size"] = 3 - plot_confusion_matrix(clf, X_test, y_test) - plt.savefig(pathSave) - return df, accuracy, weighted_avg - diff --git a/experimentsClassicClassifiers.py b/experimentsClassicClassifiers.py deleted file mode 100644 index f5d6ade..0000000 --- a/experimentsClassicClassifiers.py +++ /dev/null @@ -1,163 +0,0 @@ -import sys -import os -import time -import argparse -import pandas as pd -import numpy as np -from data_preprocessing import Preprocessor -from features_extractor import feature_extractor -from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class -from classifiers import classifiers, grid_params -from sklearn.model_selection import train_test_split -from sklearn import preprocessing -from evaluate_model import evaluate_model -from sklearn.model_selection import GridSearchCV -import configparser -import pickle - -import nltk -nltk.download('stopwords') -nltk.download('punkt') - -parser = argparse.ArgumentParser() -parser.add_argument("dataPath", help="Path of the dataframe") -parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content') -parser.add_argument("columnClass", help="ColumnClass the column name of the classes") -parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int) -parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int) - -args = parser.parse_args() -dataPath = args.dataPath -columnText = args.columnText -columnClass = args.columnClass -minOfInstancePerClass = args.minOfInstancePerClass -maxOfInstancePerClass = args.maxOfInstancePerClass - -if not os.path.exists('reports'): - os.makedirs('reports') - -if not os.path.exists(os.path.join('reports', columnClass)): - os.makedirs(os.path.join('reports', columnClass)) - -# create directory in the reports directory so save the classification results -dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) -if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): - os.makedirs(os.path.join('reports', columnClass, dir_name_report)) - - -# create directory to save and load models -if not os.path.exists('models'): - os.makedirs('models') - -# Reading data and preprocessings steps -preprocessor = Preprocessor() - -df = pd.read_csv(dataPath, sep="\t") - -df = remove_weak_classes(df, columnClass, minOfInstancePerClass) -df = resample_classes(df, columnClass, maxOfInstancePerClass) - -#Read configuration file for retreiving parameters of features extractors - -config = configparser.ConfigParser() -config.read('settings.conf') - -vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) -vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) -vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None - -doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) -max_epochs = int(config.get('vectorizers','max_epochs')) -doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count')) -doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. -doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) - -print("size after resampling, ",len(df)) - -#prepare data -#df = df[df[columnClass] != 'unclassified'] -y = df[columnClass] - -print(df.head()) - -print(df[columnClass].head()) - -train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) -encoder = preprocessing.LabelEncoder() -train_y = encoder.fit_transform(train_y) -valid_y = encoder.fit_transform(test_y) - -print("size training set, ",len(train_x)) -print("size validation set, ",len(test_x)) - - -for columnInput in [columnText, 'firstParagraph']: - - print('Process: ' + columnInput) - - extractor = feature_extractor(train_x, test_x, columnInput, columnClass) - - features_techniques = [ - ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), - ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), - ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))] - - - - #case of full text - for feature_technique_name, features in features_techniques: - - print("**** Classifier :", feature_technique_name) - # features has the train_x and the test_x after vectorization - train_x, test_x = features - - for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): - clf_name, clf = tmp_clf - grid_param_name, grid_param = tmp_grid_params - print(clf_name, clf, grid_param_name, grid_param) - model_file_name = columnInput + '_' +feature_technique_name + '_' + clf_name+ str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +".pkl" - - if clf_name != 'bayes' : - clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3, n_jobs=-1) - elif feature_technique_name == 'doc2vec': - continue - - t_begin = time.time() - - if os.path.isfile(os.path.join('./models', model_file_name)): - report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf') - with open(os.path.join('./models',columnClass, model_file_name), 'rb') as file: - clf = pickle.load(file) - else: - with open(os.path.join('./models',columnClass, model_file_name), 'wb') as file: - clf.fit(train_x, train_y) - pickle.dump(clf, file) - - t_end =time.time() - - training_time = t_end - t_begin - - y_pred = clf.predict(test_x) - - #evaluate model - file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name - - report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf') - report.to_csv(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.csv')) - with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f: - - sys.stdout = f # Change the standard output to the file we created. - print('accuracy : {}'.format(accuracy)) - print('weighted_Precision : {}'.format(weighted_avg['precision'])) - print('weighted_Recall : {}'.format(weighted_avg['recall'])) - print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) - print('weighted_Support : {}'.format(weighted_avg['support'])) - print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) - print('training time : {}'.format(training_time)) - try: - print('best parameters : {}'.format(clf.best_params_)) - except AttributeError: - pass - - #sys.stdout = sys.stdout # Reset the standard output to its original value - sys.stdout = sys.__stdout__ diff --git a/features_extractor.py b/features_extractor.py deleted file mode 100644 index 18bf313..0000000 --- a/features_extractor.py +++ /dev/null @@ -1,111 +0,0 @@ -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfVectorizer -from nltk.stem.snowball import SnowballStemmer -from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize -import string -import pandas as pd -import numpy as np -from gensim.models.doc2vec import Doc2Vec, TaggedDocument -from nltk.tokenize import word_tokenize -import spacy - - -class feature_extractor: - - def __init__(self, train_x, test_x, column, target): - - self.column = column - #self.data = data - #self.X = data[column] - #self.y = data[target] - - self.docs_train = train_x[column].tolist() - self.docs_test = test_x[column].tolist() - #for index, row in data.iterrows(): - # self.docs.append(row[column]) - - - def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ): - stop_words = set(stopwords.words('french')) - - stemmer_fr = SnowballStemmer("french") - - analyzer = CountVectorizer().build_analyzer() - - def stemmed_words_fr(doc): - return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) - - stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures) - - stem_vectorizer_fr.fit(self.docs_train) - - return stem_vectorizer_fr.transform(self.docs_train), stem_vectorizer_fr.transform(self.docs_test) - - - def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None): - - stop_words = set(stopwords.words('french')) - - stemmer_fr = SnowballStemmer("french") - - analyzer = TfidfVectorizer().build_analyzer() - - def stemmed_words_fr(doc): - return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) - - tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures) - tfidf_vectorizer.fit(self.docs_train) - return tfidf_vectorizer.transform(self.docs_train), tfidf_vectorizer.transform(self.docs_test) - - - - - def doc2vec(self, max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers): - - nlp = spacy.load("fr_core_news_sm") - stopWords = set(stopwords.words('french')) - - - - def tokenize_fr_text(sentence): - - result = string.punctuation - - - # Tokeniser la phrase - doc = nlp(sentence) - # Retourner le texte de chaque token - return [X.text.lower() for X in doc if not X.text in stopWords and not X.text in result and not len(X.text) < 2] - - - #tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs_train)] - tagged_tr = [TaggedDocument(words = tokenize_fr_text(_d),tags = [str(i)]) for i, _d in enumerate(self.docs_train)] - #Tag test set - tagged_test = [TaggedDocument(words=tokenize_fr_text(_d), tags = [str(i)]) for i, _d in enumerate(self.docs_test)] - - model = Doc2Vec(vector_size=doc2vec_vec_size, min_count = doc2vec_min_count, dm = doc2vec_dm, workers = doc2vec_workers) - model.build_vocab(tagged_tr) - model.train(tagged_tr, total_examples=model.corpus_count, epochs = max_epochs) - - - - X_train = np.array([model.docvecs[str(i)] for i in range(len(tagged_tr))]) - X_test = np.array([model.infer_vector(tagged_test[i][0]) for i in range(len(tagged_test))]) - - return X_train, X_test - - - def text_based_features(self): - - # Classical measures - - df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count']) - df['char_count'] = self.data[self.column].apply(len) - df['word_count'] = self.data[self.column].apply(lambda x: len(x.split())) - df['word_density'] = df['char_count'] / (df['word_count']+1) - df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) - df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()])) - df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()])) - - return df diff --git a/main.py b/main.py deleted file mode 100644 index 8301acc..0000000 --- a/main.py +++ /dev/null @@ -1,120 +0,0 @@ -import pandas as pd -import numpy as np -import configparser -from sklearn import preprocessing -from sklearn.model_selection import train_test_split - -from training_bertFineTuning import training_bertFineTuning -from predict_bertFineTuning import predict_class_bertFineTuning, generate_prediction_dataloader -from evaluate_bertFineTuning import evaluate_bertFineTuning - - - - - - -def create_dict(df, classColumnName): - return dict(df[classColumnName].value_counts()) - -def remove_weak_classes(df, classColumnName, threshold): - - dictOfClassInstances = create_dict(df,classColumnName) - - - dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } - keys = [*dictionary] - df_tmp = df[~ df[classColumnName].isin(keys)] - df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) - return df - - -def resample_classes(df, classColumnName, numberOfInstances): - - #random numberOfInstances elements - replace = False # with replacement - - fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] - return df.groupby(classColumnName, as_index=False).apply(fn) - - - -def main(): - - config = configparser.ConfigParser() - config.read('bert_settings.conf') - - dataPath = config.get('general','dataPath') - columnText = config.get('general','columnText') - columnClass = config.get('general','columnClass') - - minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) - maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) - - chosen_tokeniser = config.get('model','tokeniser') - chosen_model = config.get('model','model') - - max_len = int(config.get('model','max_len_sequences')) - batch_size = int(config.get('model','batch_size')) - epochs = int(config.get('model','epochs')) - - df = pd.read_csv(dataPath) - df = remove_weak_classes(df, columnClass, minOfInstancePerClass) - df = resample_classes(df, columnClass, maxOfInstancePerClass) - df = df[df[columnClass] != 'unclassified'] - - - y = df[columnClass] - numberOfClasses = y.nunique() - encoder = preprocessing.LabelEncoder() - y = encoder.fit_transform(y) - - - train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) - - sentences = train_x[columnText].values - labels = train_y.tolist() - - - #call train method - - model = training_bertFineTuning(chosen_model, sentences, labels, max_len, batch_size, epochs) - #save the model - model_save_name = config.get('model','modelName') - path = config.get('model','path') - torch.save(model, os.path.join(path,model_save_name)) - - #print the model parameters - params = list(model.named_parameters()) - - print('The BERT model has {:} different named parameters.\n'.format(len(params))) - - print('==== Embedding Layer ====\n') - - for p in params[0:5]: - print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) - - print('\n==== First Transformer ====\n') - - for p in params[5:21]: - print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) - - print('\n==== Output Layer ====\n') - - for p in params[-4:]: - print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) - - #call predict method - prediction_dataloader = generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, max_len, batch_size = 32) - predicted_class, true_labels = predict_class_bertFineTuning(chosen_model, model, prediction_dataloader) - - #call Evaluate - result_df, accuracy , weighted_avg = evaluate_bertFineTuning(predicted_class, true_labels, encoder) - - print(result_df) - print(accuracy) - print(weighted_avg) - - - -if __name__ == "__main__": - main() diff --git a/predict_bertFineTuning.py b/predict_bertFineTuning.py deleted file mode 100644 index 4276122..0000000 --- a/predict_bertFineTuning.py +++ /dev/null @@ -1,168 +0,0 @@ -import torch - -import pandas as pd - -import numpy as np - -from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler -from transformers import BertTokenizer, CamembertTokenizer - -def generate_prediction_dataloader(chosen_model, sentences_to_predict, labels, batch_size = 32): - - if chosen_model == 'bert-base-multilingual-cased' : - print('Loading Bert Tokenizer...') - tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) - elif chosen_model == 'camembert-base': - print('Loading Camembert Tokenizer...') - tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) - - # Tokenize all of the sentences and map the tokens to thier word IDs. - input_ids_test = [] - # For every sentence... - for sent in sentences_to_predict: - # `encode` will: - # (1) Tokenize the sentence. - # (2) Prepend the `[CLS]` token to the start. - # (3) Append the `[SEP]` token to the end. - # (4) Map tokens to their IDs. - encoded_sent = tokenizer.encode( - sent, # Sentence to encode. - add_special_tokens = True, # Add '[CLS]' and '[SEP]' - ) - - input_ids_test.append(encoded_sent) - - # Pad our input tokens - padded_test = [] - for i in input_ids_test: - - if len(i) > max_len: - padded_test.extend([i[:max_len]]) - else: - padded_test.extend([i + [0] * (max_len - len(i))]) - input_ids_test = np.array(padded_test) - - # Create attention masks - attention_masks = [] - - # Create a mask of 1s for each token followed by 0s for padding - for seq in input_ids_test: - seq_mask = [float(i>0) for i in seq] - attention_masks.append(seq_mask) - - # Convert to tensors. - prediction_inputs = torch.tensor(input_ids_test) - prediction_masks = torch.tensor(attention_masks) - prediction_labels = torch.tensor(labels) - - # Set the batch size. - batch_size = 32 - - # Create the DataLoader. - prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) - prediction_sampler = SequentialSampler(prediction_data) - prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) - - return prediction_dataloader - - - -def predict_class_bertFineTuning(model, sentences_to_predict_dataloader): - - - # If there's a GPU available... - if torch.cuda.is_available(): - - # Tell PyTorch to use the GPU. - device = torch.device("cuda") - - print('There are %d GPU(s) available.' % torch.cuda.device_count()) - - print('We will use the GPU:', torch.cuda.get_device_name(0)) - - # If not... - else: - print('No GPU available, using the CPU instead.') - device = torch.device("cpu") - - # Put model in evaluation mode - model.eval() - - # Tracking variables - predictions_test , true_labels = [], [] - - # Predict - for batch in prediction_dataloader: - # Add batch to GPU - batch = tuple(t.to(device) for t in batch) - - # Unpack the inputs from the dataloader - b_input_ids, b_input_mask, b_labels = batch - - # Telling the model not to compute or store gradients, saving memory and - # speeding up prediction - with torch.no_grad(): - # Forward pass, calculate logit predictions - outputs = model(b_input_ids, token_type_ids=None, - attention_mask=b_input_mask) - - logits = outputs[0] - #print(logits) - - # Move logits and labels to CPU - logits = logits.detach().cpu().numpy() - label_ids = b_labels.to('cpu').numpy() - #print(logits) - - # Store predictions and true labels - predictions_test.append(logits) - true_labels.append(label_ids) - - print(' DONE.') - - pred_labels = [] - - - for i in range(len(true_labels)): - - # The predictions for this batch are a 2-column ndarray (one column for "0" - # and one column for "1"). Pick the label with the highest value and turn this - # in to a list of 0s and 1s. - pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() - pred_labels.append(pred_labels_i) - - pred_labels_ = [item for sublist in pred_labels for item in sublist] - true_labels_ = [item for sublist in true_labels for item in sublist] - return predictions_test_, true_labels_ - - -def predict_instance_bertFineTuning(chosen_model, model, sentences_to_predict): - - if chosen_model == 'bert-base-multilingual-cased' : - print('Loading Bert Tokenizer...') - tokenizer = BertTokenizer.from_pretrained(chosen_model, do_lower_case=True) - elif chosen_model == 'camembert-base': - print('Loading Camembert Tokenizer...') - tokenizer = CamembertTokenizer.from_pretrained(chosen_model , do_lower_case=True) - - # Tokenize all of the sentences and map the tokens to thier word IDs. - input_ids_test = [] - # For every sentence... - for sent in sentences_to_predict: - # `encode` will: - # (1) Tokenize the sentence. - # (2) Prepend the `[CLS]` token to the start. - # (3) Append the `[SEP]` token to the end. - # (4) Map tokens to their IDs. - encoded_sent = tokenizer.encode( - sent, # Sentence to encode. - add_special_tokens = True, # Add '[CLS]' and '[SEP]' - ) - - input_ids_test.append(encoded_sent) - with torch.no_grad(): - # Forward pass, calculate logit predictions - outputs = model(b_input_ids, token_type_ids=None, - attention_mask=b_input_mask) - - logits = outputs[0] diff --git a/settings.conf b/settings.conf deleted file mode 100644 index 023215e..0000000 --- a/settings.conf +++ /dev/null @@ -1,10 +0,0 @@ -[vectorizers] -vectorization_max_df= 1.0 -vectorization_min_df= 4 -vectorization_numberOfFeatures= None -doc2vec_vec_size = 700 -max_epochs = 10 -doc2vec_min_count = 12 -doc2vec_dm = 0 -doc2vec_workers = 8 -min_word_per_article = 25 diff --git a/tmp_preprocess_data.py b/tmp_preprocess_data.py deleted file mode 100644 index 30b0d75..0000000 --- a/tmp_preprocess_data.py +++ /dev/null @@ -1,105 +0,0 @@ -#import sys -#import os -#import time -#import argparse -import pandas as pd -#import numpy as np -#from data_preprocessing import Preprocessor -#from features_extractor import feature_extractor -#from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class -#from classifiers import classifiers, grid_params -#from sklearn.model_selection import train_test_split -#from sklearn import preprocessing -#from evaluate_model import evaluate_model -#from sklearn.model_selection import GridSearchCV -#import configparser -#from re import search -#import math -#import re -#import nltk -#from ClassPreprocessor import create_dict - - -#print("Begin preprocess") - -# Reading data and preprocessings steps - - - -print("load dataset") - -df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") -#df = df_original.copy() - -print("len(df)",len(df)) - - -print("remove blank rows") -df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True) -print("len(df)",len(df)) - -print("remove small articles < 15 words") -#preprocessor = Preprocessor() -#preprocessor.removeArticlesByTokensNumbers(df, 'content', 25) -df = df.loc[(df['nb_word']>=15)] -print("len(df)",len(df)) - - -df.reset_index(drop=True, inplace=True) - - -print("filter unclassified rows") -# filtrer les articles non classés par ARTFL mais classé par ENCCRE (jeu de test) -df_unclassified = df.loc[(df['normClass']=="unclassified")] -df_classified = df.loc[(df['normClass']!="unclassified")] - - - -print("save dataframe") -df_classified.to_csv('./data/train_dataframe.tsv', sep="\t") -df_unclassified.to_csv('./data/test_dataframe.tsv', sep="\t") - -print("some stats") - -print("len(df_unclassified)",len(df_unclassified)) -print("len(df_classified)",len(df_classified)) - -''' - -#preprocessor.remove_null_rows(df_original, 'content') -print("copy") -df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() -df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() -df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy() - -print("split ensemble domaine enccre") -df_1 = split_class(df_1, 'ensemble_domaine_enccre') -print("save dataframe") -df_1.to_csv('./data/train_dataframe_with_ensemble_domaine_enccre.csv') - -print("split domaine enccre") -df_2 = split_class(df_2, 'domaine_enccre') -print("save dataframe") -df_2.to_csv('./data/train_dataframe_with_domaine_enccre.csv') - -print("split normclass") -df_3 = split_class(df_3, 'normClass') -print("save dataframe") -df_3.to_csv('./data/train_dataframe_with_normClass_artfl.csv') - - - -d_1 = create_dict(df_1, 'ensemble_domaine_enccre') -tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) -tosave.to_excel("ensemble_domaine_enccre.xlsx") - -d_2 = create_dict(df_2, 'domaine_enccre') -tosave = pd.DataFrame.from_dict(d_2, orient='index', columns=[ 'Count']) -tosave.to_excel("domaine_enccre.xlsx") - -d_3 = create_dict(df_3, 'normClass_artfl') -tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) -tosave.to_excel("normClass_artfl.xlsx") - -print(df_original.shape) -''' \ No newline at end of file diff --git a/training_bertFineTuning.py b/training_bertFineTuning.py deleted file mode 100644 index 72cd70f..0000000 --- a/training_bertFineTuning.py +++ /dev/null @@ -1,515 +0,0 @@ -import torch -import pandas as pd -import numpy as np -from sklearn import preprocessing -from sklearn.model_selection import train_test_split -from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler -from transformers import BertTokenizer, CamembertTokenizer -from transformers import BertForSequenceClassification, AdamW, BertConfig, CamembertForSequenceClassification -from transformers import get_linear_schedule_with_warmup -import time -import datetime -import random -import os -import argparse -import configparser -import csv - - - -def create_dict(df, classColumnName): - return dict(df[classColumnName].value_counts()) - - -def remove_weak_classes(df, classColumnName, threshold): - - dictOfClassInstances = create_dict(df,classColumnName) - - - dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } - keys = [*dictionary] - df_tmp = df[~ df[classColumnName].isin(keys)] - df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) - return df - - -def resample_classes(df, classColumnName, numberOfInstances): - - #random numberOfInstances elements - replace = False # with replacement - - fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] - return df.groupby(classColumnName, as_index=False).apply(fn) - - - -def flat_accuracy(preds, labels): - pred_flat = np.argmax(preds, axis=1).flatten() - labels_flat = labels.flatten() - return np.sum(pred_flat == labels_flat) / len(labels_flat) - - -def format_time(elapsed): - ''' - Takes a time in seconds and returns a string hh:mm:ss - ''' - # Round to the nearest second. - elapsed_rounded = int(round((elapsed))) - - # Format as hh:mm:ss - return str(datetime.timedelta(seconds=elapsed_rounded)) - - -def training_bertFineTuning(chosen_model, model_path, sentences, labels, max_len, batch_size, epochs = 4): - - # If there's a GPU available... - if torch.cuda.is_available(): - - # Tell PyTorch to use the GPU. - device = torch.device("cuda") - - print('There are %d GPU(s) available.' % torch.cuda.device_count()) - - print('We will use the GPU:', torch.cuda.get_device_name(0)) - - # If not... - else: - print('No GPU available, using the CPU instead.') - device = torch.device("cpu") - -############################################################################################################ -########################## Model: Tokenization & Input Formatting ################################################################### -########################################################################################################### - - - if chosen_model == 'bert' : - print('Loading Bert Tokenizer...') - tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True) - elif chosen_model == 'camembert': - print('Loading Camembert Tokenizer...') - tokenizer = CamembertTokenizer.from_pretrained(model_path , do_lower_case=True) - - - - # Tokenize all of the sentences and map the tokens to thier word IDs. - input_ids = [] - - # For every sentence... - for sent in sentences: - # `encode` will: - # (1) Tokenize the sentence. - # (2) Prepend the `[CLS]` token to the start. - # (3) Append the `[SEP]` token to the end. - # (4) Map tokens to their IDs. - encoded_sent = tokenizer.encode( - str(sent), # Sentence to encode. - add_special_tokens = True, # Add '[CLS]' and '[SEP]' - - # This function also supports truncation and conversion - # to pytorch tensors, but I need to do padding, so I - # can't use these features. - #max_length = 128, # Truncate all sentences. - #return_tensors = 'pt', # Return pytorch tensors. - ) - - # Add the encoded sentence to the list. - input_ids.append(encoded_sent) - - - - - padded = [] - for i in input_ids: - - if len(i) > max_len: - padded.extend([i[:max_len]]) - else: - padded.extend([i + [0] * (max_len - len(i))]) - - - padded = np.array(padded) - - - - # Create attention masks - attention_masks = [] - - # For each sentence... - for sent in padded: - - # Create the attention mask. - # - If a token ID is 0, then it's padding, set the mask to 0. - # - If a token ID is > 0, then it's a real token, set the mask to 1. - att_mask = [int(token_id > 0) for token_id in sent] - - # Store the attention mask for this sentence. - attention_masks.append(att_mask) - - - # Use 90% for training and 10% for validation. - train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(padded, labels, random_state=2018, test_size=0.3, stratify = labels ) - # Do the same for the masks. - train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.3, stratify = labels) - - - # Convert all inputs and labels into torch tensors, the required datatype - # for my model. - train_inputs = torch.tensor(train_inputs) - validation_inputs = torch.tensor(validation_inputs) - - train_labels = torch.tensor(train_labels) - validation_labels = torch.tensor(validation_labels) - - train_masks = torch.tensor(train_masks) - validation_masks = torch.tensor(validation_masks) - - - - - # The DataLoader needs to know the batch size for training, so I specify it here. - # For fine-tuning BERT on a specific task, the authors recommend a batch size of - # 16 or 32. - - - # Create the DataLoader for training set. - train_data = TensorDataset(train_inputs, train_masks, train_labels) - train_sampler = RandomSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) - - # Create the DataLoader for validation set. - validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) - validation_sampler = SequentialSampler(validation_data) - validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) - - - - - - print(' Selecting a model .....') - - numberOfClasses = len(set(labels)) - - - # Load BertForSequenceClassification, the pretrained BERT model with a single - # linear classification layer on top. - if chosen_model == 'bert': - model = BertForSequenceClassification.from_pretrained( - model_path, # Use the 12-layer BERT model, with an uncased vocab. - num_labels = numberOfClasses, # The number of output labels--2 for binary classification. - # You can increase this for multi-class tasks. - output_attentions = False, # Whether the model returns attentions weights. - output_hidden_states = False, # Whether the model returns all hidden-states. - ) - elif chosen_model == 'camembert': - - model = CamembertForSequenceClassification.from_pretrained( - model_path, # Use the 12-layer BERT model, with an uncased vocab. - num_labels = numberOfClasses, # The number of output labels--2 for binary classification. - # You can increase this for multi-class tasks. - output_attentions = False, # Whether the model returns attentions weights. - output_hidden_states = False, # Whether the model returns all hidden-states. - ) - - - # Tell pytorch to run this model on the GPU. - model.cuda() - - - #Note: AdamW is a class from the huggingface library (as opposed to pytorch) - # I believe the 'W' stands for 'Weight Decay fix" - optimizer = AdamW(model.parameters(), - lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 - eps = 1e-8 # args.adam_epsilon - default is 1e-8. - ) - - - - - # Total number of training steps is number of batches * number of epochs. - total_steps = len(train_dataloader) * epochs - - # Create the learning rate scheduler. - scheduler = get_linear_schedule_with_warmup(optimizer, - num_warmup_steps = 0, # Default value in run_glue.py - num_training_steps = total_steps) - - - - - # This training code is based on the `run_glue.py` script here: - # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128 - - - # Set the seed value all over the place to make this reproducible. - seed_val = 42 - - random.seed(seed_val) - np.random.seed(seed_val) - torch.manual_seed(seed_val) - torch.cuda.manual_seed_all(seed_val) - - # Store the average loss after each epoch so I can plot them. - loss_values = [] - - # For each epoch... - for epoch_i in range(0, epochs): - - # ======================================== - # Training - # ======================================== - - # Perform one full pass over the training set. - - print("") - print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) - print('Training...') - - # Measure how long the training epoch takes. - t0 = time.time() - - # Reset the total loss for this epoch. - total_loss = 0 - - # Put the model into training mode. - model.train() - - # For each batch of training data... - for step, batch in enumerate(train_dataloader): - - # Progress update every 40 batches. - if step % 40 == 0 and not step == 0: - # Calculate elapsed time in minutes. - elapsed = format_time(time.time() - t0) - - # Report progress. - print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed)) - - # Unpack this training batch from the dataloader. - # - # As I unpack the batch, I'll also copy each tensor to the GPU using the - # `to` method. - # - # `batch` contains three pytorch tensors: - # [0]: input ids - # [1]: attention masks - # [2]: labels - b_input_ids = batch[0].to(device) - b_input_mask = batch[1].to(device) - b_labels = batch[2].to(device) - - # Always clear any previously calculated gradients before performing a - # backward pass. PyTorch doesn't do this automatically because - # accumulating the gradients is "convenient while training RNNs". - # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch) - model.zero_grad() - - # Perform a forward pass (evaluate the model on this training batch). - # This will return the loss (rather than the model output) because I - # have provided the `labels`. - # The documentation for this `model` function is here: - # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification - outputs = model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask, - labels=b_labels) - - # The call to `model` always returns a tuple, so I need to pull the - # loss value out of the tuple. - loss = outputs[0] - - # Accumulate the training loss over all of the batches so that I can - # calculate the average loss at the end. `loss` is a Tensor containing a - # single value; the `.item()` function just returns the Python value - # from the tensor. - total_loss += loss.item() - - # Perform a backward pass to calculate the gradients. - loss.backward() - - # Clip the norm of the gradients to 1.0. - # This is to help prevent the "exploding gradients" problem. - torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) - - # Update parameters and take a step using the computed gradient. - # The optimizer dictates the "update rule"--how the parameters are - # modified based on their gradients, the learning rate, etc. - optimizer.step() - - # Update the learning rate. - scheduler.step() - - # Calculate the average loss over the training data. - avg_train_loss = total_loss / len(train_dataloader) - - # Store the loss value for plotting the learning curve. - loss_values.append(avg_train_loss) - - print("") - print(" Average training loss: {0:.2f}".format(avg_train_loss)) - print(" Training epoch took: {:}".format(format_time(time.time() - t0))) - - # ======================================== - # Validation - # ======================================== - # After the completion of each training epoch, measure the performance on - # the validation set. - - print("") - print("Running Validation...") - - t0 = time.time() - - # Put the model in evaluation mode--the dropout layers behave differently - # during evaluation. - model.eval() - - # Tracking variables - eval_loss, eval_accuracy = 0, 0 - nb_eval_steps, nb_eval_examples = 0, 0 - - # Evaluate data for one epoch - for batch in validation_dataloader: - - # Add batch to GPU - batch = tuple(t.to(device) for t in batch) - - # Unpack the inputs from dataloader - b_input_ids, b_input_mask, b_labels = batch - - # Telling the model not to compute or store gradients, saving memory and - # speeding up validation - with torch.no_grad(): - - # Forward pass, calculate logit predictions. - # This will return the logits rather than the loss because we have - # not provided labels. - # token_type_ids is the same as the "segment ids", which - # differentiates sentence 1 and 2 in 2-sentence tasks. - # The documentation for this `model` function is here: - # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification - outputs = model(b_input_ids, - token_type_ids=None, - attention_mask=b_input_mask) - - # Get the "logits" output by the model. The "logits" are the output - # values prior to applying an activation function like the softmax. - logits = outputs[0] - - # Move logits and labels to CPU - logits = logits.detach().cpu().numpy() - label_ids = b_labels.to('cpu').numpy() - - # Calculate the accuracy for this batch of test sentences. - tmp_eval_accuracy = flat_accuracy(logits, label_ids) - - # Accumulate the total accuracy. - eval_accuracy += tmp_eval_accuracy - - # Track the number of batches - nb_eval_steps += 1 - - # Report the final accuracy for this validation run. - print(" Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps)) - print(" Validation took: {:}".format(format_time(time.time() - t0))) - - print("") - print("Training complete!") - return model - - -'''print('Saving Model....') -model_save_name = config.get('model','modelName') -path = config.get('model','path') -#torch.save(model.state_dict(), os.path.join(path,model_save_name)) -torch.save(model, os.path.join(path,model_save_name))''' - - - - - - - - - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - - parser.add_argument("input_dataset") - parser.add_argument("conf_file") - parser.add_argument("output_path") - - args = parser.parse_args() - - INPUT_DATASET = args.input_dataset - CONF_FILE = args.conf_file - OUTPUT_PATH = args.output_path - - config = configparser.ConfigParser() - config.read(CONF_FILE) - - #dataPath = config.get('general','dataPath') - columnText = config.get('general','columnText') - columnClass = config.get('general','columnClass') - - minOfInstancePerClass = int(config.get('general','minOfInstancePerClass')) - maxOfInstancePerClass = int(config.get('general','maxOfInstancePerClass')) - - model_path = config.get('model','path') - chosen_model = config.get('model','model') - - max_len = int(config.get('model','max_len_sequences')) - batch_size = int(config.get('model','batch_size')) - epochs = int(config.get('model','epochs')) - - - df = pd.read_csv(INPUT_DATASET, sep="\t") - df = remove_weak_classes(df, columnClass, minOfInstancePerClass) - df = resample_classes(df, columnClass, maxOfInstancePerClass) - #df = df[df[columnClass] != 'unclassified'] - - - y = df[columnClass] - numberOfClasses = y.nunique() - encoder = preprocessing.LabelEncoder() - y = encoder.fit_transform(y) - - - #train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y ) - - #sentences = train_x[columnText].values - sentences = df[columnText].values - #labels = train_y.tolist() - labels = y.tolist() - - - #call train method - - model = training_bertFineTuning(chosen_model,model_path, sentences, labels, max_len, batch_size, epochs) - - - #save the model - model_save_name = chosen_model+"_b"+batch_size+"_e"+epochs - - torch.save(model, os.path.join(OUTPUT_PATH,model_save_name)) - - #print the model parameters - params = list(model.named_parameters()) - - print('The BERT model has {:} different named parameters.\n'.format(len(params))) - - print('==== Embedding Layer ====\n') - - for p in params[0:5]: - print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) - - print('\n==== First Transformer ====\n') - - for p in params[5:21]: - print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) - - print('\n==== Output Layer ====\n') - - for p in params[-4:]: - print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size())))) -- GitLab