diff --git a/projet/ClassPreprocessor.py b/projet/ClassPreprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..c86114450145f40f59d035a18dd6b63040d03852 --- /dev/null +++ b/projet/ClassPreprocessor.py @@ -0,0 +1,50 @@ +import pandas as pd +import numpy as np +import statistics + +def create_dict(df, classColumnName): + return dict(df[classColumnName].value_counts()) + +def remove_weak_classes(df, classColumnName, threshold): + + dictOfClassInstances = create_dict(df,classColumnName) + + + dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold } + keys = [*dictionary] + df_tmp = df[~ df[classColumnName].isin(keys)] + #df = df[df[columnTarget] not in keys] + #df = df.merge(df_tmp, how = 'outer' ,indicator=True) + df = pd.concat([df,df_tmp]).drop_duplicates(keep=False) + return df + + + +def split_class(df, columnProcessed): + i = 0 + new_df = pd.DataFrame(columns= df.columns) + for index, row in df.iterrows(): + #cls = re.split(';', row[columnProcessed]) + cls = filter(None, row[columnProcessed].split(';')) + cls = list(cls) + #cls = re.findall(r"[\w']+", row [columnProcessed]) + r = row + for categ in cls: + r[columnProcessed] = categ + #new_df.append(r, ignore_index = True) + new_df.loc[i] = r + i = i + 1 + + return new_df + +def get_median_dict(dict): + return statistics.median(dict.values()) + +def resample_classes(df, classColumnName, numberOfInstances): + # numberOfInstances first elements + #return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns]) + #random numberOfInstances elements + replace = False # with replacement + + fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:] + return df.groupby(classColumnName, as_index=False).apply(fn) diff --git a/projet/__init__.py b/projet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..579c7ee32703b0cea63ef2ac5b3552ca53131aed --- /dev/null +++ b/projet/__init__.py @@ -0,0 +1,2 @@ +from data_process.data_functions import read_tei, elem_to_text, basename_without_ext, tei_to_csv_entry +from data_process.TEIFile import TEIFile diff --git a/projet/classifiers.py b/projet/classifiers.py new file mode 100644 index 0000000000000000000000000000000000000000..16db401d1b54bfd1d667e2e9c7a32a1e5b298e5f --- /dev/null +++ b/projet/classifiers.py @@ -0,0 +1,39 @@ + +from sklearn.naive_bayes import MultinomialNB +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.linear_model import SGDClassifier +from sklearn.neighbors import KNeighborsClassifier + +import numpy as np + + +classifiers = [ + ('bayes', MultinomialNB()), + ('svm', SVC() ), + ('decisionTree',DecisionTreeClassifier()), + ('rfc', RandomForestClassifier()), + ('lr', LogisticRegression()), + ('sgd', SGDClassifier()), + ('knn', KNeighborsClassifier()) + ] + + +param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']} +param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) } +param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] } +param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]} +param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]} +param_grid_knn = {'n_neighbors' : list(range(1,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] } + +grid_params = [ + ('bayes', None), + ('svm', param_grid_svm), + ('decisionTree', param_grid_decisionTree), + ('rfc', param_grid_rfc ), + ('lr', param_grid_lr), + ('sgd', param_grid_sgd ), + ('knn', param_grid_knn), + ] diff --git a/projet/data_preparation.py b/projet/data_preparation.py new file mode 100644 index 0000000000000000000000000000000000000000..4de045dfe90e8257e589b8bc9200d3a6f58e2b83 --- /dev/null +++ b/projet/data_preparation.py @@ -0,0 +1,51 @@ +from os import path +from os.path import basename, splitext +import pandas as pd +import os +from data_process.TEIFile import TEIFile + + + + +def basename_without_ext(path): + base_name = basename(path) + stem, ext = splitext(base_name) + if stem.endswith('.tei'): + # Return base name without tei file + return stem[0:-4] + else: + return stem + + +def tei_to_csv_entry(tei_file, txt_file): + print(f"Going on {tei_file}") + tei = TEIFile(tei_file, txt_file) + print(f"Handled {tei_file}") + base_name = basename_without_ext(tei_file) + return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution + + +input_path = r'./data/EDdA/' +output_name = "corpus_tei.csv" + +column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"] + +df = pd.DataFrame(columns = column_names) + +marge = 0 + + +for tome in os.listdir(input_path): + volume = tome[1:] + + for index, article in enumerate(os.listdir(input_path + tome +"/")): + filepath = os.path.join(input_path, tome, article) + base_name = basename_without_ext(filepath) + + df.loc[index+marge] = tei_to_csv_entry(filepath, ' ') + df.loc[index+marge]['articleName'] = volume+'_'+base_name + marge += index +1 + + + +df.to_csv(output_name, index=False) diff --git a/projet/data_preprocessing.py b/projet/data_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..5fe379ec5db0bb4b2c6e57375a3b9fe26b41d2a9 --- /dev/null +++ b/projet/data_preprocessing.py @@ -0,0 +1,134 @@ +import pandas as pd +import numpy as np +from re import search +import math +from unidecode import unidecode +from sklearn.feature_extraction.text import CountVectorizer +from nltk.stem.snowball import SnowballStemmer +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +import re +import nltk + +class Preprocessor(): + + def init(self): + pass + + + def remove_null_rows(self, df, columnName): + #df = df[df[columnName].notna()] + df.dropna(subset = [columnName], inplace = True) + df.reset_index(drop=True, inplace=True) + + return + + def removeMarkers(self, df, textColumn, markerColumn = 'class'): + + #remove null values or add condition if exist + #self.remove_null_rows(df, markerColumn) + #self.remove_null_rows(df, textColumn) + + for index, row in df.iterrows(): + if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]): + + marker = row[markerColumn] + marker_with_brcts = '('+ marker +')' + row[textColumn] = row[textColumn].replace(marker_with_brcts , "") + row[textColumn] = row[textColumn].replace(marker , "") + full_text = row[textColumn] + i = unidecode(full_text).find(marker_with_brcts) + goOn = False + if i != -1: + goOn = True + while goOn: + + full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) + i = unidecode(full_text).find(marker_with_brcts) + if i == -1: + goOn = False + + + row[textColumn] = full_text + + return df + + + def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence): + + stop_words = set(stopwords.words('french')) + stemmer_fr = SnowballStemmer("french") + analyzer = CountVectorizer().build_analyzer() + + def token_fr(doc): + return (w for w in analyzer(doc) if not w in stop_words) + + stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None) + + docs = [] + + for index, row in df.iterrows(): + docs.append(row[textColumn]) + + stem_vectorizer_fr.fit(docs) + featured_docs = stem_vectorizer_fr.transform(docs) + tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs) + + for index, tokens in enumerate(tokens_per_docs): + # join token to recreate text with new tokens + new_text = ' '.join(tokens) + df.loc[index][textColumn] = new_text + + return + + def removeArticlesByTokensNumbers(self, df, textColumn, min_word_per_article): + + stop_words = set(stopwords.words('french')) + stemmer_fr = SnowballStemmer("french") + analyzer = CountVectorizer().build_analyzer() + + def token_fr(doc): + return (w for w in analyzer(doc) if not w in stop_words) + + stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr) + + docs = [] + + for index, row in df.iterrows(): + docs.append(row[textColumn]) + + stem_vectorizer_fr.fit(docs) + featured_docs = stem_vectorizer_fr.transform(docs) + tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs) + + concerned_article_index = [] + for index, tokens in enumerate(tokens_per_docs): + + if len(tokens) <= min_word_per_article: + concerned_article_index.append(index) + + df = df.drop(index = concerned_article_index, inplace = True) + + return + + + def getFirstParagraph(self, df, textColumn, columnName): + new_column = [] + for index, row in df.iterrows(): + paragraphs = row[textColumn].split('\n \n') + new_column.append(paragraphs[0]) + df[columnName] = new_column + return + + def getFirstSentence(self, df, textColumn, columnName): + sent = [] + for index, row in df.iterrows(): + sentences = nltk.sent_tokenize(row[textColumn]) + sent.append(sentences[0]) + df[columnName] = sent + return + + def saveDataFrametoCSV(self, df, pathName): + df.to_csv(pathName) + + diff --git a/projet/data_process/TEIFile.py b/projet/data_process/TEIFile.py new file mode 100644 index 0000000000000000000000000000000000000000..6ab1ed5bdbbc31062931fa1b340123bd7b3d9f48 --- /dev/null +++ b/projet/data_process/TEIFile.py @@ -0,0 +1,51 @@ +from data_process.data_functions import read_tei + +class TEIFile(object): + def __init__(self, filename, textfilename): + self.filename = filename + self.soup = read_tei(filename) + self._text = None + self._Head = '' + self._Objecttype = '' + self._attribution = '' + self._Class = '' + self._normclass = '' + self._englishclass = '' + self._generatedclass = '' + self._author = '' + + + if self.soup.find('index', type='head'): + self._Head = self.soup.find('index', type='head')['value'] + + if self.soup.find('index', type='objecttype'): + self._Objecttype = self.soup.find('index', type='objecttype')['value'] + + + if self.soup.find('index', type='attribution'): + self._attribution = self.soup.find('index', type='attribution')['value'] + + if self.soup.find('index', type='class') and self.soup.find('index', type='class').has_attr('value') : + + self._Class = self.soup.find('index', type='class')['value'] + + if self.soup.find('index', type='normclass'): + self._normclass = self.soup.find('index', type='normclass')['value'] + + if self.soup.find('index', type='englishclass'): + self._englishclass = self.soup.find('index', type='englishclass')['value'] + + if self.soup.find('index', type='generatedclass'): + self._generatedclass = self.soup.find('index', type='generatedclass')['value'] + + if self.soup.find('index', type = 'author'): + self._author = self.soup.find('index', type='author')['value'] + + + + ps = self.soup.find_all('p') + Texts = [] + for p in ps[1:]: + Texts.append(p.getText()) + + self._text = ' '.join(Texts) diff --git a/projet/data_process/__pycache__/TEIFile.cpython-37.pyc b/projet/data_process/__pycache__/TEIFile.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..222b98df52d058c6f11ee8daa0a08d0b4b5a3555 Binary files /dev/null and b/projet/data_process/__pycache__/TEIFile.cpython-37.pyc differ diff --git a/projet/data_process/__pycache__/data_functions.cpython-37.pyc b/projet/data_process/__pycache__/data_functions.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0afbca5a5d53602d2da0c2d2b167b86d53f995e9 Binary files /dev/null and b/projet/data_process/__pycache__/data_functions.cpython-37.pyc differ diff --git a/projet/data_process/data_functions.py b/projet/data_process/data_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..c351ba35977e6ca2a9f5340bdda7e645619047fd --- /dev/null +++ b/projet/data_process/data_functions.py @@ -0,0 +1,14 @@ +from bs4 import BeautifulSoup + +def read_tei(tei_file): + with open(tei_file, 'r') as tei: + soup = BeautifulSoup(tei, 'lxml') + return soup + raise RuntimeError('Cannot generate a soup from the input') + + +def elem_to_text(elem, default=''): + if elem: + return elem.getText(separator=' ', strip=True) + else: + return default diff --git a/projet/evaluate_model.py b/projet/evaluate_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8abd2e9fc70578d30e11f7c54162a45521d1fe6e --- /dev/null +++ b/projet/evaluate_model.py @@ -0,0 +1,70 @@ +import matplotlib.pyplot as plt +import numpy as np +from sklearn.metrics import plot_confusion_matrix +from sklearn.metrics import confusion_matrix +from sklearn.metrics import classification_report +import pandas as pd +import seaborn as sns + + +def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave): + + #classifier, label_list, test_x, valid_y, title = "Confusion matrix"): + precision = [] + recall = [] + f1 = [] + support = [] + weighted_avg = None + accuracy = None + + df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN']) + report = classification_report( y_pred, valid_y, output_dict = True) + for c in classes: + precision.append(report[c]['precision']) + recall.append(report[c]['recall']) + f1.append(report[c]['f1-score']) + support.append(report[c]['support']) + + accuracy = report['accuracy'] + weighted_avg = report['weighted avg'] + cnf_matrix = confusion_matrix(valid_y, y_pred) + FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix) + FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix) + TP = np.diag(cnf_matrix) + TN = cnf_matrix.sum() - (FP + FN + TP) + + df['className'] = classesName + df['precision'] = precision + df['recall'] = recall + df['f1-score'] = f1 + df['support'] = support + df['FP'] = FP + df['FN'] = FN + df['TP'] = TP + df['TN'] = TN + #disp = plot_confusion_matrix(classifier, test_x, valid_y, + # display_labels= label_list, + # cmap=plt.cm.Blues, + # normalize=None) + #disp.ax_.set_title(title) + + #print(title) + #print(disp.confusion_matrix) + + #plt.show() + plt.rcParams["font.size"] = 3 + plot_confusion_matrix(clf, X_test, y_test) + plt.savefig(pathSave) + return df, accuracy, weighted_avg + +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.metrics import confusion_matrix + + +#y_true = [2, 0, 2, 2, 0, 1] +#y_pred = [0, 0, 2, 2, 0, 2] +#cf_matrix = confusion_matrix(y_true, y_pred) +#sns.heatmap(cf_matrix, annot=True) +#import matplotlib.pyplot as plt +#plt.show() diff --git a/projet/experimentsClassicClassifiers.py b/projet/experimentsClassicClassifiers.py new file mode 100644 index 0000000000000000000000000000000000000000..be4fd36b4a88f38aa353b51476b8f5003e45feef --- /dev/null +++ b/projet/experimentsClassicClassifiers.py @@ -0,0 +1,213 @@ +import sys +import os +import time +import argparse +import pandas as pd +from data_preprocessing import Preprocessor +from features_extractor import feature_extractor +from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class +from classifiers import classifiers, grid_params +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +from evaluate_model import evaluate_model +from sklearn.model_selection import GridSearchCV +import configparser + + +parser = argparse.ArgumentParser() +parser.add_argument("dataPath", help="Path of the dataframe") +parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content') +parser.add_argument("columnClass", help="ColumnClass the column name of the classes") +parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int) +parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int) + + + + +args = parser.parse_args() +dataPath = args.dataPath +columnText = args.columnText +columnClass = args.columnClass +minOfInstancePerClass = args.minOfInstancePerClass +maxOfInstancePerClass = args.maxOfInstancePerClass + + +# create directory in the reports directory so save the classification results +dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) +if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): + os.makedirs(os.path.join('reports', columnClass, dir_name_report)) + + +# Reading data and preprocessings steps + +preprocessor = Preprocessor() + + +df_original = pd.read_csv(dataPath) + + +df = df_original[[columnClass,columnText]].copy() +preprocessor.remove_null_rows(df, columnText) +preprocessor.remove_null_rows(df, columnClass) +#df = split_class(df, columnClass) +df = remove_weak_classes(df, columnClass, minOfInstancePerClass ) +df = resample_classes(df, columnClass, maxOfInstancePerClass) + +preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text + +#Read configuration file for retreiving parameters of features extractors + +config = configparser.ConfigParser() +config.read('settings.conf') + + + +vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) +vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) +vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None +doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) +doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) +doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) + + +extractor = feature_extractor(df,columnText, columnClass) +extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass) + + +features_techniques = [ +('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), +('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), +('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + +features_techniques_paragraphe = [ +('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), +('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), +('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] + + + + + +#prepare data +df = df[df[columnClass] != 'unclassified'] +y = df[columnClass] + + +#case of full text + +for feature_technique_name, features in features_techniques: + train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + encoder = preprocessing.LabelEncoder() + train_y = encoder.fit_transform(train_y) + valid_y = encoder.fit_transform(test_y) + + for tmp_clf, tmp_grid_params in zip(classifiers, grid_params): + clf_name, clf = tmp_clf + grid_param_name, grid_param = tmp_grid_params + print(clf_name, clf, grid_param_name, grid_param) + if clf_name == 'bayes' : + if feature_technique_name == 'doc2vec': + continue + else: + t_begin = time.time() + clf.fit(train_x, train_y) + t_end =time.time() + training_time = t_end - t_begin + + y_pred = clf.predict(test_x) + + else : + clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) + t_begin = time.time() + clf.fit(train_x, train_y) + t_end =time.time() + training_time = t_end - t_begin + + y_pred = clf.predict(test_x) + + + + +#evaluate model + + file_name_report = feature_technique_name + '_' + clf_name + + report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf') + with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f: + + sys.stdout = f # Change the standard output to the file we created. + print(report) + print('accuracy : {}'.format(accuracy)) + print('weighted_Precision : {}'.format(weighted_avg['precision'])) + print('weighted_Recall : {}'.format(weighted_avg['recall'])) + print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) + print('weighted_Support : {}'.format(weighted_avg['support'])) + print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) + print('training time : {}'.format(training_time)) + #sys.stdout = sys.stdout # Reset the standard output to its original value + sys.stdout = sys.__stdout__ + + + + + + + + + + + + + +for feature_technique_name, features in features_techniques_paragraphe: + train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) + encoder = preprocessing.LabelEncoder() + train_y = encoder.fit_transform(train_y) + valid_y = encoder.fit_transform(test_y) + + for tmp_clf, clf_grid_params in zip(classifiers, grid_params): + clf_name, clf = tmp_clf + grid_param_name, grid_param = tmp_grid_params + + if clf_name == 'bayes' : + if feature_technique_name == 'doc2vec': + continue + else: + t_begin = time.time() + clf.fit(train_x, train_y) + t_end =time.time() + training_time = t_end - t_begin + + y_pred = clf.predict(test_x) + + else : + clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) + t_begin = time.time() + clf.fit(train_x, train_y) + t_end =time.time() + training_time = t_end - t_begin + + y_pred = clf.predict(test_x) + + + + +#evaluate model + + file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name + + report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf') + with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f: + sys.stdout = f # Change the standard output to the file we created. + print(report) + print('accuracy : {}'.format(accuracy)) + print('weighted_Precision : {}'.format(weighted_avg['precision'])) + print('weighted_Recall : {}'.format(weighted_avg['recall'])) + print('weighted_F-score : {}'.format(weighted_avg['f1-score'])) + print('weighted_Support : {}'.format(weighted_avg['support'])) + print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) + print('training time : {}'.format(training_time)) + sys.stdout = sys.stdout # Reset the standard output to its original value + + sys.stdout = sys.__stdout__ + diff --git a/projet/features_extractor.py b/projet/features_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..a0c99fe4cd018aff722527e727cb51a516b0f315 --- /dev/null +++ b/projet/features_extractor.py @@ -0,0 +1,101 @@ +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfVectorizer +from nltk.stem.snowball import SnowballStemmer +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +import string +import pandas as pd +import numpy as np +from gensim.models.doc2vec import Doc2Vec, TaggedDocument +from nltk.tokenize import word_tokenize + + +class feature_extractor: + + def __init__(self, data, column, target): + + self.column = column + self.data = data + self.X = data[column] + self.y = data[target] + + self.docs = [] + for index, row in data.iterrows(): + self.docs.append(row[column]) + + + def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ): + stop_words = set(stopwords.words('french')) + + stemmer_fr = SnowballStemmer("french") + + analyzer = CountVectorizer().build_analyzer() + + def stemmed_words_fr(doc): + return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) + + stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures) + + stem_vectorizer_fr.fit(self.docs) + + return stem_vectorizer_fr.transform(self.docs) + + + def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None): + + stop_words = set(stopwords.words('french')) + + stemmer_fr = SnowballStemmer("french") + + analyzer = TfidfVectorizer().build_analyzer() + + def stemmed_words_fr(doc): + return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words) + + tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures) + tfidf_vectorizer.fit(self.docs) + return tfidf_vectorizer.transform(self.docs) + + + + + def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1): + tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)] + model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1) + + model.build_vocab(tagged_data) + + for epoch in range(max_epochs): + print('iteration {0}'.format(epoch)) + model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter) + # decrease the learning rate + model.alpha -= 0.0002 + # fix the learning rate, no decay + model.min_alpha = model.alpha + + + set_tags = list(model.docvecs.doctags) + nb_docs_small = len(set_tags) + doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size)) + + i = 0 + for t in set_tags: + doc_vec_doc2vec[i] = model.docvecs[t] + i += 1 + + return doc_vec_doc2vec + + + def text_based_features(self): + + # Classical measures + + df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count']) + df['char_count'] = self.data[self.column].apply(len) + df['word_count'] = self.data[self.column].apply(lambda x: len(x.split())) + df['word_density'] = df['char_count'] / (df['word_count']+1) + df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) + df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()])) + df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()])) + + return df diff --git a/projet/requirements.txt b/projet/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a02be35accb36eca051132ea9af0968bec54aae --- /dev/null +++ b/projet/requirements.txt @@ -0,0 +1,14 @@ +beautifulsoup4 +lxml +Unidecode +Unidecode==1.2.0 +Keras==2.4.3 +Keras-Preprocessing==1.1.2 +sentence-transformers==0.4.1.2 +transformers==4.3.2 +torch==1.8.1 +torchvision==0.8.2 +tokenizers==0.10.1 +regex==2018.1.10 +tensorflow==2.2.0 +gensgensim==3.8.1 diff --git a/projet/script.txt b/projet/script.txt new file mode 100644 index 0000000000000000000000000000000000000000..45f1bd6006b2e9567ec13bd92a5186c069d61b8a --- /dev/null +++ b/projet/script.txt @@ -0,0 +1,15 @@ +mkdir -p reports/domaine_enccre +mkdir -p reports/ensemble_domaine_enccre +mkdir -p reports/normClass_artfl +pip install -r requierments.txt +python tmp_preprocess_data.py +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 300 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 800 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 100 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 50 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 300 1500 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 2000 +python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 500 diff --git a/projet/settings.conf b/projet/settings.conf new file mode 100644 index 0000000000000000000000000000000000000000..f1ef2be9fa2c8509e308b79fb8e8d137d295d93f --- /dev/null +++ b/projet/settings.conf @@ -0,0 +1,8 @@ +[vectorizers] +vectorization_max_df= 1.0 +vectorization_min_df= 1 +vectorization_numberOfFeatures= None +doc2vec_vec_size = 300 +doc2vec_epochs = 10 +doc2vec_lr = 0.025 +min_word_per_article = 4 diff --git a/projet/tmp_preprocess_data.py b/projet/tmp_preprocess_data.py new file mode 100644 index 0000000000000000000000000000000000000000..d97c7362289bfd4396b0d24dae6874c4d77d5cfb --- /dev/null +++ b/projet/tmp_preprocess_data.py @@ -0,0 +1,109 @@ +import sys +import os +import time +import argparse +import pandas as pd +import numpy as np +from data_preprocessing import Preprocessor +from features_extractor import feature_extractor +from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class +from classifiers import classifiers, grid_params +from sklearn.model_selection import train_test_split +from sklearn import preprocessing +from evaluate_model import evaluate_model +from sklearn.model_selection import GridSearchCV +import configparser +from re import search +import math +from unidecode import unidecode +import re +import nltk +from ClassPreprocessor import create_dict + + +def removeMarkers(df, textColumn, listOfMarkers): + + #remove null values or add condition if exist + #self.remove_null_rows(df, markerColumn) + #self.remove_null_rows(df, textColumn) + tmp = 0 + for index, row in df.iterrows(): + tmp += 1 + print(tmp) + if not pd.isna(row[textColumn]): + for m in listOfMarkers: + + marker = str(m) + marker_with_brcts = '('+ marker +')' + row[textColumn] = row[textColumn].replace(marker_with_brcts , "") + row[textColumn] = row[textColumn].replace(marker , "") + full_text = row[textColumn] + i = unidecode(full_text).find(marker_with_brcts) + goOn = False + if i != -1: + goOn = True + while goOn: + + full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):])) + i = unidecode(full_text).find(marker_with_brcts) + if i == -1: + goOn = False + + + row[textColumn] = full_text + + return df + + + +# Reading data and preprocessings steps + +preprocessor = Preprocessor() +df = pd.read_csv('corpus_tei.csv') +listOfM = df['class'].unique() + +df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t") +preprocessor.remove_null_rows(df_original, 'content') + +df_original = removeMarkers(df_original, 'content', listOfM) + + + +df_1 = df_original[['ensemble_domaine_enccre','content']].copy() +df_2 = df_original[['domaine_enccre','content']].copy() +df_3 = df_original[['normClass_artfl','content']].copy() + +############ shall we remove articles with less n tokens ####### remove markers +preprocessor.remove_null_rows(df_1, 'content') +preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') +preprocessor.remove_null_rows(df_2, 'content') +preprocessor.remove_null_rows(df_2, 'domaine_enccre') +preprocessor.remove_null_rows(df_3, 'content') +preprocessor.remove_null_rows(df_3, 'normClass_artfl') + +df_1 = split_class(df_1, 'ensemble_domaine_enccre') +df_2 = split_class(df_2, 'domaine_enccre') +df_3 = split_class(df_3, 'normClass_artfl') + + + + +d_1 = create_dict(df_1, 'ensemble_domaine_enccre') +tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) +tosave.to_excel("ensemble_domaine_enccre.xlsx") + +d_2 = create_dict(df_2, 'domaine_enccre') +tosave = pd.DataFrame.from_dict(d_2, orient='index', columns=[ 'Count']) +tosave.to_excel("domaine_enccre.xlsx") + +d_3 = create_dict(df_3, 'normClass_artfl') +tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) +tosave.to_excel("normClass_artfl.xlsx") + +df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv') +df_2.to_csv('dataframe_with_domaine_enccre.csv') +df_3.to_csv('dataframe_with_normClass_artfl.csv') + + + +print(df_original.shape)