Skip to content
Snippets Groups Projects
Commit 32f28eea authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

update

parent 2492f680
No related branches found
No related tags found
No related merge requests found
...@@ -2,3 +2,5 @@ ...@@ -2,3 +2,5 @@
.DS_Store .DS_Store
data/EDdA_dataframe_withContent.tsv data/EDdA_dataframe_withContent.tsv
.DS_Store .DS_Store
*.pyc
.DS_Store
...@@ -31,6 +31,11 @@ columnClass = args.columnClass ...@@ -31,6 +31,11 @@ columnClass = args.columnClass
minOfInstancePerClass = args.minOfInstancePerClass minOfInstancePerClass = args.minOfInstancePerClass
maxOfInstancePerClass = args.maxOfInstancePerClass maxOfInstancePerClass = args.maxOfInstancePerClass
if not os.path.exists('reports'):
os.makedirs('reports')
if not os.path.exists(os.path.join('reports', columnClass)):
os.makedirs(os.path.join('reports', columnClass))
# create directory in the reports directory so save the classification results # create directory in the reports directory so save the classification results
dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
...@@ -47,13 +52,13 @@ df_original = pd.read_csv(dataPath) ...@@ -47,13 +52,13 @@ df_original = pd.read_csv(dataPath)
df = df_original[[columnClass,columnText]].copy() df = df_original[[columnClass,columnText]].copy()
preprocessor.remove_null_rows(df, columnText) #preprocessor.remove_null_rows(df, columnText)
preprocessor.remove_null_rows(df, columnClass) #preprocessor.remove_null_rows(df, columnClass)
#df = split_class(df, columnClass) #df = split_class(df, columnClass)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass ) df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass)
preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text #preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
#Read configuration file for retreiving parameters of features extractors #Read configuration file for retreiving parameters of features extractors
...@@ -70,109 +75,53 @@ doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) ...@@ -70,109 +75,53 @@ doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
extractor = feature_extractor(df,columnText, columnClass) for columnInput in [columnText, 'firstParagraph']:
extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
extractor = feature_extractor(df,columnText, columnClass)
#extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
features_techniques_paragraphe = [ features_techniques = [
('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
features_techniques_paragraphe = [
('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
#case of full text
for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
#prepare data for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
df = df[df[columnClass] != 'unclassified'] clf_name, clf = tmp_clf
y = df[columnClass] grid_param_name, grid_param = tmp_grid_params
print(clf_name, clf, grid_param_name, grid_param)
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
#case of full text else :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
print(clf_name, clf, grid_param_name, grid_param)
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
else :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
#evaluate model
file_name_report = feature_technique_name + '_' + clf_name
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
#sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
for feature_technique_name, features in features_techniques_paragraphe:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for tmp_clf, clf_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
t_begin = time.time() t_begin = time.time()
clf.fit(train_x, train_y) clf.fit(train_x, train_y)
t_end =time.time() t_end =time.time()
...@@ -180,34 +129,21 @@ for feature_technique_name, features in features_techniques_paragraphe: ...@@ -180,34 +129,21 @@ for feature_technique_name, features in features_techniques_paragraphe:
y_pred = clf.predict(test_x) y_pred = clf.predict(test_x)
else : #evaluate model
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3) file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name
t_begin = time.time()
clf.fit(train_x, train_y) report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
t_end =time.time() with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
training_time = t_end - t_begin
sys.stdout = f # Change the standard output to the file we created.
y_pred = clf.predict(test_x) print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
#evaluate model print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name print('training time : {}'.format(training_time))
#sys.stdout = sys.stdout # Reset the standard output to its original value
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf') sys.stdout = sys.__stdout__
with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
...@@ -6,9 +6,9 @@ Keras==2.4.3 ...@@ -6,9 +6,9 @@ Keras==2.4.3
Keras-Preprocessing==1.1.2 Keras-Preprocessing==1.1.2
sentence-transformers==0.4.1.2 sentence-transformers==0.4.1.2
transformers==4.3.2 transformers==4.3.2
torch==1.8.1 torch==1.7.1
torchvision==0.8.2 torchvision==0.8.2
tokenizers==0.10.1 tokenizers==0.10.1
regex==2018.1.10 regex==2018.1.10
tensorflow==2.2.0 tensorflow==2.2.0
gensgensim==3.8.1 gensim==3.8.1
...@@ -3,13 +3,13 @@ mkdir -p reports/ensemble_domaine_enccre ...@@ -3,13 +3,13 @@ mkdir -p reports/ensemble_domaine_enccre
mkdir -p reports/normClass_artfl mkdir -p reports/normClass_artfl
pip install -r requirements.txt pip install -r requirements.txt
python tmp_preprocess_data.py python tmp_preprocess_data.py
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 300 1500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 1500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 800 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 800
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 100 1500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 100 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 1500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 50 1500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 300 1500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 2000 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 2000
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 500 python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 500
...@@ -22,65 +22,58 @@ from ClassPreprocessor import create_dict ...@@ -22,65 +22,58 @@ from ClassPreprocessor import create_dict
def removeMarkers(full_text, listOfMarkers):
if not pd.isna(full_text):
for m in listOfMarkers:
marker = str(m)
marker_with_brcts = '('+ marker +')'
full_text = full_text.replace(marker_with_brcts , "")
full_text = full_text.replace(marker , "")
#full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
#row[textColumn] = full_text
return full_text
## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe
vec_removeMarkers = np.vectorize(removeMarkers)
print("Begin preprocess")
# Reading data and preprocessings steps # Reading data and preprocessings steps
preprocessor = Preprocessor() preprocessor = Preprocessor()
df = pd.read_csv('corpus_tei.csv') #df = pd.read_csv('data/corpus_tei.csv')
listOfM = df['class'].unique() #listOfM = df['class'].unique()
df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
preprocessor.remove_null_rows(df_original, 'content')
#df_original = removeMarkers(df_original, 'content', listOfM)
df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM)
print("load dataset")
df_1 = df_original[['ensemble_domaine_enccre','content']].copy() df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df_2 = df_original[['domaine_enccre','content']].copy() df = df_original.copy()
df_3 = df_original[['normClass_artfl','content']].copy()
############ shall we remove articles with less n tokens ####### remove markers print("remove blank rows")
preprocessor.remove_null_rows(df_1, 'content_withoutMarkers') df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre') df.reset_index(drop=True, inplace=True)
preprocessor.remove_null_rows(df_2, 'content_withoutMarkers')
preprocessor.remove_null_rows(df_2, 'domaine_enccre') #preprocessor.remove_null_rows(df_original, 'content')
preprocessor.remove_null_rows(df_3, 'content_withoutMarkers') print("copy")
preprocessor.remove_null_rows(df_3, 'normClass_artfl') df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
############ shall we remove articles with less n tokens ####### remove markers
#preprocessor.remove_null_rows(df_1, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
#preprocessor.remove_null_rows(df_2, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_2, 'domaine_enccre')
#preprocessor.remove_null_rows(df_3, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_3, 'normClass')
print("split ensemble domaine enccre")
df_1 = split_class(df_1, 'ensemble_domaine_enccre') df_1 = split_class(df_1, 'ensemble_domaine_enccre')
print("save dataframe")
df_1.to_csv('./data/dataframe_with_ensemble_domaine_enccre.csv')
print("split ensemble domaine enccre")
df_2 = split_class(df_2, 'domaine_enccre') df_2 = split_class(df_2, 'domaine_enccre')
df_3 = split_class(df_3, 'normClass_artfl') print("save dataframe")
df_2.to_csv('./data/dataframe_with_domaine_enccre.csv')
print("split ensemble domaine enccre")
df_3 = split_class(df_3, 'normClass')
print("save dataframe")
df_3.to_csv('./data/dataframe_with_normClass_artfl.csv')
print("some stats")
d_1 = create_dict(df_1, 'ensemble_domaine_enccre') d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count']) tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count'])
tosave.to_excel("ensemble_domaine_enccre.xlsx") tosave.to_excel("ensemble_domaine_enccre.xlsx")
...@@ -93,10 +86,4 @@ d_3 = create_dict(df_3, 'normClass_artfl') ...@@ -93,10 +86,4 @@ d_3 = create_dict(df_3, 'normClass_artfl')
tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count']) tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count'])
tosave.to_excel("normClass_artfl.xlsx") tosave.to_excel("normClass_artfl.xlsx")
df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv')
df_2.to_csv('dataframe_with_domaine_enccre.csv')
df_3.to_csv('dataframe_with_normClass_artfl.csv')
print(df_original.shape) print(df_original.shape)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment