Skip to content
Snippets Groups Projects
Commit 32f28eea authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

update

parent 2492f680
No related branches found
No related tags found
No related merge requests found
......@@ -2,3 +2,5 @@
.DS_Store
data/EDdA_dataframe_withContent.tsv
.DS_Store
*.pyc
.DS_Store
......@@ -31,6 +31,11 @@ columnClass = args.columnClass
minOfInstancePerClass = args.minOfInstancePerClass
maxOfInstancePerClass = args.maxOfInstancePerClass
if not os.path.exists('reports'):
os.makedirs('reports')
if not os.path.exists(os.path.join('reports', columnClass)):
os.makedirs(os.path.join('reports', columnClass))
# create directory in the reports directory so save the classification results
dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
......@@ -47,13 +52,13 @@ df_original = pd.read_csv(dataPath)
df = df_original[[columnClass,columnText]].copy()
preprocessor.remove_null_rows(df, columnText)
preprocessor.remove_null_rows(df, columnClass)
#preprocessor.remove_null_rows(df, columnText)
#preprocessor.remove_null_rows(df, columnClass)
#df = split_class(df, columnClass)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass )
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
#preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
#Read configuration file for retreiving parameters of features extractors
......@@ -70,109 +75,53 @@ doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
extractor = feature_extractor(df,columnText, columnClass)
extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
for columnInput in [columnText, 'firstParagraph']:
extractor = feature_extractor(df,columnText, columnClass)
#extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
features_techniques_paragraphe = [
('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
features_techniques_paragraphe = [
('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
#case of full text
for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
print(clf_name, clf, grid_param_name, grid_param)
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
#case of full text
for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
print(clf_name, clf, grid_param_name, grid_param)
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
else :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
#evaluate model
file_name_report = feature_technique_name + '_' + clf_name
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
#sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
for feature_technique_name, features in features_techniques_paragraphe:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for tmp_clf, clf_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
else :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
......@@ -180,34 +129,21 @@ for feature_technique_name, features in features_techniques_paragraphe:
y_pred = clf.predict(test_x)
else :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
#evaluate model
file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf')
with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
#evaluate model
file_name_report = columnInput + '_' +feature_technique_name + '_' + clf_name
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
#sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
......@@ -6,9 +6,9 @@ Keras==2.4.3
Keras-Preprocessing==1.1.2
sentence-transformers==0.4.1.2
transformers==4.3.2
torch==1.8.1
torch==1.7.1
torchvision==0.8.2
tokenizers==0.10.1
regex==2018.1.10
tensorflow==2.2.0
gensgensim==3.8.1
gensim==3.8.1
......@@ -3,13 +3,13 @@ mkdir -p reports/ensemble_domaine_enccre
mkdir -p reports/normClass_artfl
pip install -r requirements.txt
python tmp_preprocess_data.py
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 800
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 100 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 2000
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 50 800
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass ensemble_domaine_enccre 100 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass domaine_enccre 300 500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 2000
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv contentWithoutClass normClass_artfl 50 500
......@@ -22,65 +22,58 @@ from ClassPreprocessor import create_dict
def removeMarkers(full_text, listOfMarkers):
if not pd.isna(full_text):
for m in listOfMarkers:
marker = str(m)
marker_with_brcts = '('+ marker +')'
full_text = full_text.replace(marker_with_brcts , "")
full_text = full_text.replace(marker , "")
#full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
#row[textColumn] = full_text
return full_text
## On vectorise la fonction removeMarkers() afin de l'appliquer de manière efficace (en terme de temps de calcul) sur le dataframe
vec_removeMarkers = np.vectorize(removeMarkers)
print("Begin preprocess")
# Reading data and preprocessings steps
preprocessor = Preprocessor()
df = pd.read_csv('corpus_tei.csv')
listOfM = df['class'].unique()
#df = pd.read_csv('data/corpus_tei.csv')
#listOfM = df['class'].unique()
df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
preprocessor.remove_null_rows(df_original, 'content')
#df_original = removeMarkers(df_original, 'content', listOfM)
df_original['content_withoutMarkers'] = vec_removeMarkers(df_original.content, listOfM)
print("load dataset")
df_1 = df_original[['ensemble_domaine_enccre','content']].copy()
df_2 = df_original[['domaine_enccre','content']].copy()
df_3 = df_original[['normClass_artfl','content']].copy()
df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df = df_original.copy()
############ shall we remove articles with less n tokens ####### remove markers
preprocessor.remove_null_rows(df_1, 'content_withoutMarkers')
preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
preprocessor.remove_null_rows(df_2, 'content_withoutMarkers')
preprocessor.remove_null_rows(df_2, 'domaine_enccre')
preprocessor.remove_null_rows(df_3, 'content_withoutMarkers')
preprocessor.remove_null_rows(df_3, 'normClass_artfl')
print("remove blank rows")
df.dropna(subset = ['content', 'contentWithoutClass', 'firstParagraph', 'ensemble_domaine_enccre', 'domaine_enccre', 'normClass'], inplace = True)
df.reset_index(drop=True, inplace=True)
#preprocessor.remove_null_rows(df_original, 'content')
print("copy")
df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
############ shall we remove articles with less n tokens ####### remove markers
#preprocessor.remove_null_rows(df_1, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
#preprocessor.remove_null_rows(df_2, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_2, 'domaine_enccre')
#preprocessor.remove_null_rows(df_3, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_3, 'normClass')
print("split ensemble domaine enccre")
df_1 = split_class(df_1, 'ensemble_domaine_enccre')
print("save dataframe")
df_1.to_csv('./data/dataframe_with_ensemble_domaine_enccre.csv')
print("split ensemble domaine enccre")
df_2 = split_class(df_2, 'domaine_enccre')
df_3 = split_class(df_3, 'normClass_artfl')
print("save dataframe")
df_2.to_csv('./data/dataframe_with_domaine_enccre.csv')
print("split ensemble domaine enccre")
df_3 = split_class(df_3, 'normClass')
print("save dataframe")
df_3.to_csv('./data/dataframe_with_normClass_artfl.csv')
print("some stats")
d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count'])
tosave.to_excel("ensemble_domaine_enccre.xlsx")
......@@ -93,10 +86,4 @@ d_3 = create_dict(df_3, 'normClass_artfl')
tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count'])
tosave.to_excel("normClass_artfl.xlsx")
df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv')
df_2.to_csv('dataframe_with_domaine_enccre.csv')
df_3.to_csv('dataframe_with_normClass_artfl.csv')
print(df_original.shape)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment