Skip to content
Snippets Groups Projects
Commit 9e575a9d authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

udpate

parent 6827bb95
No related branches found
No related tags found
No related merge requests found
......@@ -17,8 +17,6 @@ import nltk
nltk.download('stopwords')
nltk.download('punkt')
parser = argparse.ArgumentParser()
parser.add_argument("dataPath", help="Path of the dataframe")
parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content')
......@@ -26,9 +24,6 @@ parser.add_argument("columnClass", help="ColumnClass the column name of the clas
parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int)
parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int)
args = parser.parse_args()
dataPath = args.dataPath
columnText = args.columnText
......@@ -47,31 +42,20 @@ dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)):
os.makedirs(os.path.join('reports', columnClass, dir_name_report))
# Reading data and preprocessings steps
preprocessor = Preprocessor()
df_original = pd.read_csv(dataPath)
df = df_original[[columnClass,columnText]].copy()
#preprocessor.remove_null_rows(df, columnText)
#preprocessor.remove_null_rows(df, columnClass)
#df = split_class(df, columnClass)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
#preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
#Read configuration file for retreiving parameters of features extractors
config = configparser.ConfigParser()
config.read('settings.conf')
vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df'))
vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df'))
vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
......@@ -79,33 +63,22 @@ doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
for columnInput in [columnText, 'firstParagraph']:
print('Process: ' + columnInput)
extractor = feature_extractor(df,columnText, columnClass)
#extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
features_techniques_paragraphe = [
('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
#case of full text
for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
......
......@@ -14,4 +14,4 @@ torchvision==0.8.2
tokenizers==0.10.1
regex==2018.1.10
tensorflow==2.2.0
gensim==3.8.1
gensim==3.8.1
\ No newline at end of file
......@@ -15,23 +15,16 @@ from sklearn.model_selection import GridSearchCV
import configparser
from re import search
import math
from unidecode import unidecode
import re
import nltk
from ClassPreprocessor import create_dict
print("Begin preprocess")
# Reading data and preprocessings steps
preprocessor = Preprocessor()
#df = pd.read_csv('data/corpus_tei.csv')
#listOfM = df['class'].unique()
print("load dataset")
......@@ -48,14 +41,6 @@ df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParag
df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
############ shall we remove articles with less n tokens ####### remove markers
#preprocessor.remove_null_rows(df_1, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
#preprocessor.remove_null_rows(df_2, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_2, 'domaine_enccre')
#preprocessor.remove_null_rows(df_3, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_3, 'normClass')
print("split ensemble domaine enccre")
df_1 = split_class(df_1, 'ensemble_domaine_enccre')
print("save dataframe")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment