Skip to content
Snippets Groups Projects
Commit 9e575a9d authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

udpate

parent 6827bb95
No related branches found
No related tags found
No related merge requests found
...@@ -17,8 +17,6 @@ import nltk ...@@ -17,8 +17,6 @@ import nltk
nltk.download('stopwords') nltk.download('stopwords')
nltk.download('punkt') nltk.download('punkt')
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("dataPath", help="Path of the dataframe") parser.add_argument("dataPath", help="Path of the dataframe")
parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content') parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content')
...@@ -26,9 +24,6 @@ parser.add_argument("columnClass", help="ColumnClass the column name of the clas ...@@ -26,9 +24,6 @@ parser.add_argument("columnClass", help="ColumnClass the column name of the clas
parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int) parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int)
parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int) parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int)
args = parser.parse_args() args = parser.parse_args()
dataPath = args.dataPath dataPath = args.dataPath
columnText = args.columnText columnText = args.columnText
...@@ -47,31 +42,20 @@ dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass) ...@@ -47,31 +42,20 @@ dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)): if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)):
os.makedirs(os.path.join('reports', columnClass, dir_name_report)) os.makedirs(os.path.join('reports', columnClass, dir_name_report))
# Reading data and preprocessings steps # Reading data and preprocessings steps
preprocessor = Preprocessor() preprocessor = Preprocessor()
df_original = pd.read_csv(dataPath) df_original = pd.read_csv(dataPath)
df = df_original[[columnClass,columnText]].copy() df = df_original[[columnClass,columnText]].copy()
#preprocessor.remove_null_rows(df, columnText)
#preprocessor.remove_null_rows(df, columnClass)
#df = split_class(df, columnClass)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass) df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass)
#preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
#Read configuration file for retreiving parameters of features extractors #Read configuration file for retreiving parameters of features extractors
config = configparser.ConfigParser() config = configparser.ConfigParser()
config.read('settings.conf') config.read('settings.conf')
vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df')) vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df'))
vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df')) vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df'))
vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
...@@ -79,33 +63,22 @@ doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size')) ...@@ -79,33 +63,22 @@ doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs')) doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr')) doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
for columnInput in [columnText, 'firstParagraph']: for columnInput in [columnText, 'firstParagraph']:
print('Process: ' + columnInput) print('Process: ' + columnInput)
extractor = feature_extractor(df,columnText, columnClass) extractor = feature_extractor(df,columnText, columnClass)
#extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
features_techniques = [ features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))] ('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
features_techniques_paragraphe = [
('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
'''
#prepare data #prepare data
df = df[df[columnClass] != 'unclassified'] df = df[df[columnClass] != 'unclassified']
y = df[columnClass] y = df[columnClass]
#case of full text #case of full text
for feature_technique_name, features in features_techniques: for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y ) train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder() encoder = preprocessing.LabelEncoder()
......
...@@ -14,4 +14,4 @@ torchvision==0.8.2 ...@@ -14,4 +14,4 @@ torchvision==0.8.2
tokenizers==0.10.1 tokenizers==0.10.1
regex==2018.1.10 regex==2018.1.10
tensorflow==2.2.0 tensorflow==2.2.0
gensim==3.8.1 gensim==3.8.1
\ No newline at end of file
...@@ -15,23 +15,16 @@ from sklearn.model_selection import GridSearchCV ...@@ -15,23 +15,16 @@ from sklearn.model_selection import GridSearchCV
import configparser import configparser
from re import search from re import search
import math import math
from unidecode import unidecode
import re import re
import nltk import nltk
from ClassPreprocessor import create_dict from ClassPreprocessor import create_dict
print("Begin preprocess") print("Begin preprocess")
# Reading data and preprocessings steps # Reading data and preprocessings steps
preprocessor = Preprocessor() preprocessor = Preprocessor()
#df = pd.read_csv('data/corpus_tei.csv')
#listOfM = df['class'].unique()
print("load dataset") print("load dataset")
...@@ -48,14 +41,6 @@ df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParag ...@@ -48,14 +41,6 @@ df_1 = df[['ensemble_domaine_enccre','content','contentWithoutClass','firstParag
df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy() df_2 = df[['domaine_enccre','content','contentWithoutClass','firstParagraph']].copy()
df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy() df_3 = df[['normClass','content','contentWithoutClass','firstParagraph']].copy()
############ shall we remove articles with less n tokens ####### remove markers
#preprocessor.remove_null_rows(df_1, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
#preprocessor.remove_null_rows(df_2, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_2, 'domaine_enccre')
#preprocessor.remove_null_rows(df_3, 'contentWithoutClass')
#preprocessor.remove_null_rows(df_3, 'normClass')
print("split ensemble domaine enccre") print("split ensemble domaine enccre")
df_1 = split_class(df_1, 'ensemble_domaine_enccre') df_1 = split_class(df_1, 'ensemble_domaine_enccre')
print("save dataframe") print("save dataframe")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment