Skip to content
Snippets Groups Projects
Commit 21bf19a9 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update experimentsClassicClassifiers.py

parent c6f787f7
No related branches found
No related tags found
No related merge requests found
...@@ -52,7 +52,7 @@ if not os.path.exists('models'): ...@@ -52,7 +52,7 @@ if not os.path.exists('models'):
# Reading data and preprocessings steps # Reading data and preprocessings steps
preprocessor = Preprocessor() preprocessor = Preprocessor()
df = pd.read_csv(dataPath) df = pd.read_csv(dataPath, sep="\t")
df = remove_weak_classes(df, columnClass, minOfInstancePerClass) df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass) df = resample_classes(df, columnClass, maxOfInstancePerClass)
...@@ -72,32 +72,40 @@ doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count')) ...@@ -72,32 +72,40 @@ doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count'))
doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed. doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
doc2vec_workers = int(config.get('vectorizers','doc2vec_workers')) doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
print("size after resampling, ",len(df))
for columnInput in [columnText]: #prepare data
print('Process: ' + columnInput) df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
print(df.head())
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
#prepare data print("size training set, ",len(train_x))
df = df[df[columnClass] != 'unclassified'] print("size validation set, ",len(test_x))
y = df[columnClass]
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for columnInput in [columnText, 'firstParagraph']:
print('Process: ' + columnInput)
extractor = feature_extractor(train_x, test_x, columnInput, columnClass) extractor = feature_extractor(train_x, test_x, columnInput, columnClass)
features_techniques = [ features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )), ('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)), ('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers))] ('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))]
#case of full text #case of full text
for feature_technique_name, features in features_techniques: for feature_technique_name, features in features_techniques:
print("**** Classifier :", feature_technique_name)
# features has the train_x and the test_x after vectorization # features has the train_x and the test_x after vectorization
train_x, test_x = features train_x, test_x = features
...@@ -115,10 +123,11 @@ for columnInput in [columnText]: ...@@ -115,10 +123,11 @@ for columnInput in [columnText]:
t_begin = time.time() t_begin = time.time()
if os.path.isfile(os.path.join('./models', model_file_name)): if os.path.isfile(os.path.join('./models', model_file_name)):
with open(os.path.join('./models', model_file_name), 'rb') as file: report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
with open(os.path.join('./models',columnClass, model_file_name), 'rb') as file:
clf = pickle.load(file) clf = pickle.load(file)
else: else:
with open(os.path.join('./models', model_file_name), 'wb') as file: with open(os.path.join('./models',columnClass, model_file_name), 'wb') as file:
clf.fit(train_x, train_y) clf.fit(train_x, train_y)
pickle.dump(clf, file) pickle.dump(clf, file)
...@@ -143,5 +152,10 @@ for columnInput in [columnText]: ...@@ -143,5 +152,10 @@ for columnInput in [columnText]:
print('weighted_Support : {}'.format(weighted_avg['support'])) print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))) print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time)) print('training time : {}'.format(training_time))
try:
print('best parameters : {}'.format(clf.best_params_))
except AttributeError:
pass
#sys.stdout = sys.stdout # Reset the standard output to its original value #sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__ sys.stdout = sys.__stdout__
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment