Skip to content
Snippets Groups Projects
Commit 21bf19a9 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update experimentsClassicClassifiers.py

parent c6f787f7
No related branches found
No related tags found
No related merge requests found
......@@ -52,7 +52,7 @@ if not os.path.exists('models'):
# Reading data and preprocessings steps
preprocessor = Preprocessor()
df = pd.read_csv(dataPath)
df = pd.read_csv(dataPath, sep="\t")
df = remove_weak_classes(df, columnClass, minOfInstancePerClass)
df = resample_classes(df, columnClass, maxOfInstancePerClass)
......@@ -72,32 +72,40 @@ doc2vec_min_count = int(config.get('vectorizers','doc2vec_min_count'))
doc2vec_dm = int(config.get('vectorizers','doc2vec_dm')) # If dm=1, ‘distributed memory’ (PV-DM) is used. Otherwise, distributed bag of words (PV-DBOW) is employed.
doc2vec_workers = int(config.get('vectorizers','doc2vec_workers'))
print("size after resampling, ",len(df))
for columnInput in [columnText]:
print('Process: ' + columnInput)
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
print(df.head())
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
print("size training set, ",len(train_x))
print("size validation set, ",len(test_x))
train_x, test_x, train_y, test_y = train_test_split(df, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for columnInput in [columnText, 'firstParagraph']:
print('Process: ' + columnInput)
extractor = feature_extractor(train_x, test_x, columnInput, columnClass)
features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm, doc2vec_workers))]
('doc2vec', extractor.doc2vec(max_epochs, doc2vec_vec_size, doc2vec_min_count , doc2vec_dm))]
#case of full text
for feature_technique_name, features in features_techniques:
print("**** Classifier :", feature_technique_name)
# features has the train_x and the test_x after vectorization
train_x, test_x = features
......@@ -115,10 +123,11 @@ for columnInput in [columnText]:
t_begin = time.time()
if os.path.isfile(os.path.join('./models', model_file_name)):
with open(os.path.join('./models', model_file_name), 'rb') as file:
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
with open(os.path.join('./models',columnClass, model_file_name), 'rb') as file:
clf = pickle.load(file)
else:
with open(os.path.join('./models', model_file_name), 'wb') as file:
with open(os.path.join('./models',columnClass, model_file_name), 'wb') as file:
clf.fit(train_x, train_y)
pickle.dump(clf, file)
......@@ -143,5 +152,10 @@ for columnInput in [columnText]:
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
try:
print('best parameters : {}'.format(clf.best_params_))
except AttributeError:
pass
#sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment