Skip to content
Snippets Groups Projects
Commit bc79aeb7 authored by Khalleud's avatar Khalleud
Browse files

Remove unused files

parent a9ec82f7
No related branches found
No related tags found
1 merge request!1Branch v1
import pandas as pd
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.naive_bayes import MultinomialNB
# Reading data
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df_normClass_artfl = df[['normClass_artfl','content']].copy()
#remove null values of class column and text column
preprocessor = Preprocessor()
preprocessor.remove_null_rows(df_normClass_artfl, 'content')
preprocessor.remove_null_rows(df_normClass_artfl, 'normClass_artfl')
df_normClass_artfl = split_class(df_normClass_artfl, 'normClass_artfl')
minOfInstancePerClass = 200
maxOfInstancePerClass = 1500
#remove weak classes and resample classes
df_normClass_artfl = remove_weak_classes(df_normClass_artfl, 'normClass_artfl', minOfInstancePerClass )
df_normClass_artfl = resample_classes(df_normClass_artfl, 'normClass_artfl', maxOfInstancePerClass)
preprocessor.saveDataFrametoCSV(df_normClass_artfl,'df_normClass_artfl.csv')
#features extraction step
#df_normClass_artfl = pd.read_csv('df_normClass_artfl.csv')
extractor = feature_extractor(df_normClass_artfl,'content', 'normClass_artfl')
X_count_vect = extractor.count_vect()
X_tf = extractor.tf_idf()
#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
#X_text_feature = extractor.text_based_features()
# preparing the train and test data
df_normClass_artfl = df_normClass_artfl[df_normClass_artfl['normClass_artfl'] != 'unclassified']
y = df_normClass_artfl['normClass_artfl']
train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
# fit the model
m = MultinomialNB()
m.fit(train_x, train_y)
y_pred = m.predict(test_x)
#evaluate model
report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_)
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
import pandas as pd
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.naive_bayes import MultinomialNB
# Reading data
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df_domaine_enccre = df[['_domaine_enccre','content']].copy()
#remove null values of class column and text column
preprocessor = Preprocessor()
preprocessor.remove_null_rows(df_domaine_enccre, 'content')
preprocessor.remove_null_rows(df_domaine_enccre, '_domaine_enccre')
df_domaine_enccre = split_class(df_domaine_enccre, '_domaine_enccre')
minOfInstancePerClass = 200
maxOfInstancePerClass = 1500
#remove weak classes and resample classes
df_domaine_enccre = remove_weak_classes(df_domaine_enccre, '_domaine_enccre', minOfInstancePerClass )
df_domaine_enccre = resample_classes(df_domaine_enccre, '_domaine_enccre', maxOfInstancePerClass)
preprocessor.saveDataFrametoCSV(df_domaine_enccre,'df_domaine_enccre.csv')
#features extraction step
#df_domaine_enccre = pd.read_csv('df_domaine_enccre.csv')
extractor = feature_extractor(df_domaine_enccre,'content', '_domaine_enccre')
X_count_vect = extractor.count_vect()
X_tf = extractor.tf_idf()
#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
#X_text_feature = extractor.text_based_features()
# preparing the train and test data
df_domaine_enccre = df_domaine_enccre[df_domaine_enccre['domaine_enccre'] != 'unclassified']
y = df_domaine_enccre['domaine_enccre']
train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
# fit the model
m = MultinomialNB()
m.fit(train_x, train_y)
y_pred = m.predict(test_x)
#evaluate model
report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_)
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
import pandas as pd
import numpy as np
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Reading data
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df_ensemble_domaine_enccre = df[['ensemble_domaine_enccre','content']].copy()
#remove null values of class column and text column
preprocessor = Preprocessor()
preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'content')
preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')
#df_ensemble_domaine_enccre = split_class(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')
minOfInstancePerClass = 200
maxOfInstancePerClass = 1500
#remove weak classes and resample classes
print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre'))
df_ensemble_domaine_enccre = remove_weak_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', minOfInstancePerClass )
df_ensemble_domaine_enccre = resample_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', maxOfInstancePerClass)
print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre'))
#preprocessor.saveDataFrametoCSV(df_ensemble_domaine_enccre,'df_ensemble_domaine_enccre.csv')
#features extraction step
#df_ensemble_domaine_enccre = pd.read_csv('df_ensemble_domaine_enccre.csv')
extractor = feature_extractor(df_ensemble_domaine_enccre,'content', 'ensemble_domaine_enccre')
X_count_vect = extractor.count_vect()
#X_tf = extractor.tf_idf()
#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
#X_text_feature = extractor.text_based_features()
# preparing the train and test data
df_ensemble_domaine_enccre = df_ensemble_domaine_enccre[df_ensemble_domaine_enccre['ensemble_domaine_enccre'] != 'unclassified']
y = df_ensemble_domaine_enccre['ensemble_domaine_enccre']
train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
# fit the model
m = LogisticRegression() #MultinomialNB()
#m.fit(train_x, train_y)
param_grid_lr = {"C":np.logspace(-3,3,7)}
clf = GridSearchCV(m, param_grid = param_grid_lr, cv = 5, verbose=True, n_jobs=-1)
# Fit on data
best_clf = clf.fit(train_x, train_y)
y_pred = clf.predict(test_x)
#evaluate model
report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_)
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment