Skip to content
Snippets Groups Projects
Commit db8395ca authored by Khalleud's avatar Khalleud
Browse files

[ADD] add scripts

parent 1c9ce02c
No related branches found
No related tags found
1 merge request!1Branch v1
import pandas as pd
import numpy as np
import statistics
def create_dict(df, classColumnName):
return dict(df[classColumnName].value_counts())
def remove_weak_classes(df, classColumnName, threshold):
dictOfClassInstances = create_dict(df,classColumnName)
dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
keys = [*dictionary]
df_tmp = df[~ df[classColumnName].isin(keys)]
#df = df[df[columnTarget] not in keys]
#df = df.merge(df_tmp, how = 'outer' ,indicator=True)
df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)
return df
def split_class(df, columnProcessed):
i = 0
new_df = pd.DataFrame(columns= df.columns)
for index, row in df.iterrows():
#cls = re.split(';', row[columnProcessed])
cls = filter(None, row[columnProcessed].split(';'))
cls = list(cls)
#cls = re.findall(r"[\w']+", row [columnProcessed])
r = row
for categ in cls:
r[columnProcessed] = categ
#new_df.append(r, ignore_index = True)
new_df.loc[i] = r
i = i + 1
return new_df
def get_median_dict(dict):
return statistics.median(dict.values())
def resample_classes(df, classColumnName, numberOfInstances):
# numberOfInstances first elements
#return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
#random numberOfInstances elements
replace = False # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn)
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
classifiers = [
('bayes', MultinomialNB()),
('svm', SVC() ),
('decisionTree',DecisionTreeClassifier()),
('rfc', RandomForestClassifier()),
('lr', LogisticRegression()),
('sgd', SGDClassifier()),
('knn', KNeighborsClassifier())
]
param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(1,10), 'min_samples_split': range(1,10), 'min_samples_leaf': range(1,5) }
param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : 1000}
param_grid_knn = {'n_neighbors' : list(range(1,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
grid_params = [
('bayes', None),
('svm', param_grid_svm),
('decisionTree', param_grid_decisionTree),
('rfc', param_grid_rfc ),
('lr', param_grid_lr),
('sgd', param_grid_sgd ),
('knn', param_grid_knn),
]
This diff is collapsed.
from os import path
from os.path import basename, splitext
import pandas as pd
import os
from data_process.TEIFile import TEIFile
def basename_without_ext(path):
base_name = basename(path)
stem, ext = splitext(base_name)
if stem.endswith('.tei'):
# Return base name without tei file
return stem[0:-4]
else:
return stem
def tei_to_csv_entry(tei_file, txt_file):
print(f"Going on {tei_file}")
tei = TEIFile(tei_file, txt_file)
print(f"Handled {tei_file}")
base_name = basename_without_ext(tei_file)
return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution
input_path = r'./data/EDdA/'
output_name = "corpus_tei.csv"
column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"]
df = pd.DataFrame(columns = column_names)
marge = 0
for tome in os.listdir(input_path):
volume = tome[1:]
for index, article in enumerate(os.listdir(input_path + tome +"/")):
filepath = os.path.join(input_path, tome, article)
base_name = basename_without_ext(filepath)
df.loc[index+marge] = tei_to_csv_entry(filepath, ' ')
df.loc[index+marge]['articleName'] = volume+'_'+base_name
marge += index +1
df.to_csv(output_name, index=False)
import pandas as pd
import numpy as np
from re import search
import math
from unidecode import unidecode
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
class Preprocessor():
def init(self):
pass
def remove_null_rows(self, df, columnName):
#df = df[df[columnName].notna()]
df.dropna(subset = [columnName], inplace = True)
df.reset_index(drop=True, inplace=True)
return
def removeMarkers(self, df, textColumn, markerColumn = 'class'):
#remove null values or add condition if exist
#self.remove_null_rows(df, markerColumn)
#self.remove_null_rows(df, textColumn)
for index, row in df.iterrows():
if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]):
marker = row[markerColumn]
marker_with_brcts = '('+ marker +')'
row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
row[textColumn] = row[textColumn].replace(marker , "")
full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
row[textColumn] = full_text
return df
def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def token_fr(doc):
return (w for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None)
docs = []
for index, row in df.iterrows():
docs.append(row[textColumn])
stem_vectorizer_fr.fit(docs)
featured_docs = stem_vectorizer_fr.transform(docs)
tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
for index, tokens in enumerate(tokens_per_docs):
# join token to recreate text with new tokens
new_text = ' '.join(tokens)
df.loc[index][textColumn] = new_text
return
def removeArticlesByTokensNumbers(self, df, textColumn, min_word_per_article):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def token_fr(doc):
return (w for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr)
docs = []
for index, row in df.iterrows():
docs.append(row[textColumn])
stem_vectorizer_fr.fit(docs)
featured_docs = stem_vectorizer_fr.transform(docs)
tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
concerned_article_index = []
for index, tokens in enumerate(tokens_per_docs):
if len(tokens) <= min_word_per_article:
concerned_article_index.append(index)
df = df.drop(index = concerned_article_index, inplace = True)
return
def getFirstParagraph(self, df, textColumn, columnName):
new_column = []
for index, row in df.iterrows():
paragraphs = row[textColumn].split('\n \n')
new_column.append(paragraphs[0])
df[columnName] = new_column
return
def getFirstSentence(self, df, textColumn, columnName):
sent = []
for index, row in df.iterrows():
sentences = nltk.sent_tokenize(row[textColumn])
sent.append(sentences[0])
df[columnName] = sent
return
def saveDataFrametoCSV(self, df, pathName):
df.to_csv(pathName)
from data_process.data_functions import read_tei
class TEIFile(object):
def __init__(self, filename, textfilename):
self.filename = filename
self.soup = read_tei(filename)
self._text = None
self._Head = ''
self._Objecttype = ''
self._attribution = ''
self._Class = ''
self._normclass = ''
self._englishclass = ''
self._generatedclass = ''
self._author = ''
if self.soup.find('index', type='head'):
self._Head = self.soup.find('index', type='head')['value']
if self.soup.find('index', type='objecttype'):
self._Objecttype = self.soup.find('index', type='objecttype')['value']
if self.soup.find('index', type='attribution'):
self._attribution = self.soup.find('index', type='attribution')['value']
if self.soup.find('index', type='class') and self.soup.find('index', type='class').has_attr('value') :
self._Class = self.soup.find('index', type='class')['value']
if self.soup.find('index', type='normclass'):
self._normclass = self.soup.find('index', type='normclass')['value']
if self.soup.find('index', type='englishclass'):
self._englishclass = self.soup.find('index', type='englishclass')['value']
if self.soup.find('index', type='generatedclass'):
self._generatedclass = self.soup.find('index', type='generatedclass')['value']
if self.soup.find('index', type = 'author'):
self._author = self.soup.find('index', type='author')['value']
ps = self.soup.find_all('p')
Texts = []
for p in ps[1:]:
Texts.append(p.getText())
self._text = ' '.join(Texts)
from bs4 import BeautifulSoup
def read_tei(tei_file):
with open(tei_file, 'r') as tei:
soup = BeautifulSoup(tei, 'lxml')
return soup
raise RuntimeError('Cannot generate a soup from the input')
def elem_to_text(elem, default=''):
if elem:
return elem.getText(separator=' ', strip=True)
else:
return default
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
def evaluate_model(y_pred, valid_y, classes, classesName):
#classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
precision = []
recall = []
f1 = []
support = []
weighted_avg = None
accuracy = None
df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
report = classification_report( y_pred, valid_y, output_dict = True)
for c in classes:
precision.append(report[c]['precision'])
recall.append(report[c]['recall'])
f1.append(report[c]['f1-score'])
support.append(report[c]['support'])
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
cnf_matrix = confusion_matrix(valid_y, y_pred)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)
df['className'] = classesName
df['precision'] = precision
df['recall'] = recall
df['f1-score'] = f1
df['support'] = support
df['FP'] = FP
df['FN'] = FN
df['TP'] = TP
df['TN'] = TN
#disp = plot_confusion_matrix(classifier, test_x, valid_y,
# display_labels= label_list,
# cmap=plt.cm.Blues,
# normalize=None)
#disp.ax_.set_title(title)
#print(title)
#print(disp.confusion_matrix)
#plt.show()
return df, accuracy, weighted_avg
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
#y_true = [2, 0, 2, 2, 0, 1]
#y_pred = [0, 0, 2, 2, 0, 2]
#cf_matrix = confusion_matrix(y_true, y_pred)
#sns.heatmap(cf_matrix, annot=True)
#import matplotlib.pyplot as plt
#plt.show()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
class feature_extractor:
def __init__(self, data, column, target):
self.column = column
self.data = data
self.X = data[column]
self.y = data[target]
self.docs = []
for index, row in data.iterrows():
self.docs.append(row[column])
def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
stem_vectorizer_fr.fit(self.docs)
return stem_vectorizer_fr.transform(self.docs)
def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
tfidf_vectorizer.fit(self.docs)
return tfidf_vectorizer.transform(self.docs)
def doc2vec(self, max_epochs, vec_size, alpha , dm = 1):
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)]
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
set_tags = list(model.docvecs.doctags)
nb_docs_small = len(set_tags)
doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
i = 0
for t in set_tags:
doc_vec_doc2vec[i] = model.docvecs[t]
i += 1
return doc_vec_doc2vec
def text_based_features(self):
# Classical measures
df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count'])
df['char_count'] = self.data[self.column].apply(len)
df['word_count'] = self.data[self.column].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
return df
import pandas as pd
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.naive_bayes import MultinomialNB
# Reading data
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df_normClass_artfl = df[['normClass_artfl','content']].copy()
#remove null values of class column and text column
preprocessor = Preprocessor()
preprocessor.remove_null_rows(df_normClass_artfl, 'content')
preprocessor.remove_null_rows(df_normClass_artfl, 'normClass_artfl')
df_normClass_artfl = split_class(df_normClass_artfl, 'normClass_artfl')
minOfInstancePerClass = 200
maxOfInstancePerClass = 1500
#remove weak classes and resample classes
df_normClass_artfl = remove_weak_classes(df_normClass_artfl, 'normClass_artfl', minOfInstancePerClass )
df_normClass_artfl = resample_classes(df_normClass_artfl, 'normClass_artfl', maxOfInstancePerClass)
preprocessor.saveDataFrametoCSV(df_normClass_artfl,'df_normClass_artfl.csv')
#features extraction step
#df_normClass_artfl = pd.read_csv('df_normClass_artfl.csv')
extractor = feature_extractor(df_normClass_artfl,'content', 'normClass_artfl')
X_count_vect = extractor.count_vect()
X_tf = extractor.tf_idf()
#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
#X_text_feature = extractor.text_based_features()
# preparing the train and test data
df_normClass_artfl = df_normClass_artfl[df_normClass_artfl['normClass_artfl'] != 'unclassified']
y = df_normClass_artfl['normClass_artfl']
train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
# fit the model
m = MultinomialNB()
m.fit(train_x, train_y)
y_pred = m.predict(test_x)
#evaluate model
report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_)
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
import pandas as pd
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.naive_bayes import MultinomialNB
# Reading data
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df_domaine_enccre = df[['_domaine_enccre','content']].copy()
#remove null values of class column and text column
preprocessor = Preprocessor()
preprocessor.remove_null_rows(df_domaine_enccre, 'content')
preprocessor.remove_null_rows(df_domaine_enccre, '_domaine_enccre')
df_domaine_enccre = split_class(df_domaine_enccre, '_domaine_enccre')
minOfInstancePerClass = 200
maxOfInstancePerClass = 1500
#remove weak classes and resample classes
df_domaine_enccre = remove_weak_classes(df_domaine_enccre, '_domaine_enccre', minOfInstancePerClass )
df_domaine_enccre = resample_classes(df_domaine_enccre, '_domaine_enccre', maxOfInstancePerClass)
preprocessor.saveDataFrametoCSV(df_domaine_enccre,'df_domaine_enccre.csv')
#features extraction step
#df_domaine_enccre = pd.read_csv('df_domaine_enccre.csv')
extractor = feature_extractor(df_domaine_enccre,'content', '_domaine_enccre')
X_count_vect = extractor.count_vect()
X_tf = extractor.tf_idf()
#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
#X_text_feature = extractor.text_based_features()
# preparing the train and test data
df_domaine_enccre = df_domaine_enccre[df_domaine_enccre['domaine_enccre'] != 'unclassified']
y = df_domaine_enccre['domaine_enccre']
train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
# fit the model
m = MultinomialNB()
m.fit(train_x, train_y)
y_pred = m.predict(test_x)
#evaluate model
report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_)
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
import pandas as pd
import numpy as np
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# Reading data
df = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
df_ensemble_domaine_enccre = df[['ensemble_domaine_enccre','content']].copy()
#remove null values of class column and text column
preprocessor = Preprocessor()
preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'content')
preprocessor.remove_null_rows(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')
#df_ensemble_domaine_enccre = split_class(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre')
minOfInstancePerClass = 200
maxOfInstancePerClass = 1500
#remove weak classes and resample classes
print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre'))
df_ensemble_domaine_enccre = remove_weak_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', minOfInstancePerClass )
df_ensemble_domaine_enccre = resample_classes(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre', maxOfInstancePerClass)
print(create_dict(df_ensemble_domaine_enccre, 'ensemble_domaine_enccre'))
#preprocessor.saveDataFrametoCSV(df_ensemble_domaine_enccre,'df_ensemble_domaine_enccre.csv')
#features extraction step
#df_ensemble_domaine_enccre = pd.read_csv('df_ensemble_domaine_enccre.csv')
extractor = feature_extractor(df_ensemble_domaine_enccre,'content', 'ensemble_domaine_enccre')
X_count_vect = extractor.count_vect()
#X_tf = extractor.tf_idf()
#X_doc2vec = extractor.doc2vec(10, 20, 0.025)
#X_text_feature = extractor.text_based_features()
# preparing the train and test data
df_ensemble_domaine_enccre = df_ensemble_domaine_enccre[df_ensemble_domaine_enccre['ensemble_domaine_enccre'] != 'unclassified']
y = df_ensemble_domaine_enccre['ensemble_domaine_enccre']
train_x, test_x, train_y, test_y = train_test_split(X_count_vect, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
# fit the model
m = LogisticRegression() #MultinomialNB()
#m.fit(train_x, train_y)
param_grid_lr = {"C":np.logspace(-3,3,7)}
clf = GridSearchCV(m, param_grid = param_grid_lr, cv = 5, verbose=True, n_jobs=-1)
# Fit on data
best_clf = clf.fit(train_x, train_y)
y_pred = clf.predict(test_x)
#evaluate model
report, accuracy, weighted_avg = evaluate_model(y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_)
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
beautifulsoup4
lxml
Unidecode
Unidecode==1.2.0
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment