Skip to content
Snippets Groups Projects
Commit edd2e616 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Merge branch 'branch_v1' into 'master'

Branch v1

See merge request !1
parents 1c9ce02c 39ca3e0a
No related branches found
No related tags found
1 merge request!1Branch v1
Showing
with 871 additions and 0 deletions
import pandas as pd
import numpy as np
import statistics
def create_dict(df, classColumnName):
return dict(df[classColumnName].value_counts())
def remove_weak_classes(df, classColumnName, threshold):
dictOfClassInstances = create_dict(df,classColumnName)
dictionary = {k: v for k, v in dictOfClassInstances.items() if v >= threshold }
keys = [*dictionary]
df_tmp = df[~ df[classColumnName].isin(keys)]
#df = df[df[columnTarget] not in keys]
#df = df.merge(df_tmp, how = 'outer' ,indicator=True)
df = pd.concat([df,df_tmp]).drop_duplicates(keep=False)
return df
def split_class(df, columnProcessed):
i = 0
new_df = pd.DataFrame(columns= df.columns)
for index, row in df.iterrows():
#cls = re.split(';', row[columnProcessed])
cls = filter(None, row[columnProcessed].split(';'))
cls = list(cls)
#cls = re.findall(r"[\w']+", row [columnProcessed])
r = row
for categ in cls:
r[columnProcessed] = categ
#new_df.append(r, ignore_index = True)
new_df.loc[i] = r
i = i + 1
return new_df
def get_median_dict(dict):
return statistics.median(dict.values())
def resample_classes(df, classColumnName, numberOfInstances):
# numberOfInstances first elements
#return df.groupby(classColumnName).apply(lambda x: x[:numberOfInstances][df.columns])
#random numberOfInstances elements
replace = False # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
return df.groupby(classColumnName, as_index=False).apply(fn)
from data_process.data_functions import read_tei, elem_to_text, basename_without_ext, tei_to_csv_entry
from data_process.TEIFile import TEIFile
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
classifiers = [
('bayes', MultinomialNB()),
('svm', SVC() ),
('decisionTree',DecisionTreeClassifier()),
('rfc', RandomForestClassifier()),
('lr', LogisticRegression()),
('sgd', SGDClassifier()),
('knn', KNeighborsClassifier())
]
param_grid_svm = {'C':[1,10,100,1000],'gamma':[1,0.1,0.001,0.0001], 'kernel':['linear','rbf']}
param_grid_decisionTree = { 'criterion' : ['gini', 'entropy'], 'max_depth':range(5,10), 'min_samples_split': range(5,10), 'min_samples_leaf': range(1,5) }
param_grid_rfc = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4,5,6,7,8], 'criterion' :['gini', 'entropy'] }
param_grid_lr = {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}
param_grid_sgd = { "loss" : ["hinge", "log", "squared_hinge", "modified_huber"], "alpha" : [0.0001, 0.001, 0.01, 0.1], "penalty" : ["l2", "l1", "none"], "max_iter" : [500]}
param_grid_knn = {'n_neighbors' : list(range(1,20)), 'weights' : ['uniform', 'distance'], 'metric' : ['euclidean', 'manhattan'] }
grid_params = [
('bayes', None),
('svm', param_grid_svm),
('decisionTree', param_grid_decisionTree),
('rfc', param_grid_rfc ),
('lr', param_grid_lr),
('sgd', param_grid_sgd ),
('knn', param_grid_knn),
]
from os import path
from os.path import basename, splitext
import pandas as pd
import os
from data_process.TEIFile import TEIFile
def basename_without_ext(path):
base_name = basename(path)
stem, ext = splitext(base_name)
if stem.endswith('.tei'):
# Return base name without tei file
return stem[0:-4]
else:
return stem
def tei_to_csv_entry(tei_file, txt_file):
print(f"Going on {tei_file}")
tei = TEIFile(tei_file, txt_file)
print(f"Handled {tei_file}")
base_name = basename_without_ext(tei_file)
return base_name, tei._text, tei._Head, tei._author, tei._Objecttype, tei._Class, tei._normclass, tei._generatedclass, tei._englishclass, tei._attribution
input_path = r'./data/EDdA/'
output_name = "corpus_tei.csv"
column_names = ["articleName", "text", "head", "author", "objecttype", "class", "normclass", "generatedclass", "englishclass", "attribution"]
df = pd.DataFrame(columns = column_names)
marge = 0
for tome in os.listdir(input_path):
volume = tome[1:]
for index, article in enumerate(os.listdir(input_path + tome +"/")):
filepath = os.path.join(input_path, tome, article)
base_name = basename_without_ext(filepath)
df.loc[index+marge] = tei_to_csv_entry(filepath, ' ')
df.loc[index+marge]['articleName'] = volume+'_'+base_name
marge += index +1
df.to_csv(output_name, index=False)
import pandas as pd
import numpy as np
from re import search
import math
from unidecode import unidecode
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import nltk
class Preprocessor():
def init(self):
pass
def remove_null_rows(self, df, columnName):
#df = df[df[columnName].notna()]
df.dropna(subset = [columnName], inplace = True)
df.reset_index(drop=True, inplace=True)
return
def removeMarkers(self, df, textColumn, markerColumn = 'class'):
#remove null values or add condition if exist
#self.remove_null_rows(df, markerColumn)
#self.remove_null_rows(df, textColumn)
for index, row in df.iterrows():
if not pd.isna(row[markerColumn]) and not pd.isna(row[textColumn]):
marker = row[markerColumn]
marker_with_brcts = '('+ marker +')'
row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
row[textColumn] = row[textColumn].replace(marker , "")
full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
row[textColumn] = full_text
return df
def removeWordsByFrequency(self, df, textColumn, min_word_occurence, max_word_occurence):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def token_fr(doc):
return (w for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr, max_df= max_word_occurence , min_df= min_word_occurence, max_features=None)
docs = []
for index, row in df.iterrows():
docs.append(row[textColumn])
stem_vectorizer_fr.fit(docs)
featured_docs = stem_vectorizer_fr.transform(docs)
tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
for index, tokens in enumerate(tokens_per_docs):
# join token to recreate text with new tokens
new_text = ' '.join(tokens)
df.loc[index][textColumn] = new_text
return
def removeArticlesByTokensNumbers(self, df, textColumn, min_word_per_article):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def token_fr(doc):
return (w for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words= 'french', analyzer= token_fr)
docs = []
for index, row in df.iterrows():
docs.append(row[textColumn])
stem_vectorizer_fr.fit(docs)
featured_docs = stem_vectorizer_fr.transform(docs)
tokens_per_docs = stem_vectorizer_fr.inverse_transform(featured_docs)
concerned_article_index = []
for index, tokens in enumerate(tokens_per_docs):
if len(tokens) <= min_word_per_article:
concerned_article_index.append(index)
df = df.drop(index = concerned_article_index, inplace = True)
return
def getFirstParagraph(self, df, textColumn, columnName):
new_column = []
for index, row in df.iterrows():
paragraphs = row[textColumn].split('\n \n')
new_column.append(paragraphs[0])
df[columnName] = new_column
return
def getFirstSentence(self, df, textColumn, columnName):
sent = []
for index, row in df.iterrows():
sentences = nltk.sent_tokenize(row[textColumn])
sent.append(sentences[0])
df[columnName] = sent
return
def saveDataFrametoCSV(self, df, pathName):
df.to_csv(pathName)
from data_process.data_functions import read_tei
class TEIFile(object):
def __init__(self, filename, textfilename):
self.filename = filename
self.soup = read_tei(filename)
self._text = None
self._Head = ''
self._Objecttype = ''
self._attribution = ''
self._Class = ''
self._normclass = ''
self._englishclass = ''
self._generatedclass = ''
self._author = ''
if self.soup.find('index', type='head'):
self._Head = self.soup.find('index', type='head')['value']
if self.soup.find('index', type='objecttype'):
self._Objecttype = self.soup.find('index', type='objecttype')['value']
if self.soup.find('index', type='attribution'):
self._attribution = self.soup.find('index', type='attribution')['value']
if self.soup.find('index', type='class') and self.soup.find('index', type='class').has_attr('value') :
self._Class = self.soup.find('index', type='class')['value']
if self.soup.find('index', type='normclass'):
self._normclass = self.soup.find('index', type='normclass')['value']
if self.soup.find('index', type='englishclass'):
self._englishclass = self.soup.find('index', type='englishclass')['value']
if self.soup.find('index', type='generatedclass'):
self._generatedclass = self.soup.find('index', type='generatedclass')['value']
if self.soup.find('index', type = 'author'):
self._author = self.soup.find('index', type='author')['value']
ps = self.soup.find_all('p')
Texts = []
for p in ps[1:]:
Texts.append(p.getText())
self._text = ' '.join(Texts)
File added
File added
from bs4 import BeautifulSoup
def read_tei(tei_file):
with open(tei_file, 'r') as tei:
soup = BeautifulSoup(tei, 'lxml')
return soup
raise RuntimeError('Cannot generate a soup from the input')
def elem_to_text(elem, default=''):
if elem:
return elem.getText(separator=' ', strip=True)
else:
return default
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pandas as pd
import seaborn as sns
def evaluate_model(clf, X_test, y_test, y_pred, valid_y, classes, classesName, pathSave):
#classifier, label_list, test_x, valid_y, title = "Confusion matrix"):
precision = []
recall = []
f1 = []
support = []
weighted_avg = None
accuracy = None
df = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
report = classification_report( y_pred, valid_y, output_dict = True)
for c in classes:
precision.append(report[c]['precision'])
recall.append(report[c]['recall'])
f1.append(report[c]['f1-score'])
support.append(report[c]['support'])
accuracy = report['accuracy']
weighted_avg = report['weighted avg']
cnf_matrix = confusion_matrix(valid_y, y_pred)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)
df['className'] = classesName
df['precision'] = precision
df['recall'] = recall
df['f1-score'] = f1
df['support'] = support
df['FP'] = FP
df['FN'] = FN
df['TP'] = TP
df['TN'] = TN
#disp = plot_confusion_matrix(classifier, test_x, valid_y,
# display_labels= label_list,
# cmap=plt.cm.Blues,
# normalize=None)
#disp.ax_.set_title(title)
#print(title)
#print(disp.confusion_matrix)
#plt.show()
plt.rcParams["font.size"] = 3
plot_confusion_matrix(clf, X_test, y_test)
plt.savefig(pathSave)
return df, accuracy, weighted_avg
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
#y_true = [2, 0, 2, 2, 0, 1]
#y_pred = [0, 0, 2, 2, 0, 2]
#cf_matrix = confusion_matrix(y_true, y_pred)
#sns.heatmap(cf_matrix, annot=True)
#import matplotlib.pyplot as plt
#plt.show()
import sys
import os
import time
import argparse
import pandas as pd
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.model_selection import GridSearchCV
import configparser
parser = argparse.ArgumentParser()
parser.add_argument("dataPath", help="Path of the dataframe")
parser.add_argument("columnText", help="the column name of the text that should preproceed", default = 'content')
parser.add_argument("columnClass", help="ColumnClass the column name of the classes")
parser.add_argument("minOfInstancePerClass", help="minOfInstancePerClass the minimum of instance required for each class", type=int)
parser.add_argument("maxOfInstancePerClass", help="maxOfInstancePerClass the maximum of instance required resamling classes", type=int)
args = parser.parse_args()
dataPath = args.dataPath
columnText = args.columnText
columnClass = args.columnClass
minOfInstancePerClass = args.minOfInstancePerClass
maxOfInstancePerClass = args.maxOfInstancePerClass
# create directory in the reports directory so save the classification results
dir_name_report = str(minOfInstancePerClass) + '_' + str(maxOfInstancePerClass)
if not os.path.exists(os.path.join('reports', columnClass, dir_name_report)):
os.makedirs(os.path.join('reports', columnClass, dir_name_report))
# Reading data and preprocessings steps
preprocessor = Preprocessor()
df_original = pd.read_csv(dataPath)
df = df_original[[columnClass,columnText]].copy()
preprocessor.remove_null_rows(df, columnText)
preprocessor.remove_null_rows(df, columnClass)
#df = split_class(df, columnClass)
df = remove_weak_classes(df, columnClass, minOfInstancePerClass )
df = resample_classes(df, columnClass, maxOfInstancePerClass)
preprocessor.getFirstParagraph(df, columnText, 'paragraphe' ) # select first sentence of each text
#Read configuration file for retreiving parameters of features extractors
config = configparser.ConfigParser()
config.read('settings.conf')
vectorization_max_df = int(config.get('vectorizers','vectorization_max_df')) if config.get('vectorizers','vectorization_max_df').isdigit() else float(config.get('vectorizers','vectorization_max_df'))
vectorization_min_df = int(config.get('vectorizers','vectorization_min_df')) if config.get('vectorizers','vectorization_min_df').isdigit() else float(config.get('vectorizers','vectorization_min_df'))
vectorization_numberOfFeatures = int(config.get('vectorizers','vectorization_numberOfFeatures')) if config.get('vectorizers','vectorization_numberOfFeatures').isdigit() else None
doc2vec_vec_size = int(config.get('vectorizers','doc2vec_vec_size'))
doc2vec_epochs = int(config.get('vectorizers','doc2vec_epochs'))
doc2vec_lr = float(config.get('vectorizers','doc2vec_lr'))
extractor = feature_extractor(df,columnText, columnClass)
extractor_paragraphe = feature_extractor(df,'paragraphe', columnClass)
features_techniques = [
('counter', extractor.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
features_techniques_paragraphe = [
('counter', extractor_paragraphe.count_vect(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures )),
('tf_idf', extractor_paragraphe.tf_idf(max_df = vectorization_max_df, min_df = vectorization_min_df, numberOfFeatures = vectorization_numberOfFeatures)),
('doc2vec', extractor_paragraphe.doc2vec(doc2vec_epochs, doc2vec_vec_size, doc2vec_lr))]
#prepare data
df = df[df[columnClass] != 'unclassified']
y = df[columnClass]
#case of full text
for feature_technique_name, features in features_techniques:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for tmp_clf, tmp_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
print(clf_name, clf, grid_param_name, grid_param)
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
else :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
#evaluate model
file_name_report = feature_technique_name + '_' + clf_name
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report)+'.pdf')
with open(os.path.join('reports', columnClass, dir_name_report, file_name_report+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
#sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
for feature_technique_name, features in features_techniques_paragraphe:
train_x, test_x, train_y, test_y = train_test_split(features, y, test_size=0.33, random_state=42, stratify = y )
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(test_y)
for tmp_clf, clf_grid_params in zip(classifiers, grid_params):
clf_name, clf = tmp_clf
grid_param_name, grid_param = tmp_grid_params
if clf_name == 'bayes' :
if feature_technique_name == 'doc2vec':
continue
else:
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
else :
clf = GridSearchCV(clf, grid_param, refit = True, verbose = 3)
t_begin = time.time()
clf.fit(train_x, train_y)
t_end =time.time()
training_time = t_end - t_begin
y_pred = clf.predict(test_x)
#evaluate model
file_name_report_paragraphe = feature_technique_name + '_paragraphe_' + clf_name
report, accuracy, weighted_avg = evaluate_model(clf, test_x, valid_y, y_pred, valid_y, [str(e) for e in encoder.transform(encoder.classes_)], encoder.classes_, os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe)+'.pdf')
with open(os.path.join('reports', columnClass, dir_name_report, file_name_report_paragraphe+'.txt'), 'w') as f:
sys.stdout = f # Change the standard output to the file we created.
print(report)
print('accuracy : {}'.format(accuracy))
print('weighted_Precision : {}'.format(weighted_avg['precision']))
print('weighted_Recall : {}'.format(weighted_avg['recall']))
print('weighted_F-score : {}'.format(weighted_avg['f1-score']))
print('weighted_Support : {}'.format(weighted_avg['support']))
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
print('training time : {}'.format(training_time))
sys.stdout = sys.stdout # Reset the standard output to its original value
sys.stdout = sys.__stdout__
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import pandas as pd
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
class feature_extractor:
def __init__(self, data, column, target):
self.column = column
self.data = data
self.X = data[column]
self.y = data[target]
self.docs = []
for index, row in data.iterrows():
self.docs.append(row[column])
def count_vect(self, max_df= 1.0 , min_df= 1, numberOfFeatures= None ):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = CountVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
stem_vectorizer_fr = CountVectorizer( stop_words = 'french', analyzer = stemmed_words_fr, max_df= max_df, min_df = min_df, max_features = numberOfFeatures)
stem_vectorizer_fr.fit(self.docs)
return stem_vectorizer_fr.transform(self.docs)
def tf_idf(self, max_df= 1.0 , min_df= 1, numberOfFeatures = None):
stop_words = set(stopwords.words('french'))
stemmer_fr = SnowballStemmer("french")
analyzer = TfidfVectorizer().build_analyzer()
def stemmed_words_fr(doc):
return (stemmer_fr.stem(w) for w in analyzer(doc) if not w in stop_words)
tfidf_vectorizer = TfidfVectorizer(stop_words= 'french', analyzer=stemmed_words_fr, max_df= max_df, min_df = min_df, max_features= numberOfFeatures)
tfidf_vectorizer.fit(self.docs)
return tfidf_vectorizer.transform(self.docs)
def doc2vec(self, max_epochs, vec_size, alpha = 0.025 , dm = 1):
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(self.docs)]
model = Doc2Vec(vector_size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm =1)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
print('iteration {0}'.format(epoch))
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
set_tags = list(model.docvecs.doctags)
nb_docs_small = len(set_tags)
doc_vec_doc2vec = np.zeros(shape=(nb_docs_small, vec_size))
i = 0
for t in set_tags:
doc_vec_doc2vec[i] = model.docvecs[t]
i += 1
return doc_vec_doc2vec
def text_based_features(self):
# Classical measures
df = pd.DataFrame(columns=['char_count', 'word_count', 'word_density', 'punctuation_count', 'title_word_count', 'upper_case_word_count'])
df['char_count'] = self.data[self.column].apply(len)
df['word_count'] = self.data[self.column].apply(lambda x: len(x.split()))
df['word_density'] = df['char_count'] / (df['word_count']+1)
df['punctuation_count'] = self.data[self.column].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['title_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
df['upper_case_word_count'] = self.data[self.column].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
return df
beautifulsoup4
lxml
Unidecode
Unidecode==1.2.0
Keras==2.4.3
Keras-Preprocessing==1.1.2
sentence-transformers==0.4.1.2
transformers==4.3.2
torch==1.8.1
torchvision==0.8.2
tokenizers==0.10.1
regex==2018.1.10
tensorflow==2.2.0
gensgensim==3.8.1
mkdir -p reports/domaine_enccre
mkdir -p reports/ensemble_domaine_enccre
mkdir -p reports/normClass_artfl
pip install -r requierments.txt
python tmp_preprocess_data.py
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 50 800
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content ensemble_domaine_enccre 100 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 50 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content domaine_enccre 300 500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 300 1500
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 2000
python experimentsClassicClassifiers.py data/EDdA_dataframe_withContent.tsv content normClass_artfl 50 500
[vectorizers]
vectorization_max_df= 1.0
vectorization_min_df= 1
vectorization_numberOfFeatures= None
doc2vec_vec_size = 300
doc2vec_epochs = 10
doc2vec_lr = 0.025
min_word_per_article = 4
import sys
import os
import time
import argparse
import pandas as pd
import numpy as np
from data_preprocessing import Preprocessor
from features_extractor import feature_extractor
from ClassPreprocessor import remove_weak_classes, resample_classes, create_dict, split_class
from classifiers import classifiers, grid_params
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from evaluate_model import evaluate_model
from sklearn.model_selection import GridSearchCV
import configparser
from re import search
import math
from unidecode import unidecode
import re
import nltk
from ClassPreprocessor import create_dict
def removeMarkers(df, textColumn, listOfMarkers):
#remove null values or add condition if exist
#self.remove_null_rows(df, markerColumn)
#self.remove_null_rows(df, textColumn)
tmp = 0
for index, row in df.iterrows():
tmp += 1
print(tmp)
if not pd.isna(row[textColumn]):
for m in listOfMarkers:
marker = str(m)
marker_with_brcts = '('+ marker +')'
row[textColumn] = row[textColumn].replace(marker_with_brcts , "")
row[textColumn] = row[textColumn].replace(marker , "")
full_text = row[textColumn]
i = unidecode(full_text).find(marker_with_brcts)
goOn = False
if i != -1:
goOn = True
while goOn:
full_text = "".join((full_text[:i],"",full_text[i+len(marker_with_brcts):]))
i = unidecode(full_text).find(marker_with_brcts)
if i == -1:
goOn = False
row[textColumn] = full_text
return df
# Reading data and preprocessings steps
preprocessor = Preprocessor()
df = pd.read_csv('corpus_tei.csv')
listOfM = df['class'].unique()
df_original = pd.read_csv('data/EDdA_dataframe_withContent.tsv', sep="\t")
preprocessor.remove_null_rows(df_original, 'content')
df_original = removeMarkers(df_original, 'content', listOfM)
df_1 = df_original[['ensemble_domaine_enccre','content']].copy()
df_2 = df_original[['domaine_enccre','content']].copy()
df_3 = df_original[['normClass_artfl','content']].copy()
############ shall we remove articles with less n tokens ####### remove markers
preprocessor.remove_null_rows(df_1, 'content')
preprocessor.remove_null_rows(df_1, 'ensemble_domaine_enccre')
preprocessor.remove_null_rows(df_2, 'content')
preprocessor.remove_null_rows(df_2, 'domaine_enccre')
preprocessor.remove_null_rows(df_3, 'content')
preprocessor.remove_null_rows(df_3, 'normClass_artfl')
df_1 = split_class(df_1, 'ensemble_domaine_enccre')
df_2 = split_class(df_2, 'domaine_enccre')
df_3 = split_class(df_3, 'normClass_artfl')
d_1 = create_dict(df_1, 'ensemble_domaine_enccre')
tosave = pd.DataFrame.from_dict(d_1, orient='index', columns=[ 'Count'])
tosave.to_excel("ensemble_domaine_enccre.xlsx")
d_2 = create_dict(df_2, 'domaine_enccre')
tosave = pd.DataFrame.from_dict(d_2, orient='index', columns=[ 'Count'])
tosave.to_excel("domaine_enccre.xlsx")
d_3 = create_dict(df_3, 'normClass_artfl')
tosave = pd.DataFrame.from_dict(d_3, orient='index', columns=[ 'Count'])
tosave.to_excel("normClass_artfl.xlsx")
df_1.to_csv('dataframe_with_ensemble_domaine_enccre.csv')
df_2.to_csv('dataframe_with_domaine_enccre.csv')
df_3.to_csv('dataframe_with_normClass_artfl.csv')
print(df_original.shape)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment