# Deep learning for EDdA classification

## Setup colab environment

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Install packages

In [None]:
#!pip install zeugma
#!pip install plot_model

### Import librairies

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os

from tqdm import tqdm
import requests, zipfile, io
import codecs

from sklearn import preprocessing # LabelEncoder
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

from keras.layers import BatchNormalization, Input, Reshape, Conv2D, MaxPool2D, Concatenate
from keras.layers import Embedding, Dropout, Flatten, Dense
from keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint


### Utils functions

In [3]:

def resample_classes(df, classColumnName, numberOfInstances):
 #random numberOfInstances elements
 replace = False # with replacement
 fn = lambda obj: obj.loc[np.random.choice(obj.index, numberOfInstances if len(obj) > numberOfInstances else len(obj), replace),:]
 return df.groupby(classColumnName, as_index=False).apply(fn)
 


## Load Data

In [4]:
!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv
!wget https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv

--2022-02-18 07:27:50-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/training_set.tsv
Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28
Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 175634219 (167M) [text/tab-separated-values]
Saving to: ‘training_set.tsv’


2022-02-18 07:27:58 (21.8 MB/s) - ‘training_set.tsv’ saved [175634219/175634219]

--2022-02-18 07:27:58-- https://projet.liris.cnrs.fr/geode/EDdA-Classification/datasets/test_set.tsv
Resolving projet.liris.cnrs.fr (projet.liris.cnrs.fr)... 134.214.142.28
Connecting to projet.liris.cnrs.fr (projet.liris.cnrs.fr)|134.214.142.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42730598 (41M) [text/tab-separated-values]
Saving to: ‘test_set.tsv’


2022-02-18 07:28:01 (17.1 MB/s) - ‘test_set.tsv’ saved [42730598/42730598]



### Loading dataset

In [5]:
train_path = 'training_set.tsv'
test_path = 'test_set.tsv'

In [6]:
df_train = pd.read_csv(train_path, sep="\t")


In [7]:
df_train.sample(5)

Unnamed: 0,volume,numero,head,normClass,classEDdA,author,id_enccre,domaine_enccre,ensemble_domaine_enccre,content,contentWithoutClass,firstParagraph,nb_words
5965,9,712,KNAWEL,Botanique,Botan.,Jaucourt,v9-452-0,botanique,Histoire naturelle,"KNAWEL, (Botan.) genre de plante ainsi nommée ...",knawel genre plante nommée \n gérard ray par...,knawel genre plante nommée \n gérard ray par...,169
21406,4,3605,DECRETE,Jurisprudence,Jurispr.,Boucher d'Argis,v4-1826-0,jurisprudence,Droit - Jurisprudence,"DECRETE, adj. (Jurispr.) se dit communément\nd...",decrete adj communément \n contre a ordonné ...,decrete adj communément \n contre a ordonné ...,80
46481,12,2389,Piece nette,Artillerie,Artillerie.,Jaucourt,v12-1440-16,artillerie,Militaire (Art) - Guerre - Arme,"Piece nette, (Artillerie.) on appelle pieces n...",piece nette appelle pieces nestes pieces art...,piece nette appelle pieces nestes pieces art...,68
32540,7,1375,Gale,Manège | Maréchallerie,Manége & Maréchallerie.,Bourgelat,v7-622-1,manège,Maréchage - Manège,"Gale, (Manége & Maréchallerie.) maladie prurig...",gale maladie prurigineuse \n cutanée manifes...,gale maladie prurigineuse \n cutanée manifes...,3052
27748,13,4039,Récit historique,Histoire,Histoire.,unsigned,v13-2396-2,histoire,Histoire,"Récit historique, (Histoire.) le récit histori...",récit historique récit historique \n exposé ...,récit historique récit historique \n exposé ...,122


## Configuration


In [8]:
columnText = 'contentWithoutClass'
columnClass = 'ensemble_domaine_enccre'

maxOfInstancePerClass = 1500

batch_size = 64
validation_split = 0.20
max_nb_words = 20000 # taille du vocabulaire
max_sequence_length = 512 # taille max du 'document' 
epochs = 10

#embedding_name = "fasttext" 
#embedding_dim = 300 

embedding_name = "glove.6B.100d"
embedding_dim = 100 

path = "drive/MyDrive/Classification-EDdA/"

encoder_filename = "label_encoder.pkl"
tokenizer_filename = "tokenizer_keras.pkl"

## Preprocessing


In [9]:
if maxOfInstancePerClass != 10000:
 df_train = resample_classes(df_train, columnClass, maxOfInstancePerClass)

In [10]:
labels = df_train[columnClass]
numberOfClasses = labels.nunique()

if os.path.isfile(path+encoder_filename): 
 # load existing encoder 
 with open(path+encoder_filename, 'rb') as file:
 encoder = pickle.load(file)

else:
 encoder = preprocessing.LabelEncoder()
 encoder.fit(labels)

 with open(path+encoder_filename, 'wb') as file:
 pickle.dump(encoder, file)


labels = encoder.transform(labels)

In [11]:
encoder.classes_

array(['Agriculture - Economie rustique', 'Anatomie', 'Antiquité',
 'Architecture', 'Arts et métiers', 'Beaux-arts',
 'Belles-lettres - Poésie', 'Blason', 'Caractères', 'Chasse',
 'Chimie', 'Commerce', 'Droit - Jurisprudence',
 'Economie domestique', 'Grammaire', 'Géographie', 'Histoire',
 'Histoire naturelle', 'Jeu', 'Marine', 'Maréchage - Manège',
 'Mathématiques', 'Mesure', 'Militaire (Art) - Guerre - Arme',
 'Minéralogie', 'Monnaie', 'Musique', 'Médailles',
 'Médecine - Chirurgie', 'Métiers', 'Pharmacie', 'Philosophie',
 'Physique - [Sciences physico-mathématiques]', 'Politique',
 'Pêche', 'Religion', 'Spectacle', 'Superstition'], dtype=object)

In [12]:
labels_index = dict(zip(list(encoder.classes_), encoder.transform(list(encoder.classes_))))

In [13]:
labels_index

{'Agriculture - Economie rustique': 0,
 'Anatomie': 1,
 'Antiquité': 2,
 'Architecture': 3,
 'Arts et métiers': 4,
 'Beaux-arts': 5,
 'Belles-lettres - Poésie': 6,
 'Blason': 7,
 'Caractères': 8,
 'Chasse': 9,
 'Chimie': 10,
 'Commerce': 11,
 'Droit - Jurisprudence': 12,
 'Economie domestique': 13,
 'Grammaire': 14,
 'Géographie': 15,
 'Histoire': 16,
 'Histoire naturelle': 17,
 'Jeu': 18,
 'Marine': 19,
 'Maréchage - Manège': 20,
 'Mathématiques': 21,
 'Mesure': 22,
 'Militaire (Art) - Guerre - Arme': 23,
 'Minéralogie': 24,
 'Monnaie': 25,
 'Musique': 26,
 'Médailles': 27,
 'Médecine - Chirurgie': 28,
 'Métiers': 29,
 'Pharmacie': 30,
 'Philosophie': 31,
 'Physique - [Sciences physico-mathématiques]': 32,
 'Politique': 33,
 'Pêche': 34,
 'Religion': 35,
 'Spectacle': 36,
 'Superstition': 37}

### Loading pre-trained embeddings

#### FastText

In [None]:
# download FastText (prend trop de place pour le laisser sur le drive)
zip_file_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip"
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [None]:
print('loading word embeddings FastText...')

embeddings_index = {}
f = codecs.open('crawl-300d-2M.vec', encoding='utf-8')

for line in tqdm(f):
 values = line.rstrip().rsplit(' ')
 word = values[0]
 coefs = np.asarray(values[1:], dtype='float32')
 embeddings_index[word] = coefs
f.close()

print('found %s word vectors' % len(embeddings_index))

#### GLOVE

In [None]:
# download Glove
#zip_file_url = "https://nlp.stanford.edu/data/glove.6B.zip"
#r = requests.get(zip_file_url)
#z = zipfile.ZipFile(io.BytesIO(r.content))
#z.extractall()

In [15]:
print('loading word embeddings GLOVE...')

embeddings_index = {}
f = open(path+"embeddings/"+embedding_name+".txt", encoding='utf-8')
for line in tqdm(f):
 values = line.split()
 word = values[0]
 coefs = np.asarray(values[1:], dtype='float32')
 embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

loading word embeddings GLOVE...


400000it [00:13, 30217.64it/s]

Found 400000 word vectors.





## Training models

In [16]:

raw_docs_train = df_train[columnText].tolist()


print("pre-processing train data...")

if os.path.isfile(path+tokenizer_filename):
 with open(path+tokenizer_filename, 'rb') as file:
 tokenizer = pickle.load(file)
else:
 tokenizer = Tokenizer(num_words = max_nb_words)
 tokenizer.fit_on_texts(raw_docs_train) 

 with open(path+tokenizer_filename, 'wb') as file:
 pickle.dump(tokenizer, file)

sequences = tokenizer.texts_to_sequences(raw_docs_train)

word_index = tokenizer.word_index
print("dictionary size: ", len(word_index))

#pad sequences
data = sequence.pad_sequences(sequences, maxlen=max_sequence_length)

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print(labels)

pre-processing train data...
dictionary size: 190508
Shape of data tensor: (27381, 512)
Shape of label tensor: (27381,)
[ 0 0 0 ... 37 37 37]


In [17]:
# split the data into a training set and a validation set

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

nb_validation_samples = int(validation_split * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]


In [18]:
#embedding matrix

print('preparing embedding matrix...')

embedding_matrix = np.zeros((len(word_index)+1, embedding_dim))

for word, i in word_index.items():
 embedding_vector = embeddings_index.get(word)
 if embedding_vector is not None : 
 embedding_matrix[i] = embedding_vector


preparing embedding matrix...


In [19]:

filter_sizes = [2, 3, 5]
drop = 0.5

embedding_layer = Embedding(len(word_index)+1, embedding_dim, input_length = max_sequence_length,
 weights=[embedding_matrix], trainable=False)
inputs = Input(shape=(max_sequence_length), dtype='int32')
embedding = embedding_layer(inputs)

print(embedding.shape)
reshape = Reshape((max_sequence_length, embedding_dim, 1))(embedding)
print(reshape.shape)

# https://github.com/elvinaqa/Text-Classification-GloVe-CNN

conv_0 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(max_sequence_length, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(max_sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(len(labels_index), activation='softmax')(dropout)

# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
#adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])
model.summary()

(None, 512, 100)
(None, 512, 100, 1)
Model: "model"
__________________________________________________________________________________________________
 Layer (type) Output Shape Param # Connected to 
 input_1 (InputLayer) [(None, 512)] 0 [] 
 
 embedding (Embedding) (None, 512, 100) 19050900 ['input_1[0][0]'] 
 
 reshape (Reshape) (None, 512, 100, 1) 0 ['embedding[0][0]'] 
 
 conv2d (Conv2D) (None, 511, 1, 512) 102912 ['reshape[0][0]'] 
 
 conv2d_1 (Conv2D) (None, 510, 1, 512) 154112 ['reshape[0][0]'] 
 
 conv2d_2 (Conv2D) (None, 508, 1, 512) 256512 ['reshape[0][0]'] 
 
 max_pooling2d (MaxPooling2D) (None, 1, 1, 512) 0 ['conv2d[0][0]'] 
 
 max_pooling2d_1 (MaxPooling2D) (None, 1, 1, 512) 0 ['conv2d_1[0][0]'] 
 
 max_pooling2d_2 (MaxPooling2D) (None, 1, 1, 512) 0 ['conv2d_2[0][0]'] 
 
 concatenate (Concatenate) (None, 3, 1, 512) 0 ['max_pooling2d[0][0]', 
 'max_pooling2d_1[0][0]', 
 'max_pooling2d_2[0][0]'] 
 
 flatten (Flatten) (None, 1536) 0 ['concatenate[0][0]'] 
 
 dropout (Dropout)

In [None]:
history = model.fit(x_train, y_train, 
 batch_size=batch_size, 
 epochs=epochs, 
 verbose=1,
 callbacks=[checkpoint],
 validation_data=(x_val, y_val))

Epoch 1/10
Epoch 1: val_acc improved from -inf to 0.54164, saving model to weights_cnn_sentece.hdf5
Epoch 2/10
Epoch 2: val_acc improved from 0.54164 to 0.61158, saving model to weights_cnn_sentece.hdf5
Epoch 3/10

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='lower right')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

## Saving models

In [None]:
name = "cnn_conv2D_"+embedding_name+"_s"+str(maxOfInstancePerClass)

In [None]:
model.save(path+name+".h5")


In [None]:
# save embeddings

# saving embeddings index 


## Loading models

In [None]:
model = load_model(path+name+".h5")

with open(path+tokenizer_filename, 'rb') as file:
 tokenizer = pickle.load(file)

with open(path+encoder_filename, 'rb') as file:
 encoder = pickle.load(file)


## Evaluation

In [None]:
df_test = pd.read_csv(test_path, sep="\t")


In [None]:
test_texts = df_test[columnText].tolist()
test_labels = df_test[columnClass].tolist()

test_sequences = tokenizer.texts_to_sequences(test_texts)
test_input = sequence.pad_sequences(test_sequences, maxlen=max_sequence_length)

# Get predictions
test_predictions_probas = model.predict(test_input)
test_predictions = test_predictions_probas.argmax(axis=-1)

In [None]:

test_intent_predictions = encoder.inverse_transform(test_predictions)
#test_intent_original = encoder.inverse_transform(test_labels)

print('accuracy: ', sum(test_intent_predictions == test_labels) / len(test_labels))
print("Precision, Recall and F1-Score:\n\n", classification_report(test_labels, test_intent_predictions))



In [None]:

report = classification_report(test_labels, test_intent_predictions, output_dict = True)

precision = []
recall = []
f1 = []
support = []
dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
for c in encoder.classes_:
 precision.append(report[c]['precision'])
 recall.append(report[c]['recall'])
 f1.append(report[c]['f1-score'])
 support.append(report[c]['support'])

accuracy = report['accuracy']
weighted_avg = report['weighted avg']


cnf_matrix = confusion_matrix(test_labels, test_intent_predictions)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

dff['className'] = encoder.classes_
dff['precision'] = precision
dff['recall'] = recall
dff['f1-score'] = f1
dff['support'] = support
dff['FP'] = FP
dff['FN'] = FN
dff['TP'] = TP
dff['TN'] = TN


 
content = name + "\n"
print(name)
content += str(weighted_avg) + "\n"
print(weighted_avg)
print(accuracy)
print(dff)

dff.to_csv(path+"/reports/report_"+name+".csv", index=False)

# enregistrer les predictions
pd.DataFrame({'labels': pd.Series(df_test[columnClass]), 'predictions': pd.Series(test_intent_predictions)}).to_csv(path+"/predictions/predictions_"+name+".csv")

with open(path+"/reports/report_"+name+".txt", 'w') as f:
 f.write(content)


In [None]:
predictions = model.predict(word_seq_validation)
predictions = np.argmax(predictions,axis=1)

In [None]:
report = classification_report(predictions, y_validation, output_dict = True)

accuracy = report['accuracy']
weighted_avg = report['weighted avg']

print(accuracy, weighted_avg)

0.5726683109527725 {'precision': 0.6118028288513718, 'recall': 0.5726683109527725, 'f1-score': 0.5870482221489528, 'support': 10947}


 _warn_prf(average, modifier, msg_start, len(result))
 _warn_prf(average, modifier, msg_start, len(result))
 _warn_prf(average, modifier, msg_start, len(result))


In [None]:
df_test = pd.read_csv(test_path, sep="\t")

encoder = preprocessing.LabelEncoder()
y_test = encoder.fit_transform(df_test[columnClass])


In [None]:
raw_docs_test = df_test[columnText].tolist()

print("pre-processing test data...")

stop_words = set(stopwords.words('french'))

processed_docs_test = []
for doc in tqdm(raw_docs_test):
 tokens = word_tokenize(doc, language='french')
 filtered = [word for word in tokens if word not in stop_words]
 processed_docs_test.append(" ".join(filtered))
#end for

print("tokenizing input data...")
#tokenizer = Tokenizer(num_words=max_len, lower=True, char_level=False)
#tokenizer.fit_on_texts(processed_docs_train + processed_docs_validation) #leaky
word_seq_test = tokenizer.texts_to_sequences(processed_docs_test)

#pad sequences
word_seq_test = sequence.pad_sequences(word_seq_test, maxlen=max_len)

pre-processing test data...


100%|██████████| 13137/13137 [00:09<00:00, 1331.48it/s]


tokenizing input data...


In [None]:
predictions = model.predict(word_seq_test)
predictions = np.argmax(predictions,axis=1)

In [None]:
report = classification_report(predictions, y_test, output_dict = True)

accuracy = report['accuracy']
weighted_avg = report['weighted avg']

print(accuracy, weighted_avg)

0.5698409073608891 {'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}


 _warn_prf(average, modifier, msg_start, len(result))
 _warn_prf(average, modifier, msg_start, len(result))
 _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import confusion_matrix

classesName = encoder.classes_
classes = [str(e) for e in encoder.transform(encoder.classes_)]

precision = []
recall = []
f1 = []
support = []
dff = pd.DataFrame(columns= ['className', 'precision', 'recall', 'f1-score', 'support', 'FP', 'FN', 'TP', 'TN'])
for c in classes:
 precision.append(report[c]['precision'])
 recall.append(report[c]['recall'])
 f1.append(report[c]['f1-score'])
 support.append(report[c]['support'])

accuracy = report['accuracy']
weighted_avg = report['weighted avg']


cnf_matrix = confusion_matrix(y_test, predictions)
FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)
FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
TP = np.diag(cnf_matrix)
TN = cnf_matrix.sum() - (FP + FN + TP)

dff['className'] = classesName
dff['precision'] = precision
dff['recall'] = recall
dff['f1-score'] = f1
dff['support'] = support
dff['FP'] = FP
dff['FN'] = FN
dff['TP'] = TP
dff['TN'] = TN

print("test_cnn_s"+str(maxOfInstancePerClass))

print(weighted_avg)
print(accuracy)
print(dff)

dff.to_csv("drive/MyDrive/Classification-EDdA/report_test_cnn_s"+str(maxOfInstancePerClass)+".csv", index=False)

test_cnn_s10000
{'precision': 0.6081680700148677, 'recall': 0.5698409073608891, 'f1-score': 0.5847417616022411, 'support': 13137}
0.5698409073608891
 className precision ... TP TN
0 Agriculture - Economie rustique 0.216535 ... 55 12636
1 Anatomie 0.459821 ... 103 12768
2 Antiquité 0.287975 ... 91 12710
3 Architecture 0.339623 ... 108 12722
4 Arts et métiers 0.015504 ... 2 12995
5 Beaux-arts 0.060000 ... 6 13018
6 Belles-lettres - Poésie 0.127660 ... 30 12761
7 Blason 0.228571 ... 24 12993
8 Caractères 0.037037 ... 1 13110
9 Chasse 0.221311 ... 27 12962
10 Chimie 0.160714 ... 18 12991
11 Commerce 0.443418 ... 192 12490
12 Droit - Jurisprudence 0.762879 ... 1081 11263
13 Economie domestique 0.000000 ... 0 13102
14 Grammaire 0.408929 ... 229 12254
15 Géographie 0.917312 ... 2607 9910
16 Histoire 0.405063 ... 288 11777
17 Histoire naturelle 0.743292 ... 831 11661
18 Jeu 0.061538 ... 4 13067
19 Marine 0.590805 ... 257 12549
20 Maréchage - Manège 0.620690 ... 72 13001
21 Mathématiques 0.5496

In [None]:
def predict(data, max_len):
 
 pad_sequ_test, _ = prepare_sequence(data, max_len)
 pred_labels_ = model.predict(pad_sequ_test)

 return np.argmax(pred_labels_,axis=1)


def eval(data, labels, max_len):
 
 pred_labels_ = predict(data, max_len)
 report = classification_report(pred_labels_, labels, output_dict = True)

 accuracy = report['accuracy']
 weighted_avg = report['weighted avg']
 
 print(accuracy, weighted_avg)

In [None]:
# evaluation sur le jeu de validation
eval(df_validation[columnText], y_validation, max_len)

 return np.array(self.texts_to_sequences(texts))


0.06925290207361841 {'precision': 0.09108131158125257, 'recall': 0.06925290207361841, 'f1-score': 0.06099084715237025, 'support': 10079}


In [None]:
# evaluation sur le jeu de test
df_test = pd.read_csv(test_path, sep="\t")
#df_test = resample_classes(df_test, columnClass, maxOfInstancePerClass)

y_test = df_test[columnClass]
encoder = preprocessing.LabelEncoder()
y_test = encoder.fit_transform(y_test)

eval(df_test[columnText], y_test, max_len)


 return np.array(self.texts_to_sequences(texts))


0.07231483595950369 {'precision': 0.081194635559303, 'recall': 0.07231483595950369, 'f1-score': 0.06322383877903374, 'support': 13137}
