Skip to content
Snippets Groups Projects
Commit dfede5ac authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Predict_LGE.ipynb

parent 6b612e5e
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# BERT Predict classification # BERT Predict classification
## 1. Setup the environment ## 1. Setup the environment
### 1.1 Setup colab environment ### 1.1 Setup colab environment
#### 1.1.1 Install packages #### 1.1.1 Install packages
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
!pip install transformers==4.10.3 !pip install transformers==4.10.3
!pip install sentencepiece !pip install sentencepiece
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
#### 1.1.2 Use more RAM #### 1.1.2 Use more RAM
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from psutil import virtual_memory from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9 ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb)) print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20: if ram_gb < 20:
print('Not using a high-RAM runtime') print('Not using a high-RAM runtime')
else: else:
print('You are using a high-RAM runtime!') print('You are using a high-RAM runtime!')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
#### 1.1.3 Mount GoogleDrive #### 1.1.3 Mount GoogleDrive
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from google.colab import drive from google.colab import drive
drive.mount('/content/drive') drive.mount('/content/drive')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 1.2 Setup GPU ### 1.2 Import librairies
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import os
import pandas as pd
import numpy as np
import pickle
import torch import torch
from tqdm import tqdm
from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
```
%% Cell type:markdown id: tags:
### 1.3 Setup GPU
%% Cell type:code id: tags:
# If there's a GPU available... ``` python
# If there's a GPU available...
if torch.cuda.is_available(): if torch.cuda.is_available():
# Tell PyTorch to use the GPU. # Tell PyTorch to use the GPU.
device = torch.device("cuda") device = torch.device("cuda")
gpu_name = "cuda"
print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0)) print('We will use the GPU:', torch.cuda.get_device_name(0))
# for MacOS # for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
device = torch.device("mps") device = torch.device("mps")
gpu_name = "mps"
print('We will use the GPU') print('We will use the GPU')
else: else:
device = torch.device("cpu") device = torch.device("cpu")
gpu_name = "cpu"
print('No GPU available, using the CPU instead.') print('No GPU available, using the CPU instead.')
``` ```
%% Output %% Output
We will use the GPU We will use the GPU
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 1.3 Import librairies ## 2. Utils
%% Cell type:code id: tags:
``` python
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, CamembertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
```
%% Cell type:markdown id: tags:
## 2. Load Data
%% Cell type:code id: tags:
``` python
#path = "drive/MyDrive/Classification-EDdA/"
path = "../"
```
%% Cell type:code id: tags:
``` python
!wget https://projet.liris.cnrs.fr/geode/files/datasets/EDdA/Classification/LGE_withContent.tsv
```
%% Cell type:code id: tags:
``` python
df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t")
data_LGE = df_LGE["content"].values
```
%% Cell type:code id: tags:
``` python
df_LGE.head()
```
%% Output
id tome rank domain remark \
0 abrabeses-0 1 623 geography NaN
1 accius-0 1 1076 biography NaN
2 achenbach-2 1 1357 biography NaN
3 acireale-0 1 1513 geography NaN
4 actée-0 1 1731 botany NaN
content
0 ABRABESES. Village d’Espagne de la prov. de Za...
1 ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...
2 ACHENBACH(Henri), administrateur prussien, né ...
3 ACIREALE. Yille de Sicile, de la province et d...
4 ACTÉE(Actœa L.). Genre de plantes de la famill...
%% Cell type:code id: tags:
``` python
df_LGE.shape
```
%% Output
(310, 6)
%% Cell type:markdown id: tags:
## 3. Load model and predict
### 3.1 BERT / CamemBERT
%% Cell type:code id: tags:
``` python
model_name = "bert-base-multilingual-cased"
#model_name = "camembert-base"
model_path = path + "models/model_" + model_name + "_s10000.pt"
```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512): def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):
# Tokenize all of the sentences and map the tokens to thier word IDs. # Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = [] input_ids_test = []
# For every sentence... # For every sentence...
for sent in sentences: for sent in sentences:
# `encode` will: # `encode` will:
# (1) Tokenize the sentence. # (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start. # (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end. # (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs. # (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode( encoded_sent = tokenizer.encode(
sent, # Sentence to encode. sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]' add_special_tokens = True, # Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion # This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I # to pytorch tensors, but I need to do padding, so I
# can't use these features. # can't use these features.
#max_length = max_len, # Truncate all sentences. #max_length = max_len, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors. #return_tensors = 'pt', # Return pytorch tensors.
) )
input_ids_test.append(encoded_sent) input_ids_test.append(encoded_sent)
# Pad our input tokens # Pad our input tokens
padded_test = [] padded_test = []
for i in input_ids_test: for i in input_ids_test:
if len(i) > max_len: if len(i) > max_len:
padded_test.extend([i[:max_len]]) padded_test.extend([i[:max_len]])
else: else:
padded_test.extend([i + [0] * (max_len - len(i))]) padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test) input_ids_test = np.array(padded_test)
# Create attention masks # Create attention masks
attention_masks = [] attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding # Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test: for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq] seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask) attention_masks.append(seq_mask)
# Convert to tensors. # Convert to tensors.
inputs = torch.tensor(input_ids_test) inputs = torch.tensor(input_ids_test)
masks = torch.tensor(attention_masks) masks = torch.tensor(attention_masks)
#set batch size #set batch size
# Create the DataLoader. # Create the DataLoader.
data = TensorDataset(inputs, masks) data = TensorDataset(inputs, masks)
prediction_sampler = SequentialSampler(data) prediction_sampler = SequentialSampler(data)
return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size) return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)
def predict(model, dataloader, device): def predict(model, dataloader, device):
# Put model in evaluation mode # Put model in evaluation mode
model.eval() model.eval()
# Tracking variables # Tracking variables
predictions_test , true_labels = [], [] predictions_test , true_labels = [], []
pred_labels_ = [] pred_labels_ = []
# Predict # Predict
for batch in dataloader: for batch in dataloader:
# Add batch to GPU # Add batch to GPU
batch = tuple(t.to(device) for t in batch) batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader # Unpack the inputs from the dataloader
b_input_ids, b_input_mask = batch b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and # Telling the model not to compute or store gradients, saving memory and
# speeding up prediction # speeding up prediction
with torch.no_grad(): with torch.no_grad():
# Forward pass, calculate logit predictions # Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None, outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask) attention_mask=b_input_mask)
logits = outputs[0] logits = outputs[0]
#print(logits) #print(logits)
# Move logits and labels to CPU ??? # Move logits and labels to CPU ???
logits = logits.detach().cpu().numpy() logits = logits.detach().cpu().numpy()
#print(logits) #print(logits)
# Store predictions and true labels # Store predictions and true labels
predictions_test.append(logits) predictions_test.append(logits)
pred_labels = [] pred_labels = []
for i in range(len(predictions_test)): for i in range(len(predictions_test)):
# The predictions for this batch are a 2-column ndarray (one column for "0" # The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this # and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s. # in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i) pred_labels.append(pred_labels_i)
pred_labels_ += [item for sublist in pred_labels for item in sublist] pred_labels_ += [item for sublist in pred_labels for item in sublist]
return pred_labels_ return pred_labels_
def text_folder_to_dataframe(path):
data = []
# id,tome,filename,nb_words,content,domain
for tome in sorted(os.listdir(path)):
try:
for article in tqdm(sorted(os.listdir(path + "/" + tome))):
filename = article[:-4]
id = tome + filename
if article[-4:] == ".txt":
with open(path + "/" + tome + "/" + article) as f:
content = f.read()
data.append([id, tome, filename, content, len(content.split(' '))])
except NotADirectoryError:
pass
return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words'])
``` ```
%% Cell type:markdown id: tags:
## 3. Load Data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
if model_name == 'bert-base-multilingual-cased' : !wget https://api.nakala.fr/data/10.34847/nkl.74eb1xfd/e522413b58b04ab7c283f8fa68642e9cb69ab5c5
print('Loading Bert Tokenizer...') ```
tokenizer = BertTokenizer.from_pretrained(model_name)
elif model_name == 'camembert-base': %% Cell type:code id: tags:
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(model_name) ``` python
!unzip e522413b58b04ab7c283f8fa68642e9cb69ab5c5
```
%% Cell type:code id: tags:
``` python
#input_path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text"
input_path = "./Text"
```
%% Cell type:code id: tags:
``` python
df_LGE = text_folder_to_dataframe(input_path)
#df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t")
data_LGE = df_LGE["content"].values
```
%% Cell type:code id: tags:
``` python
df_LGE.head()
``` ```
%% Output %% Output
Loading Bert Tokenizer... id tome rank domain remark \
0 abrabeses-0 1 623 geography NaN
1 accius-0 1 1076 biography NaN
2 achenbach-2 1 1357 biography NaN
3 acireale-0 1 1513 geography NaN
4 actée-0 1 1731 botany NaN
content
0 ABRABESES. Village d’Espagne de la prov. de Za...
1 ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...
2 ACHENBACH(Henri), administrateur prussien, né ...
3 ACIREALE. Yille de Sicile, de la province et d...
4 ACTÉE(Actœa L.). Genre de plantes de la famill...
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
data_loader = generate_dataloader(tokenizer, data_LGE) df_LGE.shape
``` ```
%% Output %% Output
Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors (310, 6)
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 3. Load model and predict
https://discuss.huggingface.co/t/an-efficient-way-of-loading-a-model-that-was-saved-with-torch-save/9814 ### 3.1 BERT / CamemBERT
https://github.com/huggingface/transformers/issues/2094
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#model = torch.load(model_path, map_location=torch.device('mps')) #path = "drive/MyDrive/Classification-EDdA/"
#model.load_state_dict(torch.load(model_path, map_location=torch.device('mps'))) path = "../"
model_name = "bert-base-multilingual-cased"
model = BertForSequenceClassification.from_pretrained(model_path).to("mps") #.to("cuda") #model_name = "camembert-base"
model_path = path + "models/model_" + model_name + "_s10000.pt"
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pred = predict(model, data_loader, device) if model_name == 'bert-base-multilingual-cased' :
print('Loading Bert Tokenizer...')
tokenizer = BertTokenizer.from_pretrained(model_name)
elif model_name == 'camembert-base':
print('Loading Camembert Tokenizer...')
tokenizer = CamembertTokenizer.from_pretrained(model_name)
``` ```
%% Output
Loading Bert Tokenizer...
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pred data_loader = generate_dataloader(tokenizer, data_LGE)
``` ```
%% Output %% Output
[15, Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors
6,
16, %% Cell type:code id: tags:
15,
17, ``` python
10, model = BertForSequenceClassification.from_pretrained(model_path).to(gpu_name) #.to("cuda")
17, ```
16,
19, %% Cell type:code id: tags:
35,
15, ``` python
26, pred = predict(model, data_loader, device)
15, ```
15,
15,
15,
2,
2,
17,
6,
32,
17,
30,
16,
32,
15,
35,
15,
23,
15,
15,
15,
17,
15,
16,
3,
17,
17,
16,
4,
15,
17,
19,
16,
35,
3,
17,
5,
15,
16,
16,
15,
16,
6,
16,
5,
16,
15,
28,
16,
17,
10,
15,
15,
32,
15,
17,
15,
15,
15,
12,
15,
18,
15,
35,
26,
16,
16,
15,
5,
15,
15,
5,
17,
15,
17,
35,
15,
16,
16,
17,
2,
17,
15,
16,
23,
16,
15,
15,
15,
16,
6,
15,
35,
15,
32,
16,
6,
16,
23,
36,
5,
35,
3,
3,
3,
16,
17,
2,
15,
5,
17,
16,
15,
17,
6,
15,
16,
10,
16,
15,
35,
17,
15,
15,
6,
28,
16,
15,
15,
15,
16,
5,
15,
21,
5,
1,
7,
16,
15,
17,
23,
15,
5,
0,
10,
16,
16,
15,
16,
15,
15,
3,
3,
17,
36,
16,
15,
12,
6,
15,
4,
16,
16,
26,
15,
15,
32,
15,
10,
15,
5,
26,
5,
15,
15,
26,
15,
35,
15,
16,
16,
15,
6,
16,
12,
16,
28,
16,
15,
15,
16,
6,
10,
16,
15,
15,
16,
16,
15,
15,
15,
15,
5,
16,
16,
17,
15,
16,
35,
16,
16,
15,
6,
29,
16,
15,
5,
5,
15,
15,
15,
16,
16,
15,
15,
31,
16,
15,
16,
15,
6,
16,
3,
15,
2,
15,
15,
28,
17,
15,
15,
16,
15,
15,
10,
15,
5,
16,
15,
15,
17,
15,
5,
15,
3,
15,
2,
15,
15,
6,
15,
28,
15,
6,
15,
32,
16,
15,
2,
15,
15,
15,
15,
15,
16,
17,
15,
15,
15,
15,
15,
16,
15,
15,
15,
35,
15,
15,
35,
16,
28,
15,
15,
15,
5,
15,
15,
19,
15]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pickle
encoder_filename = "models/label_encoder.pkl" encoder_filename = "models/label_encoder.pkl"
with open(path+encoder_filename, 'rb') as file: with open(path + encoder_filename, 'rb') as file:
encoder = pickle.load(file) encoder = pickle.load(file)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
p2 = list(encoder.inverse_transform(pred)) p2 = list(encoder.inverse_transform(pred))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE['class_bert'] = p2 df_LGE['domain'] = p2
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.head(50) df_LGE.head(50)
``` ```
%% Output %% Output
id tome rank domain remark \ id tome rank domain remark \
0 abrabeses-0 1 623 geography NaN 0 abrabeses-0 1 623 geography NaN
1 accius-0 1 1076 biography NaN 1 accius-0 1 1076 biography NaN
2 achenbach-2 1 1357 biography NaN 2 achenbach-2 1 1357 biography NaN
3 acireale-0 1 1513 geography NaN 3 acireale-0 1 1513 geography NaN
4 actée-0 1 1731 botany NaN 4 actée-0 1 1731 botany NaN
5 adulteration-0 1 2197 NaN cross reference 5 adulteration-0 1 2197 NaN cross reference
6 aérides-0 1 2334 botany NaN 6 aérides-0 1 2334 botany NaN
7 ager-0 1 2710 biography NaN 7 ager-0 1 2710 biography NaN
8 aigu-1 1 3160 NaN cross reference 8 aigu-1 1 3160 NaN cross reference
9 alavika-0 1 3664 theology NaN 9 alavika-0 1 3664 theology NaN
10 allassac-0 2 755 geography NaN 10 allassac-0 2 755 geography NaN
11 allegretto-0 2 786 NaN cross reference 11 allegretto-0 2 786 NaN cross reference
12 alleuze-0 2 908 geography NaN 12 alleuze-0 2 908 geography NaN
13 alliat-0 2 933 geography NaN 13 alliat-0 2 933 geography NaN
14 amanty-0 2 1651 geography NaN 14 amanty-0 2 1651 geography NaN
15 âmasserah-0 2 1701 geography explicit domain 15 âmasserah-0 2 1701 geography explicit domain
16 a-118 2 2971 history NaN 16 a-118 2 2971 history NaN
17 androclès-0 2 3261 mythology explicit domain 17 androclès-0 2 3261 mythology explicit domain
18 anfouson-0 2 3394 zoology NaN 18 anfouson-0 2 3394 zoology NaN
19 anicet-bourgeois-0 2 3717 biography NaN 19 anicet-bourgeois-0 2 3717 biography NaN
20 anomalistique-0 3 238 astronomy explicit domain 20 anomalistique-0 3 238 astronomy explicit domain
21 anostostome-0 3 298 zoology NaN 21 anostostome-0 3 298 zoology NaN
22 anthoxanthème-0 3 571 chemistry NaN 22 anthoxanthème-0 3 571 chemistry NaN
23 aod-0 3 1024 theology NaN 23 aod-0 3 1024 theology NaN
24 aphellan-0 3 1177 astronomy NaN 24 aphellan-0 3 1177 astronomy NaN
25 appelle-0 3 1494 geography NaN 25 appelle-0 3 1494 geography NaN
26 aragona-1 3 1841 biography NaN 26 aragona-1 3 1841 biography NaN
27 araujuzon-0 3 1940 geography NaN 27 araujuzon-0 3 1940 geography NaN
28 ardant-0 3 2421 biography NaN 28 ardant-0 3 2421 biography NaN
29 ariano-0 3 2839 geography NaN 29 ariano-0 3 2839 geography NaN
30 athabaska-0 4 1118 anthropology NaN 30 athabaska-0 4 1118 anthropology NaN
31 aslonnes-0 4 446 geography NaN 31 aslonnes-0 4 446 geography NaN
32 astr0rh1za-0 4 992 zoology explicit domain 32 astr0rh1za-0 4 992 zoology explicit domain
33 atthidographes-0 4 1397 NaN cross reference 33 atthidographes-0 4 1397 NaN cross reference
34 aubery-2 4 1577 biography NaN 34 aubery-2 4 1577 biography NaN
35 aula-0 4 1992 history NaN 35 aula-0 4 1992 history NaN
36 au-113 4 2112 botany explicit domain 36 au-113 4 2112 botany explicit domain
37 auriol-4 4 2224 NaN cross reference 37 auriol-4 4 2224 NaN cross reference
38 ave-lalleniant-0 4 2739 biography NaN 38 ave-lalleniant-0 4 2739 biography NaN
39 badin-2 4 3857 biography NaN 39 badin-2 4 3857 biography NaN
40 baizieux-0 5 133 geography NaN 40 baizieux-0 5 133 geography NaN
41 balsam1te-0 5 677 botany explicit domain 41 balsam1te-0 5 677 botany explicit domain
42 balze-0 5 757 navy explicit domain 42 balze-0 5 757 navy explicit domain
43 bande-2 5 880 history NaN 43 bande-2 5 880 history NaN
44 barbosa-5 5 1580 biography NaN 44 barbosa-5 5 1580 biography NaN
45 bati-0 5 2955 architecture NaN 45 bati-0 5 2955 architecture NaN
46 baveuse-0 5 3457 zoology explicit domain 46 baveuse-0 5 3457 zoology explicit domain
47 beard-2 5 3728 biography NaN 47 beard-2 5 3728 biography NaN
48 beaufort-4 5 3838 geography NaN 48 beaufort-4 5 3838 geography NaN
49 beaumont-26 5 4018 biography NaN 49 beaumont-26 5 4018 biography NaN
content \ content \
0 ABRABESES. Village d’Espagne de la prov. de Za... 0 ABRABESES. Village d’Espagne de la prov. de Za...
1 ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po... 1 ACCIUS, L. ou L. ATTIUS (170-94 av. J.-C.), po...
2 ACHENBACH(Henri), administrateur prussien, né ... 2 ACHENBACH(Henri), administrateur prussien, né ...
3 ACIREALE. Yille de Sicile, de la province et d... 3 ACIREALE. Yille de Sicile, de la province et d...
4 ACTÉE(Actœa L.). Genre de plantes de la famill... 4 ACTÉE(Actœa L.). Genre de plantes de la famill...
5 ADULTERATION. Altération d’un médicament, d’un... 5 ADULTERATION. Altération d’un médicament, d’un...
6 AÉRIDES{Aérides Lour.). Genres de plantes de l... 6 AÉRIDES{Aérides Lour.). Genres de plantes de l...
7 AGERouAGERIUS (Nicolaus), médecin alsacien, né... 7 AGERouAGERIUS (Nicolaus), médecin alsacien, né...
8 AIGU1 LH E (V. Raimond d’).\n 8 AIGU1 LH E (V. Raimond d’).\n
9 ALAVIKA« qui est d'Alava »(V. ce mot) : Bhikch... 9 ALAVIKA« qui est d'Alava »(V. ce mot) : Bhikch...
10 ALLASSAC. Com. du dép. de la Corrèze, arr. de ... 10 ALLASSAC. Com. du dép. de la Corrèze, arr. de ...
11 ALLEGRETTO(V. Allegro).\n 11 ALLEGRETTO(V. Allegro).\n
12 ALLEUZE. Com. du dép. du Cantal, arr. et cant.... 12 ALLEUZE. Com. du dép. du Cantal, arr. et cant....
13 ALLIAT. Com. du dép. de l’Ariège, arr. de Foix... 13 ALLIAT. Com. du dép. de l’Ariège, arr. de Foix...
14 AMANTY. Corn, du dép. de la Meuse, arr. de Com... 14 AMANTY. Corn, du dép. de la Meuse, arr. de Com...
15 ÂMASSERAH, AMASR1 ou AMASRAH (Géogr.). Ville d... 15 ÂMASSERAH, AMASR1 ou AMASRAH (Géogr.). Ville d...
16 AN Cl LIA. Boucliers sacrés des Romains, au no... 16 AN Cl LIA. Boucliers sacrés des Romains, au no...
17 ANDROCLÈS(Myth.), un fils d’Eole qui régna sur... 17 ANDROCLÈS(Myth.), un fils d’Eole qui régna sur...
18 ANFOUSON. Nom donné à Nice au Néron brun\n(V. ... 18 ANFOUSON. Nom donné à Nice au Néron brun\n(V. ...
19 ANICET-BOURGEOIS(Auguste Anicet, connu sous le... 19 ANICET-BOURGEOIS(Auguste Anicet, connu sous le...
20 ANOMALISTIQUE(Astron.). On appelle révolution\... 20 ANOMALISTIQUE(Astron.). On appelle révolution\...
21 ANOSTOSTOME(Anostostoma Gray). Genre d’insecte... 21 ANOSTOSTOME(Anostostoma Gray). Genre d’insecte...
22 ANTHOXANTHÈME. L’un des deux principes coloran... 22 ANTHOXANTHÈME. L’un des deux principes coloran...
23 AOD, plus exactement Ehoud. personnage des com... 23 AOD, plus exactement Ehoud. personnage des com...
24 APHELLAN(Astron.). Un des noms de l’étoile a2 ... 24 APHELLAN(Astron.). Un des noms de l’étoile a2 ...
25 APPELLE. Com. du dép. du Tarn, arr. de Lavaux,... 25 APPELLE. Com. du dép. du Tarn, arr. de Lavaux,...
26 ARAGONA, cardinal d’origine sicilienne, né en ... 26 ARAGONA, cardinal d’origine sicilienne, né en ...
27 ARAUJUZON. Com. du dép. des Basses-Pyrénées, a... 27 ARAUJUZON. Com. du dép. des Basses-Pyrénées, a...
28 ARDANT(Paul-Joseph), général français, né en 1... 28 ARDANT(Paul-Joseph), général français, né en 1...
29 ARIANOdi Puglia. Ville de la prov. de principa... 29 ARIANOdi Puglia. Ville de la prov. de principa...
30 ATHABASKA. Col, rivière, lac, territoire et fa... 30 ATHABASKA. Col, rivière, lac, territoire et fa...
31 ASLONNES, corn, du dép. de la Vienne, arr. de ... 31 ASLONNES, corn, du dép. de la Vienne, arr. de ...
32 ASTR0RH1ZA(Zool.).Genre deForaminifèresimperfo... 32 ASTR0RH1ZA(Zool.).Genre deForaminifèresimperfo...
33 ATTHIDOGRAPHES(V. Atthide).\n 33 ATTHIDOGRAPHES(V. Atthide).\n
34 AUBERY(Antoine;, historien français, né le .18... 34 AUBERY(Antoine;, historien français, né le .18...
35 AULA. Mot latin signifiant cour, lieu découver... 35 AULA. Mot latin signifiant cour, lieu découver...
36 AUNÉE (bot.). L'Aunée, Grande Année, Année off... 36 AUNÉE (bot.). L'Aunée, Grande Année, Année off...
37 AURIOL. Nom donné à Marseille au Maquereau (V.... 37 AURIOL. Nom donné à Marseille au Maquereau (V....
38 AVE-LALLENIANT(Robert-Christian-Barthold), méd... 38 AVE-LALLENIANT(Robert-Christian-Barthold), méd...
39 BADIN(Pierre-Adolphe), peintre français, né à ... 39 BADIN(Pierre-Adolphe), peintre français, né à ...
40 BAIZIEUX(Bacium, Basium). Com. du dép. de la\n... 40 BAIZIEUX(Bacium, Basium). Com. du dép. de la\n...
41 BALSAM1TE(Bot.) (Balsamita Desf.). Genre de Co... 41 BALSAM1TE(Bot.) (Balsamita Desf.). Genre de Co...
42 BALZE(Mar.). Radeau delà côte occidentale de l... 42 BALZE(Mar.). Radeau delà côte occidentale de l...
43 BANDE(Ordre delà) ou de l’ECHARPE.Ordre milita... 43 BANDE(Ordre delà) ou de l’ECHARPE.Ordre milita...
44 BARBOSA(Antonio), jésuite et orientaliste port... 44 BARBOSA(Antonio), jésuite et orientaliste port...
45 BATIÈRE. Toit en forme de bât se terminant à c... 45 BATIÈRE. Toit en forme de bât se terminant à c...
46 BAVEUSE(Zool.). Nom vulgaire par lequel les\np... 46 BAVEUSE(Zool.). Nom vulgaire par lequel les\np...
47 BEARD(James-Henry), peintre américain contempo... 47 BEARD(James-Henry), peintre américain contempo...
48 BEAUFORT. Com. du dép. de la Meuse, arr. de Mo... 48 BEAUFORT. Com. du dép. de la Meuse, arr. de Mo...
49 BEAUMONT(J.-G. Leprevôt de), secrétaire du cle... 49 BEAUMONT(J.-G. Leprevôt de), secrétaire du cle...
class_bert class_bert
0 Géographie 0 Géographie
1 Belles-lettres - Poésie 1 Belles-lettres - Poésie
2 Histoire 2 Histoire
3 Géographie 3 Géographie
4 Histoire naturelle 4 Histoire naturelle
5 Chimie 5 Chimie
6 Histoire naturelle 6 Histoire naturelle
7 Histoire 7 Histoire
8 Marine 8 Marine
9 Religion 9 Religion
10 Géographie 10 Géographie
11 Musique 11 Musique
12 Géographie 12 Géographie
13 Géographie 13 Géographie
14 Géographie 14 Géographie
15 Géographie 15 Géographie
16 Antiquité 16 Antiquité
17 Antiquité 17 Antiquité
18 Histoire naturelle 18 Histoire naturelle
19 Belles-lettres - Poésie 19 Belles-lettres - Poésie
20 Physique - [Sciences physico-mathématiques] 20 Physique - [Sciences physico-mathématiques]
21 Histoire naturelle 21 Histoire naturelle
22 Pharmacie 22 Pharmacie
23 Histoire 23 Histoire
24 Physique - [Sciences physico-mathématiques] 24 Physique - [Sciences physico-mathématiques]
25 Géographie 25 Géographie
26 Religion 26 Religion
27 Géographie 27 Géographie
28 Militaire (Art) - Guerre - Arme 28 Militaire (Art) - Guerre - Arme
29 Géographie 29 Géographie
30 Géographie 30 Géographie
31 Géographie 31 Géographie
32 Histoire naturelle 32 Histoire naturelle
33 Géographie 33 Géographie
34 Histoire 34 Histoire
35 Architecture 35 Architecture
36 Histoire naturelle 36 Histoire naturelle
37 Histoire naturelle 37 Histoire naturelle
38 Histoire 38 Histoire
39 Arts et métiers 39 Arts et métiers
40 Géographie 40 Géographie
41 Histoire naturelle 41 Histoire naturelle
42 Marine 42 Marine
43 Histoire 43 Histoire
44 Religion 44 Religion
45 Architecture 45 Architecture
46 Histoire naturelle 46 Histoire naturelle
47 Beaux-arts 47 Beaux-arts
48 Géographie 48 Géographie
49 Histoire 49 Histoire
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.to_csv(path + "reports/classification_LGE.tsv", sep="\t") filepath = path + "results_LGE/LGE-metadata-withContent.csv"
df_LGE.to_csv(filepath, sep="\,")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.drop(columns=['content'], inplace=True)
filepath = path + "results_LGE/LGE-metadata.csv"
df_LGE.to_csv(filepath, sep="\,")
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment