Skip to content
Snippets Groups Projects
Commit 074aed88 authored by Ludovic Moncla's avatar Ludovic Moncla
Browse files

Update Predict_XAI.ipynb

parent 6fc7b4e7
No related branches found
No related tags found
No related merge requests found
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# BERT Predict classification # BERT Predict classification
## 1. Setup the environment ## 1. Setup the environment
### 1.1 Setup colab environment ### 1.1 Setup colab environment
#### 1.1.1 Install packages #### 1.1.1 Install packages
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
!pip install transformers==4.10.3 !pip install transformers==4.10.3
!pip install sentencepiece !pip install sentencepiece
!pip install transformers_interpret !pip install transformers_interpret
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
#### 1.1.2 Use more RAM #### 1.1.2 Use more RAM
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from psutil import virtual_memory from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9 ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb)) print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))
if ram_gb < 20: if ram_gb < 20:
print('Not using a high-RAM runtime') print('Not using a high-RAM runtime')
else: else:
print('You are using a high-RAM runtime!') print('You are using a high-RAM runtime!')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
#### 1.1.3 Mount GoogleDrive #### 1.1.3 Mount GoogleDrive
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
from google.colab import drive from google.colab import drive
drive.mount('/content/drive') drive.mount('/content/drive')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 1.2 Import librairies ### 1.2 Import librairies
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pickle import pickle
import torch import torch
from transformers import BertTokenizer, BertForSequenceClassification from transformers import BertTokenizer, BertForSequenceClassification
from transformers_interpret import SequenceClassificationExplainer from transformers_interpret import SequenceClassificationExplainer
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
import torch import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
import pandas as pd import pandas as pd
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 1.3 Setup GPU ### 1.3 Setup GPU
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# If there's a GPU available... # If there's a GPU available...
if torch.cuda.is_available(): if torch.cuda.is_available():
# Tell PyTorch to use the GPU. # Tell PyTorch to use the GPU.
device = torch.device("cuda") device = torch.device("cuda")
gpu_name = "cuda" gpu_name = "cuda"
print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0)) print('We will use the GPU:', torch.cuda.get_device_name(0))
# for MacOS # for MacOS
elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
device = torch.device("mps") device = torch.device("mps")
gpu_name = "mps" gpu_name = "mps"
print('We will use the GPU') print('We will use the GPU')
else: else:
device = torch.device("cpu") device = torch.device("cpu")
gpu_name = "cpu" gpu_name = "cpu"
print('No GPU available, using the CPU instead.') print('No GPU available, using the CPU instead.')
``` ```
%% Output %% Output
We will use the GPU We will use the GPU
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
device = torch.device("cpu") device = torch.device("cpu")
gpu_name = "cpu" gpu_name = "cpu"
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 2. Utils ## 2. Utils
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512): def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):
# Tokenize all of the sentences and map the tokens to thier word IDs. # Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids_test = [] input_ids_test = []
# For every sentence... # For every sentence...
for sent in sentences: for sent in sentences:
# `encode` will: # `encode` will:
# (1) Tokenize the sentence. # (1) Tokenize the sentence.
# (2) Prepend the `[CLS]` token to the start. # (2) Prepend the `[CLS]` token to the start.
# (3) Append the `[SEP]` token to the end. # (3) Append the `[SEP]` token to the end.
# (4) Map tokens to their IDs. # (4) Map tokens to their IDs.
encoded_sent = tokenizer.encode( encoded_sent = tokenizer.encode(
sent, # Sentence to encode. sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]' add_special_tokens = True, # Add '[CLS]' and '[SEP]'
# This function also supports truncation and conversion # This function also supports truncation and conversion
# to pytorch tensors, but I need to do padding, so I # to pytorch tensors, but I need to do padding, so I
# can't use these features. # can't use these features.
#max_length = max_len, # Truncate all sentences. #max_length = max_len, # Truncate all sentences.
#return_tensors = 'pt', # Return pytorch tensors. #return_tensors = 'pt', # Return pytorch tensors.
) )
input_ids_test.append(encoded_sent) input_ids_test.append(encoded_sent)
# Pad our input tokens # Pad our input tokens
padded_test = [] padded_test = []
for i in input_ids_test: for i in input_ids_test:
if len(i) > max_len: if len(i) > max_len:
padded_test.extend([i[:max_len]]) padded_test.extend([i[:max_len]])
else: else:
padded_test.extend([i + [0] * (max_len - len(i))]) padded_test.extend([i + [0] * (max_len - len(i))])
input_ids_test = np.array(padded_test) input_ids_test = np.array(padded_test)
# Create attention masks # Create attention masks
attention_masks = [] attention_masks = []
# Create a mask of 1s for each token followed by 0s for padding # Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_test: for seq in input_ids_test:
seq_mask = [float(i>0) for i in seq] seq_mask = [float(i>0) for i in seq]
attention_masks.append(seq_mask) attention_masks.append(seq_mask)
# Convert to tensors. # Convert to tensors.
inputs = torch.tensor(input_ids_test) inputs = torch.tensor(input_ids_test)
masks = torch.tensor(attention_masks) masks = torch.tensor(attention_masks)
#set batch size #set batch size
# Create the DataLoader. # Create the DataLoader.
data = TensorDataset(inputs, masks) data = TensorDataset(inputs, masks)
prediction_sampler = SequentialSampler(data) prediction_sampler = SequentialSampler(data)
return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size) return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)
def predict(model, dataloader, device): def predict(model, dataloader, device):
# Put model in evaluation mode # Put model in evaluation mode
model.eval() model.eval()
# Tracking variables # Tracking variables
predictions_test , true_labels = [], [] predictions_test , true_labels = [], []
pred_labels_ = [] pred_labels_ = []
# Predict # Predict
for batch in dataloader: for batch in dataloader:
# Add batch to GPU # Add batch to GPU
batch = tuple(t.to(device) for t in batch) batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from the dataloader # Unpack the inputs from the dataloader
b_input_ids, b_input_mask = batch b_input_ids, b_input_mask = batch
# Telling the model not to compute or store gradients, saving memory and # Telling the model not to compute or store gradients, saving memory and
# speeding up prediction # speeding up prediction
with torch.no_grad(): with torch.no_grad():
# Forward pass, calculate logit predictions # Forward pass, calculate logit predictions
outputs = model(b_input_ids, token_type_ids=None, outputs = model(b_input_ids, token_type_ids=None,
attention_mask=b_input_mask) attention_mask=b_input_mask)
logits = outputs[0] logits = outputs[0]
#print(logits) #print(logits)
# Move logits and labels to CPU ??? # Move logits and labels to CPU ???
logits = logits.detach().cpu().numpy() logits = logits.detach().cpu().numpy()
#print(logits) #print(logits)
# Store predictions and true labels # Store predictions and true labels
predictions_test.append(logits) predictions_test.append(logits)
pred_labels = [] pred_labels = []
for i in range(len(predictions_test)): for i in range(len(predictions_test)):
# The predictions for this batch are a 2-column ndarray (one column for "0" # The predictions for this batch are a 2-column ndarray (one column for "0"
# and one column for "1"). Pick the label with the highest value and turn this # and one column for "1"). Pick the label with the highest value and turn this
# in to a list of 0s and 1s. # in to a list of 0s and 1s.
pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
pred_labels.append(pred_labels_i) pred_labels.append(pred_labels_i)
pred_labels_ += [item for sublist in pred_labels for item in sublist] pred_labels_ += [item for sublist in pred_labels for item in sublist]
return pred_labels_ return pred_labels_
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 3. Load Data ## 3. Load Data
!! A modifier: charger le corpus parallele : EDdA et LGE !! A modifier: charger le corpus parallele : EDdA et LGE
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 3.1 LGE (Nakala) ### 3.1 LGE (Nakala)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
lge_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_dataset_articles.tsv" lge_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_dataset_articles.tsv"
df_LGE = pd.read_csv(lge_path, sep="\t") df_LGE = pd.read_csv(lge_path, sep="\t")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.head() df_LGE.head()
``` ```
%% Output %% Output
id tome filename \ id tome filename \
0 T1article_1 T1 article_1 0 T1article_1 T1 article_1
1 T1article_10 T1 article_10 1 T1article_10 T1 article_10
2 T1article_100 T1 article_100 2 T1article_100 T1 article_100
3 T1article_1000 T1 article_1000 3 T1article_1000 T1 article_1000
4 T1article_1001 T1 article_1001 4 T1article_1001 T1 article_1001
content nb_words content nb_words
0 F.-Camille DREYFUS, député de la Seine.\n 6 0 F.-Camille DREYFUS, député de la Seine.\n 6
1 quimarque un mouvement en avant de l’esprit hu... 212 1 quimarque un mouvement en avant de l’esprit hu... 212
2 ABACUS. L’abacus ou abaque était un instrument... 1345 2 ABACUS. L’abacus ou abaque était un instrument... 1345
3 H6SS6)\n1780-1793 Choiseul-Goufficr\n1780-1793... 218 3 H6SS6)\n1780-1793 Choiseul-Goufficr\n1780-1793... 218
4 1803Le Brun.\n 2 4 1803Le Brun.\n 2
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.shape df_LGE.shape
``` ```
%% Output %% Output
(229475, 5) (229475, 5)
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 3.2 LGE Parallel ### 3.2 LGE Parallel
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
lge_par_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_parallel_dataset_articles.tsv" lge_par_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/LGE/LGE_parallel_dataset_articles.tsv"
df_LGE_par = pd.read_csv(lge_par_path, sep="\t") df_LGE_par = pd.read_csv(lge_par_path, sep="\t")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE_par.head() df_LGE_par.head()
``` ```
%% Output %% Output
id tome filename \ id tome filename \
0 T1aam-0 T1 aam-0 0 T1aam-0 T1 aam-0
1 T1abaco-0 T1 abaco-0 1 T1abaco-0 T1 abaco-0
2 T1abacot-0 T1 abacot-0 2 T1abacot-0 T1 abacot-0
3 T1abaddon-0 T1 abaddon-0 3 T1abaddon-0 T1 abaddon-0
4 T1abandonnement-0 T1 abandonnement-0 4 T1abandonnement-0 T1 abandonnement-0
content nb_words content nb_words
0 AAM. Mesure de capacité pour les liquides en u... 38 0 AAM. Mesure de capacité pour les liquides en u... 38
1 ABACO, architecte italien du xvi siècle (V. La... 8 1 ABACO, architecte italien du xvi siècle (V. La... 8
2 ABACOT. Double couronne que portaient autrefoi... 33 2 ABACOT. Double couronne que portaient autrefoi... 33
3 ABADDONou APOLYON le Destructeur. « Elles\nava... 109 3 ABADDONou APOLYON le Destructeur. « Elles\nava... 109
4 ABANDONNEMENT. I. Droit civil. — Ce mot est un... 76 4 ABANDONNEMENT. I. Droit civil. — Ce mot est un... 76
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 3.3 EDdA (ARTFL) ### 3.3 EDdA (ARTFL)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
edda_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv" edda_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv"
df_EDdA = pd.read_csv(edda_path, sep="\t") df_EDdA = pd.read_csv(edda_path, sep="\t")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_EDdA.head() df_EDdA.head()
``` ```
%% Output %% Output
volume numero head author \ volume numero head author \
0 1 1 Title Page unsigned 0 1 1 Title Page unsigned
1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert 1 1 2 A MONSEIGNEUR LE COMTE D'ARGENSON Diderot & d'Alembert
2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert 2 1 3 DISCOURS PRÉLIMINAIRE DES EDITEURS d'Alembert
3 1 5 A, a & a Dumarsais5 3 1 5 A, a & a Dumarsais5
4 1 6 A Dumarsais5 4 1 6 A Dumarsais5
edda_class enccre_id enccre_class \ edda_class enccre_id enccre_class \
0 unclassified NaN NaN 0 unclassified NaN NaN
1 unclassified NaN NaN 1 unclassified NaN NaN
2 unclassified NaN NaN 2 unclassified NaN NaN
3 Grammaire v1-1-0 Grammaire 3 Grammaire v1-1-0 Grammaire
4 unclassified v1-1-1 Grammaire 4 unclassified v1-1-1 Grammaire
content \ content \
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ... 0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS... 1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n... 2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc... 3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
4 \nA, mot, est 1. la troisieme personne du prés... 4 \nA, mot, est 1. la troisieme personne du prés...
content_without_designant \ content_without_designant \
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ... 0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ...
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS... 1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS...
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n... 2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n\n...
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc... 3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc...
4 \nA, mot, est 1. la troisieme personne du prés... 4 \nA, mot, est 1. la troisieme personne du prés...
first_paragraph nb_words first_paragraph nb_words
0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ... 151 0 \n\nENCYCLOPÉDIE,\nDICTIONNAIRE RAISONNÉ\nDES ... 151
1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS... 208 1 \n\nA MONSEIGNEUR\nLE COMTE D'ARGENSON,\nMINIS... 208
2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n 44669 2 \n\nDISCOURS PRÉLIMINAIRE\nDES EDITEURS.\n\n 44669
3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc... 711 3 \nA, a & a s.m. (ordre Encyclopéd.\nEntend. Sc... 711
4 \nA, mot, est 1. la troisieme personne du prés... 238 4 \nA, mot, est 1. la troisieme personne du prés... 238
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 3.4 EDdA Parallel ### 3.4 EDdA Parallel
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
edda_par_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_dataset_articles.tsv" edda_par_path = "/Users/lmoncla/Nextcloud-LIRIS/GEODE/GEODE - Partage consortium/Corpus/EDdA/EDdA_parallel_dataset_articles.tsv"
df_EDdA_par = pd.read_csv(edda_par_path, sep="\t") df_EDdA_par = pd.read_csv(edda_par_path, sep="\t")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_EDdA_par.head() df_EDdA_par.head()
``` ```
%% Output
id tome filename \
0 T1article100 T1 article100
1 T1article1016 T1 article1016
2 T1article1036 T1 article1036
3 T1article1038 T1 article1038
4 T1article1039 T1 article1039
content nb_words
0 ABATAGE, s. m. On dit dans un chantier & sur\n... 64
1 AFFILIATION, s. f. (Jurispr.) s'est dit par le... 69
2 AFFLUENT, adj. terme de rivieres, se dit d'une... 50
3 AFFORAGE, s. terme de Droit, qui se prend dans... 83
4 AFFOUAGE, s. terme de Coûtumes, qui signifie l... 27
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 4. Load model and predict ## 4. Load model and predict
### 4.1 Load BERT model ### 4.1 Load BERT model
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
#path = "drive/MyDrive/Classification-EDdA/" #path = "drive/MyDrive/Classification-EDdA/"
path = "../" path = "../"
model_name = "bert-base-multilingual-cased" model_name = "bert-base-multilingual-cased"
model_path = path + "models/model_" + model_name + "_s10000.pt" model_path = path + "models/model_" + model_name + "_s10000.pt"
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
encoder_filename = "models/label_encoder.pkl" encoder_filename = "models/label_encoder.pkl"
with open(path + encoder_filename, 'rb') as file: with open(path + encoder_filename, 'rb') as file:
encoder = pickle.load(file) encoder = pickle.load(file)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
tokenizer = BertTokenizer.from_pretrained(model_name) tokenizer = BertTokenizer.from_pretrained(model_name)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
model = BertForSequenceClassification.from_pretrained(model_path).to(gpu_name) #.to("cuda") model = BertForSequenceClassification.from_pretrained(model_path).to(gpu_name) #.to("cuda")
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 4.2 Prepare datasets ### 4.2 Prepare datasets
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# LGE # LGE
data_loader_LGE = generate_dataloader(tokenizer, df_LGE.content.values) data_loader_LGE = generate_dataloader(tokenizer, df_LGE.content.values)
``` ```
%% Output %% Output
Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors Token indices sequence length is longer than the specified maximum sequence length for this model (1204 > 512). Running this sequence through the model will result in indexing errors
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# LGE parallel # LGE parallel
data_loader_LGE_par = generate_dataloader(tokenizer, df_LGE_par.content.values) data_loader_LGE_par = generate_dataloader(tokenizer, df_LGE_par.content.values)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# EDdA # EDdA
data_loader_EDdA = generate_dataloader(tokenizer, df_EDdA.content.values) data_loader_EDdA = generate_dataloader(tokenizer, df_EDdA.content.values)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# EDdA parallel # EDdA parallel
data_loader_EDdA_par = generate_dataloader(tokenizer, df_EDdA_par.content.values) data_loader_EDdA_par = generate_dataloader(tokenizer, df_EDdA_par.content.values)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 4.3 Predict ### 4.3 Predict
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pred_LGE = predict(model, data_loader_LGE, device) pred_LGE = predict(model, data_loader_LGE, device)
df_LGE['class_pred'] = list(encoder.inverse_transform(pred_LGE)) df_LGE['class_pred'] = list(encoder.inverse_transform(pred_LGE))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pred_LGE_par = predict(model, data_loader_LGE_par, device) pred_LGE_par = predict(model, data_loader_LGE_par, device)
df_LGE_par['class_pred'] = list(encoder.inverse_transform(pred_LGE_par)) df_LGE_par['class_pred'] = list(encoder.inverse_transform(pred_LGE_par))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pred_EDdA = predict(model, data_loader_EDdA, device) pred_EDdA = predict(model, data_loader_EDdA, device)
df_EDdA['class_pred'] = list(encoder.inverse_transform(pred_EDdA)) df_EDdA['class_pred'] = list(encoder.inverse_transform(pred_EDdA))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pred_EDdA_par = predict(model, data_loader_EDdA_par, device) pred_EDdA_par = predict(model, data_loader_EDdA_par, device)
df_EDdA_par['class_pred'] = list(encoder.inverse_transform(pred_EDdA_par)) df_EDdA_par['class_pred'] = list(encoder.inverse_transform(pred_EDdA_par))
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.head() df_LGE.head()
``` ```
%% Output %% Output
id tome filename \ id tome filename \
0 T1article_1 T1 article_1 0 T1article_1 T1 article_1
1 T1article_10 T1 article_10 1 T1article_10 T1 article_10
2 T1article_100 T1 article_100 2 T1article_100 T1 article_100
3 T1article_1000 T1 article_1000 3 T1article_1000 T1 article_1000
4 T1article_1001 T1 article_1001 4 T1article_1001 T1 article_1001
content nb_words content nb_words
0 F.-Camille DREYFUS, député de la Seine.\n 6 0 F.-Camille DREYFUS, député de la Seine.\n 6
1 quimarque un mouvement en avant de l’esprit hu... 212 1 quimarque un mouvement en avant de l’esprit hu... 212
2 ABACUS. L’abacus ou abaque était un instrument... 1345 2 ABACUS. L’abacus ou abaque était un instrument... 1345
3 H6SS6)\n1780-1793 Choiseul-Goufficr\n1780-1793... 218 3 H6SS6)\n1780-1793 Choiseul-Goufficr\n1780-1793... 218
4 1803Le Brun.\n 2 4 1803Le Brun.\n 2
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
### 4.4 Save ### 4.4 Save
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
filepath = path + "results_LGE/LGE-metadata-withContent.csv" filepath = path + "results_LGE/LGE-metadata-withContent.csv"
df_LGE.to_csv(filepath, sep="\,") df_LGE.to_csv(filepath, sep="\,")
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_LGE.drop(columns=['content'], inplace=True) df_LGE.drop(columns=['content'], inplace=True)
filepath = path + "results_LGE/LGE-metadata.csv" filepath = path + "results_LGE/LGE-metadata.csv"
df_LGE.to_csv(filepath, sep="\,") df_LGE.to_csv(filepath, sep="\,")
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## 5. BERT XAI ## 5. BERT XAI
https://www.kaggle.com/code/rizwanhaidar/deep-learning-xai-models-loading-and-predictions https://www.kaggle.com/code/rizwanhaidar/deep-learning-xai-models-loading-and-predictions
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
cls_explainer = SequenceClassificationExplainer( cls_explainer = SequenceClassificationExplainer(
model, model,
tokenizer, tokenizer,
custom_labels=encoder.classes_.tolist() custom_labels=encoder.classes_.tolist()
) )
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
content = df_EDdA.loc[df_EDdA['head']=="LYON"].reset_index().content[0][:512] content = df_EDdA.loc[df_EDdA['head']=="LYON"].reset_index().content[0][:512]
content content
``` ```
%% Output %% Output
"\nLYON, (Géogr.) grande, riche, belle, ancienne\n& celebre ville de France, la plus considérable du\nroyaume après Paris, & la capitale du Lyonnois.\nElle se nomme en latin Lugdunum, Lugudunum, Lugdumum Segusianorum, Lugdumum Celtarum, &c.\nVoyez Lugdunum.\n\nLyon fut fondée l'an de Rome 712, quarante-un\nans avant l'ere chrétienne, par Lucius Munatius\nPlancus, qui étoit consul avec AEmilius Lepidus. Il\nla bâtit sur la Sône, au lieu où cette riviere se jette\ndans le Rhône, & il la peupla des citoyens romains \nqui a" "\nLYON, (Géogr.) grande, riche, belle, ancienne\n& celebre ville de France, la plus considérable du\nroyaume après Paris, & la capitale du Lyonnois.\nElle se nomme en latin Lugdunum, Lugudunum, Lugdumum Segusianorum, Lugdumum Celtarum, &c.\nVoyez Lugdunum.\n\nLyon fut fondée l'an de Rome 712, quarante-un\nans avant l'ere chrétienne, par Lucius Munatius\nPlancus, qui étoit consul avec AEmilius Lepidus. Il\nla bâtit sur la Sône, au lieu où cette riviere se jette\ndans le Rhône, & il la peupla des citoyens romains \nqui a"
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
word_attributions = cls_explainer(content if len(content) < 512 else content[:512]) word_attributions = cls_explainer(content[:512])
word_attributions word_attributions
``` ```
%% Output %% Output
[('[CLS]', 0.0), [('[CLS]', 0.0),
('L', 0.007399733805079844), ('L', 0.007399733805079844),
('##Y', 0.1456759996705617), ('##Y', 0.1456759996705617),
('##ON', 0.14307146561933012), ('##ON', 0.14307146561933012),
(',', 0.19909154256915337), (',', 0.19909154256915337),
('(', 0.09932002907423143), ('(', 0.09932002907423143),
('G', 0.24402357535335403), ('G', 0.24402357535335403),
('##éo', 0.23393328870446992), ('##éo', 0.23393328870446992),
('##gr', 0.1695800465119405), ('##gr', 0.1695800465119405),
('.', 0.14162802579543046), ('.', 0.14162802579543046),
(')', -0.13544847084394057), (')', -0.13544847084394057),
('grande', 0.21832893139528123), ('grande', 0.21832893139528123),
(',', 0.11257940886969105), (',', 0.11257940886969105),
('riche', 0.07913704700022943), ('riche', 0.07913704700022943),
(',', 0.05662853362544685), (',', 0.05662853362544685),
('belle', -0.029909244412604778), ('belle', -0.029909244412604778),
(',', 0.06873738399629244), (',', 0.06873738399629244),
('ancienne', 0.12958979621300132), ('ancienne', 0.12958979621300132),
('&', -0.08418116246612357), ('&', -0.08418116246612357),
('celebre', 0.12947489123965564), ('celebre', 0.12947489123965564),
('ville', 0.478071716663547), ('ville', 0.478071716663547),
('de', 0.03403811335226887), ('de', 0.03403811335226887),
('France', 0.13688799086603975), ('France', 0.13688799086603975),
(',', -0.0010714894154601323), (',', -0.0010714894154601323),
('la', -0.006879341345145134), ('la', -0.006879341345145134),
('plus', 0.05840061099213507), ('plus', 0.05840061099213507),
('con', 0.026165582559808873), ('con', 0.026165582559808873),
('##sid', 0.03255043778254519), ('##sid', 0.03255043778254519),
('##érable', 0.05297839086419718), ('##érable', 0.05297839086419718),
('du', 0.018572791985543135), ('du', 0.018572791985543135),
('royaume', 0.24665610131446675), ('royaume', 0.24665610131446675),
('après', 0.01785470962170739), ('après', 0.01785470962170739),
('Paris', 0.03310146903416289), ('Paris', 0.03310146903416289),
(',', -0.006856821180122214), (',', -0.006856821180122214),
('&', -0.006321268573570221), ('&', -0.006321268573570221),
('la', 0.08253583803987206), ('la', 0.08253583803987206),
('capitale', 0.2983988672217172), ('capitale', 0.2983988672217172),
('du', 0.07376998774114908), ('du', 0.07376998774114908),
('Lyon', 0.04007542253467923), ('Lyon', 0.04007542253467923),
('##noi', 0.02909189419875202), ('##noi', 0.02909189419875202),
('##s', 0.02625525527522554), ('##s', 0.02625525527522554),
('.', 0.0760972913677917), ('.', 0.0760972913677917),
('Elle', 0.0693630173969722), ('Elle', 0.0693630173969722),
('se', 0.04164162356829115), ('se', 0.04164162356829115),
('nomme', 0.10000471924693329), ('nomme', 0.10000471924693329),
('en', 0.03010674205624715), ('en', 0.03010674205624715),
('latin', 0.13303588704102381), ('latin', 0.13303588704102381),
('Lu', 0.0309274199183622), ('Lu', 0.0309274199183622),
('##gd', 0.00518317960511743), ('##gd', 0.00518317960511743),
('##unum', 0.029477331874186236), ('##unum', 0.029477331874186236),
(',', -0.007011581545450849), (',', -0.007011581545450849),
('Lu', 0.008645628419735481), ('Lu', 0.008645628419735481),
('##gu', 0.023084632572130535), ('##gu', 0.023084632572130535),
('##dun', 0.025699022336446258), ('##dun', 0.025699022336446258),
('##um', 0.03484266276127894), ('##um', 0.03484266276127894),
(',', 0.0030423079199119554), (',', 0.0030423079199119554),
('Lu', 0.02366442497712222), ('Lu', 0.02366442497712222),
('##gd', 0.006578965732858923), ('##gd', 0.006578965732858923),
('##umu', 0.03456580806237662), ('##umu', 0.03456580806237662),
('##m', 0.026521004509341334), ('##m', 0.026521004509341334),
('Se', 0.037503453809376), ('Se', 0.037503453809376),
('##gus', 0.03294045015997047), ('##gus', 0.03294045015997047),
('##iano', 0.017089445343453365), ('##iano', 0.017089445343453365),
('##rum', 0.03820084664850618), ('##rum', 0.03820084664850618),
(',', -0.011642202072501788), (',', -0.011642202072501788),
('Lu', 0.02694302543021504), ('Lu', 0.02694302543021504),
('##gd', 0.005721331572683938), ('##gd', 0.005721331572683938),
('##umu', 0.03658546160187376), ('##umu', 0.03658546160187376),
('##m', 0.03770363967219936), ('##m', 0.03770363967219936),
('Cel', 0.014027086848242715), ('Cel', 0.014027086848242715),
('##tar', 0.025188870477124894), ('##tar', 0.025188870477124894),
('##um', 0.039922520378568), ('##um', 0.039922520378568),
(',', 0.022664305461904344), (',', 0.022664305461904344),
('&', -0.0494132018474461), ('&', -0.0494132018474461),
('c', 0.0403850871592572), ('c', 0.0403850871592572),
('.', 0.04544699824023643), ('.', 0.04544699824023643),
('Vo', 0.02935262786796574), ('Vo', 0.02935262786796574),
('##ye', 0.03940461731845493), ('##ye', 0.03940461731845493),
('##z', 0.029871874749211054), ('##z', 0.029871874749211054),
('Lu', 0.026337930390794705), ('Lu', 0.026337930390794705),
('##gd', -0.00442376201350928), ('##gd', -0.00442376201350928),
('##unum', 0.025898349689579492), ('##unum', 0.025898349689579492),
('.', 0.06146632041097513), ('.', 0.06146632041097513),
('Lyon', 0.022692171217471906), ('Lyon', 0.022692171217471906),
('fut', 0.0438798787047486), ('fut', 0.0438798787047486),
('fondée', 0.04704211890403151), ('fondée', 0.04704211890403151),
('l', 0.0451974674122074), ('l', 0.0451974674122074),
("'", 0.06774875716439344), ("'", 0.06774875716439344),
('an', 0.023585319400848195), ('an', 0.023585319400848195),
('de', 0.021013220187771894), ('de', 0.021013220187771894),
('Rome', 0.03688032185991681), ('Rome', 0.03688032185991681),
('712', 0.054693452829347115), ('712', 0.054693452829347115),
(',', 0.007825484996566502), (',', 0.007825484996566502),
('quarante', 0.028231791558966633), ('quarante', 0.028231791558966633),
('-', -0.0003846539976056082), ('-', -0.0003846539976056082),
('un', 0.03117251985912735), ('un', 0.03117251985912735),
('ans', 0.014472180695321534), ('ans', 0.014472180695321534),
('avant', 0.017626577836139475), ('avant', 0.017626577836139475),
('l', 0.04620483463390136), ('l', 0.04620483463390136),
("'", 0.05723696778164145), ("'", 0.05723696778164145),
('ere', 0.007590037219544403), ('ere', 0.007590037219544403),
('chrétienne', 0.1311835388990743), ('chrétienne', 0.1311835388990743),
(',', 0.013178253982938232), (',', 0.013178253982938232),
('par', 0.007011176299182855), ('par', 0.007011176299182855),
('Lucius', 0.0044462351021057325), ('Lucius', 0.0044462351021057325),
('Mu', 0.006340399133187405), ('Mu', 0.006340399133187405),
('##nati', 0.01752347206998558), ('##nati', 0.01752347206998558),
('##us', 0.015200983089939281), ('##us', 0.015200983089939281),
('Plan', 0.030820184404097863), ('Plan', 0.030820184404097863),
('##cus', 0.01828726599412002), ('##cus', 0.01828726599412002),
(',', -0.008296981653008715), (',', -0.008296981653008715),
('qui', 0.03778469886529954), ('qui', 0.03778469886529954),
('é', 0.024089543382319098), ('é', 0.024089543382319098),
('##toi', 0.01483008688193065), ('##toi', 0.01483008688193065),
('##t', 0.0036622619849812073), ('##t', 0.0036622619849812073),
('consul', 0.006312700914285012), ('consul', 0.006312700914285012),
('avec', 0.019039309232488966), ('avec', 0.019039309232488966),
('AE', -0.010990138859793724), ('AE', -0.010990138859793724),
('##mil', 0.02963200210194755), ('##mil', 0.02963200210194755),
('##ius', -0.00048531039895657175), ('##ius', -0.00048531039895657175),
('Le', 0.016166723086828174), ('Le', 0.016166723086828174),
('##pid', 0.01787476167297771), ('##pid', 0.01787476167297771),
('##us', 0.020667475964218647), ('##us', 0.020667475964218647),
('.', 0.029180628239546275), ('.', 0.029180628239546275),
('Il', 0.021736540370470812), ('Il', 0.021736540370470812),
('la', 0.03714972247323993), ('la', 0.03714972247323993),
('b', 0.042870227388534604), ('b', 0.042870227388534604),
('##ât', 0.011332787999157318), ('##ât', 0.011332787999157318),
('##it', 0.009763016011555254), ('##it', 0.009763016011555254),
('sur', 0.04260004363332922), ('sur', 0.04260004363332922),
('la', 0.04410484491168233), ('la', 0.04410484491168233),
('S', 0.06575101714951456), ('S', 0.06575101714951456),
('##ôn', 0.011241165099203603), ('##ôn', 0.011241165099203603),
('##e', 0.010537012868472688), ('##e', 0.010537012868472688),
(',', -0.0062768408260973066), (',', -0.0062768408260973066),
('au', -0.0018304190363696647), ('au', -0.0018304190363696647),
('lieu', 0.13020947076813982), ('lieu', 0.13020947076813982),
('où', 0.013157964330803138), ('où', 0.013157964330803138),
('cette', 0.03737564027887762), ('cette', 0.03737564027887762),
('rivier', 0.11169096058453537), ('rivier', 0.11169096058453537),
('##e', 0.04378867745175019), ('##e', 0.04378867745175019),
('se', 0.0058271154715995995), ('se', 0.0058271154715995995),
('jet', 0.031575857152632385), ('jet', 0.031575857152632385),
('##te', 0.020236291895152022), ('##te', 0.020236291895152022),
('dans', 0.01852231748257226), ('dans', 0.01852231748257226),
('le', 0.03105610182850656), ('le', 0.03105610182850656),
('Rhône', 0.07226400802922804), ('Rhône', 0.07226400802922804),
(',', 0.02919256859997905), (',', 0.02919256859997905),
('&', -0.038176803729996794), ('&', -0.038176803729996794),
('il', 0.00609352197030786), ('il', 0.00609352197030786),
('la', 0.04198219592000479), ('la', 0.04198219592000479),
('peu', 0.040981027718879084), ('peu', 0.040981027718879084),
('##pla', 0.005560350755837545), ('##pla', 0.005560350755837545),
('des', 0.028996120278423045), ('des', 0.028996120278423045),
('citoyens', -0.03278504989463669), ('citoyens', -0.03278504989463669),
('romain', -0.008083189911088765), ('romain', -0.008083189911088765),
('##s', 0.01484737615013025), ('##s', 0.01484737615013025),
('qui', 0.05804189959646576), ('qui', 0.05804189959646576),
('a', -0.022083265525204197), ('a', -0.022083265525204197),
('[SEP]', 0.0)] ('[SEP]', 0.0)]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
cls_explainer.predicted_class_name cls_explainer.predicted_class_name
``` ```
%% Output %% Output
'Géographie' 'Géographie'
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
cls_explainer.visualize() cls_explainer.visualize()
``` ```
%% Output %% Output
<IPython.core.display.HTML object> <IPython.core.display.HTML object>
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
* récupérer les mots positifs par domaine (EDdA et LGE) * récupérer les mots positifs par domaine (EDdA et LGE)
* faire des nuages de mots et comparer les plus fréquents entre EDdA et LGE (corpus parallèle) * faire des nuages de mots et comparer les plus fréquents entre EDdA et LGE (corpus parallèle)
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
pour chaque domaine pour chaque domaine
pour chaque token : moyenne (somme ?) des scores ? ne prendre que les scores au dessus de 0.1 ? pour chaque token : moyenne (somme ?) des scores ? ne prendre que les scores au dessus de 0.1 ?
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
word_attributions.sort(key=lambda a: a[1], reverse = True) word_attributions.sort(key=lambda a: a[1], reverse = True)
word_attributions word_attributions
``` ```
%% Output %% Output
[('ville', 0.478071716663547), [('ville', 0.478071716663547),
('capitale', 0.2983988672217172), ('capitale', 0.2983988672217172),
('royaume', 0.24665610131446675), ('royaume', 0.24665610131446675),
('G', 0.24402357535335403), ('G', 0.24402357535335403),
('##éo', 0.23393328870446992), ('##éo', 0.23393328870446992),
('grande', 0.21832893139528123), ('grande', 0.21832893139528123),
(',', 0.19909154256915337), (',', 0.19909154256915337),
('##gr', 0.1695800465119405), ('##gr', 0.1695800465119405),
('##Y', 0.1456759996705617), ('##Y', 0.1456759996705617),
('##ON', 0.14307146561933012), ('##ON', 0.14307146561933012),
('.', 0.14162802579543046), ('.', 0.14162802579543046),
('France', 0.13688799086603975), ('France', 0.13688799086603975),
('latin', 0.13303588704102381), ('latin', 0.13303588704102381),
('chrétienne', 0.1311835388990743), ('chrétienne', 0.1311835388990743),
('lieu', 0.13020947076813982), ('lieu', 0.13020947076813982),
('ancienne', 0.12958979621300132), ('ancienne', 0.12958979621300132),
('celebre', 0.12947489123965564), ('celebre', 0.12947489123965564),
(',', 0.11257940886969105), (',', 0.11257940886969105),
('rivier', 0.11169096058453537), ('rivier', 0.11169096058453537),
('nomme', 0.10000471924693329), ('nomme', 0.10000471924693329),
('(', 0.09932002907423143), ('(', 0.09932002907423143),
('la', 0.08253583803987206), ('la', 0.08253583803987206),
('riche', 0.07913704700022943), ('riche', 0.07913704700022943),
('.', 0.0760972913677917), ('.', 0.0760972913677917),
('du', 0.07376998774114908), ('du', 0.07376998774114908),
('Rhône', 0.07226400802922804), ('Rhône', 0.07226400802922804),
('Elle', 0.0693630173969722), ('Elle', 0.0693630173969722),
(',', 0.06873738399629244), (',', 0.06873738399629244),
("'", 0.06774875716439344), ("'", 0.06774875716439344),
('S', 0.06575101714951456), ('S', 0.06575101714951456),
('.', 0.06146632041097513), ('.', 0.06146632041097513),
('plus', 0.05840061099213507), ('plus', 0.05840061099213507),
('qui', 0.05804189959646576), ('qui', 0.05804189959646576),
("'", 0.05723696778164145), ("'", 0.05723696778164145),
(',', 0.05662853362544685), (',', 0.05662853362544685),
('712', 0.054693452829347115), ('712', 0.054693452829347115),
('##érable', 0.05297839086419718), ('##érable', 0.05297839086419718),
('fondée', 0.04704211890403151), ('fondée', 0.04704211890403151),
('l', 0.04620483463390136), ('l', 0.04620483463390136),
('.', 0.04544699824023643), ('.', 0.04544699824023643),
('l', 0.0451974674122074), ('l', 0.0451974674122074),
('la', 0.04410484491168233), ('la', 0.04410484491168233),
('fut', 0.0438798787047486), ('fut', 0.0438798787047486),
('##e', 0.04378867745175019), ('##e', 0.04378867745175019),
('b', 0.042870227388534604), ('b', 0.042870227388534604),
('sur', 0.04260004363332922), ('sur', 0.04260004363332922),
('la', 0.04198219592000479), ('la', 0.04198219592000479),
('se', 0.04164162356829115), ('se', 0.04164162356829115),
('peu', 0.040981027718879084), ('peu', 0.040981027718879084),
('c', 0.0403850871592572), ('c', 0.0403850871592572),
('Lyon', 0.04007542253467923), ('Lyon', 0.04007542253467923),
('##um', 0.039922520378568), ('##um', 0.039922520378568),
('##ye', 0.03940461731845493), ('##ye', 0.03940461731845493),
('##rum', 0.03820084664850618), ('##rum', 0.03820084664850618),
('qui', 0.03778469886529954), ('qui', 0.03778469886529954),
('##m', 0.03770363967219936), ('##m', 0.03770363967219936),
('Se', 0.037503453809376), ('Se', 0.037503453809376),
('cette', 0.03737564027887762), ('cette', 0.03737564027887762),
('la', 0.03714972247323993), ('la', 0.03714972247323993),
('Rome', 0.03688032185991681), ('Rome', 0.03688032185991681),
('##umu', 0.03658546160187376), ('##umu', 0.03658546160187376),
('##um', 0.03484266276127894), ('##um', 0.03484266276127894),
('##umu', 0.03456580806237662), ('##umu', 0.03456580806237662),
('de', 0.03403811335226887), ('de', 0.03403811335226887),
('Paris', 0.03310146903416289), ('Paris', 0.03310146903416289),
('##gus', 0.03294045015997047), ('##gus', 0.03294045015997047),
('##sid', 0.03255043778254519), ('##sid', 0.03255043778254519),
('jet', 0.031575857152632385), ('jet', 0.031575857152632385),
('un', 0.03117251985912735), ('un', 0.03117251985912735),
('le', 0.03105610182850656), ('le', 0.03105610182850656),
('Lu', 0.0309274199183622), ('Lu', 0.0309274199183622),
('Plan', 0.030820184404097863), ('Plan', 0.030820184404097863),
('en', 0.03010674205624715), ('en', 0.03010674205624715),
('##z', 0.029871874749211054), ('##z', 0.029871874749211054),
('##mil', 0.02963200210194755), ('##mil', 0.02963200210194755),
('##unum', 0.029477331874186236), ('##unum', 0.029477331874186236),
('Vo', 0.02935262786796574), ('Vo', 0.02935262786796574),
(',', 0.02919256859997905), (',', 0.02919256859997905),
('.', 0.029180628239546275), ('.', 0.029180628239546275),
('##noi', 0.02909189419875202), ('##noi', 0.02909189419875202),
('des', 0.028996120278423045), ('des', 0.028996120278423045),
('quarante', 0.028231791558966633), ('quarante', 0.028231791558966633),
('Lu', 0.02694302543021504), ('Lu', 0.02694302543021504),
('##m', 0.026521004509341334), ('##m', 0.026521004509341334),
('Lu', 0.026337930390794705), ('Lu', 0.026337930390794705),
('##s', 0.02625525527522554), ('##s', 0.02625525527522554),
('con', 0.026165582559808873), ('con', 0.026165582559808873),
('##unum', 0.025898349689579492), ('##unum', 0.025898349689579492),
('##dun', 0.025699022336446258), ('##dun', 0.025699022336446258),
('##tar', 0.025188870477124894), ('##tar', 0.025188870477124894),
('é', 0.024089543382319098), ('é', 0.024089543382319098),
('Lu', 0.02366442497712222), ('Lu', 0.02366442497712222),
('an', 0.023585319400848195), ('an', 0.023585319400848195),
('##gu', 0.023084632572130535), ('##gu', 0.023084632572130535),
('Lyon', 0.022692171217471906), ('Lyon', 0.022692171217471906),
(',', 0.022664305461904344), (',', 0.022664305461904344),
('Il', 0.021736540370470812), ('Il', 0.021736540370470812),
('de', 0.021013220187771894), ('de', 0.021013220187771894),
('##us', 0.020667475964218647), ('##us', 0.020667475964218647),
('##te', 0.020236291895152022), ('##te', 0.020236291895152022),
('avec', 0.019039309232488966), ('avec', 0.019039309232488966),
('du', 0.018572791985543135), ('du', 0.018572791985543135),
('dans', 0.01852231748257226), ('dans', 0.01852231748257226),
('##cus', 0.01828726599412002), ('##cus', 0.01828726599412002),
('##pid', 0.01787476167297771), ('##pid', 0.01787476167297771),
('après', 0.01785470962170739), ('après', 0.01785470962170739),
('avant', 0.017626577836139475), ('avant', 0.017626577836139475),
('##nati', 0.01752347206998558), ('##nati', 0.01752347206998558),
('##iano', 0.017089445343453365), ('##iano', 0.017089445343453365),
('Le', 0.016166723086828174), ('Le', 0.016166723086828174),
('##us', 0.015200983089939281), ('##us', 0.015200983089939281),
('##s', 0.01484737615013025), ('##s', 0.01484737615013025),
('##toi', 0.01483008688193065), ('##toi', 0.01483008688193065),
('ans', 0.014472180695321534), ('ans', 0.014472180695321534),
('Cel', 0.014027086848242715), ('Cel', 0.014027086848242715),
(',', 0.013178253982938232), (',', 0.013178253982938232),
('où', 0.013157964330803138), ('où', 0.013157964330803138),
('##ât', 0.011332787999157318), ('##ât', 0.011332787999157318),
('##ôn', 0.011241165099203603), ('##ôn', 0.011241165099203603),
('##e', 0.010537012868472688), ('##e', 0.010537012868472688),
('##it', 0.009763016011555254), ('##it', 0.009763016011555254),
('Lu', 0.008645628419735481), ('Lu', 0.008645628419735481),
(',', 0.007825484996566502), (',', 0.007825484996566502),
('ere', 0.007590037219544403), ('ere', 0.007590037219544403),
('L', 0.007399733805079844), ('L', 0.007399733805079844),
('par', 0.007011176299182855), ('par', 0.007011176299182855),
('##gd', 0.006578965732858923), ('##gd', 0.006578965732858923),
('Mu', 0.006340399133187405), ('Mu', 0.006340399133187405),
('consul', 0.006312700914285012), ('consul', 0.006312700914285012),
('il', 0.00609352197030786), ('il', 0.00609352197030786),
('se', 0.0058271154715995995), ('se', 0.0058271154715995995),
('##gd', 0.005721331572683938), ('##gd', 0.005721331572683938),
('##pla', 0.005560350755837545), ('##pla', 0.005560350755837545),
('##gd', 0.00518317960511743), ('##gd', 0.00518317960511743),
('Lucius', 0.0044462351021057325), ('Lucius', 0.0044462351021057325),
('##t', 0.0036622619849812073), ('##t', 0.0036622619849812073),
(',', 0.0030423079199119554), (',', 0.0030423079199119554),
('[CLS]', 0.0), ('[CLS]', 0.0),
('[SEP]', 0.0), ('[SEP]', 0.0),
('-', -0.0003846539976056082), ('-', -0.0003846539976056082),
('##ius', -0.00048531039895657175), ('##ius', -0.00048531039895657175),
(',', -0.0010714894154601323), (',', -0.0010714894154601323),
('au', -0.0018304190363696647), ('au', -0.0018304190363696647),
('##gd', -0.00442376201350928), ('##gd', -0.00442376201350928),
(',', -0.0062768408260973066), (',', -0.0062768408260973066),
('&', -0.006321268573570221), ('&', -0.006321268573570221),
(',', -0.006856821180122214), (',', -0.006856821180122214),
('la', -0.006879341345145134), ('la', -0.006879341345145134),
(',', -0.007011581545450849), (',', -0.007011581545450849),
('romain', -0.008083189911088765), ('romain', -0.008083189911088765),
(',', -0.008296981653008715), (',', -0.008296981653008715),
('AE', -0.010990138859793724), ('AE', -0.010990138859793724),
(',', -0.011642202072501788), (',', -0.011642202072501788),
('a', -0.022083265525204197), ('a', -0.022083265525204197),
('belle', -0.029909244412604778), ('belle', -0.029909244412604778),
('citoyens', -0.03278504989463669), ('citoyens', -0.03278504989463669),
('&', -0.038176803729996794), ('&', -0.038176803729996794),
('&', -0.0494132018474461), ('&', -0.0494132018474461),
('&', -0.08418116246612357), ('&', -0.08418116246612357),
(')', -0.13544847084394057)] (')', -0.13544847084394057)]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
d = {} d = {}
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
d_tmp = dict(word_attributions) d_tmp = dict(word_attributions)
d_tmp d_tmp
``` ```
%% Output %% Output
{'ville': 0.478071716663547, {'ville': 0.478071716663547,
'capitale': 0.2983988672217172, 'capitale': 0.2983988672217172,
'royaume': 0.24665610131446675, 'royaume': 0.24665610131446675,
'G': 0.24402357535335403, 'G': 0.24402357535335403,
'##éo': 0.23393328870446992, '##éo': 0.23393328870446992,
'grande': 0.21832893139528123, 'grande': 0.21832893139528123,
',': -0.011642202072501788, ',': -0.011642202072501788,
'##gr': 0.1695800465119405, '##gr': 0.1695800465119405,
'##Y': 0.1456759996705617, '##Y': 0.1456759996705617,
'##ON': 0.14307146561933012, '##ON': 0.14307146561933012,
'.': 0.029180628239546275, '.': 0.029180628239546275,
'France': 0.13688799086603975, 'France': 0.13688799086603975,
'latin': 0.13303588704102381, 'latin': 0.13303588704102381,
'chrétienne': 0.1311835388990743, 'chrétienne': 0.1311835388990743,
'lieu': 0.13020947076813982, 'lieu': 0.13020947076813982,
'ancienne': 0.12958979621300132, 'ancienne': 0.12958979621300132,
'celebre': 0.12947489123965564, 'celebre': 0.12947489123965564,
'rivier': 0.11169096058453537, 'rivier': 0.11169096058453537,
'nomme': 0.10000471924693329, 'nomme': 0.10000471924693329,
'(': 0.09932002907423143, '(': 0.09932002907423143,
'la': -0.006879341345145134, 'la': -0.006879341345145134,
'riche': 0.07913704700022943, 'riche': 0.07913704700022943,
'du': 0.018572791985543135, 'du': 0.018572791985543135,
'Rhône': 0.07226400802922804, 'Rhône': 0.07226400802922804,
'Elle': 0.0693630173969722, 'Elle': 0.0693630173969722,
"'": 0.05723696778164145, "'": 0.05723696778164145,
'S': 0.06575101714951456, 'S': 0.06575101714951456,
'plus': 0.05840061099213507, 'plus': 0.05840061099213507,
'qui': 0.03778469886529954, 'qui': 0.03778469886529954,
'712': 0.054693452829347115, '712': 0.054693452829347115,
'##érable': 0.05297839086419718, '##érable': 0.05297839086419718,
'fondée': 0.04704211890403151, 'fondée': 0.04704211890403151,
'l': 0.0451974674122074, 'l': 0.0451974674122074,
'fut': 0.0438798787047486, 'fut': 0.0438798787047486,
'##e': 0.010537012868472688, '##e': 0.010537012868472688,
'b': 0.042870227388534604, 'b': 0.042870227388534604,
'sur': 0.04260004363332922, 'sur': 0.04260004363332922,
'se': 0.0058271154715995995, 'se': 0.0058271154715995995,
'peu': 0.040981027718879084, 'peu': 0.040981027718879084,
'c': 0.0403850871592572, 'c': 0.0403850871592572,
'Lyon': 0.022692171217471906, 'Lyon': 0.022692171217471906,
'##um': 0.03484266276127894, '##um': 0.03484266276127894,
'##ye': 0.03940461731845493, '##ye': 0.03940461731845493,
'##rum': 0.03820084664850618, '##rum': 0.03820084664850618,
'##m': 0.026521004509341334, '##m': 0.026521004509341334,
'Se': 0.037503453809376, 'Se': 0.037503453809376,
'cette': 0.03737564027887762, 'cette': 0.03737564027887762,
'Rome': 0.03688032185991681, 'Rome': 0.03688032185991681,
'##umu': 0.03456580806237662, '##umu': 0.03456580806237662,
'de': 0.021013220187771894, 'de': 0.021013220187771894,
'Paris': 0.03310146903416289, 'Paris': 0.03310146903416289,
'##gus': 0.03294045015997047, '##gus': 0.03294045015997047,
'##sid': 0.03255043778254519, '##sid': 0.03255043778254519,
'jet': 0.031575857152632385, 'jet': 0.031575857152632385,
'un': 0.03117251985912735, 'un': 0.03117251985912735,
'le': 0.03105610182850656, 'le': 0.03105610182850656,
'Lu': 0.008645628419735481, 'Lu': 0.008645628419735481,
'Plan': 0.030820184404097863, 'Plan': 0.030820184404097863,
'en': 0.03010674205624715, 'en': 0.03010674205624715,
'##z': 0.029871874749211054, '##z': 0.029871874749211054,
'##mil': 0.02963200210194755, '##mil': 0.02963200210194755,
'##unum': 0.025898349689579492, '##unum': 0.025898349689579492,
'Vo': 0.02935262786796574, 'Vo': 0.02935262786796574,
'##noi': 0.02909189419875202, '##noi': 0.02909189419875202,
'des': 0.028996120278423045, 'des': 0.028996120278423045,
'quarante': 0.028231791558966633, 'quarante': 0.028231791558966633,
'##s': 0.01484737615013025, '##s': 0.01484737615013025,
'con': 0.026165582559808873, 'con': 0.026165582559808873,
'##dun': 0.025699022336446258, '##dun': 0.025699022336446258,
'##tar': 0.025188870477124894, '##tar': 0.025188870477124894,
'é': 0.024089543382319098, 'é': 0.024089543382319098,
'an': 0.023585319400848195, 'an': 0.023585319400848195,
'##gu': 0.023084632572130535, '##gu': 0.023084632572130535,
'Il': 0.021736540370470812, 'Il': 0.021736540370470812,
'##us': 0.015200983089939281, '##us': 0.015200983089939281,
'##te': 0.020236291895152022, '##te': 0.020236291895152022,
'avec': 0.019039309232488966, 'avec': 0.019039309232488966,
'dans': 0.01852231748257226, 'dans': 0.01852231748257226,
'##cus': 0.01828726599412002, '##cus': 0.01828726599412002,
'##pid': 0.01787476167297771, '##pid': 0.01787476167297771,
'après': 0.01785470962170739, 'après': 0.01785470962170739,
'avant': 0.017626577836139475, 'avant': 0.017626577836139475,
'##nati': 0.01752347206998558, '##nati': 0.01752347206998558,
'##iano': 0.017089445343453365, '##iano': 0.017089445343453365,
'Le': 0.016166723086828174, 'Le': 0.016166723086828174,
'##toi': 0.01483008688193065, '##toi': 0.01483008688193065,
'ans': 0.014472180695321534, 'ans': 0.014472180695321534,
'Cel': 0.014027086848242715, 'Cel': 0.014027086848242715,
'où': 0.013157964330803138, 'où': 0.013157964330803138,
'##ât': 0.011332787999157318, '##ât': 0.011332787999157318,
'##ôn': 0.011241165099203603, '##ôn': 0.011241165099203603,
'##it': 0.009763016011555254, '##it': 0.009763016011555254,
'ere': 0.007590037219544403, 'ere': 0.007590037219544403,
'L': 0.007399733805079844, 'L': 0.007399733805079844,
'par': 0.007011176299182855, 'par': 0.007011176299182855,
'##gd': -0.00442376201350928, '##gd': -0.00442376201350928,
'Mu': 0.006340399133187405, 'Mu': 0.006340399133187405,
'consul': 0.006312700914285012, 'consul': 0.006312700914285012,
'il': 0.00609352197030786, 'il': 0.00609352197030786,
'##pla': 0.005560350755837545, '##pla': 0.005560350755837545,
'Lucius': 0.0044462351021057325, 'Lucius': 0.0044462351021057325,
'##t': 0.0036622619849812073, '##t': 0.0036622619849812073,
'[CLS]': 0.0, '[CLS]': 0.0,
'[SEP]': 0.0, '[SEP]': 0.0,
'-': -0.0003846539976056082, '-': -0.0003846539976056082,
'##ius': -0.00048531039895657175, '##ius': -0.00048531039895657175,
'au': -0.0018304190363696647, 'au': -0.0018304190363696647,
'&': -0.08418116246612357, '&': -0.08418116246612357,
'romain': -0.008083189911088765, 'romain': -0.008083189911088765,
'AE': -0.010990138859793724, 'AE': -0.010990138859793724,
'a': -0.022083265525204197, 'a': -0.022083265525204197,
'belle': -0.029909244412604778, 'belle': -0.029909244412604778,
'citoyens': -0.03278504989463669, 'citoyens': -0.03278504989463669,
')': -0.13544847084394057} ')': -0.13544847084394057}
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df_EDdA.enccre_class.unique() df_EDdA.enccre_class.unique()
``` ```
%% Output %% Output
array([nan, 'Grammaire', 'Médailles', 'Histoire', 'Caractères', array([nan, 'Grammaire', 'Médailles', 'Histoire', 'Caractères',
'Philosophie', 'Chimie', 'Médecine - Chirurgie', 'Commerce', 'Philosophie', 'Chimie', 'Médecine - Chirurgie', 'Commerce',
'Géographie', 'Mathématiques', 'Histoire naturelle', 'Géographie', 'Mathématiques', 'Histoire naturelle',
'Architecture', 'Blason', 'Agriculture - Economie rustique', 'Architecture', 'Blason', 'Agriculture - Economie rustique',
'Chasse', 'Métiers', 'Anatomie', 'Droit - Jurisprudence', 'Chasse', 'Métiers', 'Anatomie', 'Droit - Jurisprudence',
'Antiquité', 'Marine', 'Militaire (Art) - Guerre - Arme', 'Antiquité', 'Marine', 'Militaire (Art) - Guerre - Arme',
'Economie domestique', 'Maréchage - Manège', 'Jeu', 'Monnaie', 'Economie domestique', 'Maréchage - Manège', 'Jeu', 'Monnaie',
'Physique - [Sciences physico-mathématiques]', 'Religion', 'Pêche', 'Physique - [Sciences physico-mathématiques]', 'Religion', 'Pêche',
'Pharmacie', 'Musique', 'Beaux-arts', 'Spectacle', 'Politique', 'Pharmacie', 'Musique', 'Beaux-arts', 'Spectacle', 'Politique',
'Mesure', 'Belles-lettres - Poésie', 'Superstition', 'Mesure', 'Belles-lettres - Poésie', 'Superstition',
'Arts et métiers', 'Minéralogie'], dtype=object) 'Arts et métiers', 'Minéralogie'], dtype=object)
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def get_dict_xai(word_attributions): def get_dict_xai(word_attributions):
word_attributions.sort(key=lambda a: a[1], reverse = True) word_attributions.sort(key=lambda a: a[1], reverse = True)
return dict(word_attributions) return dict(word_attributions)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df = df_EDdA.copy() df = df_EDdA.copy()
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
df = df.sample(100) df = df.sample(100)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
d = {} d = {}
# boucle sur les domaines # boucle sur les domaines
for group_name, df_group in df.groupby('enccre_class'): for group_name, df_group in df.groupby('enccre_class'):
d[group_name] = {} d[group_name] = {}
# boucle sur les articles # boucle sur les articles
for index, row in tqdm(df_group.iterrows()): for index, row in tqdm(df_group.iterrows()):
word_attributions = cls_explainer(row['content'] if len(row['content']) < 512 else row['content'][:512]) word_attributions = cls_explainer(row['content'] if len(row['content']) < 512 else row['content'][:512])
d_tmp = get_dict_xai(word_attributions) d_tmp = get_dict_xai(word_attributions)
for k, v in d_tmp.items(): for k, v in d_tmp.items():
if k in d[group_name]: if k in d[group_name]:
d[group_name][k] = (d[group_name][k] + v) / 2 d[group_name][k] = (d[group_name][k] + v) / 2
else: else:
d[group_name][k] = v d[group_name][k] = v
``` ```
%% Output %% Output
2it [00:05, 2.71s/it] 2it [00:05, 2.71s/it]
3it [00:13, 4.63s/it] 3it [00:13, 4.63s/it]
3it [00:10, 3.66s/it] 3it [00:10, 3.66s/it]
2it [00:05, 2.57s/it] 2it [00:05, 2.57s/it]
1it [00:01, 1.19s/it] 1it [00:01, 1.19s/it]
1it [00:02, 2.46s/it] 1it [00:02, 2.46s/it]
1it [00:01, 1.73s/it] 1it [00:01, 1.73s/it]
1it [00:03, 3.13s/it] 1it [00:03, 3.13s/it]
2it [00:09, 4.66s/it] 2it [00:09, 4.66s/it]
11it [00:21, 1.99s/it] 11it [00:21, 1.99s/it]
1it [00:01, 1.41s/it] 1it [00:01, 1.41s/it]
16it [00:47, 2.97s/it] 16it [00:47, 2.97s/it]
2it [00:06, 3.33s/it] 2it [00:06, 3.33s/it]
10it [00:40, 4.09s/it] 10it [00:40, 4.09s/it]
3it [00:06, 2.25s/it] 3it [00:06, 2.25s/it]
1it [00:01, 1.14s/it] 1it [00:01, 1.14s/it]
1it [00:01, 1.60s/it] 1it [00:01, 1.60s/it]
2it [00:06, 3.00s/it] 2it [00:06, 3.00s/it]
1it [00:01, 1.60s/it] 1it [00:01, 1.60s/it]
1it [00:02, 2.46s/it] 1it [00:02, 2.46s/it]
1it [00:00, 1.30it/s] 1it [00:00, 1.30it/s]
4it [00:15, 3.97s/it] 4it [00:15, 3.97s/it]
4it [00:13, 3.41s/it] 4it [00:13, 3.41s/it]
2it [00:07, 3.59s/it] 2it [00:07, 3.59s/it]
1it [00:04, 4.69s/it] 1it [00:04, 4.69s/it]
1it [00:00, 1.24it/s] 1it [00:00, 1.24it/s]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
d['Géographie'] d['Géographie']
``` ```
%% Output %% Output
{'ville': 0.468205314323041, {'ville': 0.468205314323041,
'fleuve': 0.2423772411435995, 'fleuve': 0.2423772411435995,
'G': 0.23135720218825717, 'G': 0.23135720218825717,
',': 0.0858046931906998, ',': 0.0858046931906998,
'nommée': 0.19227297918896974, 'nommée': 0.19227297918896974,
'##éo': 0.18973738733606565, '##éo': 0.18973738733606565,
'##GE': 0.16634948535987606, '##GE': 0.16634948535987606,
'##A': 0.15877744098706795, '##A': 0.15877744098706795,
'##gr': 0.16202194372565118, '##gr': 0.16202194372565118,
'##TI': 0.14030883450630402, '##TI': 0.14030883450630402,
'.': 0.054787505866608105, '.': 0.054787505866608105,
'##c': 0.0948633585054219, '##c': 0.0948633585054219,
'(': 0.0413040601926874, '(': 0.0413040601926874,
'depuis': 0.1051878315897289, 'depuis': 0.1051878315897289,
'nom': 0.18136152655341967, 'nom': 0.18136152655341967,
'se': 0.08786898558747075, 'se': 0.08786898558747075,
'##ce': 0.09317071188401634, '##ce': 0.09317071188401634,
'##ia': 0.031040617643215574, '##ia': 0.031040617643215574,
'Allemands': 0.0787047715309721, 'Allemands': 0.0787047715309721,
'Ce': 0.06555545762824336, 'Ce': 0.06555545762824336,
'est': 0.074775644174404, 'est': 0.074775644174404,
'romaine': 0.07262679757314543, 'romaine': 0.07262679757314543,
'roi': 0.0716659059875927, 'roi': 0.0716659059875927,
'Da': 0.0714045742494807, 'Da': 0.0714045742494807,
'##é': 0.02500576025356923, '##é': 0.02500576025356923,
'la': 0.12298598711273313, 'la': 0.12298598711273313,
'mais': 0.05591848646995751, 'mais': 0.05591848646995751,
'T': 0.0541465558097154, 'T': 0.0541465558097154,
'Sar': 0.01724425985017986, 'Sar': 0.01724425985017986,
'La': 0.051446794552354624, 'La': 0.051446794552354624,
'ensuite': 0.051200901865576555, 'ensuite': 0.051200901865576555,
'jet': 0.0829284321696537, 'jet': 0.0829284321696537,
'J': 0.07546336834323314, 'J': 0.07546336834323314,
'république': 0.04943046233577988, 'république': 0.04943046233577988,
'ce': 0.057519719498181784, 'ce': 0.057519719498181784,
'ou': 0.08969930275580645, 'ou': 0.08969930275580645,
'an': 0.041146473695852476, 'an': 0.041146473695852476,
'moderne': 0.04177711856360317, 'moderne': 0.04177711856360317,
'##ché': 0.040774048966399426, '##ché': 0.040774048966399426,
'##toi': 0.03342140264875253, '##toi': 0.03342140264875253,
'le': 0.05642199866799782, 'le': 0.05642199866799782,
'##ès': 0.11698138135400395, '##ès': 0.11698138135400395,
'##jana': 0.03818439831225535, '##jana': 0.03818439831225535,
'les': 0.0757883643580965, 'les': 0.0757883643580965,
'Le': 0.03715890519752807, 'Le': 0.03715890519752807,
'##on': 0.025900328149956048, '##on': 0.025900328149956048,
'##a': 0.04319994977377055, '##a': 0.04319994977377055,
'Sir': 0.03642712798689307, 'Sir': 0.03642712798689307,
'Tra': 0.030808153025433653, 'Tra': 0.030808153025433653,
'avo': 0.032212276900181, 'avo': 0.032212276900181,
'de': 0.06770303542758496, 'de': 0.06770303542758496,
'##z': 0.03365499782170345, '##z': 0.03365499782170345,
"'": 0.061565814719079584, "'": 0.061565814719079584,
'Ar': 0.032402384389004744, 'Ar': 0.032402384389004744,
'un': 0.03322588462984368, 'un': 0.03322588462984368,
'sa': 0.032102001051091615, 'sa': 0.032102001051091615,
'Hong': 0.03178504105089397, 'Hong': 0.03178504105089397,
'##us': 0.03713196635047464, '##us': 0.03713196635047464,
'##bal': 0.03142378925271683, '##bal': 0.03142378925271683,
'##thus': 0.029706139074928077, '##thus': 0.029706139074928077,
'##hab': 0.02854877671553812, '##hab': 0.02854877671553812,
'##rig': 0.028440723473813656, '##rig': 0.028440723473813656,
'##lp': 0.028348485933541584, '##lp': 0.028348485933541584,
'D': 0.021142184892655142, 'D': 0.021142184892655142,
'##ssent': 0.026560785640090002, '##ssent': 0.026560785640090002,
'ar': 0.02655491286531302, 'ar': 0.02655491286531302,
'##s': 0.047794961185982546, '##s': 0.047794961185982546,
'dont': 0.02121929633019263, 'dont': 0.02121929633019263,
'dit': 0.018481328341026063, 'dit': 0.018481328341026063,
'##cus': 0.024522117453619832, '##cus': 0.024522117453619832,
'##ziu': 0.023636246682512362, '##ziu': 0.023636246682512362,
'celui': 0.023612546696296227, 'celui': 0.023612546696296227,
'Sam': 0.023408934631155117, 'Sam': 0.023408934631155117,
'd': 0.05567218618641231, 'd': 0.05567218618641231,
'dans': 0.10022254602904804, 'dans': 0.10022254602904804,
'##gent': 0.01748654204829846, '##gent': 0.01748654204829846,
'##noi': 0.02239494074557525, '##noi': 0.02239494074557525,
'##ros': 0.021349833073345013, '##ros': 0.021349833073345013,
'Ist': 0.020901809554305005, 'Ist': 0.020901809554305005,
'con': 0.025748442770215568, 'con': 0.025748442770215568,
'##x': 0.02002047701965849, '##x': 0.02002047701965849,
'R': 0.030949942486234276, 'R': 0.030949942486234276,
'app': 0.018555258608997136, 'app': 0.018555258608997136,
'##jano': 0.01808523851267019, '##jano': 0.01808523851267019,
'Dion': 0.015754758531027112, 'Dion': 0.015754758531027112,
'##zog': 0.014949231310082899, '##zog': 0.014949231310082899,
'U': 0.014654267300384672, 'U': 0.014654267300384672,
'à': 0.08298650083974059, 'à': 0.08298650083974059,
'##oe': 0.013082684704750751, '##oe': 0.013082684704750751,
'sous': -0.0002069181948599748, 'sous': -0.0002069181948599748,
'selon': -0.020082683114963043, 'selon': -0.020082683114963043,
'##mi': 0.008906287279368003, '##mi': 0.008906287279368003,
'Cassius': 0.008545442448074718, 'Cassius': 0.008545442448074718,
'par': 0.061077180857349866, 'par': 0.061077180857349866,
'-': 0.06475762097649433, '-': 0.06475762097649433,
'##t': 0.0048467472813908796, '##t': 0.0048467472813908796,
'##zet': 0.0063275438230979185, '##zet': 0.0063275438230979185,
'##uy': 0.006086328817223191, '##uy': 0.006086328817223191,
'##el': 0.0060696145131198575, '##el': 0.0060696145131198575,
'ca': 0.0056714294799304005, 'ca': 0.0056714294799304005,
'##roi': 0.005468227545142956, '##roi': 0.005468227545142956,
'trés': 0.002824228104975018, 'trés': 0.002824228104975018,
'##oit': 0.001711933310961672, '##oit': 0.001711933310961672,
'##bu': 0.0016690113836018768, '##bu': 0.0016690113836018768,
'[CLS]': 0.0, '[CLS]': 0.0,
'[SEP]': 0.0, '[SEP]': 0.0,
'ses': 0.049645402358164066, 'ses': 0.049645402358164066,
'##ors': -0.0017502978171956062, '##ors': -0.0017502978171956062,
'in': -0.003095317243436783, 'in': -0.003095317243436783,
'que': 0.009635328008189985, 'que': 0.009635328008189985,
'creu': -0.007475521658293897, 'creu': -0.007475521658293897,
'&': -0.08000801041182604, '&': -0.08000801041182604,
'##it': 0.0015963173159964771, '##it': 0.0015963173159964771,
';': 0.007984840521573309, ';': 0.007984840521573309,
')': -0.06872271549404119, ')': -0.06872271549404119,
'sentiment': -0.048171146031505886, 'sentiment': -0.048171146031505886,
'SAR': -0.07143494474675065, 'SAR': -0.07143494474675065,
'latin': 0.3960803982086443, 'latin': 0.3960803982086443,
'vallée': 0.24144223688497182, 'vallée': 0.24144223688497182,
'grec': 0.24154434016104503, 'grec': 0.24154434016104503,
'distingue': 0.20225033977383755, 'distingue': 0.20225033977383755,
'en': 0.026650124208107327, 'en': 0.026650124208107327,
'##g': 0.11139228310919369, '##g': 0.11139228310919369,
'une': 0.0705267038169397, 'une': 0.0705267038169397,
'exemple': 0.02878307063738609, 'exemple': 0.02878307063738609,
'torre': 0.010422364080421317, 'torre': 0.010422364080421317,
'fonte': 0.07176335280650341, 'fonte': 0.07176335280650341,
'mod': 0.05471099375321398, 'mod': 0.05471099375321398,
'neige': 0.06716279133677414, 'neige': 0.06716279133677414,
'##nt': -0.010146726047092878, '##nt': -0.010146726047092878,
'des': 0.10395928916105898, 'des': 0.10395928916105898,
'##al': 0.021472918740770543, '##al': 0.021472918740770543,
'l': 0.07719865579756106, 'l': 0.07719865579756106,
'toujours': 0.05654656086498567, 'toujours': 0.05654656086498567,
'grandes': 0.0500708177588014, 'grandes': 0.0500708177588014,
'##ns': 0.010472519252392662, '##ns': 0.010472519252392662,
'h': -0.007529062495164878, 'h': -0.007529062495164878,
'aussi': 0.04182030961442956, 'aussi': 0.04182030961442956,
'du': 0.02917787047349929, 'du': 0.02917787047349929,
'##rro': 0.041311241708329303, '##rro': 0.041311241708329303,
'##u': 0.018374918227908328, '##u': 0.018374918227908328,
'L': 0.03595023525894895, 'L': 0.03595023525894895,
'bien': 0.0283913554226912, 'bien': 0.0283913554226912,
'autre': 0.057531150082004046, 'autre': 0.057531150082004046,
'plu': 0.03005047616189665, 'plu': 0.03005047616189665,
'##ule': 0.0166859230453766, '##ule': 0.0166859230453766,
'On': 0.008215765462606122, 'On': 0.008215765462606122,
'##éra': 0.009599163212087692, '##éra': 0.009599163212087692,
'qu': 0.021413621005676176, 'qu': 0.021413621005676176,
'##bre': 0.01200562189665623, '##bre': 0.01200562189665623,
'terme': 0.02610804943109468, 'terme': 0.02610804943109468,
'##ima': 0.03813331812513627, '##ima': 0.03813331812513627,
'E': 0.02353268737063088, 'E': 0.02353268737063088,
'é': 0.031069540958438446, 'é': 0.031069540958438446,
'n': 0.022605076586881327, 'n': 0.022605076586881327,
'puisque': 0.022114351087644566, 'puisque': 0.022114351087644566,
'cela': 0.021728661127459404, 'cela': 0.021728661127459404,
'on': 0.031637879100526636, 'on': 0.031637879100526636,
'ne': 0.023481641713517833, 'ne': 0.023481641713517833,
'##euse': 0.019031264633545753, '##euse': 0.019031264633545753,
'souvent': 0.01412861492800689, 'souvent': 0.01412861492800689,
'fort': 0.04442544031371985, 'fort': 0.04442544031371985,
'danger': 0.013811198066105809, 'danger': 0.013811198066105809,
'signifie': 0.013575249323296856, 'signifie': 0.013575249323296856,
'après': 0.01941062923428665, 'après': 0.01941062923428665,
'met': 0.011327151448637373, 'met': 0.011327151448637373,
'##ies': 0.01091580579338499, '##ies': 0.01091580579338499,
'##que': 0.04353082158018988, '##que': 0.04353082158018988,
'##qui': 0.009204343050603915, '##qui': 0.009204343050603915,
'che': 0.005705843958191305, 'che': 0.005705843958191305,
'pour': 0.0021363265380875763, 'pour': 0.0021363265380875763,
'##crit': 0.004866795460923155, '##crit': 0.004866795460923155,
'co': 0.006828431290704029, 'co': 0.006828431290704029,
'pas': 0.006987930951319245, 'pas': 0.006987930951319245,
'##vo': -0.007773218105616921, '##vo': -0.007773218105616921,
'##re': 0.015364885611094212, '##re': 0.015364885611094212,
'##ure': -0.029633249072781927, '##ure': -0.029633249072781927,
'tem': -0.03507853370176394, 'tem': -0.03507853370176394,
'nach': -0.059647272844616725, 'nach': -0.059647272844616725,
'to': -0.043587086241707595, 'to': -0.043587086241707595,
'Torre': -0.07370789174796667, 'Torre': -0.07370789174796667,
'Comme': -0.08102775997694327, 'Comme': -0.08102775997694327,
'lac': 0.2627689961654854, 'lac': 0.2627689961654854,
'duché': 0.15878063729951344, 'duché': 0.15878063729951344,
'rivier': 0.1835301313877702, 'rivier': 0.1835301313877702,
'Novo': 0.12531683029473817, 'Novo': 0.12531683029473817,
'##US': 0.0822458506910208, '##US': 0.0822458506910208,
'empire': 0.12043364298838244, 'empire': 0.12043364298838244,
'endroit': 0.12779042183921233, 'endroit': 0.12779042183921233,
'Russie': 0.10463858590067449, 'Russie': 0.10463858590067449,
'sur': 0.04915812925999788, 'sur': 0.04915812925999788,
'##IA': 0.08076525751909835, '##IA': 0.08076525751909835,
'##SA': 0.02594775540828238, '##SA': 0.02594775540828238,
'##gor': 0.08311355621144252, '##gor': 0.08311355621144252,
'##O': 0.12780809591110026, '##O': 0.12780809591110026,
'STAR': 0.037760347726659545, 'STAR': 0.037760347726659545,
'où': 0.05080575319876235, 'où': 0.05080575319876235,
'Il': 0.0284042208786913, 'Il': 0.0284042208786913,
'##e': 0.03364232916886872, '##e': 0.03364232916886872,
'##vat': 0.05917044284419409, '##vat': 0.05917044284419409,
'##n': 0.04476191426913992, '##n': 0.04476191426913992,
'Lo': 0.05322826492644412, 'Lo': 0.05322826492644412,
'##te': 0.05775689209126553, '##te': 0.05775689209126553,
'##men': 0.025864880430748294, '##men': 0.025864880430748294,
'##od': 0.022641329488828195, '##od': 0.022641329488828195,
'province': 0.3138687127885894, 'province': 0.3138687127885894,
'lieu': 0.288944483446071, 'lieu': 0.288944483446071,
'septentrional': 0.21736956560941825, 'septentrional': 0.21736956560941825,
'bourg': 0.205192644118189, 'bourg': 0.205192644118189,
'##er': 0.20470878331452866, '##er': 0.20470878331452866,
'Amérique': 0.2577046079084956, 'Amérique': 0.2577046079084956,
'Frontera': 0.19152877305938218, 'Frontera': 0.19152877305938218,
'deux': 0.07898332743727765, 'deux': 0.07898332743727765,
'Espagne': 0.09686731766445605, 'Espagne': 0.09686731766445605,
'Gal': 0.09024113726333705, 'Gal': 0.09024113726333705,
'nouvelle': 0.0828520285184145, 'nouvelle': 0.0828520285184145,
'Guadalajara': 0.0857800140375107, 'Guadalajara': 0.0857800140375107,
'##ice': 0.06692411701657223, '##ice': 0.06692411701657223,
'##uat': 0.0508369206389474, '##uat': 0.0508369206389474,
'X': 0.04064410422404537, 'X': 0.04064410422404537,
'##la': 0.028711752575946197, '##la': 0.028711752575946197,
'##es': 0.022001482046772214, '##es': 0.022001482046772214,
'30': 0.05213470124947088, '30': 0.05213470124947088,
'##ades': -0.01685895398059224, '##ades': -0.01685895398059224,
'audience': -0.045768989771362824, 'audience': -0.045768989771362824,
'petite': 0.21919648886046295, 'petite': 0.21919648886046295,
'royaume': 0.2573068212004148, 'royaume': 0.2573068212004148,
'Naples': 0.23150278005781894, 'Naples': 0.23150278005781894,
'##Z': 0.1830676183440645, '##Z': 0.1830676183440645,
'Italie': 0.1575315512120412, 'Italie': 0.1575315512120412,
'##J': 0.1573522788542329, '##J': 0.1573522788542329,
'terre': 0.08085795705733743, 'terre': 0.08085795705733743,
'32': 0.12352770141613803, '32': 0.12352770141613803,
'10': 0.07431163055453546, '10': 0.07431163055453546,
'au': 0.034992091061273066, 'au': 0.034992091061273066,
'Long': 0.06868800126003936, 'Long': 0.06868800126003936,
'41': 0.08953747205002806, '41': 0.08953747205002806,
'Labour': 0.07756492814166974, 'Labour': 0.07756492814166974,
'CA': 0.05046508890359498, 'CA': 0.05046508890359498,
'lat': 0.05319105448357667, 'lat': 0.05319105448357667,
'Macédoine': 0.3205630652048948, 'Macédoine': 0.3205630652048948,
'##RI': 0.16460252115303084, '##RI': 0.16460252115303084,
'partie': 0.15845359313084062, 'partie': 0.15845359313084062,
'lieux': 0.15247473369694478, 'lieux': 0.15247473369694478,
'habita': 0.13771036963333821, 'habita': 0.13771036963333821,
'##I': 0.141653889530405, '##I': 0.141653889530405,
'nord': 0.242739838786995, 'nord': 0.242739838786995,
'##E': 0.12333498427020301, '##E': 0.12333498427020301,
'orientale': 0.12299072447402201, 'orientale': 0.12299072447402201,
'golf': 0.11792822736721131, 'golf': 0.11792822736721131,
'A': 0.06479200165315122, 'A': 0.06479200165315122,
'commun': 0.08994387122081324, 'commun': 0.08994387122081324,
'voir': 0.08847236372959857, 'voir': 0.08847236372959857,
'##rma': 0.07365816308437048, '##rma': 0.07365816308437048,
'##É': 0.07159170704085929, '##É': 0.07159170704085929,
'##p': 0.07104594769838767, '##p': 0.07104594769838767,
'P': 0.03272049148432894, 'P': 0.03272049148432894,
'comme': 0.06549376453090695, 'comme': 0.06549376453090695,
'##mée': 0.0646449194613384, '##mée': 0.0646449194613384,
'##ïque': 0.061645981622394784, '##ïque': 0.061645981622394784,
'##én': 0.057367966434507346, '##én': 0.057367966434507346,
'Ali': 0.055659756416669474, 'Ali': 0.055659756416669474,
'C': 0.04306076808452276, 'C': 0.04306076808452276,
'nomme': 0.05401793892863615, 'nomme': 0.05401793892863615,
'Lu': 0.05137836438835051, 'Lu': 0.05137836438835051,
'##ii': 0.049587268182288424, '##ii': 0.049587268182288424,
'##olo': 0.04880284359551007, '##olo': 0.04880284359551007,
'midi': 0.07113877841272949, 'midi': 0.07113877841272949,
'il': 0.0027654637201536968, 'il': 0.0027654637201536968,
'va': 0.04337036149864036, 'va': 0.04337036149864036,
'Pier': 0.022936152887815472, 'Pier': 0.022936152887815472,
'Pt': 0.04241563924208806, 'Pt': 0.04241563924208806,
'##rab': 0.0423110927679243, '##rab': 0.0423110927679243,
'##rée': 0.042257144545369996, '##rée': 0.042257144545369996,
'##iot': 0.04203676192154972, '##iot': 0.04203676192154972,
'The': 0.04191806043439661, 'The': 0.04191806043439661,
'St': 0.04123715089841821, 'St': 0.04123715089841821,
'Per': 0.03985223606785319, 'Per': 0.03985223606785319,
'##j': 0.03926027199963966, '##j': 0.03926027199963966,
'##ée': 0.039227260949307896, '##ée': 0.039227260949307896,
'côté': 0.023297246836922872, 'côté': 0.023297246836922872,
'cont': 0.03657382982964496, 'cont': 0.03657382982964496,
'##ae': 0.03185063483930072, '##ae': 0.03185063483930072,
'##ac': 0.031224938425030026, '##ac': 0.031224938425030026,
'ex': 0.020417873274345934, 'ex': 0.020417873274345934,
'différentes': 0.030718941715107615, 'différentes': 0.030718941715107615,
'2°': 0.030522230876155416, '2°': 0.030522230876155416,
'VII': 0.02928677630182494, 'VII': 0.02928677630182494,
'##dia': 0.026233515028085855, '##dia': 0.026233515028085855,
'##cer': 0.023491982765047097, '##cer': 0.023491982765047097,
'##xiu': 0.022805612841692573, '##xiu': 0.022805612841692573,
'a': -0.0012715805160076403, 'a': -0.0012715805160076403,
'commence': 0.014379318563949204, 'commence': 0.014379318563949204,
'##pt': 0.013195773759531383, '##pt': 0.013195773759531383,
'##ie': 0.017838742894118582, '##ie': 0.017838742894118582,
'cha': 0.0067258655884391136, 'cha': 0.0067258655884391136,
'fine': 0.006245003686740857, 'fine': 0.006245003686740857,
'##mon': 0.002094276853030689, '##mon': 0.002094276853030689,
'##éir': 0.0059480290294971795, '##éir': 0.0059480290294971795,
'III': 0.00283531432530362, 'III': 0.00283531432530362,
'termine': 0.0013832185393301071, 'termine': 0.0013832185393301071,
'x': 0.0008492312750708209, 'x': 0.0008492312750708209,
'donne': -0.0033814500703101104, 'donne': -0.0033814500703101104,
'liv': -0.01661384524933283, 'liv': -0.01661384524933283,
'born': -0.041898895862637786, 'born': -0.041898895862637786,
'1°': -0.026381150057445416, '1°': -0.026381150057445416,
'Asie': 0.25951725852047475, 'Asie': 0.25951725852047475,
'Orient': 0.26844923609680227, 'Orient': 0.26844923609680227,
'Inde': 0.2133903020411604, 'Inde': 0.2133903020411604,
'mer': 0.26868989143550776, 'mer': 0.26868989143550776,
'desert': 0.15074697961858496, 'desert': 0.15074697961858496,
'petit': 0.1376690268440502, 'petit': 0.1376690268440502,
'Cam': 0.12958328223340024, 'Cam': 0.12958328223340024,
'##MP': 0.10974714271316263, '##MP': 0.10974714271316263,
'##chin': 0.0894472877928192, '##chin': 0.0894472877928192,
'Co': 0.07783219273122367, 'Co': 0.07783219273122367,
'##chine': 0.07614065102466291, '##chine': 0.07614065102466291,
'##bo': 0.06026845472373662, '##bo': 0.06026845472373662,
'ori': 0.05756818718647703, 'ori': 0.05756818718647703,
'##cci': 0.05157091008708276, '##cci': 0.05157091008708276,
'##ge': 0.04400356682753303, '##ge': 0.04400356682753303,
'o': 0.04070060977171756, 'o': 0.04070060977171756,
'##ent': 0.023156820087233715, '##ent': 0.023156820087233715,
'CIA': 0.020303354874778645, 'CIA': 0.020303354874778645,
'##dent': 0.00517282162953671, '##dent': 0.00517282162953671,
'capitale': 0.36838594655129425, 'capitale': 0.36838594655129425,
'comté': 0.29220668870626426, 'comté': 0.29220668870626426,
'milles': 0.18218276737467964, 'milles': 0.18218276737467964,
'Irlande': 0.17055004990277814, 'Irlande': 0.17055004990277814,
'Dublin': 0.14585461747071327, 'Dublin': 0.14585461747071327,
'forte': 0.09134790101615797, 'forte': 0.09134790101615797,
'M': 0.04357113158723378, 'M': 0.04357113158723378,
'28': 0.11327727914116813, '28': 0.11327727914116813,
'##AR': 0.0700853626124515, '##AR': 0.0700853626124515,
'40': 0.10679936658542148, '40': 0.10679936658542148,
'West': 0.08993142493114488, 'West': 0.08993142493114488,
'##UL': 0.08819267818570828, '##UL': 0.08819267818570828,
'12': 0.08319295152975612, '12': 0.08319295152975612,
'##L': 0.13304534043947008, '##L': 0.13304534043947008,
'53': 0.08036439319058314, '53': 0.08036439319058314,
'Bat': 0.0781627203933898, 'Bat': 0.0781627203933898,
'13': 0.07480367622434814, '13': 0.07480367622434814,
'##ING': 0.05812388913159779, '##ING': 0.05812388913159779,
'O': 0.06583622417433492, 'O': 0.06583622417433492,
'##éa': 0.05230588344828021, '##éa': 0.05230588344828021,
'##sh': 0.03433270961563577, '##sh': 0.03433270961563577,
'MO': 0.030683540302786836, 'MO': 0.030683540302786836,
'##imo': 0.017082182198960103, '##imo': 0.017082182198960103,
'let': -0.0461111572568852, 'let': -0.0461111572568852,
'remarquable': 0.21263713433629308, 'remarquable': 0.21263713433629308,
'qui': 0.050173365888367016, 'qui': 0.050173365888367016,
'bataille': 0.13091439215231107, 'bataille': 0.13091439215231107,
'Allemagne': 0.11618493617005074, 'Allemagne': 0.11618493617005074,
'Fu': 0.10846701242333791, 'Fu': 0.10846701242333791,
'entre': 0.06369520587259789, 'entre': 0.06369520587259789,
'##EN': 0.09069418279503459, '##EN': 0.09069418279503459,
'fut': 0.08918546414477314, 'fut': 0.08918546414477314,
'##ER': 0.08840867622071771, '##ER': 0.08840867622071771,
'mar': 0.08131159281878217, 'mar': 0.08131159281878217,
'##SH': 0.07880885570732322, '##SH': 0.07880885570732322,
'Brand': 0.07534765223550378, 'Brand': 0.07534765223550378,
's': 0.07214195178952942, 's': 0.07214195178952942,
'victoire': 0.07404695440084724, 'victoire': 0.07404695440084724,
'Albert': 0.06782766064004295, 'Albert': 0.06782766064004295,
'##êché': 0.06760324837539838, '##êché': 0.06760324837539838,
'##se': 0.06694410055617671, '##se': 0.06694410055617671,
'##we': 0.06692906138119581, '##we': 0.06692906138119581,
'##illet': 0.06506627545649421, '##illet': 0.06506627545649421,
'##grave': 0.06490679816106117, '##grave': 0.06490679816106117,
'##eb': 0.061676421437111995, '##eb': 0.061676421437111995,
'Saxe': 0.056318485823556044, 'Saxe': 0.056318485823556044,
'blessure': 0.054096736108623616, 'blessure': 0.054096736108623616,
'##fait': 0.05343946030378872, '##fait': 0.05343946030378872,
'Hildesheim': 0.05099271687488835, 'Hildesheim': 0.05099271687488835,
'##fin': 0.04989368895071508, '##fin': 0.04989368895071508,
'Maurice': 0.04751196968720714, 'Maurice': 0.04751196968720714,
'##ourg': 0.04583679368750848, '##ourg': 0.04583679368750848,
'##lante': 0.03983621993726603, '##lante': 0.03983621993726603,
'7': 0.03584389050476482, '7': 0.03584389050476482,
'aux': 0.035433950894822305, 'aux': 0.035433950894822305,
'1553': 0.034349353274837756, '1553': 0.034349353274837756,
'dé': 0.03434818892830247, 'dé': 0.03434818892830247,
'mourut': 0.03360980244865721, 'mourut': 0.03360980244865721,
'év': 0.029168399068651336, 'év': 0.029168399068651336,
'##bourg': 0.02789169534010975, '##bourg': 0.02789169534010975,
'plusieurs': 0.02669902130425908, 'plusieurs': 0.02669902130425908,
'donna': 0.025856769861851345, 'donna': 0.025856769861851345,
'##ade': 0.023830238910252367, '##ade': 0.023830238910252367,
'SW': 0.02082288328341326, 'SW': 0.02082288328341326,
'##ect': 0.020450927253329462, '##ect': 0.020450927253329462,
'Lune': 0.02024761291388681, 'Lune': 0.02024761291388681,
'él': 0.01741279676584186, 'él': 0.01741279676584186,
'jours': 0.016570340432660837, 'jours': 0.016570340432660837,
'sang': 0.015613060865029399, 'sang': 0.015613060865029399,
'peu': 0.03996102752601043, 'peu': 0.03996102752601043,
'##eta': 0.01069390934382476, '##eta': 0.01069390934382476,
'ach': 0.010388292909415044, 'ach': 0.010388292909415044,
'Ju': 0.0073498549683621225, 'Ju': 0.0073498549683621225,
'##eur': 0.0019017102256399137, '##eur': 0.0019017102256399137,
'y': -0.044396462419225395, 'y': -0.044396462419225395,
'Ty': 0.24148741758807102, 'Ty': 0.24148741758807102,
'eaux': 0.21684370603760367, 'eaux': 0.21684370603760367,
'situation': 0.10329824944457552, 'situation': 0.10329824944457552,
'cité': 0.09052769883978086, 'cité': 0.09052769883978086,
'Or': 0.019460845999311357, 'Or': 0.019460845999311357,
'##esti': 0.03883953302868402, '##esti': 0.03883953302868402,
'Port': -0.1040613277987558, 'Port': -0.1040613277987558,
'm': 0.05587762491014585, 'm': 0.05587762491014585,
'Cal': 0.07697134262504549, 'Cal': 0.07697134262504549,
'appelle': 0.07462975062628824, 'appelle': 0.07462975062628824,
'car': 0.07461062484523039, 'car': 0.07461062484523039,
'port': 0.06465359698878792, 'port': 0.06465359698878792,
'dire': 0.07059837227461703, 'dire': 0.07059837227461703,
'cette': 0.08802705291918328, 'cette': 0.08802705291918328,
'##rr': 0.06022085803296414, '##rr': 0.06022085803296414,
'sept': 0.057519390301443825, 'sept': 0.057519390301443825,
'##nien': 0.05355078021839007, '##nien': 0.05355078021839007,
'fondée': 0.0509239663456544, 'fondée': 0.0509239663456544,
'pur': 0.05064960239352996, 'pur': 0.05064960239352996,
'assez': 0.05042410659440811, 'assez': 0.05042410659440811,
'Bari': 0.05030515494264842, 'Bari': 0.05030515494264842,
'##vog': 0.04906152561188761, '##vog': 0.04906152561188761,
'pré': 0.04549836271489032, 'pré': 0.04549836271489032,
'##éta': 0.04507297230896154, '##éta': 0.04507297230896154,
'aujourd': 0.04447144001423532, 'aujourd': 0.04447144001423532,
'##ne': 0.0347530163976298, '##ne': 0.0347530163976298,
'##hé': 0.03961438526125526, '##hé': 0.03961438526125526,
'ensemble': 0.037956283252757204, 'ensemble': 0.037956283252757204,
'hui': 0.03734807021915644, 'hui': 0.03734807021915644,
'leurs': 0.03687680836802711, 'leurs': 0.03687680836802711,
'être': 0.0343762027708024, 'être': 0.0343762027708024,
'##ab': 0.032563279819096465, '##ab': 0.032563279819096465,
'pou': 0.03124793207062872, 'pou': 0.03124793207062872,
'##oi': 0.02989229975500687, '##oi': 0.02989229975500687,
'c': -0.006530315247605613, 'c': -0.006530315247605613,
'##uri': 0.026965622143799262, '##uri': 0.026965622143799262,
'Ra': 0.026882539673090518, 'Ra': 0.026882539673090518,
'##voi': 0.01431455878734083, '##voi': 0.01431455878734083,
'suivant': 0.02558394558733645, 'suivant': 0.02558394558733645,
'##o': 0.023331497432489353, '##o': 0.023331497432489353,
'##i': 0.020948732860884275, '##i': 0.020948732860884275,
'soit': 0.020530716687414827, 'soit': 0.020530716687414827,
'##tend': 0.020495566508082305, '##tend': 0.020495566508082305,
'loin': 0.019550798182729083, 'loin': 0.019550798182729083,
'##rie': 0.017169840154727146, '##rie': 0.017169840154727146,
'##ject': 0.017138782093343834, '##ject': 0.017138782093343834,
'ora': 0.017038607009944622, 'ora': 0.017038607009944622,
'##ît': 0.014602155227601584, '##ît': 0.014602155227601584,
'##êl': 0.014337833594615367, '##êl': 0.014337833594615367,
'##oso': 0.010042907154746029, '##oso': 0.010042907154746029,
'##cle': 0.008070864174746364, '##cle': 0.008070864174746364,
'##este': -0.004142491388013976, '##este': -0.004142491388013976,
'Quo': -0.011600711502630034, 'Quo': -0.011600711502630034,
'##ifier': -0.024313910709926605, '##ifier': -0.024313910709926605,
'seul': -0.0305767799476541, 'seul': -0.0305767799476541,
'accord': -0.0385442098622452, 'accord': -0.0385442098622452,
'Amazon': 0.268750975733531, 'Amazon': 0.268750975733531,
'##Y': 0.19620519591858654, '##Y': 0.19620519591858654,
'elle': 0.16664352411683878, 'elle': 0.16664352411683878,
'##UR': 0.1644776678073895, '##UR': 0.1644776678073895,
'##AT': 0.15286658491534266, '##AT': 0.15286658491534266,
'##dion': 0.14541624779420775, '##dion': 0.14541624779420775,
'Qui': 0.13771435252173025, 'Qui': 0.13771435252173025,
'##éri': 0.06699488014992556, '##éri': 0.06699488014992556,
'##xos': 0.11031678713869973, '##xos': 0.11031678713869973,
'##ale': 0.09162023651074921, '##ale': 0.09162023651074921,
':': -0.04107125437938059, ':': -0.04107125437938059,
'grande': 0.2745048084212555, 'grande': 0.2745048084212555,
'##G': 0.1593574100830241, '##G': 0.1593574100830241,
'37': 0.15551673881995645, '37': 0.15551673881995645,
'##NI': 0.13384155651705198, '##NI': 0.13384155651705198,
'56': 0.11739328894256833, '56': 0.11739328894256833,
'51': 0.10404197401916239, '51': 0.10404197401916239,
'Turquie': 0.0990410787816931, 'Turquie': 0.0990410787816931,
'Cara': 0.09295436490234411, 'Cara': 0.09295436490234411,
'##manie': 0.08529201860860249, '##manie': 0.08529201860860249,
'CO': 0.07632048130911342, 'CO': 0.07632048130911342,
'Portugal': 0.2042976668202922, 'Portugal': 0.2042976668202922,
'ruines': 0.20314744837396317, 'ruines': 0.20314744837396317,
'plaine': 0.16178744164448852, 'plaine': 0.16178744164448852,
'située': 0.14874643050344483, 'située': 0.14874643050344483,
'sud': 0.11940264716050225, 'sud': 0.11940264716050225,
'embouchure': 0.11627573925225723, 'embouchure': 0.11627573925225723,
'fertile': 0.10563735882960068, 'fertile': 0.10563735882960068,
'ancienne': 0.09576345241389979, 'ancienne': 0.09576345241389979,
'été': 0.09014077147035093, 'été': 0.09014077147035093,
'##T': 0.07841500408788923, '##T': 0.07841500408788923,
'##ob': 0.07632579567509333, '##ob': 0.07632579567509333,
'##AL': 0.07609846437262502, '##AL': 0.07609846437262502,
'longueur': 0.061323686830543296, 'longueur': 0.061323686830543296,
'laquelle': 0.06110922821761415, 'laquelle': 0.06110922821761415,
'##UB': 0.05967416217649997, '##UB': 0.05967416217649997,
'b': 0.057586434212371265, 'b': 0.057586434212371265,
'mura': 0.054736177441230875, 'mura': 0.054736177441230875,
'Cet': 0.05395581578984272, 'Cet': 0.05395581578984272,
'##riga': 0.051846567210072046, '##riga': 0.051846567210072046,
'Za': 0.04995526964949323, 'Za': 0.04995526964949323,
'plus': 0.04112507132732335, 'plus': 0.04112507132732335,
'##dao': 0.04753728678912817, '##dao': 0.04753728678912817,
'S': -0.04602159510840381, 'S': -0.04602159510840381,
'##uba': 0.04550942840492348, '##uba': 0.04550942840492348,
'temple': 0.044018212177904886, 'temple': 0.044018212177904886,
'pr': 0.04163503255558388, 'pr': 0.04163503255558388,
'fruits': 0.04113575313188026, 'fruits': 0.04113575313188026,
'##ét': 0.038606207695351155, '##ét': 0.038606207695351155,
'##ât': 0.03806250021892484, '##ât': 0.03806250021892484,
'Tage': 0.03765080030853683, 'Tage': 0.03765080030853683,
'##l': 0.036545777864497975, '##l': 0.036545777864497975,
'ferme': 0.03580818214260307, 'ferme': 0.03580818214260307,
'soi': 0.040668276790300634, 'soi': 0.040668276790300634,
'forti': 0.03549622233435284, 'forti': 0.03549622233435284,
'Lisbon': 0.034290688839724325, 'Lisbon': 0.034290688839724325,
'Est': 0.034150407829088125, 'Est': 0.034150407829088125,
'grain': 0.03124989952518535, 'grain': 0.03124989952518535,
'##r': 0.013680800966903325, '##r': 0.013680800966903325,
'vin': 0.030361970705734747, 'vin': 0.030361970705734747,
'##dou': 0.030107317328042935, '##dou': 0.030107317328042935,
'fait': 0.024187752198256366, 'fait': 0.024187752198256366,
'##tr': 0.023949993283441592, '##tr': 0.023949993283441592,
'Elle': 0.022927855199012262, 'Elle': 0.022927855199012262,
'##ucha': 0.009319170948366045, '##ucha': 0.009319170948366045,
'bout': 0.020885765063577755, 'bout': 0.020885765063577755,
'vers': 0.020157214084907837, 'vers': 0.020157214084907837,
'eu': 0.01603845134764128, 'eu': 0.01603845134764128,
'avant': 0.014469418561726512, 'avant': 0.014469418561726512,
'Jupiter': 0.01427103838187756, 'Jupiter': 0.01427103838187756,
'Au': 0.013620171062788134, 'Au': 0.013620171062788134,
'##fier': 0.006096813392549551, '##fier': 0.006096813392549551,
'Am': 0.005564187960915995, 'Am': 0.005564187960915995,
'##ème': 0.004959174323142408, '##ème': 0.004959174323142408,
'##rama': -0.004348877527747782, '##rama': -0.004348877527747782,
'##ment': -0.0064917887675220295, '##ment': -0.0064917887675220295,
'##illes': -0.007961146859700573, '##illes': -0.007961146859700573,
'ouest': 0.286768593993972, 'ouest': 0.286768593993972,
'villages': 0.20582618265616082, 'villages': 0.20582618265616082,
'canton': 0.1572237472100742, 'canton': 0.1572237472100742,
'##UE': 0.13830905321433584, '##UE': 0.13830905321433584,
'##xi': 0.12866311074641193, '##xi': 0.12866311074641193,
'quelques': 0.12300477044276094, 'quelques': 0.12300477044276094,
'México': 0.11818705671271108, 'México': 0.11818705671271108,
'##EP': 0.08843851847857245, '##EP': 0.08843851847857245,
'##aca': 0.08293512707914791, '##aca': 0.08293512707914791,
'##Q': 0.07529631420873374, '##Q': 0.07529631420873374,
'Pan': 0.06891223037247476, 'Pan': 0.06891223037247476,
'##ho': 0.06749528567182805, '##ho': 0.06749528567182805,
'##éc': 0.06615958296532198, '##éc': 0.06615958296532198,
'##fer': 0.06354695452212748, '##fer': 0.06354695452212748,
'##uco': 0.06345670762758723, '##uco': 0.06345670762758723,
'ren': 0.05244102948978422, 'ren': 0.05244102948978422,
'##me': 0.04554978807764583, '##me': 0.04554978807764583,
'##OT': 0.03221971618476101, '##OT': 0.03221971618476101,
'XI': -0.011949950511950972, 'XI': -0.011949950511950972,
'île': 0.15309594891081557, 'île': 0.15309594891081557,
'Java': 0.16105351665895745, 'Java': 0.16105351665895745,
'##BA': 0.07463413630204671, '##BA': 0.07463413630204671,
'côte': 0.15127253738509425, 'côte': 0.15127253738509425,
'TU': 0.05645581765532423, 'TU': 0.05645581765532423,
'place': 0.11962756867654223, 'place': 0.11962756867654223,
'##N': 0.11801784167947217, '##N': 0.11801784167947217,
'Ban': 0.08482843899835783, 'Ban': 0.08482843899835783,
'came': 0.08336073380554908, 'came': 0.08336073380554908,
'ceinture': 0.06844534842035621, 'ceinture': 0.06844534842035621,
'près': 0.07520853958224895, 'près': 0.07520853958224895,
'##ON': 0.07452683419741084, '##ON': 0.07452683419741084,
'haut': 0.04823238398842199, 'haut': 0.04823238398842199,
'130': 0.04487521666602899, '130': 0.04487521666602899,
'ils': 0.044544039157585216, 'ils': 0.044544039157585216,
'##tam': 0.04319673648672328, '##tam': 0.04319673648672328,
'nu': 0.03745403392222305, 'nu': 0.03745403392222305,
'vont': 0.03470655996970495, 'vont': 0.03470655996970495,
'toile': 0.03155735621178085, 'toile': 0.03155735621178085,
'belle': 0.03147662648632047, 'belle': 0.03147662648632047,
'5': 0.0313537019882558, '5': 0.0313537019882558,
'poi': 0.0294521548866359, 'poi': 0.0294521548866359,
'##ton': 0.027713782580636243, '##ton': 0.027713782580636243,
'Ses': 0.024714865988903954, 'Ses': 0.024714865988903954,
'##gnar': 0.018810819002694525, '##gnar': 0.018810819002694525,
'leur': 0.012756004109524549, 'leur': 0.012756004109524549,
'toute': 0.011917089830431075, 'toute': 0.011917089830431075,
'##ds': 0.00923735338058978, '##ds': 0.00923735338058978,
'##d': -0.007455893077793125, '##d': -0.007455893077793125,
'##lot': 0.0014408876850427973, '##lot': 0.0014408876850427973,
'tout': -0.003079531645644194, 'tout': -0.003079531645644194,
'traf': -0.005273322147678258, 'traf': -0.005273322147678258,
'lati': -0.006057162419608609, 'lati': -0.006057162419608609,
'portent': -0.010660914154466471, 'portent': -0.010660914154466471,
'##ique': -0.014304967428463296, '##ique': -0.014304967428463296,
'Sue': 0.18807178158587753, 'Sue': 0.18807178158587753,
'grand': 0.18657103884394427, 'grand': 0.18657103884394427,
'sont': 0.09666358953523904, 'sont': 0.09666358953523904,
'##de': 0.08031207024907842, '##de': 0.08031207024907842,
'portée': 0.07505629073993414, 'portée': 0.07505629073993414,
'##ar': 0.05511323593526249, '##ar': 0.05511323593526249,
'##lie': 0.05204283271035915, '##lie': 0.05204283271035915,
'Dal': 0.018382978798721323, 'Dal': 0.018382978798721323,
'SI': -0.0025126387312130413} 'SI': -0.0025126387312130413}
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
l = [(k, v) for k, v in d['Géographie'].items()] l = [(k, v) for k, v in d['Géographie'].items()]
l.sort(key=lambda a: a[1], reverse = True) l.sort(key=lambda a: a[1], reverse = True)
l l
``` ```
%% Output %% Output
[('ville', 0.468205314323041), [('ville', 0.468205314323041),
('latin', 0.3960803982086443), ('latin', 0.3960803982086443),
('capitale', 0.36838594655129425), ('capitale', 0.36838594655129425),
('Macédoine', 0.3205630652048948), ('Macédoine', 0.3205630652048948),
('province', 0.3138687127885894), ('province', 0.3138687127885894),
('comté', 0.29220668870626426), ('comté', 0.29220668870626426),
('lieu', 0.288944483446071), ('lieu', 0.288944483446071),
('ouest', 0.286768593993972), ('ouest', 0.286768593993972),
('grande', 0.2745048084212555), ('grande', 0.2745048084212555),
('Amazon', 0.268750975733531), ('Amazon', 0.268750975733531),
('mer', 0.26868989143550776), ('mer', 0.26868989143550776),
('Orient', 0.26844923609680227), ('Orient', 0.26844923609680227),
('lac', 0.2627689961654854), ('lac', 0.2627689961654854),
('Asie', 0.25951725852047475), ('Asie', 0.25951725852047475),
('Amérique', 0.2577046079084956), ('Amérique', 0.2577046079084956),
('royaume', 0.2573068212004148), ('royaume', 0.2573068212004148),
('nord', 0.242739838786995), ('nord', 0.242739838786995),
('fleuve', 0.2423772411435995), ('fleuve', 0.2423772411435995),
('grec', 0.24154434016104503), ('grec', 0.24154434016104503),
('Ty', 0.24148741758807102), ('Ty', 0.24148741758807102),
('vallée', 0.24144223688497182), ('vallée', 0.24144223688497182),
('Naples', 0.23150278005781894), ('Naples', 0.23150278005781894),
('G', 0.23135720218825717), ('G', 0.23135720218825717),
('petite', 0.21919648886046295), ('petite', 0.21919648886046295),
('septentrional', 0.21736956560941825), ('septentrional', 0.21736956560941825),
('eaux', 0.21684370603760367), ('eaux', 0.21684370603760367),
('Inde', 0.2133903020411604), ('Inde', 0.2133903020411604),
('remarquable', 0.21263713433629308), ('remarquable', 0.21263713433629308),
('villages', 0.20582618265616082), ('villages', 0.20582618265616082),
('bourg', 0.205192644118189), ('bourg', 0.205192644118189),
('##er', 0.20470878331452866), ('##er', 0.20470878331452866),
('Portugal', 0.2042976668202922), ('Portugal', 0.2042976668202922),
('ruines', 0.20314744837396317), ('ruines', 0.20314744837396317),
('distingue', 0.20225033977383755), ('distingue', 0.20225033977383755),
('##Y', 0.19620519591858654), ('##Y', 0.19620519591858654),
('nommée', 0.19227297918896974), ('nommée', 0.19227297918896974),
('Frontera', 0.19152877305938218), ('Frontera', 0.19152877305938218),
('##éo', 0.18973738733606565), ('##éo', 0.18973738733606565),
('Sue', 0.18807178158587753), ('Sue', 0.18807178158587753),
('grand', 0.18657103884394427), ('grand', 0.18657103884394427),
('rivier', 0.1835301313877702), ('rivier', 0.1835301313877702),
('##Z', 0.1830676183440645), ('##Z', 0.1830676183440645),
('milles', 0.18218276737467964), ('milles', 0.18218276737467964),
('nom', 0.18136152655341967), ('nom', 0.18136152655341967),
('Irlande', 0.17055004990277814), ('Irlande', 0.17055004990277814),
('elle', 0.16664352411683878), ('elle', 0.16664352411683878),
('##GE', 0.16634948535987606), ('##GE', 0.16634948535987606),
('##RI', 0.16460252115303084), ('##RI', 0.16460252115303084),
('##UR', 0.1644776678073895), ('##UR', 0.1644776678073895),
('##gr', 0.16202194372565118), ('##gr', 0.16202194372565118),
('plaine', 0.16178744164448852), ('plaine', 0.16178744164448852),
('Java', 0.16105351665895745), ('Java', 0.16105351665895745),
('##G', 0.1593574100830241), ('##G', 0.1593574100830241),
('duché', 0.15878063729951344), ('duché', 0.15878063729951344),
('##A', 0.15877744098706795), ('##A', 0.15877744098706795),
('partie', 0.15845359313084062), ('partie', 0.15845359313084062),
('Italie', 0.1575315512120412), ('Italie', 0.1575315512120412),
('##J', 0.1573522788542329), ('##J', 0.1573522788542329),
('canton', 0.1572237472100742), ('canton', 0.1572237472100742),
('37', 0.15551673881995645), ('37', 0.15551673881995645),
('île', 0.15309594891081557), ('île', 0.15309594891081557),
('##AT', 0.15286658491534266), ('##AT', 0.15286658491534266),
('lieux', 0.15247473369694478), ('lieux', 0.15247473369694478),
('côte', 0.15127253738509425), ('côte', 0.15127253738509425),
('desert', 0.15074697961858496), ('desert', 0.15074697961858496),
('située', 0.14874643050344483), ('située', 0.14874643050344483),
('Dublin', 0.14585461747071327), ('Dublin', 0.14585461747071327),
('##dion', 0.14541624779420775), ('##dion', 0.14541624779420775),
('##I', 0.141653889530405), ('##I', 0.141653889530405),
('##TI', 0.14030883450630402), ('##TI', 0.14030883450630402),
('##UE', 0.13830905321433584), ('##UE', 0.13830905321433584),
('Qui', 0.13771435252173025), ('Qui', 0.13771435252173025),
('habita', 0.13771036963333821), ('habita', 0.13771036963333821),
('petit', 0.1376690268440502), ('petit', 0.1376690268440502),
('##NI', 0.13384155651705198), ('##NI', 0.13384155651705198),
('##L', 0.13304534043947008), ('##L', 0.13304534043947008),
('bataille', 0.13091439215231107), ('bataille', 0.13091439215231107),
('Cam', 0.12958328223340024), ('Cam', 0.12958328223340024),
('##xi', 0.12866311074641193), ('##xi', 0.12866311074641193),
('##O', 0.12780809591110026), ('##O', 0.12780809591110026),
('endroit', 0.12779042183921233), ('endroit', 0.12779042183921233),
('Novo', 0.12531683029473817), ('Novo', 0.12531683029473817),
('32', 0.12352770141613803), ('32', 0.12352770141613803),
('##E', 0.12333498427020301), ('##E', 0.12333498427020301),
('quelques', 0.12300477044276094), ('quelques', 0.12300477044276094),
('orientale', 0.12299072447402201), ('orientale', 0.12299072447402201),
('la', 0.12298598711273313), ('la', 0.12298598711273313),
('empire', 0.12043364298838244), ('empire', 0.12043364298838244),
('place', 0.11962756867654223), ('place', 0.11962756867654223),
('sud', 0.11940264716050225), ('sud', 0.11940264716050225),
('México', 0.11818705671271108), ('México', 0.11818705671271108),
('##N', 0.11801784167947217), ('##N', 0.11801784167947217),
('golf', 0.11792822736721131), ('golf', 0.11792822736721131),
('56', 0.11739328894256833), ('56', 0.11739328894256833),
('##ès', 0.11698138135400395), ('##ès', 0.11698138135400395),
('embouchure', 0.11627573925225723), ('embouchure', 0.11627573925225723),
('Allemagne', 0.11618493617005074), ('Allemagne', 0.11618493617005074),
('28', 0.11327727914116813), ('28', 0.11327727914116813),
('##g', 0.11139228310919369), ('##g', 0.11139228310919369),
('##xos', 0.11031678713869973), ('##xos', 0.11031678713869973),
('##MP', 0.10974714271316263), ('##MP', 0.10974714271316263),
('Fu', 0.10846701242333791), ('Fu', 0.10846701242333791),
('40', 0.10679936658542148), ('40', 0.10679936658542148),
('fertile', 0.10563735882960068), ('fertile', 0.10563735882960068),
('depuis', 0.1051878315897289), ('depuis', 0.1051878315897289),
('Russie', 0.10463858590067449), ('Russie', 0.10463858590067449),
('51', 0.10404197401916239), ('51', 0.10404197401916239),
('des', 0.10395928916105898), ('des', 0.10395928916105898),
('situation', 0.10329824944457552), ('situation', 0.10329824944457552),
('dans', 0.10022254602904804), ('dans', 0.10022254602904804),
('Turquie', 0.0990410787816931), ('Turquie', 0.0990410787816931),
('Espagne', 0.09686731766445605), ('Espagne', 0.09686731766445605),
('sont', 0.09666358953523904), ('sont', 0.09666358953523904),
('ancienne', 0.09576345241389979), ('ancienne', 0.09576345241389979),
('##c', 0.0948633585054219), ('##c', 0.0948633585054219),
('##ce', 0.09317071188401634), ('##ce', 0.09317071188401634),
('Cara', 0.09295436490234411), ('Cara', 0.09295436490234411),
('##ale', 0.09162023651074921), ('##ale', 0.09162023651074921),
('forte', 0.09134790101615797), ('forte', 0.09134790101615797),
('##EN', 0.09069418279503459), ('##EN', 0.09069418279503459),
('cité', 0.09052769883978086), ('cité', 0.09052769883978086),
('Gal', 0.09024113726333705), ('Gal', 0.09024113726333705),
('été', 0.09014077147035093), ('été', 0.09014077147035093),
('commun', 0.08994387122081324), ('commun', 0.08994387122081324),
('West', 0.08993142493114488), ('West', 0.08993142493114488),
('ou', 0.08969930275580645), ('ou', 0.08969930275580645),
('41', 0.08953747205002806), ('41', 0.08953747205002806),
('##chin', 0.0894472877928192), ('##chin', 0.0894472877928192),
('fut', 0.08918546414477314), ('fut', 0.08918546414477314),
('voir', 0.08847236372959857), ('voir', 0.08847236372959857),
('##EP', 0.08843851847857245), ('##EP', 0.08843851847857245),
('##ER', 0.08840867622071771), ('##ER', 0.08840867622071771),
('##UL', 0.08819267818570828), ('##UL', 0.08819267818570828),
('cette', 0.08802705291918328), ('cette', 0.08802705291918328),
('se', 0.08786898558747075), ('se', 0.08786898558747075),
(',', 0.0858046931906998), (',', 0.0858046931906998),
('Guadalajara', 0.0857800140375107), ('Guadalajara', 0.0857800140375107),
('##manie', 0.08529201860860249), ('##manie', 0.08529201860860249),
('Ban', 0.08482843899835783), ('Ban', 0.08482843899835783),
('came', 0.08336073380554908), ('came', 0.08336073380554908),
('12', 0.08319295152975612), ('12', 0.08319295152975612),
('##gor', 0.08311355621144252), ('##gor', 0.08311355621144252),
('à', 0.08298650083974059), ('à', 0.08298650083974059),
('##aca', 0.08293512707914791), ('##aca', 0.08293512707914791),
('jet', 0.0829284321696537), ('jet', 0.0829284321696537),
('nouvelle', 0.0828520285184145), ('nouvelle', 0.0828520285184145),
('##US', 0.0822458506910208), ('##US', 0.0822458506910208),
('mar', 0.08131159281878217), ('mar', 0.08131159281878217),
('terre', 0.08085795705733743), ('terre', 0.08085795705733743),
('##IA', 0.08076525751909835), ('##IA', 0.08076525751909835),
('53', 0.08036439319058314), ('53', 0.08036439319058314),
('##de', 0.08031207024907842), ('##de', 0.08031207024907842),
('deux', 0.07898332743727765), ('deux', 0.07898332743727765),
('##SH', 0.07880885570732322), ('##SH', 0.07880885570732322),
('Allemands', 0.0787047715309721), ('Allemands', 0.0787047715309721),
('##T', 0.07841500408788923), ('##T', 0.07841500408788923),
('Bat', 0.0781627203933898), ('Bat', 0.0781627203933898),
('Co', 0.07783219273122367), ('Co', 0.07783219273122367),
('Labour', 0.07756492814166974), ('Labour', 0.07756492814166974),
('l', 0.07719865579756106), ('l', 0.07719865579756106),
('Cal', 0.07697134262504549), ('Cal', 0.07697134262504549),
('##ob', 0.07632579567509333), ('##ob', 0.07632579567509333),
('CO', 0.07632048130911342), ('CO', 0.07632048130911342),
('##chine', 0.07614065102466291), ('##chine', 0.07614065102466291),
('##AL', 0.07609846437262502), ('##AL', 0.07609846437262502),
('les', 0.0757883643580965), ('les', 0.0757883643580965),
('J', 0.07546336834323314), ('J', 0.07546336834323314),
('Brand', 0.07534765223550378), ('Brand', 0.07534765223550378),
('##Q', 0.07529631420873374), ('##Q', 0.07529631420873374),
('près', 0.07520853958224895), ('près', 0.07520853958224895),
('portée', 0.07505629073993414), ('portée', 0.07505629073993414),
('13', 0.07480367622434814), ('13', 0.07480367622434814),
('est', 0.074775644174404), ('est', 0.074775644174404),
('##BA', 0.07463413630204671), ('##BA', 0.07463413630204671),
('appelle', 0.07462975062628824), ('appelle', 0.07462975062628824),
('car', 0.07461062484523039), ('car', 0.07461062484523039),
('##ON', 0.07452683419741084), ('##ON', 0.07452683419741084),
('10', 0.07431163055453546), ('10', 0.07431163055453546),
('victoire', 0.07404695440084724), ('victoire', 0.07404695440084724),
('##rma', 0.07365816308437048), ('##rma', 0.07365816308437048),
('romaine', 0.07262679757314543), ('romaine', 0.07262679757314543),
('s', 0.07214195178952942), ('s', 0.07214195178952942),
('fonte', 0.07176335280650341), ('fonte', 0.07176335280650341),
('roi', 0.0716659059875927), ('roi', 0.0716659059875927),
('##É', 0.07159170704085929), ('##É', 0.07159170704085929),
('Da', 0.0714045742494807), ('Da', 0.0714045742494807),
('midi', 0.07113877841272949), ('midi', 0.07113877841272949),
('##p', 0.07104594769838767), ('##p', 0.07104594769838767),
('dire', 0.07059837227461703), ('dire', 0.07059837227461703),
('une', 0.0705267038169397), ('une', 0.0705267038169397),
('##AR', 0.0700853626124515), ('##AR', 0.0700853626124515),
('Pan', 0.06891223037247476), ('Pan', 0.06891223037247476),
('Long', 0.06868800126003936), ('Long', 0.06868800126003936),
('ceinture', 0.06844534842035621), ('ceinture', 0.06844534842035621),
('Albert', 0.06782766064004295), ('Albert', 0.06782766064004295),
('de', 0.06770303542758496), ('de', 0.06770303542758496),
('##êché', 0.06760324837539838), ('##êché', 0.06760324837539838),
('##ho', 0.06749528567182805), ('##ho', 0.06749528567182805),
('neige', 0.06716279133677414), ('neige', 0.06716279133677414),
('##éri', 0.06699488014992556), ('##éri', 0.06699488014992556),
('##se', 0.06694410055617671), ('##se', 0.06694410055617671),
('##we', 0.06692906138119581), ('##we', 0.06692906138119581),
('##ice', 0.06692411701657223), ('##ice', 0.06692411701657223),
('##éc', 0.06615958296532198), ('##éc', 0.06615958296532198),
('O', 0.06583622417433492), ('O', 0.06583622417433492),
('Ce', 0.06555545762824336), ('Ce', 0.06555545762824336),
('comme', 0.06549376453090695), ('comme', 0.06549376453090695),
('##illet', 0.06506627545649421), ('##illet', 0.06506627545649421),
('##grave', 0.06490679816106117), ('##grave', 0.06490679816106117),
('A', 0.06479200165315122), ('A', 0.06479200165315122),
('-', 0.06475762097649433), ('-', 0.06475762097649433),
('port', 0.06465359698878792), ('port', 0.06465359698878792),
('##mée', 0.0646449194613384), ('##mée', 0.0646449194613384),
('entre', 0.06369520587259789), ('entre', 0.06369520587259789),
('##fer', 0.06354695452212748), ('##fer', 0.06354695452212748),
('##uco', 0.06345670762758723), ('##uco', 0.06345670762758723),
('##eb', 0.061676421437111995), ('##eb', 0.061676421437111995),
('##ïque', 0.061645981622394784), ('##ïque', 0.061645981622394784),
("'", 0.061565814719079584), ("'", 0.061565814719079584),
('longueur', 0.061323686830543296), ('longueur', 0.061323686830543296),
('laquelle', 0.06110922821761415), ('laquelle', 0.06110922821761415),
('par', 0.061077180857349866), ('par', 0.061077180857349866),
('##bo', 0.06026845472373662), ('##bo', 0.06026845472373662),
('##rr', 0.06022085803296414), ('##rr', 0.06022085803296414),
('##UB', 0.05967416217649997), ('##UB', 0.05967416217649997),
('##vat', 0.05917044284419409), ('##vat', 0.05917044284419409),
('##ING', 0.05812388913159779), ('##ING', 0.05812388913159779),
('##te', 0.05775689209126553), ('##te', 0.05775689209126553),
('b', 0.057586434212371265), ('b', 0.057586434212371265),
('ori', 0.05756818718647703), ('ori', 0.05756818718647703),
('autre', 0.057531150082004046), ('autre', 0.057531150082004046),
('ce', 0.057519719498181784), ('ce', 0.057519719498181784),
('sept', 0.057519390301443825), ('sept', 0.057519390301443825),
('##én', 0.057367966434507346), ('##én', 0.057367966434507346),
('toujours', 0.05654656086498567), ('toujours', 0.05654656086498567),
('TU', 0.05645581765532423), ('TU', 0.05645581765532423),
('le', 0.05642199866799782), ('le', 0.05642199866799782),
('Saxe', 0.056318485823556044), ('Saxe', 0.056318485823556044),
('mais', 0.05591848646995751), ('mais', 0.05591848646995751),
('m', 0.05587762491014585), ('m', 0.05587762491014585),
('d', 0.05567218618641231), ('d', 0.05567218618641231),
('Ali', 0.055659756416669474), ('Ali', 0.055659756416669474),
('##ar', 0.05511323593526249), ('##ar', 0.05511323593526249),
('.', 0.054787505866608105), ('.', 0.054787505866608105),
('mura', 0.054736177441230875), ('mura', 0.054736177441230875),
('mod', 0.05471099375321398), ('mod', 0.05471099375321398),
('T', 0.0541465558097154), ('T', 0.0541465558097154),
('blessure', 0.054096736108623616), ('blessure', 0.054096736108623616),
('nomme', 0.05401793892863615), ('nomme', 0.05401793892863615),
('Cet', 0.05395581578984272), ('Cet', 0.05395581578984272),
('##nien', 0.05355078021839007), ('##nien', 0.05355078021839007),
('##fait', 0.05343946030378872), ('##fait', 0.05343946030378872),
('Lo', 0.05322826492644412), ('Lo', 0.05322826492644412),
('lat', 0.05319105448357667), ('lat', 0.05319105448357667),
('ren', 0.05244102948978422), ('ren', 0.05244102948978422),
('##éa', 0.05230588344828021), ('##éa', 0.05230588344828021),
('30', 0.05213470124947088), ('30', 0.05213470124947088),
('##lie', 0.05204283271035915), ('##lie', 0.05204283271035915),
('##riga', 0.051846567210072046), ('##riga', 0.051846567210072046),
('##cci', 0.05157091008708276), ('##cci', 0.05157091008708276),
('La', 0.051446794552354624), ('La', 0.051446794552354624),
('Lu', 0.05137836438835051), ('Lu', 0.05137836438835051),
('ensuite', 0.051200901865576555), ('ensuite', 0.051200901865576555),
('Hildesheim', 0.05099271687488835), ('Hildesheim', 0.05099271687488835),
('fondée', 0.0509239663456544), ('fondée', 0.0509239663456544),
('##uat', 0.0508369206389474), ('##uat', 0.0508369206389474),
('où', 0.05080575319876235), ('où', 0.05080575319876235),
('pur', 0.05064960239352996), ('pur', 0.05064960239352996),
('CA', 0.05046508890359498), ('CA', 0.05046508890359498),
('assez', 0.05042410659440811), ('assez', 0.05042410659440811),
('Bari', 0.05030515494264842), ('Bari', 0.05030515494264842),
('qui', 0.050173365888367016), ('qui', 0.050173365888367016),
('grandes', 0.0500708177588014), ('grandes', 0.0500708177588014),
('Za', 0.04995526964949323), ('Za', 0.04995526964949323),
('##fin', 0.04989368895071508), ('##fin', 0.04989368895071508),
('ses', 0.049645402358164066), ('ses', 0.049645402358164066),
('##ii', 0.049587268182288424), ('##ii', 0.049587268182288424),
('république', 0.04943046233577988), ('république', 0.04943046233577988),
('sur', 0.04915812925999788), ('sur', 0.04915812925999788),
('##vog', 0.04906152561188761), ('##vog', 0.04906152561188761),
('##olo', 0.04880284359551007), ('##olo', 0.04880284359551007),
('haut', 0.04823238398842199), ('haut', 0.04823238398842199),
('##s', 0.047794961185982546), ('##s', 0.047794961185982546),
('##dao', 0.04753728678912817), ('##dao', 0.04753728678912817),
('Maurice', 0.04751196968720714), ('Maurice', 0.04751196968720714),
('##ourg', 0.04583679368750848), ('##ourg', 0.04583679368750848),
('##me', 0.04554978807764583), ('##me', 0.04554978807764583),
('##uba', 0.04550942840492348), ('##uba', 0.04550942840492348),
('pré', 0.04549836271489032), ('pré', 0.04549836271489032),
('##éta', 0.04507297230896154), ('##éta', 0.04507297230896154),
('130', 0.04487521666602899), ('130', 0.04487521666602899),
('##n', 0.04476191426913992), ('##n', 0.04476191426913992),
('ils', 0.044544039157585216), ('ils', 0.044544039157585216),
('aujourd', 0.04447144001423532), ('aujourd', 0.04447144001423532),
('fort', 0.04442544031371985), ('fort', 0.04442544031371985),
('temple', 0.044018212177904886), ('temple', 0.044018212177904886),
('##ge', 0.04400356682753303), ('##ge', 0.04400356682753303),
('M', 0.04357113158723378), ('M', 0.04357113158723378),
('##que', 0.04353082158018988), ('##que', 0.04353082158018988),
('va', 0.04337036149864036), ('va', 0.04337036149864036),
('##a', 0.04319994977377055), ('##a', 0.04319994977377055),
('##tam', 0.04319673648672328), ('##tam', 0.04319673648672328),
('C', 0.04306076808452276), ('C', 0.04306076808452276),
('Pt', 0.04241563924208806), ('Pt', 0.04241563924208806),
('##rab', 0.0423110927679243), ('##rab', 0.0423110927679243),
('##rée', 0.042257144545369996), ('##rée', 0.042257144545369996),
('##iot', 0.04203676192154972), ('##iot', 0.04203676192154972),
('The', 0.04191806043439661), ('The', 0.04191806043439661),
('aussi', 0.04182030961442956), ('aussi', 0.04182030961442956),
('moderne', 0.04177711856360317), ('moderne', 0.04177711856360317),
('pr', 0.04163503255558388), ('pr', 0.04163503255558388),
('##rro', 0.041311241708329303), ('##rro', 0.041311241708329303),
('(', 0.0413040601926874), ('(', 0.0413040601926874),
('St', 0.04123715089841821), ('St', 0.04123715089841821),
('an', 0.041146473695852476), ('an', 0.041146473695852476),
('fruits', 0.04113575313188026), ('fruits', 0.04113575313188026),
('plus', 0.04112507132732335), ('plus', 0.04112507132732335),
('##ché', 0.040774048966399426), ('##ché', 0.040774048966399426),
('o', 0.04070060977171756), ('o', 0.04070060977171756),
('soi', 0.040668276790300634), ('soi', 0.040668276790300634),
('X', 0.04064410422404537), ('X', 0.04064410422404537),
('peu', 0.03996102752601043), ('peu', 0.03996102752601043),
('Per', 0.03985223606785319), ('Per', 0.03985223606785319),
('##lante', 0.03983621993726603), ('##lante', 0.03983621993726603),
('##hé', 0.03961438526125526), ('##hé', 0.03961438526125526),
('##j', 0.03926027199963966), ('##j', 0.03926027199963966),
('##ée', 0.039227260949307896), ('##ée', 0.039227260949307896),
('##esti', 0.03883953302868402), ('##esti', 0.03883953302868402),
('##ét', 0.038606207695351155), ('##ét', 0.038606207695351155),
('##jana', 0.03818439831225535), ('##jana', 0.03818439831225535),
('##ima', 0.03813331812513627), ('##ima', 0.03813331812513627),
('##ât', 0.03806250021892484), ('##ât', 0.03806250021892484),
('ensemble', 0.037956283252757204), ('ensemble', 0.037956283252757204),
('STAR', 0.037760347726659545), ('STAR', 0.037760347726659545),
('Tage', 0.03765080030853683), ('Tage', 0.03765080030853683),
('nu', 0.03745403392222305), ('nu', 0.03745403392222305),
('hui', 0.03734807021915644), ('hui', 0.03734807021915644),
('Le', 0.03715890519752807), ('Le', 0.03715890519752807),
('##us', 0.03713196635047464), ('##us', 0.03713196635047464),
('leurs', 0.03687680836802711), ('leurs', 0.03687680836802711),
('cont', 0.03657382982964496), ('cont', 0.03657382982964496),
('##l', 0.036545777864497975), ('##l', 0.036545777864497975),
('Sir', 0.03642712798689307), ('Sir', 0.03642712798689307),
('L', 0.03595023525894895), ('L', 0.03595023525894895),
('7', 0.03584389050476482), ('7', 0.03584389050476482),
('ferme', 0.03580818214260307), ('ferme', 0.03580818214260307),
('forti', 0.03549622233435284), ('forti', 0.03549622233435284),
('aux', 0.035433950894822305), ('aux', 0.035433950894822305),
('au', 0.034992091061273066), ('au', 0.034992091061273066),
('##ne', 0.0347530163976298), ('##ne', 0.0347530163976298),
('vont', 0.03470655996970495), ('vont', 0.03470655996970495),
('être', 0.0343762027708024), ('être', 0.0343762027708024),
('1553', 0.034349353274837756), ('1553', 0.034349353274837756),
('dé', 0.03434818892830247), ('dé', 0.03434818892830247),
('##sh', 0.03433270961563577), ('##sh', 0.03433270961563577),
('Lisbon', 0.034290688839724325), ('Lisbon', 0.034290688839724325),
('Est', 0.034150407829088125), ('Est', 0.034150407829088125),
('##z', 0.03365499782170345), ('##z', 0.03365499782170345),
('##e', 0.03364232916886872), ('##e', 0.03364232916886872),
('mourut', 0.03360980244865721), ('mourut', 0.03360980244865721),
('##toi', 0.03342140264875253), ('##toi', 0.03342140264875253),
('un', 0.03322588462984368), ('un', 0.03322588462984368),
('P', 0.03272049148432894), ('P', 0.03272049148432894),
('##ab', 0.032563279819096465), ('##ab', 0.032563279819096465),
('Ar', 0.032402384389004744), ('Ar', 0.032402384389004744),
('##OT', 0.03221971618476101), ('##OT', 0.03221971618476101),
('avo', 0.032212276900181), ('avo', 0.032212276900181),
('sa', 0.032102001051091615), ('sa', 0.032102001051091615),
('##ae', 0.03185063483930072), ('##ae', 0.03185063483930072),
('Hong', 0.03178504105089397), ('Hong', 0.03178504105089397),
('on', 0.031637879100526636), ('on', 0.031637879100526636),
('toile', 0.03155735621178085), ('toile', 0.03155735621178085),
('belle', 0.03147662648632047), ('belle', 0.03147662648632047),
('##bal', 0.03142378925271683), ('##bal', 0.03142378925271683),
('5', 0.0313537019882558), ('5', 0.0313537019882558),
('grain', 0.03124989952518535), ('grain', 0.03124989952518535),
('pou', 0.03124793207062872), ('pou', 0.03124793207062872),
('##ac', 0.031224938425030026), ('##ac', 0.031224938425030026),
('é', 0.031069540958438446), ('é', 0.031069540958438446),
('##ia', 0.031040617643215574), ('##ia', 0.031040617643215574),
('R', 0.030949942486234276), ('R', 0.030949942486234276),
('Tra', 0.030808153025433653), ('Tra', 0.030808153025433653),
('différentes', 0.030718941715107615), ('différentes', 0.030718941715107615),
('MO', 0.030683540302786836), ('MO', 0.030683540302786836),
('2°', 0.030522230876155416), ('2°', 0.030522230876155416),
('vin', 0.030361970705734747), ('vin', 0.030361970705734747),
('##dou', 0.030107317328042935), ('##dou', 0.030107317328042935),
('plu', 0.03005047616189665), ('plu', 0.03005047616189665),
('##oi', 0.02989229975500687), ('##oi', 0.02989229975500687),
('##thus', 0.029706139074928077), ('##thus', 0.029706139074928077),
('poi', 0.0294521548866359), ('poi', 0.0294521548866359),
('VII', 0.02928677630182494), ('VII', 0.02928677630182494),
('du', 0.02917787047349929), ('du', 0.02917787047349929),
('év', 0.029168399068651336), ('év', 0.029168399068651336),
('exemple', 0.02878307063738609), ('exemple', 0.02878307063738609),
('##la', 0.028711752575946197), ('##la', 0.028711752575946197),
('##hab', 0.02854877671553812), ('##hab', 0.02854877671553812),
('##rig', 0.028440723473813656), ('##rig', 0.028440723473813656),
('Il', 0.0284042208786913), ('Il', 0.0284042208786913),
('bien', 0.0283913554226912), ('bien', 0.0283913554226912),
('##lp', 0.028348485933541584), ('##lp', 0.028348485933541584),
('##bourg', 0.02789169534010975), ('##bourg', 0.02789169534010975),
('##ton', 0.027713782580636243), ('##ton', 0.027713782580636243),
('##uri', 0.026965622143799262), ('##uri', 0.026965622143799262),
('Ra', 0.026882539673090518), ('Ra', 0.026882539673090518),
('plusieurs', 0.02669902130425908), ('plusieurs', 0.02669902130425908),
('en', 0.026650124208107327), ('en', 0.026650124208107327),
('##ssent', 0.026560785640090002), ('##ssent', 0.026560785640090002),
('ar', 0.02655491286531302), ('ar', 0.02655491286531302),
('##dia', 0.026233515028085855), ('##dia', 0.026233515028085855),
('terme', 0.02610804943109468), ('terme', 0.02610804943109468),
('##SA', 0.02594775540828238), ('##SA', 0.02594775540828238),
('##on', 0.025900328149956048), ('##on', 0.025900328149956048),
('##men', 0.025864880430748294), ('##men', 0.025864880430748294),
('donna', 0.025856769861851345), ('donna', 0.025856769861851345),
('con', 0.025748442770215568), ('con', 0.025748442770215568),
('suivant', 0.02558394558733645), ('suivant', 0.02558394558733645),
('##é', 0.02500576025356923), ('##é', 0.02500576025356923),
('Ses', 0.024714865988903954), ('Ses', 0.024714865988903954),
('##cus', 0.024522117453619832), ('##cus', 0.024522117453619832),
('fait', 0.024187752198256366), ('fait', 0.024187752198256366),
('##tr', 0.023949993283441592), ('##tr', 0.023949993283441592),
('##ade', 0.023830238910252367), ('##ade', 0.023830238910252367),
('##ziu', 0.023636246682512362), ('##ziu', 0.023636246682512362),
('celui', 0.023612546696296227), ('celui', 0.023612546696296227),
('E', 0.02353268737063088), ('E', 0.02353268737063088),
('##cer', 0.023491982765047097), ('##cer', 0.023491982765047097),
('ne', 0.023481641713517833), ('ne', 0.023481641713517833),
('Sam', 0.023408934631155117), ('Sam', 0.023408934631155117),
('##o', 0.023331497432489353), ('##o', 0.023331497432489353),
('côté', 0.023297246836922872), ('côté', 0.023297246836922872),
('##ent', 0.023156820087233715), ('##ent', 0.023156820087233715),
('Pier', 0.022936152887815472), ('Pier', 0.022936152887815472),
('Elle', 0.022927855199012262), ('Elle', 0.022927855199012262),
('##xiu', 0.022805612841692573), ('##xiu', 0.022805612841692573),
('##od', 0.022641329488828195), ('##od', 0.022641329488828195),
('n', 0.022605076586881327), ('n', 0.022605076586881327),
('##noi', 0.02239494074557525), ('##noi', 0.02239494074557525),
('puisque', 0.022114351087644566), ('puisque', 0.022114351087644566),
('##es', 0.022001482046772214), ('##es', 0.022001482046772214),
('cela', 0.021728661127459404), ('cela', 0.021728661127459404),
('##al', 0.021472918740770543), ('##al', 0.021472918740770543),
('qu', 0.021413621005676176), ('qu', 0.021413621005676176),
('##ros', 0.021349833073345013), ('##ros', 0.021349833073345013),
('dont', 0.02121929633019263), ('dont', 0.02121929633019263),
('D', 0.021142184892655142), ('D', 0.021142184892655142),
('##i', 0.020948732860884275), ('##i', 0.020948732860884275),
('Ist', 0.020901809554305005), ('Ist', 0.020901809554305005),
('bout', 0.020885765063577755), ('bout', 0.020885765063577755),
('SW', 0.02082288328341326), ('SW', 0.02082288328341326),
('soit', 0.020530716687414827), ('soit', 0.020530716687414827),
('##tend', 0.020495566508082305), ('##tend', 0.020495566508082305),
('##ect', 0.020450927253329462), ('##ect', 0.020450927253329462),
('ex', 0.020417873274345934), ('ex', 0.020417873274345934),
('CIA', 0.020303354874778645), ('CIA', 0.020303354874778645),
('Lune', 0.02024761291388681), ('Lune', 0.02024761291388681),
('vers', 0.020157214084907837), ('vers', 0.020157214084907837),
('##x', 0.02002047701965849), ('##x', 0.02002047701965849),
('loin', 0.019550798182729083), ('loin', 0.019550798182729083),
('Or', 0.019460845999311357), ('Or', 0.019460845999311357),
('après', 0.01941062923428665), ('après', 0.01941062923428665),
('##euse', 0.019031264633545753), ('##euse', 0.019031264633545753),
('##gnar', 0.018810819002694525), ('##gnar', 0.018810819002694525),
('app', 0.018555258608997136), ('app', 0.018555258608997136),
('dit', 0.018481328341026063), ('dit', 0.018481328341026063),
('Dal', 0.018382978798721323), ('Dal', 0.018382978798721323),
('##u', 0.018374918227908328), ('##u', 0.018374918227908328),
('##jano', 0.01808523851267019), ('##jano', 0.01808523851267019),
('##ie', 0.017838742894118582), ('##ie', 0.017838742894118582),
('##gent', 0.01748654204829846), ('##gent', 0.01748654204829846),
('él', 0.01741279676584186), ('él', 0.01741279676584186),
('Sar', 0.01724425985017986), ('Sar', 0.01724425985017986),
('##rie', 0.017169840154727146), ('##rie', 0.017169840154727146),
('##ject', 0.017138782093343834), ('##ject', 0.017138782093343834),
('##imo', 0.017082182198960103), ('##imo', 0.017082182198960103),
('ora', 0.017038607009944622), ('ora', 0.017038607009944622),
('##ule', 0.0166859230453766), ('##ule', 0.0166859230453766),
('jours', 0.016570340432660837), ('jours', 0.016570340432660837),
('eu', 0.01603845134764128), ('eu', 0.01603845134764128),
('Dion', 0.015754758531027112), ('Dion', 0.015754758531027112),
('sang', 0.015613060865029399), ('sang', 0.015613060865029399),
('##re', 0.015364885611094212), ('##re', 0.015364885611094212),
('##zog', 0.014949231310082899), ('##zog', 0.014949231310082899),
('U', 0.014654267300384672), ('U', 0.014654267300384672),
('##ît', 0.014602155227601584), ('##ît', 0.014602155227601584),
('avant', 0.014469418561726512), ('avant', 0.014469418561726512),
('commence', 0.014379318563949204), ('commence', 0.014379318563949204),
('##êl', 0.014337833594615367), ('##êl', 0.014337833594615367),
('##voi', 0.01431455878734083), ('##voi', 0.01431455878734083),
('Jupiter', 0.01427103838187756), ('Jupiter', 0.01427103838187756),
('souvent', 0.01412861492800689), ('souvent', 0.01412861492800689),
('danger', 0.013811198066105809), ('danger', 0.013811198066105809),
('##r', 0.013680800966903325), ('##r', 0.013680800966903325),
('Au', 0.013620171062788134), ('Au', 0.013620171062788134),
('signifie', 0.013575249323296856), ('signifie', 0.013575249323296856),
('##pt', 0.013195773759531383), ('##pt', 0.013195773759531383),
('##oe', 0.013082684704750751), ('##oe', 0.013082684704750751),
('leur', 0.012756004109524549), ('leur', 0.012756004109524549),
('##bre', 0.01200562189665623), ('##bre', 0.01200562189665623),
('toute', 0.011917089830431075), ('toute', 0.011917089830431075),
('met', 0.011327151448637373), ('met', 0.011327151448637373),
('##ies', 0.01091580579338499), ('##ies', 0.01091580579338499),
('##eta', 0.01069390934382476), ('##eta', 0.01069390934382476),
('##ns', 0.010472519252392662), ('##ns', 0.010472519252392662),
('torre', 0.010422364080421317), ('torre', 0.010422364080421317),
('ach', 0.010388292909415044), ('ach', 0.010388292909415044),
('##oso', 0.010042907154746029), ('##oso', 0.010042907154746029),
('que', 0.009635328008189985), ('que', 0.009635328008189985),
('##éra', 0.009599163212087692), ('##éra', 0.009599163212087692),
('##ucha', 0.009319170948366045), ('##ucha', 0.009319170948366045),
('##ds', 0.00923735338058978), ('##ds', 0.00923735338058978),
('##qui', 0.009204343050603915), ('##qui', 0.009204343050603915),
('##mi', 0.008906287279368003), ('##mi', 0.008906287279368003),
('Cassius', 0.008545442448074718), ('Cassius', 0.008545442448074718),
('On', 0.008215765462606122), ('On', 0.008215765462606122),
('##cle', 0.008070864174746364), ('##cle', 0.008070864174746364),
(';', 0.007984840521573309), (';', 0.007984840521573309),
('Ju', 0.0073498549683621225), ('Ju', 0.0073498549683621225),
('pas', 0.006987930951319245), ('pas', 0.006987930951319245),
('co', 0.006828431290704029), ('co', 0.006828431290704029),
('cha', 0.0067258655884391136), ('cha', 0.0067258655884391136),
('##zet', 0.0063275438230979185), ('##zet', 0.0063275438230979185),
('fine', 0.006245003686740857), ('fine', 0.006245003686740857),
('##fier', 0.006096813392549551), ('##fier', 0.006096813392549551),
('##uy', 0.006086328817223191), ('##uy', 0.006086328817223191),
('##el', 0.0060696145131198575), ('##el', 0.0060696145131198575),
('##éir', 0.0059480290294971795), ('##éir', 0.0059480290294971795),
('che', 0.005705843958191305), ('che', 0.005705843958191305),
('ca', 0.0056714294799304005), ('ca', 0.0056714294799304005),
('Am', 0.005564187960915995), ('Am', 0.005564187960915995),
('##roi', 0.005468227545142956), ('##roi', 0.005468227545142956),
('##dent', 0.00517282162953671), ('##dent', 0.00517282162953671),
('##ème', 0.004959174323142408), ('##ème', 0.004959174323142408),
('##crit', 0.004866795460923155), ('##crit', 0.004866795460923155),
('##t', 0.0048467472813908796), ('##t', 0.0048467472813908796),
('III', 0.00283531432530362), ('III', 0.00283531432530362),
('trés', 0.002824228104975018), ('trés', 0.002824228104975018),
('il', 0.0027654637201536968), ('il', 0.0027654637201536968),
('pour', 0.0021363265380875763), ('pour', 0.0021363265380875763),
('##mon', 0.002094276853030689), ('##mon', 0.002094276853030689),
('##eur', 0.0019017102256399137), ('##eur', 0.0019017102256399137),
('##oit', 0.001711933310961672), ('##oit', 0.001711933310961672),
('##bu', 0.0016690113836018768), ('##bu', 0.0016690113836018768),
('##it', 0.0015963173159964771), ('##it', 0.0015963173159964771),
('##lot', 0.0014408876850427973), ('##lot', 0.0014408876850427973),
('termine', 0.0013832185393301071), ('termine', 0.0013832185393301071),
('x', 0.0008492312750708209), ('x', 0.0008492312750708209),
('[CLS]', 0.0), ('[CLS]', 0.0),
('[SEP]', 0.0), ('[SEP]', 0.0),
('sous', -0.0002069181948599748), ('sous', -0.0002069181948599748),
('a', -0.0012715805160076403), ('a', -0.0012715805160076403),
('##ors', -0.0017502978171956062), ('##ors', -0.0017502978171956062),
('SI', -0.0025126387312130413), ('SI', -0.0025126387312130413),
('tout', -0.003079531645644194), ('tout', -0.003079531645644194),
('in', -0.003095317243436783), ('in', -0.003095317243436783),
('donne', -0.0033814500703101104), ('donne', -0.0033814500703101104),
('##este', -0.004142491388013976), ('##este', -0.004142491388013976),
('##rama', -0.004348877527747782), ('##rama', -0.004348877527747782),
('traf', -0.005273322147678258), ('traf', -0.005273322147678258),
('lati', -0.006057162419608609), ('lati', -0.006057162419608609),
('##ment', -0.0064917887675220295), ('##ment', -0.0064917887675220295),
('c', -0.006530315247605613), ('c', -0.006530315247605613),
('##d', -0.007455893077793125), ('##d', -0.007455893077793125),
('creu', -0.007475521658293897), ('creu', -0.007475521658293897),
('h', -0.007529062495164878), ('h', -0.007529062495164878),
('##vo', -0.007773218105616921), ('##vo', -0.007773218105616921),
('##illes', -0.007961146859700573), ('##illes', -0.007961146859700573),
('##nt', -0.010146726047092878), ('##nt', -0.010146726047092878),
('portent', -0.010660914154466471), ('portent', -0.010660914154466471),
('Quo', -0.011600711502630034), ('Quo', -0.011600711502630034),
('XI', -0.011949950511950972), ('XI', -0.011949950511950972),
('##ique', -0.014304967428463296), ('##ique', -0.014304967428463296),
('liv', -0.01661384524933283), ('liv', -0.01661384524933283),
('##ades', -0.01685895398059224), ('##ades', -0.01685895398059224),
('selon', -0.020082683114963043), ('selon', -0.020082683114963043),
('##ifier', -0.024313910709926605), ('##ifier', -0.024313910709926605),
('1°', -0.026381150057445416), ('1°', -0.026381150057445416),
('##ure', -0.029633249072781927), ('##ure', -0.029633249072781927),
('seul', -0.0305767799476541), ('seul', -0.0305767799476541),
('tem', -0.03507853370176394), ('tem', -0.03507853370176394),
('accord', -0.0385442098622452), ('accord', -0.0385442098622452),
(':', -0.04107125437938059), (':', -0.04107125437938059),
('born', -0.041898895862637786), ('born', -0.041898895862637786),
('to', -0.043587086241707595), ('to', -0.043587086241707595),
('y', -0.044396462419225395), ('y', -0.044396462419225395),
('audience', -0.045768989771362824), ('audience', -0.045768989771362824),
('S', -0.04602159510840381), ('S', -0.04602159510840381),
('let', -0.0461111572568852), ('let', -0.0461111572568852),
('sentiment', -0.048171146031505886), ('sentiment', -0.048171146031505886),
('nach', -0.059647272844616725), ('nach', -0.059647272844616725),
(')', -0.06872271549404119), (')', -0.06872271549404119),
('SAR', -0.07143494474675065), ('SAR', -0.07143494474675065),
('Torre', -0.07370789174796667), ('Torre', -0.07370789174796667),
('&', -0.08000801041182604), ('&', -0.08000801041182604),
('Comme', -0.08102775997694327), ('Comme', -0.08102775997694327),
('Port', -0.1040613277987558)] ('Port', -0.1040613277987558)]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
l = [(k, v) for k, v in d['Histoire'].items()] l = [(k, v) for k, v in d['Histoire'].items()]
l.sort(key=lambda a: a[1], reverse = True) l.sort(key=lambda a: a[1], reverse = True)
l l
``` ```
%% Output %% Output
[('m', 0.40633566091131035), [('m', 0.40633566091131035),
('nom', 0.3458214847350235), ('nom', 0.3458214847350235),
('historiens', 0.30862924390515073), ('historiens', 0.30862924390515073),
('s', 0.3077777094666005), ('s', 0.3077777094666005),
('Angleterre', 0.2337920536519143), ('Angleterre', 0.2337920536519143),
('##ient', 0.19580712386703672), ('##ient', 0.19580712386703672),
('comté', 0.14833858844766837), ('comté', 0.14833858844766837),
('second', 0.14820616290018154), ('second', 0.14820616290018154),
('pl', 0.1463281902011242), ('pl', 0.1463281902011242),
('poste', 0.14323795796283648), ('poste', 0.14323795796283648),
('Hist', 0.14218800778526514), ('Hist', 0.14218800778526514),
('##ito', 0.13969328688970806), ('##ito', 0.13969328688970806),
('##sita', 0.1280014345411447), ('##sita', 0.1280014345411447),
('hab', 0.11652014547079982), ('hab', 0.11652014547079982),
('comme', 0.11188170814742139), ('comme', 0.11188170814742139),
('anciens', 0.11027232859611118), ('anciens', 0.11027232859611118),
('##usi', 0.10947085362788891), ('##usi', 0.10947085362788891),
('nos', 0.10762903275291207), ('nos', 0.10762903275291207),
('##X', 0.1005805525354637), ('##X', 0.1005805525354637),
('##c', 0.10015186438256996), ('##c', 0.10015186438256996),
('an', 0.09648203036131225), ('an', 0.09648203036131225),
('affect', 0.09530701819855351), ('affect', 0.09530701819855351),
('vient', 0.09334766704470135), ('vient', 0.09334766704470135),
('##E', 0.09274172459154978), ('##E', 0.09274172459154978),
('occupa', 0.08942963262827937), ('occupa', 0.08942963262827937),
('And', 0.08824648677383519), ('And', 0.08824648677383519),
('l', 0.0831444108231522), ('l', 0.0831444108231522),
('créé', 0.08306820589587903), ('créé', 0.08306820589587903),
('Il', 0.0802525747513547), ('Il', 0.0802525747513547),
('##alo', 0.07944165678487682), ('##alo', 0.07944165678487682),
('fils', 0.07878572633572721), ('fils', 0.07878572633572721),
('##crive', 0.07730113556902), ('##crive', 0.07730113556902),
('f', 0.07687161513015221), ('f', 0.07687161513015221),
('duché', 0.07152755021320736), ('duché', 0.07152755021320736),
('##UL', 0.068920897640158), ('##UL', 0.068920897640158),
('##U', 0.06701168109866709), ('##U', 0.06701168109866709),
('##TA', 0.06379073714776834), ('##TA', 0.06379073714776834),
('partie', 0.06336604967850502), ('partie', 0.06336604967850502),
('une', 0.06293001400537444), ('une', 0.06293001400537444),
('la', 0.0626827034395569), ('la', 0.0626827034395569),
('##NI', 0.06060879691257725), ('##NI', 0.06060879691257725),
('duc', 0.060449964135058896), ('duc', 0.060449964135058896),
('Portugal', 0.05976240345910968), ('Portugal', 0.05976240345910968),
('##S', 0.057922495862996685), ('##S', 0.057922495862996685),
('armes', 0.05763547000153224), ('armes', 0.05763547000153224),
('##É', 0.057597313355056134), ('##É', 0.057597313355056134),
('premier', 0.05650131136433099), ('premier', 0.05650131136433099),
('roi', 0.05595339444458686), ('roi', 0.05595339444458686),
("'", 0.054559329877048986), ("'", 0.054559329877048986),
('Romains', 0.05104433937584386), ('Romains', 0.05104433937584386),
('##CI', 0.04991247745965172), ('##CI', 0.04991247745965172),
('dans', 0.04683541514897137), ('dans', 0.04683541514897137),
('posse', 0.04601685231132299), ('posse', 0.04601685231132299),
('##étique', 0.04578876328373981), ('##étique', 0.04578876328373981),
('fut', 0.04558935519068275), ('fut', 0.04558935519068275),
('tem', 0.04553165486762242), ('tem', 0.04553165486762242),
('mod', 0.04336368647999369), ('mod', 0.04336368647999369),
('sa', 0.04312663582043505), ('sa', 0.04312663582043505),
('##ond', 0.040984157917012765), ('##ond', 0.040984157917012765),
('avo', 0.04051199330086981), ('avo', 0.04051199330086981),
('femme', 0.03757626958787695), ('femme', 0.03757626958787695),
('Clare', 0.036022498192356825), ('Clare', 0.036022498192356825),
(')', 0.035690660988266225), (')', 0.035690660988266225),
('c', 0.035580215549090806), ('c', 0.035580215549090806),
('trois', 0.03534531063748135), ('trois', 0.03534531063748135),
('##e', 0.0350972190161163), ('##e', 0.0350972190161163),
('terre', 0.03289370991419495), ('terre', 0.03289370991419495),
('é', 0.03272807035735632), ('é', 0.03272807035735632),
('François', 0.03239359465926122), ('François', 0.03239359465926122),
('Espagne', 0.032368113353630164), ('Espagne', 0.032368113353630164),
('devenu', 0.03229911266839503), ('devenu', 0.03229911266839503),
('Vo', 0.03188328560089204), ('Vo', 0.03188328560089204),
('##EN', 0.031208214635954697), ('##EN', 0.031208214635954697),
('III', 0.030991483761359837), ('III', 0.030991483761359837),
('h', 0.030069242575632493), ('h', 0.030069242575632493),
('Roi', 0.030003469570913023), ('Roi', 0.030003469570913023),
('##ye', 0.02903003695461631), ('##ye', 0.02903003695461631),
('à', 0.028589994974646694), ('à', 0.028589994974646694),
('Clarence', 0.028489464477414392), ('Clarence', 0.028489464477414392),
('IV', 0.028143981425416645), ('IV', 0.028143981425416645),
('##nie', 0.02768313810342139), ('##nie', 0.02768313810342139),
('étant', 0.026340927727638207), ('étant', 0.026340927727638207),
('##sse', 0.026016168412132286), ('##sse', 0.026016168412132286),
('##it', 0.022331074154075873), ('##it', 0.022331074154075873),
('de', 0.021088133952241003), ('de', 0.021088133952241003),
('Edouard', 0.020242216027167314), ('Edouard', 0.020242216027167314),
('(', 0.019469240875565032), ('(', 0.019469240875565032),
('un', 0.01930394072485968), ('un', 0.01930394072485968),
('##IE', 0.018411360404024364), ('##IE', 0.018411360404024364),
('Thom', 0.017349739492096148), ('Thom', 0.017349739492096148),
('ce', 0.01719950485751714), ('ce', 0.01719950485751714),
('##ur', 0.016483046513021214), ('##ur', 0.016483046513021214),
('##nt', 0.016016739337704203), ('##nt', 0.016016739337704203),
('##chu', 0.015601898501551158), ('##chu', 0.015601898501551158),
('lui', 0.014232570873439493), ('lui', 0.014232570873439493),
('d', 0.012780386271698287), ('d', 0.012780386271698287),
('app', 0.012536523456252611), ('app', 0.012536523456252611),
('##s', 0.011660356091300485), ('##s', 0.011660356091300485),
('Ce', 0.010793171581974913), ('Ce', 0.010793171581974913),
('le', 0.010778818997477383), ('le', 0.010778818997477383),
('##ée', 0.009759282079499483), ('##ée', 0.009759282079499483),
('##oit', 0.008672358830146106), ('##oit', 0.008672358830146106),
('##arten', 0.008646284725542878), ('##arten', 0.008646284725542878),
('il', 0.005702086021636536), ('il', 0.005702086021636536),
('##NS', 0.005693935541883908), ('##NS', 0.005693935541883908),
('en', 0.0043551674016225304), ('en', 0.0043551674016225304),
('##éra', 0.004037004014395021), ('##éra', 0.004037004014395021),
('##ort', 0.003902753055284708), ('##ort', 0.003902753055284708),
('##CE', 0.0003593292110211764), ('##CE', 0.0003593292110211764),
('[CLS]', 0.0), ('[CLS]', 0.0),
('[SEP]', 0.0), ('[SEP]', 0.0),
('##ut', -9.915939616175563e-05), ('##ut', -9.915939616175563e-05),
('##réa', -0.0010687617835396714), ('##réa', -0.0010687617835396714),
('##ieme', -0.007155774598079843), ('##ieme', -0.007155774598079843),
('##z', -0.008830385943257218), ('##z', -0.008830385943257218),
('au', -0.010348566917927805), ('au', -0.010348566917927805),
('Lionel', -0.013749864105647325), ('Lionel', -0.013749864105647325),
('##é', -0.015446660519507572), ('##é', -0.015446660519507572),
('Lu', -0.020325085739928896), ('Lu', -0.020325085739928896),
('que', -0.020383708539975235), ('que', -0.020383708539975235),
('TU', -0.021388873125663347), ('TU', -0.021388873125663347),
('-', -0.025802582476853374), ('-', -0.025802582476853374),
('des', -0.027998968502838124), ('des', -0.027998968502838124),
('##AR', -0.029229992016931022), ('##AR', -0.029229992016931022),
('B', -0.029730676972465106), ('B', -0.029730676972465106),
('##RD', -0.030606004670801854), ('##RD', -0.030606004670801854),
('du', -0.04385291573716607), ('du', -0.04385291573716607),
('.', -0.04457615506556049), ('.', -0.04457615506556049),
('no', -0.047490509302559034), ('no', -0.047490509302559034),
('ou', -0.04832152919030737), ('ou', -0.04832152919030737),
('qui', -0.05547125005789886), ('qui', -0.05547125005789886),
('CL', -0.07023955246241759), ('CL', -0.07023955246241759),
(',', -0.07295463363450089), (',', -0.07295463363450089),
('peuples', -0.15972786319846213), ('peuples', -0.15972786319846213),
('mariage', -0.2230970846131238), ('mariage', -0.2230970846131238),
('&', -0.2716457925014497)] ('&', -0.2716457925014497)]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment