diff --git a/scripts/Predict_LGE.py b/scripts/Predict_LGE.py new file mode 100644 index 0000000000000000000000000000000000000000..a20544d9b24ae18cb41a11dcb4edea5cd4509102 --- /dev/null +++ b/scripts/Predict_LGE.py @@ -0,0 +1,206 @@ +import os +import pandas as pd +import numpy as np +import pickle +import torch +import tqdm + +from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, CamembertForSequenceClassification +from torch.utils.data import TensorDataset, DataLoader, SequentialSampler + + +def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512): + + # Tokenize all of the sentences and map the tokens to thier word IDs. + input_ids_test = [] + # For every sentence... + for sent in sentences: + # `encode` will: + # (1) Tokenize the sentence. + # (2) Prepend the `[CLS]` token to the start. + # (3) Append the `[SEP]` token to the end. + # (4) Map tokens to their IDs. + encoded_sent = tokenizer.encode( + sent, # Sentence to encode. + add_special_tokens = True, # Add '[CLS]' and '[SEP]' + # This function also supports truncation and conversion + # to pytorch tensors, but I need to do padding, so I + # can't use these features. + #max_length = max_len, # Truncate all sentences. + #return_tensors = 'pt', # Return pytorch tensors. + ) + input_ids_test.append(encoded_sent) + + # Pad our input tokens + padded_test = [] + for i in input_ids_test: + if len(i) > max_len: + padded_test.extend([i[:max_len]]) + else: + padded_test.extend([i + [0] * (max_len - len(i))]) + input_ids_test = np.array(padded_test) + + # Create attention masks + attention_masks = [] + + # Create a mask of 1s for each token followed by 0s for padding + for seq in input_ids_test: + seq_mask = [float(i>0) for i in seq] + attention_masks.append(seq_mask) + + # Convert to tensors. + inputs = torch.tensor(input_ids_test) + masks = torch.tensor(attention_masks) + #set batch size + + # Create the DataLoader. + data = TensorDataset(inputs, masks) + prediction_sampler = SequentialSampler(data) + + return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size) + + +def predict(model, dataloader, device): + + # Put model in evaluation mode + model.eval() + + # Tracking variables + predictions_test , true_labels = [], [] + pred_labels_ = [] + # Predict + for batch in dataloader: + # Add batch to GPU + batch = tuple(t.to(device) for t in batch) + + # Unpack the inputs from the dataloader + b_input_ids, b_input_mask = batch + + # Telling the model not to compute or store gradients, saving memory and + # speeding up prediction + with torch.no_grad(): + # Forward pass, calculate logit predictions + outputs = model(b_input_ids, token_type_ids=None, + attention_mask=b_input_mask) + logits = outputs[0] + #print(logits) + + # Move logits and labels to CPU ??? + logits = logits.detach().cpu().numpy() + #print(logits) + + # Store predictions and true labels + predictions_test.append(logits) + + pred_labels = [] + + for i in range(len(predictions_test)): + # The predictions for this batch are a 2-column ndarray (one column for "0" + # and one column for "1"). Pick the label with the highest value and turn this + # in to a list of 0s and 1s. + pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten() + pred_labels.append(pred_labels_i) + + pred_labels_ += [item for sublist in pred_labels for item in sublist] + return pred_labels_ + + +def text_folder_to_dataframe(path): + + data = [] + # id,tome,filename,nb_words,content,domain + + for tome in sorted(os.listdir(path)): + for article in sorted(os.listdir(path + "/" + tome)): + filename = article[:-4] + id = tome + filename + + if article[-4:] == ".txt": + with open(path + "/" + tome + "/" + article) as f: + content = f.read() + + data.append([id, tome, filename, content, len(content.split(' '))]) + + return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words']) + + +if __name__ == '__main__': + + # If there's a GPU available... + if torch.cuda.is_available(): + # Tell PyTorch to use the GPU. + device = torch.device("cuda") + gpu_name = "cuda" + print('There are %d GPU(s) available.' % torch.cuda.device_count()) + print('We will use the GPU:', torch.cuda.get_device_name(0)) + # for MacOS + elif torch.backends.mps.is_available() and torch.backends.mps.is_built(): + device = torch.device("mps") + gpu_name = "mps" + print('We will use the GPU') + else: + device = torch.device("cpu") + gpu_name = "cpu" + print('No GPU available, using the CPU instead.') + + + ############# + ## Load data + print("* Load data") + + path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text/" + + + df_LGE = text_folder_to_dataframe(path) + #df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t") + data_LGE = df_LGE["content"].values + + #df_LGE.head() + #df_LGE.shape + + ############# + ## Load model + print("* Load model") + + model_name = "bert-base-multilingual-cased" + #model_name = "camembert-base" + model_path = path + "models/model_" + model_name + "_s10000.pt" + + if model_name == 'bert-base-multilingual-cased' : + print('Loading Bert Tokenizer...') + tokenizer = BertTokenizer.from_pretrained(model_name) + elif model_name == 'camembert-base': + print('Loading Camembert Tokenizer...') + tokenizer = CamembertTokenizer.from_pretrained(model_name) + + data_loader = generate_dataloader(tokenizer, data_LGE) + + model = BertForSequenceClassification.from_pretrained(model_path).to(gpu_name) + + ############# + ## Predict + print("* Predict") + + pred = predict(model, data_loader, device) + + encoder_filename = "models/label_encoder.pkl" + with open(path+encoder_filename, 'rb') as file: + encoder = pickle.load(file) + + p2 = list(encoder.inverse_transform(pred)) + + df_LGE['domain'] = p2 + + + ############# + ## Save results + filepath = path + "results_LGE/metadata-withContent.csv" + print("* Save results: ", filepath) + df_LGE.to_csv(filepath, sep="\,") + + df_LGE.drop(columns=['content'], inplace=True) + filepath = path + "results_LGE/metadata.csv" + print("* Save results: ", filepath) + df_LGE.to_csv(filepath, sep="\,") + +