Create Predict_LGE.py

01434873 · Ludovic Moncla · cafa9021 · 01434873
Commit 01434873 authored 2 years ago by Ludovic Moncla
--- a/scripts/Predict_LGE.py
+++ b/scripts/Predict_LGE.py
+import os
+import pandas as pd 
+import numpy as np
+import pickle 
+import torch
+import tqdm
+
+from transformers import BertTokenizer, BertForSequenceClassification, CamembertTokenizer, CamembertForSequenceClassification
+from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
+
+
+def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                            # This function also supports truncation and conversion
+                            # to pytorch tensors, but I need to do padding, so I
+                            # can't use these features.
+                            #max_length = max_len,          # Truncate all sentences.
+                            #return_tensors = 'pt',     # Return pytorch tensors.
+                    )
+        input_ids_test.append(encoded_sent)
+
+    # Pad our input tokens
+    padded_test = []
+    for i in input_ids_test:
+        if len(i) > max_len:
+            padded_test.extend([i[:max_len]])
+        else:
+            padded_test.extend([i + [0] * (max_len - len(i))])
+    input_ids_test = np.array(padded_test)
+
+    # Create attention masks
+    attention_masks = []
+
+    # Create a mask of 1s for each token followed by 0s for padding
+    for seq in input_ids_test:
+        seq_mask = [float(i>0) for i in seq]
+        attention_masks.append(seq_mask)
+
+    # Convert to tensors.
+    inputs = torch.tensor(input_ids_test)
+    masks = torch.tensor(attention_masks)
+    #set batch size
+
+    # Create the DataLoader.
+    data = TensorDataset(inputs, masks)
+    prediction_sampler = SequentialSampler(data)
+
+    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)
+
+
+def predict(model, dataloader, device):
+
+    # Put model in evaluation mode
+    model.eval()
+
+    # Tracking variables
+    predictions_test , true_labels = [], []
+    pred_labels_ = []
+    # Predict
+    for batch in dataloader:
+    # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+
+        # Unpack the inputs from the dataloader
+        b_input_ids, b_input_mask = batch
+
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up prediction
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+        logits = outputs[0]
+        #print(logits)
+
+        # Move logits and labels to CPU ???
+        logits = logits.detach().cpu().numpy()
+        #print(logits)
+
+        # Store predictions and true labels
+        predictions_test.append(logits)
+
+        pred_labels = []
+        
+        for i in range(len(predictions_test)):
+            # The predictions for this batch are a 2-column ndarray (one column for "0"
+            # and one column for "1"). Pick the label with the highest value and turn this
+            # in to a list of 0s and 1s.
+            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
+            pred_labels.append(pred_labels_i)
+
+    pred_labels_ += [item for sublist in pred_labels for item in sublist]
+    return pred_labels_
+
+
+def text_folder_to_dataframe(path):
+
+  data = []
+  # id,tome,filename,nb_words,content,domain
+
+  for tome in sorted(os.listdir(path)):
+    for article in sorted(os.listdir(path + "/" + tome)):
+        filename = article[:-4]
+        id = tome + filename
+
+        if article[-4:] == ".txt":
+            with open(path + "/" + tome + "/" + article) as f:
+                content = f.read()
+
+                data.append([id, tome, filename, content, len(content.split(' '))])
+       
+  return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words'])
+
+
+if __name__ == '__main__':
+
+  # If there's a GPU available...
+  if torch.cuda.is_available():    
+      # Tell PyTorch to use the GPU.    
+      device = torch.device("cuda")
+      gpu_name = "cuda"
+      print('There are %d GPU(s) available.' % torch.cuda.device_count())
+      print('We will use the GPU:', torch.cuda.get_device_name(0))
+  # for MacOS
+  elif torch.backends.mps.is_available() and torch.backends.mps.is_built():
+      device = torch.device("mps")
+      gpu_name = "mps"
+      print('We will use the GPU')
+  else:
+      device = torch.device("cpu")
+      gpu_name = "cpu"
+      print('No GPU available, using the CPU instead.')
+
+
+  #############
+  ## Load data
+  print("* Load data")
+
+  path = "/Users/lmoncla/Documents/Data/Corpus/LGE/Text/"
+
+
+  df_LGE = text_folder_to_dataframe(path)
+  #df_LGE = pd.read_csv(path + "data/LGE_withContent.tsv", sep="\t")
+  data_LGE = df_LGE["content"].values
+
+  #df_LGE.head()
+  #df_LGE.shape
+
+  #############
+  ## Load model
+  print("* Load model")
+
+  model_name = "bert-base-multilingual-cased"
+  #model_name = "camembert-base"
+  model_path = path + "models/model_" + model_name + "_s10000.pt"
+
+  if model_name == 'bert-base-multilingual-cased' :
+    print('Loading Bert Tokenizer...')
+    tokenizer = BertTokenizer.from_pretrained(model_name)
+  elif model_name == 'camembert-base':
+      print('Loading Camembert Tokenizer...')
+      tokenizer = CamembertTokenizer.from_pretrained(model_name)
+
+  data_loader = generate_dataloader(tokenizer, data_LGE)
+
+  model = BertForSequenceClassification.from_pretrained(model_path).to(gpu_name)
+
+  #############
+  ## Predict
+  print("* Predict")
+
+  pred = predict(model, data_loader, device)
+
+  encoder_filename = "models/label_encoder.pkl"
+  with open(path+encoder_filename, 'rb') as file:
+        encoder = pickle.load(file)
+  
+  p2 = list(encoder.inverse_transform(pred))
+
+  df_LGE['domain'] = p2
+
+
+  #############
+  ## Save results
+  filepath = path + "results_LGE/metadata-withContent.csv"
+  print("* Save results: ", filepath)
+  df_LGE.to_csv(filepath, sep="\,")
+
+  df_LGE.drop(columns=['content'], inplace=True)
+  filepath = path + "results_LGE/metadata.csv"
+  print("* Save results: ", filepath)
+  df_LGE.to_csv(filepath, sep="\,")
+
+