Create utils.py

85b9486f · Ludovic Moncla · b8b339ab · 85b9486f
Commit 85b9486f authored 2 years ago by Ludovic Moncla
--- a/scripts/utils.py
+++ b/scripts/utils.py
+import numpy as np
+import torch
+from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
+from tqdm import tqdm
+import os
+import pandas as pd 
+
+
+def generate_dataloader(tokenizer, sentences, batch_size = 8, max_len = 512):
+
+    # Tokenize all of the sentences and map the tokens to thier word IDs.
+    input_ids_test = []
+    # For every sentence...
+    for sent in sentences:
+        # `encode` will:
+        #   (1) Tokenize the sentence.
+        #   (2) Prepend the `[CLS]` token to the start.
+        #   (3) Append the `[SEP]` token to the end.
+        #   (4) Map tokens to their IDs.
+        encoded_sent = tokenizer.encode(
+                            sent,                      # Sentence to encode.
+                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
+                            # This function also supports truncation and conversion
+                            # to pytorch tensors, but I need to do padding, so I
+                            # can't use these features.
+                            #max_length = max_len,          # Truncate all sentences.
+                            #return_tensors = 'pt',     # Return pytorch tensors.
+                    )
+        input_ids_test.append(encoded_sent)
+
+    # Pad our input tokens
+    padded_test = []
+    for i in input_ids_test:
+        if len(i) > max_len:
+            padded_test.extend([i[:max_len]])
+        else:
+            padded_test.extend([i + [0] * (max_len - len(i))])
+    input_ids_test = np.array(padded_test)
+
+    # Create attention masks
+    attention_masks = []
+
+    # Create a mask of 1s for each token followed by 0s for padding
+    for seq in input_ids_test:
+        seq_mask = [float(i>0) for i in seq]
+        attention_masks.append(seq_mask)
+
+    # Convert to tensors.
+    inputs = torch.tensor(input_ids_test)
+    masks = torch.tensor(attention_masks)
+    #set batch size
+
+    # Create the DataLoader.
+    data = TensorDataset(inputs, masks)
+    prediction_sampler = SequentialSampler(data)
+
+    return DataLoader(data, sampler=prediction_sampler, batch_size=batch_size)
+
+
+def predict(model, dataloader, device):
+
+    # Put model in evaluation mode
+    model.eval()
+
+    # Tracking variables
+    predictions_test , true_labels = [], []
+    pred_labels_ = []
+    # Predict
+    for batch in dataloader:
+    # Add batch to GPU
+        batch = tuple(t.to(device) for t in batch)
+
+        # Unpack the inputs from the dataloader
+        b_input_ids, b_input_mask = batch
+
+        # Telling the model not to compute or store gradients, saving memory and
+        # speeding up prediction
+        with torch.no_grad():
+            # Forward pass, calculate logit predictions
+            outputs = model(b_input_ids, token_type_ids=None,
+                            attention_mask=b_input_mask)
+        logits = outputs[0]
+        #print(logits)
+
+        # Move logits and labels to CPU ???
+        logits = logits.detach().cpu().numpy()
+        #print(logits)
+
+        # Store predictions and true labels
+        predictions_test.append(logits)
+
+        pred_labels = []
+        
+        for i in range(len(predictions_test)):
+            # The predictions for this batch are a 2-column ndarray (one column for "0"
+            # and one column for "1"). Pick the label with the highest value and turn this
+            # in to a list of 0s and 1s.
+            pred_labels_i = np.argmax(predictions_test[i], axis=1).flatten()
+            pred_labels.append(pred_labels_i)
+
+    pred_labels_ += [item for sublist in pred_labels for item in sublist]
+    return pred_labels_
+
+
+def text_folder_to_dataframe(path):
+
+  data = []
+  # id,tome,filename,nb_words,content,domain
+
+  for tome in sorted(os.listdir(path)):
+    try:
+        for article in tqdm(sorted(os.listdir(path + "/" + tome))):
+            filename = article[:-4]
+            id = tome + filename
+
+            if article[-4:] == ".txt":
+                with open(path + "/" + tome + "/" + article) as f:
+                    content = f.read()
+
+                    data.append([id, tome, filename, content, len(content.split(' '))])
+    except NotADirectoryError:
+        pass
+  return pd.DataFrame(data, columns=['id', 'tome', 'filename', 'content', 'nb_words'])