predict.py

#!/usr/bin/env python3
import numpy
import pandas
import pickle
import sklearn
from sys import argv
import torch
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer, TextClassificationPipeline

class Classifier:
    """
    A class wrapping all the different models and classes used throughout a
    classification task:

        - tokenizer
        - classifier
        - pipeline
        - label encoder

    Once created, it behaves as a function which you apply to a generator
    containing the texts to classify
    """
    def __init__(self, root_path):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self._init_tokenizer()
        self._init_model(root_path)
        self._init_pipe()
        self._init_encoder(f"{root_path}/label_encoder.pkl")
        self.log()

    def _init_model(self, path):
        bert = BertForSequenceClassification.from_pretrained(path)
        self.model = bert.to(self.device.type)

    def _init_tokenizer(self):
        model_name = 'bert-base-multilingual-cased'
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def _init_pipe(self):
        self.pipe = TextClassificationPipeline(
            model=self.model,
            tokenizer=self.tokenizer,
            return_all_scores=True,
            device=self.device)

    def _init_encoder(self, path):
        with open(path, 'rb') as pickled:
            self.encoder = pickle.load(pickled)

    def log(self):
        if self.device.type == 'cpu':
            print('No GPU available, using the CPU instead.')
        else:
            print('We will use the GPU:', torch.cuda.get_device_name(0))

    def __call__(self, text_generator):
        tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
        predictions = []
        for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
            byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
            predictions.append([int(byScoreDesc[0]['label'][6:]),
                                byScoreDesc[0]['score'],
                                int(byScoreDesc[1]['label'][6:])])
        predictions = numpy.array(predictions)
        return list(self.encoder.inverse_transform(predictions[:,0].astype(int)))

class Source:
    """
    A class to handle the normalised path used in the project and loading the
    actual text input as a generator from records when they are needed
    """
    def __init__(self, root_path):
        """
        Positional arguments
        :param root_path: the path to a GÉODE-style folder containing the text
        version of the corpus on which to predict the classes
        """
        self.root_path = root_path

    def path_to(self, record):
        article_relative_path = "{work}/T{volume}/{article}".format(**record)
        prefix = f"{self.root_path}/{article_relative_path}"
        if 'paragraph' in record:
            return f"{prefix}/{record.paragraph}.txt"
        else:
            return f"{prefix}.txt"

    def load_text(self, record):
        with open(self.path_to(record), 'r') as file:
            return file.read()

    def iterate(self, records):
        for _, record in records.iterrows():
            yield self.load_text(record)

def label(classify, source, tsv_path, name='label'):
    """
    Make predictions on a set of document

    Positional arguments
    :param classify: an instance of the Classifier class above
    :param source: an instance of the Source class above
    :param tsv_path: the path to a TSV file containing (at least) article or
    paragraph records (additional metadata will be ignored)

    Keyword arguments
    :param name: defaults to 'label' — the name of the column to be created, that is
    to say, the name of the category you are predicting with your model (if your
    model labels in "Red", "Green", or "Blue", you may want to use
    `name='color'`).

    :return: a panda dataframe containing the records from the input TSV file plus
    an additional column
    """
    records = pandas.read_csv(tsv_path, sep='\t')
    records[name] = classify(source.iterate(records))
    return records

if __name__ == '__main__':
    classify = Classifier(argv[1])
    source = Source(argv[2])
    label(classify, source, argv[3]).to_csv(argv[4], sep='\t', index=False)