From 7799b94cc843a88045840b6c797e6df99e43a46a Mon Sep 17 00:00:00 2001 From: Alice BRENON <alice.brenon@ens-lyon.fr> Date: Mon, 20 Nov 2023 18:38:52 +0100 Subject: [PATCH] Add predictor for MultiBERT model, improve the trainer for the same model and expose the predicted score in the model's output --- scripts/ML/BERT/Classifier.py | 12 ++++------ scripts/ML/Corpus.py | 21 ++++++----------- scripts/ML/predictMulti.py | 26 +++++++++++++++++++++ scripts/ML/{predict.py => predictSimple.py} | 2 +- scripts/ML/prodigy-multi-jsonl-to-tsv.py | 2 +- scripts/ML/trainMultiBERT.py | 20 ++++++---------- 6 files changed, 47 insertions(+), 36 deletions(-) create mode 100755 scripts/ML/predictMulti.py rename scripts/ML/{predict.py => predictSimple.py} (92%) diff --git a/scripts/ML/BERT/Classifier.py b/scripts/ML/BERT/Classifier.py index 04bcffa..d5c1b50 100644 --- a/scripts/ML/BERT/Classifier.py +++ b/scripts/ML/BERT/Classifier.py @@ -24,16 +24,14 @@ class Classifier(BERT): self.pipe = TextClassificationPipeline( model=self.model, tokenizer=self.tokenizer, - return_all_scores=True, + top_k=1, device=self.device) def __call__(self, text_generator): tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512} - predictions = [] + labels, scores = [], [] for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)): byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True) - predictions.append([int(byScoreDesc[0]['label'][6:]), - byScoreDesc[0]['score'], - int(byScoreDesc[1]['label'][6:])]) - return self.encoder.inverse_transform( - numpy.array(predictions)[:,0].astype(int)) + labels.append(int(byScoreDesc[0]['label'][6:])) + scores.append(byScoreDesc[0]['score']) + return self.encoder.inverse_transform(labels), numpy.array(scores) diff --git a/scripts/ML/Corpus.py b/scripts/ML/Corpus.py index 159092b..00d5d9f 100644 --- a/scripts/ML/Corpus.py +++ b/scripts/ML/Corpus.py @@ -1,3 +1,4 @@ +from GEODE import fromKey, relativePath import pandas from os import makedirs from os.path import dirname, isdir @@ -103,34 +104,26 @@ class Directory(TSVIndexed): A class to handle the normalised path used in the project and loading the actual text input as a generator from records when they are needed """ - def __init__(self, root_path, column_name='content'): + def __init__(self, root_path, tsv_filename="files", column_name='content'): """ Positional arguments :param root_path: the path to a GÉODE-style folder containing the text version of the corpus on which to predict the classes """ self.text_path = f"{root_path}/Text" - TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name) + TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name) def path_to(self, primary_key): record = self.dict_primary_key(primary_key) - article_relative_path = "{work}/T{volume}/{article}".format(**record) - prefix = f"{self.text_path}/{article_relative_path}" - if 'paragraph' in record: - return f"{prefix}/{record['paragraph']}.txt" - else: - return f"{prefix}.txt" + return f"{self.text_path}/{relativePath(record, 'txt')}" def dict_primary_key(self, primary_key): if type(primary_key) == pandas.core.series.Series: return dict(primary_key) - elif type(primary_key) != dict: - keys = self.default_keys.copy() - if len(primary_key) == 4: - keys.append('paragraph') - return dict(zip(keys, primary_key)) - else: + elif type(primary_key) == dict: return primary_key + else: + return fromKey(primary_key) def get_text(self, primary_key): with open(self.path_to(primary_key), 'r') as file: diff --git a/scripts/ML/predictMulti.py b/scripts/ML/predictMulti.py new file mode 100755 index 0000000..80d7eaa --- /dev/null +++ b/scripts/ML/predictMulti.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 +from BERT import Classifier +from Corpus import corpus +import GEODE.discursive as discursive +import pandas +from sys import argv + +def rateClass(name, answer, score): + return (1, score) if answer == 'accept' else (0, -score) + +def combine(row): + classes = [(name, row[name], row[name + 'Score']) + for name in discursive.functions] + return max(classes, key=lambda c: rateClass(*c))[0] + +def label(modelsRoot, source): + records = pandas.DataFrame(source.get_all('key')) + for name in discursive.functions: + classify = Classifier(f"{modelsRoot}/{name}") + content = source.get_all('content') + records[name], records[name + 'Score'] = classify(content) + records['label'] = records.apply(combine, axis=1) + return records + +if __name__ == '__main__': + label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False) diff --git a/scripts/ML/predict.py b/scripts/ML/predictSimple.py similarity index 92% rename from scripts/ML/predict.py rename to scripts/ML/predictSimple.py index 2dfa3e8..5ba28ad 100755 --- a/scripts/ML/predict.py +++ b/scripts/ML/predictSimple.py @@ -22,7 +22,7 @@ def label(classify, source, name='label'): an additional column """ records = pandas.DataFrame(source.get_all('key')) - records[name] = classify(source.get_all('content')) + records[name], records['score'] = classify(source.get_all('content')) return records if __name__ == '__main__': diff --git a/scripts/ML/prodigy-multi-jsonl-to-tsv.py b/scripts/ML/prodigy-multi-jsonl-to-tsv.py index c4b3c17..d555e74 100755 --- a/scripts/ML/prodigy-multi-jsonl-to-tsv.py +++ b/scripts/ML/prodigy-multi-jsonl-to-tsv.py @@ -22,7 +22,7 @@ def initialiseTexts(texts, key, annotation): {'accept': None, 'reject': set(), 'row': dict(**annotation['meta'], - content=annotation['text'])}) + content=annotation['text'].strip()+'\n')}) def byLabel(annotations): labels = {} diff --git a/scripts/ML/trainMultiBERT.py b/scripts/ML/trainMultiBERT.py index 637ad29..43cc10b 100755 --- a/scripts/ML/trainMultiBERT.py +++ b/scripts/ML/trainMultiBERT.py @@ -6,24 +6,18 @@ from LabeledData import LabeledData import os import sys -def split(columnName): - return {} - -def load(rootPath): - classes = {} - for f in os.listdir(rootPath): - if f[-4:] == '.tsv': - classes[f[:-4]] = f"{rootPath}/{f}" - return classes - def trainSubClassifier(trainRoot, modelRoot, className): trainData = Directory(trainRoot, tsv_filename=className) labeled_data = LabeledData(trainData, "answer") subModelPath = f"{modelRoot}/{className}" os.makedirs(subModelPath, exist_ok=True) - os.symlink(f"../{BERT.encoder_file}", f"{subModelPath}/{BERT.encoder_file}") - trainer = Trainer(subModelPath, labeled_data) - trainer() + encoderLink = f"{subModelPath}/{BERT.encoder_file}" + if not os.path.islink(encoderLink): + os.symlink(f"../{BERT.encoder_file}", encoderLink) + if not os.path.exists(f"{subModelPath}/pytorch_model.bin"): + trainer = Trainer(subModelPath, labeled_data) + print(f"Training {className} to {subModelPath}") + trainer() if __name__ == '__main__': for className in discursive.functions: -- GitLab