Skip to content
Snippets Groups Projects
Commit 7799b94c authored by Alice Brenon's avatar Alice Brenon
Browse files

Add predictor for MultiBERT model, improve the trainer for the same model and...

Add predictor for MultiBERT model, improve the trainer for the same model and expose the predicted score in the model's output
parent 669b56e6
Branches main
No related tags found
No related merge requests found
...@@ -24,16 +24,14 @@ class Classifier(BERT): ...@@ -24,16 +24,14 @@ class Classifier(BERT):
self.pipe = TextClassificationPipeline( self.pipe = TextClassificationPipeline(
model=self.model, model=self.model,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
return_all_scores=True, top_k=1,
device=self.device) device=self.device)
def __call__(self, text_generator): def __call__(self, text_generator):
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512} tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
predictions = [] labels, scores = [], []
for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)): for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True) byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
predictions.append([int(byScoreDesc[0]['label'][6:]), labels.append(int(byScoreDesc[0]['label'][6:]))
byScoreDesc[0]['score'], scores.append(byScoreDesc[0]['score'])
int(byScoreDesc[1]['label'][6:])]) return self.encoder.inverse_transform(labels), numpy.array(scores)
return self.encoder.inverse_transform(
numpy.array(predictions)[:,0].astype(int))
from GEODE import fromKey, relativePath
import pandas import pandas
from os import makedirs from os import makedirs
from os.path import dirname, isdir from os.path import dirname, isdir
...@@ -103,34 +104,26 @@ class Directory(TSVIndexed): ...@@ -103,34 +104,26 @@ class Directory(TSVIndexed):
A class to handle the normalised path used in the project and loading the A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed actual text input as a generator from records when they are needed
""" """
def __init__(self, root_path, column_name='content'): def __init__(self, root_path, tsv_filename="files", column_name='content'):
""" """
Positional arguments Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text :param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes version of the corpus on which to predict the classes
""" """
self.text_path = f"{root_path}/Text" self.text_path = f"{root_path}/Text"
TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name) TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name)
def path_to(self, primary_key): def path_to(self, primary_key):
record = self.dict_primary_key(primary_key) record = self.dict_primary_key(primary_key)
article_relative_path = "{work}/T{volume}/{article}".format(**record) return f"{self.text_path}/{relativePath(record, 'txt')}"
prefix = f"{self.text_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record['paragraph']}.txt"
else:
return f"{prefix}.txt"
def dict_primary_key(self, primary_key): def dict_primary_key(self, primary_key):
if type(primary_key) == pandas.core.series.Series: if type(primary_key) == pandas.core.series.Series:
return dict(primary_key) return dict(primary_key)
elif type(primary_key) != dict: elif type(primary_key) == dict:
keys = self.default_keys.copy()
if len(primary_key) == 4:
keys.append('paragraph')
return dict(zip(keys, primary_key))
else:
return primary_key return primary_key
else:
return fromKey(primary_key)
def get_text(self, primary_key): def get_text(self, primary_key):
with open(self.path_to(primary_key), 'r') as file: with open(self.path_to(primary_key), 'r') as file:
......
#!/usr/bin/env python3
from BERT import Classifier
from Corpus import corpus
import GEODE.discursive as discursive
import pandas
from sys import argv
def rateClass(name, answer, score):
return (1, score) if answer == 'accept' else (0, -score)
def combine(row):
classes = [(name, row[name], row[name + 'Score'])
for name in discursive.functions]
return max(classes, key=lambda c: rateClass(*c))[0]
def label(modelsRoot, source):
records = pandas.DataFrame(source.get_all('key'))
for name in discursive.functions:
classify = Classifier(f"{modelsRoot}/{name}")
content = source.get_all('content')
records[name], records[name + 'Score'] = classify(content)
records['label'] = records.apply(combine, axis=1)
return records
if __name__ == '__main__':
label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False)
...@@ -22,7 +22,7 @@ def label(classify, source, name='label'): ...@@ -22,7 +22,7 @@ def label(classify, source, name='label'):
an additional column an additional column
""" """
records = pandas.DataFrame(source.get_all('key')) records = pandas.DataFrame(source.get_all('key'))
records[name] = classify(source.get_all('content')) records[name], records['score'] = classify(source.get_all('content'))
return records return records
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -22,7 +22,7 @@ def initialiseTexts(texts, key, annotation): ...@@ -22,7 +22,7 @@ def initialiseTexts(texts, key, annotation):
{'accept': None, {'accept': None,
'reject': set(), 'reject': set(),
'row': dict(**annotation['meta'], 'row': dict(**annotation['meta'],
content=annotation['text'])}) content=annotation['text'].strip()+'\n')})
def byLabel(annotations): def byLabel(annotations):
labels = {} labels = {}
......
...@@ -6,24 +6,18 @@ from LabeledData import LabeledData ...@@ -6,24 +6,18 @@ from LabeledData import LabeledData
import os import os
import sys import sys
def split(columnName):
return {}
def load(rootPath):
classes = {}
for f in os.listdir(rootPath):
if f[-4:] == '.tsv':
classes[f[:-4]] = f"{rootPath}/{f}"
return classes
def trainSubClassifier(trainRoot, modelRoot, className): def trainSubClassifier(trainRoot, modelRoot, className):
trainData = Directory(trainRoot, tsv_filename=className) trainData = Directory(trainRoot, tsv_filename=className)
labeled_data = LabeledData(trainData, "answer") labeled_data = LabeledData(trainData, "answer")
subModelPath = f"{modelRoot}/{className}" subModelPath = f"{modelRoot}/{className}"
os.makedirs(subModelPath, exist_ok=True) os.makedirs(subModelPath, exist_ok=True)
os.symlink(f"../{BERT.encoder_file}", f"{subModelPath}/{BERT.encoder_file}") encoderLink = f"{subModelPath}/{BERT.encoder_file}"
trainer = Trainer(subModelPath, labeled_data) if not os.path.islink(encoderLink):
trainer() os.symlink(f"../{BERT.encoder_file}", encoderLink)
if not os.path.exists(f"{subModelPath}/pytorch_model.bin"):
trainer = Trainer(subModelPath, labeled_data)
print(f"Training {className} to {subModelPath}")
trainer()
if __name__ == '__main__': if __name__ == '__main__':
for className in discursive.functions: for className in discursive.functions:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment