Skip to content
Snippets Groups Projects
Commit 7799b94c authored by Alice Brenon's avatar Alice Brenon
Browse files

Add predictor for MultiBERT model, improve the trainer for the same model and...

Add predictor for MultiBERT model, improve the trainer for the same model and expose the predicted score in the model's output
parent 669b56e6
No related branches found
No related tags found
No related merge requests found
......@@ -24,16 +24,14 @@ class Classifier(BERT):
self.pipe = TextClassificationPipeline(
model=self.model,
tokenizer=self.tokenizer,
return_all_scores=True,
top_k=1,
device=self.device)
def __call__(self, text_generator):
tokenizer_kwargs = {'padding':True, 'truncation':True, 'max_length':512}
predictions = []
labels, scores = [], []
for output in tqdm(self.pipe(text_generator, **tokenizer_kwargs)):
byScoreDesc = sorted(output, key=lambda d: d['score'], reverse=True)
predictions.append([int(byScoreDesc[0]['label'][6:]),
byScoreDesc[0]['score'],
int(byScoreDesc[1]['label'][6:])])
return self.encoder.inverse_transform(
numpy.array(predictions)[:,0].astype(int))
labels.append(int(byScoreDesc[0]['label'][6:]))
scores.append(byScoreDesc[0]['score'])
return self.encoder.inverse_transform(labels), numpy.array(scores)
from GEODE import fromKey, relativePath
import pandas
from os import makedirs
from os.path import dirname, isdir
......@@ -103,34 +104,26 @@ class Directory(TSVIndexed):
A class to handle the normalised path used in the project and loading the
actual text input as a generator from records when they are needed
"""
def __init__(self, root_path, column_name='content'):
def __init__(self, root_path, tsv_filename="files", column_name='content'):
"""
Positional arguments
:param root_path: the path to a GÉODE-style folder containing the text
version of the corpus on which to predict the classes
"""
self.text_path = f"{root_path}/Text"
TSVIndexed.__init__(self, f"{root_path}/files.tsv", column_name)
TSVIndexed.__init__(self, f"{root_path}/{tsv_filename}.tsv", column_name)
def path_to(self, primary_key):
record = self.dict_primary_key(primary_key)
article_relative_path = "{work}/T{volume}/{article}".format(**record)
prefix = f"{self.text_path}/{article_relative_path}"
if 'paragraph' in record:
return f"{prefix}/{record['paragraph']}.txt"
else:
return f"{prefix}.txt"
return f"{self.text_path}/{relativePath(record, 'txt')}"
def dict_primary_key(self, primary_key):
if type(primary_key) == pandas.core.series.Series:
return dict(primary_key)
elif type(primary_key) != dict:
keys = self.default_keys.copy()
if len(primary_key) == 4:
keys.append('paragraph')
return dict(zip(keys, primary_key))
else:
elif type(primary_key) == dict:
return primary_key
else:
return fromKey(primary_key)
def get_text(self, primary_key):
with open(self.path_to(primary_key), 'r') as file:
......
#!/usr/bin/env python3
from BERT import Classifier
from Corpus import corpus
import GEODE.discursive as discursive
import pandas
from sys import argv
def rateClass(name, answer, score):
return (1, score) if answer == 'accept' else (0, -score)
def combine(row):
classes = [(name, row[name], row[name + 'Score'])
for name in discursive.functions]
return max(classes, key=lambda c: rateClass(*c))[0]
def label(modelsRoot, source):
records = pandas.DataFrame(source.get_all('key'))
for name in discursive.functions:
classify = Classifier(f"{modelsRoot}/{name}")
content = source.get_all('content')
records[name], records[name + 'Score'] = classify(content)
records['label'] = records.apply(combine, axis=1)
return records
if __name__ == '__main__':
label(argv[1], corpus(argv[2])).to_csv(argv[3], sep='\t', index=False)
......@@ -22,7 +22,7 @@ def label(classify, source, name='label'):
an additional column
"""
records = pandas.DataFrame(source.get_all('key'))
records[name] = classify(source.get_all('content'))
records[name], records['score'] = classify(source.get_all('content'))
return records
if __name__ == '__main__':
......
......@@ -22,7 +22,7 @@ def initialiseTexts(texts, key, annotation):
{'accept': None,
'reject': set(),
'row': dict(**annotation['meta'],
content=annotation['text'])})
content=annotation['text'].strip()+'\n')})
def byLabel(annotations):
labels = {}
......
......@@ -6,24 +6,18 @@ from LabeledData import LabeledData
import os
import sys
def split(columnName):
return {}
def load(rootPath):
classes = {}
for f in os.listdir(rootPath):
if f[-4:] == '.tsv':
classes[f[:-4]] = f"{rootPath}/{f}"
return classes
def trainSubClassifier(trainRoot, modelRoot, className):
trainData = Directory(trainRoot, tsv_filename=className)
labeled_data = LabeledData(trainData, "answer")
subModelPath = f"{modelRoot}/{className}"
os.makedirs(subModelPath, exist_ok=True)
os.symlink(f"../{BERT.encoder_file}", f"{subModelPath}/{BERT.encoder_file}")
trainer = Trainer(subModelPath, labeled_data)
trainer()
encoderLink = f"{subModelPath}/{BERT.encoder_file}"
if not os.path.islink(encoderLink):
os.symlink(f"../{BERT.encoder_file}", encoderLink)
if not os.path.exists(f"{subModelPath}/pytorch_model.bin"):
trainer = Trainer(subModelPath, labeled_data)
print(f"Training {className} to {subModelPath}")
trainer()
if __name__ == '__main__':
for className in discursive.functions:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment