Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from transformers import BertForSequenceClassification, BertTokenizer
import os
import pickle
from sklearn import preprocessing
import torch
def get_device():
if torch.cuda.is_available():
print('We will use the GPU:', torch.cuda.get_device_name(0))
return torch.device("cuda")
else:
print('No GPU available, using the CPU instead.')
return torch.device("cpu")
def loader(f):
def wrapped(*args, **kwargs):
name = f.__name__.replace('_init_', '')
print(f' - {name}', end='')
f(*args, **kwargs)
print(f'\r✔️ {name}')
return wrapped
class BERT:
model_name = 'bert-base-multilingual-cased'
def __init__(self, root_path, train_on=None):
self.device = get_device()
print('Loading BERT tools')
self._init_tokenizer()
self.root_path = root_path
self._init_classifier(train_on)
self._init_encoder(train_on)
@loader
def _init_tokenizer(self):
self.tokenizer = BertTokenizer.from_pretrained(BERT.model_name)
@loader
def _init_classifier(self, train_on):
if train_on is not None:
bert = BertForSequenceClassification.from_pretrained(
BERT.model_name, # Use the 12-layer BERT model, with an uncased vocab.
num_labels = len(train_on),
output_attentions = False,
output_hidden_states = False
)
else:
bert = BertForSequenceClassification.from_pretrained(self.root_path)
self.model = bert.to(self.device.type)
@loader
def _init_encoder(self, train_on):
path = f"{self.root_path}/label_encoder.pkl"
if os.path.isfile(path):
with open(path, 'rb') as pickled:
self.encoder = pickle.load(pickled)
elif train_on is not None:
self.encoder = preprocessing.LabelEncoder()
self.encoder.fit(train_on)
with open(path, 'wb') as file:
pickle.dump(self.encoder, file)
else:
raise FileNotFoundError(path)
def import_data(self, data):
return map(lambda d: d.to(self.device), data)
def save(self):
self.model.save_pretrained(self.root_path)