Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
ALPHABET_UNMOD = {
"_": 0,
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
"CaC": 21,
"OxM": 22
}
IUPAC_VOCAB = {
"_": 0,
"<mask>": 1,
"<cls>": 2,
"<sep>": 3,
"<unk>": 4,
"A": 5,
"B": 6,
"C": 7,
"D": 8,
"E": 9,
"F": 10,
"G": 11,
"H": 12,
"I": 13,
"K": 14,
"L": 15,
"M": 16,
"N": 17,
"O": 18,
"P": 19,
"Q": 20,
"R": 21,
"S": 22,
"T": 23,
"U": 24,
"V": 25,
"W": 26,
"X": 27,
"Y": 28,
"Z": 29}
ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
def padding(dataframe, columns, length):
def pad(x):
return x + (length - len(x) + 2 * x.count('-')) * '_'
for i in range(len(dataframe)):
if len(dataframe[columns][i]) > length + 2 * dataframe[columns][i].count('-'):
dataframe.drop(i)
dataframe[columns] = dataframe[columns].map(pad)
for i in range(len(dataframe)):
if len(dataframe[columns][i]) > length:
dataframe.drop(i)
def alphabetical_to_numerical(seq, vocab):
num = []
dec = 0
if vocab == 'unmod':
for i in range(len(seq) - 2 * seq.count('-')):
if seq[i + dec] != '-':
num.append(ALPHABET_UNMOD[seq[i + dec]])
else:
if seq[i + dec + 1:i + dec + 4] == 'CaC':
num.append(21)
elif seq[i + dec + 1:i + dec + 4] == 'OxM':
num.append(22)
else:
raise 'Modification not supported'
dec += 4
else :
for i in range(len(seq) - 2 * seq.count('-')):
if seq[i + dec] != '-':
num.append(ALPHABET_UNMOD[seq[i + dec]])
else:
if seq[i + dec + 1:i + dec + 4] == 'CaC':
num.append(21)
elif seq[i + dec + 1:i + dec + 4] == 'OxM':
num.append(22)
else:
raise 'Modification not supported'
dec += 4
return np.array(num)
def numerical_to_alphabetical(arr):
seq = ''
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def zero_to_minus(arr):
arr[arr <= 0.00001] = -1.
return arr
class Common_Dataset(Dataset):
def __init__(self, dataframe, length, pad=True, convert=True, vocab='unmod', file=False):
print('Data loader Initialisation')
self.data = dataframe.reset_index()
if pad :
print('Padding')
padding(self.data, 'Sequence', length)
if convert :
print('Converting')
self.data['Sequence'] = self.data['Sequence'].map(lambda x: alphabetical_to_numerical(x, vocab))
self.data['Spectra'] = self.data['Spectra'].map(zero_to_minus)
def __getitem__(self, index: int):
seq = self.data['Sequence'][index]
rt = self.data['Retention time'][index]
intensity = self.data['Spectra'][index]
charge = self.data['Charge'][index]
file = self.data['file'][index]
if self.file_mode :
return torch.tensor(seq), torch.tensor(charge), torch.tensor(rt).float(), torch.tensor(intensity), torch.tensor(file)
else :
return torch.tensor(seq), torch.tensor(charge), torch.tensor(rt).float(), torch.tensor(intensity)
def __len__(self) -> int:
return self.data.shape[0]
def load_data(path_train, path_val, path_test, batch_size, length, pad=False, convert=False, vocab = 'unmod'):
print('Loading data')
data_train = pd.read_pickle(path_train)
data_val = pd.read_pickle(path_val)
data_test = pd.read_pickle(path_test)
train = Common_Dataset(data_train, length, pad, convert, vocab)
test = Common_Dataset(data_val, length, pad, convert, vocab)
val = Common_Dataset(data_test, length, pad, convert, vocab)
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=True)
return train_loader, val_loader, test_loader