Newer
Older
import numpy as np
import pandas as pd
from loess.loess_1d import loess_1d
import scipy as sp
from sklearn.metrics import r2_score
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
"CaC": 21,
"OxM": 22
}
ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
def numerical_to_alphabetical(arr):
seq = ''
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def numerical_to_alphabetical_str(s):
seq = ''
s = s.replace('[','')
s = s.replace(']', '')
arr = s.split(',')
arr = list(map(int, arr))
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
seq_common = dataset['Sequence']
seq_ref = seq_ref.tolist()
seq_common = seq_common.tolist()
seq_ref = [tuple(l) for l in seq_ref]
seq_common = [tuple(l) for l in seq_common]
ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref))
inter = set(ind_dict_ref).intersection(seq_common)
ind_dict_ref = [ind_dict_ref[x] for x in inter]
indices_common = dict((k, i) for i, k in enumerate(seq_common))
indices_common = [indices_common[x] for x in inter]
rt_ref = reference['Retention time'][ind_dict_ref].reset_index()
rt_data = dataset['Retention time'][indices_common].reset_index()
xout, yout, wout = loess_1d(np.array(rt_data['Retention time'].tolist()), np.array(rt_ref['Retention time'].tolist()),
xnew=dataset['Retention time'],
degree=1, frac=0.5,
npoints=None, rotate=False, sigy=None)
dataset['Retention time'] = yout
return dataset
def filter_cysteine(df, col):
def map_cys(str):
return not('C' in str)
df['cys'] = df[col].map(map_cys)
data = df[df['cys']].reset_index(drop=True)
return data
def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
df_value_list = []
df_sub_value_list=[]
for r in sub_df.iterrows() :
try :
df_value_list.append(df[df['Sequence']==r[1]['Sequence']]['Retention time'].reset_index(drop=True)[0])
df_sub_value_list.append(r[1]['Retention time'])
except:
pass
fig, ax = plt.subplots()
ax.scatter(df_sub_value_list, df_value_list)
x = np.array([min(df_value_list), max(df_value_list)])
linreg = sp.stats.linregress(df_value_list, df_sub_value_list)
ax.annotate("r-squared = {:.3f}".format(r2_score(df_value_list, df_sub_value_list)), (0, 1))
plt.plot(x, linreg.intercept + linreg.slope * x, 'r')
if save :
plt.savefig(path)
plt.clf()
return df_value_list, df_sub_value_list
# data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True)
# # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical)
#
# data_train = load_data('msms/msms16_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True)
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# data_align.to_pickle('database/data_DIA_16_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms17_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_17_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms20_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_20_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms23_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_23_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms24_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_24_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms30_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True)
# # data_align = align(data_train, data_ori)
# data_train.to_pickle('database/data_DIA_30_01_aligned30_01.pkl')
# plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1)
# plt.savefig('test_align_2.png')
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#
# dataset_ref = pd.read_pickle('database/data_01_16_DIA_ISA_55.pkl')
# data_ref = Common_Dataset(dataset_ref, 25).data
# dataset_2 = pd.read_pickle('database/data_01_20_DIA_ISA_55.pkl')
# data_2 = Common_Dataset(dataset_2, 25).data
# dataset_3 = pd.read_pickle('database/data_01_17_DIA_ISA_55.pkl')
# data_3 = Common_Dataset(dataset_3, 25).data
# dataset_4 = pd.read_pickle('database/data_01_23_DIA_ISA_55.pkl')
# data_4 = Common_Dataset(dataset_4, 25).data
# data_align_3 = align(data_3, data_ref)
# data_align_4 = align(data_4, data_ref)
#
# data = pd.concat([data_ref, data_2, data_align_3, data_align_4], ignore_index=True)
# data = data.drop(columns='index')
# data['Sequence'] = data['Sequence'].map(numerical_to_alphabetical)
# num_data = data.shape[0]
# train_num = np.floor(num_data*0.8)
# train_size=0
# list_train=[]
# list_test=[]
# groups = data.groupby('Sequence')
# for seq, gr in groups:
#
# train_size+= gr.shape[0]
#
# if train_size>train_num:
# list_test.append(gr)
# else:
# list_train.append(gr)
#
#
# dataset_train = pd.concat(list_train, ignore_index=True)
# dataset_test = pd.concat(list_test, ignore_index=True)
# dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl')
# dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl')
# data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True)
# data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True)
# data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True)
# data_ori['Sequence']=data_ori['sequence']
# data_ori['Retention time']=data_ori['irt']
# data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True)
# data_align = align(data_train, data_ori)
#
# data_align.to_pickle('database/data_ISA_dual_align.pkl')
#compare DIANN pred to DIA mesures
# df_ori = pd.read_csv('database/data_train.csv')
# df_ori['Sequence']=df_ori['sequence']
# df_ori['Retention time']=df_ori['irt']
# df_diann = pd.read_csv('database/CIT_BASE_UP000584719_546.csv')
#
# df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl')
#
#
#
# df_diann_aligned = align(df_diann, df_ori)
#
# df_value_list, df_sub_value_list = compare_include_df(df_diann_aligned, df_ISA, True)
# create augmented dataset from ISA data + column invariant prosit peptides
df_base = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl')
df_base = df_base[['Sequence','Retention time']]
df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl')
df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str)
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl')
df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str)
df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl')
df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str)
df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True)
df_augmented_1.columns=['sequence','irt']
df_augmented_1['state']='train'
df_augmented_1.to_csv('database/data_ISA_augmented_1_30_01.csv')
df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True)
df_augmented_2.columns=['sequence','irt']
df_augmented_2['state']='train'
df_augmented_2.to_csv('database/data_ISA_augmented_2_30_01.csv')
df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True)
df_augmented_3.columns=['sequence','irt']
df_augmented_3['state']='train'
df_augmented_3.to_csv('database/data_ISA_augmented_3_30_01.csv')
# testing intersection between test and augmented dataset
df_sup = pd.read_pickle('database/data_prosit_threshold_3.pkl')
df_test = pd.read_pickle('database/data_DIA_ISA_55_test.pkl')
inter = []
n = 0
df_sup['Sequence']= df_sup['Sequence'].map(numerical_to_alphabetical_str)
groups_sup = df_sup.groupby('Sequence')
groups_test = df_test.groupby('Sequence')
for seq, _ in groups_sup:
for seq2, _ in groups_test:
if seq2==seq :
inter.append(seq)