diff --git a/data/data_processing.py b/data/data_processing.py index f226a6acc3aa01a04586e79f1bcbb7c666827036..9fd62ec6892602bc7bfcb1f037f93b7a2140ef8f 100644 --- a/data/data_processing.py +++ b/data/data_processing.py @@ -33,45 +33,45 @@ ALPHABET_UNMOD = { ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()} -# def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref): -# dataset_ref=dataset[dataset['state']=='train'] -# dataset_unique = dataset_ref[[seq_data,column_dataset]].groupby(seq_data).mean() -# print('unique',len(dataset_unique)) -# reference_unique = reference[[seq_ref,column_ref]].groupby(seq_ref).mean() -# seq_ref = reference_unique.index -# seq_common = dataset_unique.index -# seq_ref = seq_ref.tolist() -# seq_common = seq_common.tolist() -# -# seq_ref = [tuple(l) for l in seq_ref] -# seq_common = [tuple(l) for l in seq_common] -# -# ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref)) -# inter = set(ind_dict_ref).intersection(seq_common) -# print(len(inter)) -# -# ind_dict_ref = [ind_dict_ref[x] for x in inter] -# -# indices_common = dict((k, i) for i, k in enumerate(seq_common)) -# indices_common = [indices_common[x] for x in inter] -# -# -# rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index() -# rt_data = dataset_unique[column_dataset][indices_common].reset_index() -# -# plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1) -# plt.savefig('test.png') -# -# xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()), -# xnew=dataset[column_dataset], -# degree=1, frac=0.25, -# npoints=None, rotate=False, sigy=None) -# -# plt.scatter(xout, yout, s=0.1) -# plt.savefig('test_2.png') -# -# dataset[column_dataset] = yout -# return dataset +def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref): + dataset_ref=dataset[dataset['state']=='train'] + dataset_unique = dataset_ref[[seq_data,column_dataset]].groupby(seq_data).mean() + print('unique',len(dataset_unique)) + reference_unique = reference[[seq_ref,column_ref]].groupby(seq_ref).mean() + seq_ref = reference_unique.index + seq_common = dataset_unique.index + seq_ref = seq_ref.tolist() + seq_common = seq_common.tolist() + + seq_ref = [tuple(l) for l in seq_ref] + seq_common = [tuple(l) for l in seq_common] + + ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref)) + inter = set(ind_dict_ref).intersection(seq_common) + print(len(inter)) + + ind_dict_ref = [ind_dict_ref[x] for x in inter] + + indices_common = dict((k, i) for i, k in enumerate(seq_common)) + indices_common = [indices_common[x] for x in inter] + + + rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index() + rt_data = dataset_unique[column_dataset][indices_common].reset_index() + + plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1) + plt.savefig('test.png') + + xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()), + xnew=dataset[column_dataset], + degree=1, frac=0.25, + npoints=None, rotate=False, sigy=None) + + plt.scatter(xout, yout, s=0.1) + plt.savefig('test_2.png') + + dataset[column_dataset] = yout + return dataset def get_number_unique_peptide(dataset): seq = dataset['sequence'] diff --git a/data/diann_lib_processing.py b/data/diann_lib_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..52114706f21b9aa1a42d45571688e5f45a9dd464 --- /dev/null +++ b/data/diann_lib_processing.py @@ -0,0 +1,146 @@ +import numpy as np +import pandas as pb +import pandas as pd +import pyarrow.parquet as pq +import pyarrow as pa +import torch +import matplotlib.pyplot as plt +from model.model import ModelTransformer +from config import load_args +from data.dataset import load_data + +ALPHABET_UNMOD = { + "": 0, + "A": 1, + "C": 2, + "D": 3, + "E": 4, + "F": 5, + "G": 6, + "H": 7, + "I": 8, + "K": 9, + "L": 10, + "M": 11, + "N": 12, + "P": 13, + "Q": 14, + "R": 15, + "S": 16, + "T": 17, + "V": 18, + "W": 19, + "Y": 20, + "M(UniMod:35)": 21, + "CaC": 22 +} + +ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()} + +def numerical_to_alphabetical_str(s): + seq = '' + s = s.replace('[','') + s = s.replace(']', '') + arr = s.split(',') + arr = list(map(int, arr)) + for i in range(len(arr)): + seq+=ALPHABET_UNMOD_REV[arr[i]] + return seq + +def load_lib(path): + table = pq.read_table(path) + table = table.to_pandas() + + return table + +def extract_sequence(data_frame): + + seq = data_frame['Modified.Sequence'] + + df_pred = pd.DataFrame(seq) + df_pred.columns = ['sequence'] + df_pred['sequence']=df_pred['sequence'].map(lambda x:x.replace('M(UniMod:35)','-OxM-')) + df_pred['remove']=df_pred['sequence'].map((lambda x : 'U' in x)) + df_pred = df_pred[df_pred['remove']==False] + df_pred = df_pred[['sequence']] + df_pred['irt_scaled']=0 + df_pred['state'] = 'holdout' + + df_pred = df_pred.drop_duplicates() + + return df_pred + +def predict(data_pred, model, output_path): + data_frame = pd.DataFrame() + model.eval() + for param in model.parameters(): + param.requires_grad = False + + pred_rt, seqs, true_rt = [], [], [] + for seq, rt in data_pred: + rt = rt.float() + if torch.cuda.is_available(): + seq, rt = seq.cuda(), rt.cuda() + pr_rt = model.forward(seq) + pred_rt.extend(pr_rt.data.cpu().tolist()) + seqs.extend(seq.data.cpu().tolist()) + true_rt.extend(rt.data.cpu().tolist()) + + data_frame['rt pred'] = pred_rt + data_frame['seq'] = seqs + data_frame['true rt'] = true_rt + data_frame.to_csv(output_path) + + +if __name__ =='__main__': + df = load_lib('spectral_lib/first_lib.parquet') + + plt.hist(df['RT']) + plt.savefig('test.png') + + df_2 = pd.read_csv('data_prosit/data.csv') + + plt.clf() + plt.hist(df_2['irt']) + plt.savefig('test2.png') + + # df_2 = extract_sequence(df).reset_index(drop=True) + # + # pred = pd.read_csv('../output/out_uniprot_base.csv') + # + # pred['seq']=pred['seq'].map(numerical_to_alphabetical_str) + # + # pred['Modified.Sequence']=pred['seq'] + # + # result = pd.merge(df,pred[['Modified.Sequence','rt pred']],on='Modified.Sequence',how='left') + # + # result['RT']=result['rt pred'] + # + # result = result.drop('rt pred', axis=1) + # + # table = pa.Table.from_pandas(result) + # + # pq.write_table(table, 'spectral_lib/custom_first_lib.parquet') + + + + # args = load_args() + # + # model = ModelTransformer(encoder_ff=args.encoder_ff, decoder_rt_ff=args.decoder_rt_ff, + # n_head=args.n_head, encoder_num_layer=args.encoder_num_layer, + # decoder_rt_num_layer=args.decoder_rt_num_layer, drop_rate=args.drop_rate, + # embedding_dim=args.embedding_dim, acti=args.activation, norm=args.norm_first, seq_length=30) + # + # if torch.cuda.is_available(): + # model = model.cuda() + # + # model.load_state_dict(torch.load(args.model_weigh, weights_only=True)) + # + # print(args.dataset_test) + # data_test = load_data(data_source='data/spectral_lib/data_uniprot_base.csv', batch_size=args.batch_size, length=30, mode=args.split_test, + # seq_col=args.seq_test) + # + # predict(data_test, model, args.output) + + plt.hist(df['RT']) + plt.savefig('test.png') \ No newline at end of file diff --git a/data/msp_file_extraction.py b/data/msp_file_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..64ade727c65783677d47e14e6c1be0f6c21743ec --- /dev/null +++ b/data/msp_file_extraction.py @@ -0,0 +1,16 @@ +import pandas as pd + + +if __name__ == '__main__': + seq=[] + file = open("spectral_lib/predicted_library.msp", "r") + content=file.readlines() + file.close() + for l in content : + if 'Name:'in l: + seq.append(l.split(':')[1].split('/')[0]) + + df = pd.DataFrame(seq,columns=['sequence']) + df['irt_scaled']=0 + df['state'] = 'holdout' + df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False) \ No newline at end of file diff --git a/data/spectral_lib/df_predicted_library_oktoberfest.csv b/data/spectral_lib/df_predicted_library_oktoberfest.csv new file mode 100644 index 0000000000000000000000000000000000000000..1387ae724c7d4a2bb6de0e881f852f4500e156c3 Binary files /dev/null and b/data/spectral_lib/df_predicted_library_oktoberfest.csv differ diff --git a/diann_lib_processing.py b/diann_lib_processing.py deleted file mode 100644 index 51ba849b5896c1defa87284e239807a95c1d6335..0000000000000000000000000000000000000000 --- a/diann_lib_processing.py +++ /dev/null @@ -1,79 +0,0 @@ -import numpy as np -import pandas as pb -import pandas as pd -import pyarrow.parquet as pq -import torch -from model.model import ModelTransformer -from config import load_args -from data.dataset import load_data - - -def load_lib(path): - table = pq.read_table('data/spectral_lib/first_lib.parquet') - table = table.to_pandas() - - return table - -def extract_sequence(data_frame): - - seq = data_frame['Modified.Sequence'] - - df_pred = pd.DataFrame(seq) - df_pred.columns = ['sequence'] - df_pred['sequence']=df_pred['sequence'].map(lambda x:x.replace('M(UniMod:35)','-OxM-')) - df_pred['remove']=df_pred['sequence'].map((lambda x : 'U' in x)) - df_pred = df_pred[df_pred['remove']==False] - df_pred = df_pred[['sequence']] - df_pred['irt_scaled']=0 - df_pred['state'] = 'holdout' - - df_pred = df_pred.drop_duplicates() - - return df_pred - -def predict(data_pred, model, output_path): - data_frame = pd.DataFrame() - model.eval() - for param in model.parameters(): - param.requires_grad = False - - pred_rt, seqs, true_rt = [], [], [] - for seq, rt in data_pred: - rt = rt.float() - if torch.cuda.is_available(): - seq, rt = seq.cuda(), rt.cuda() - pr_rt = model.forward(seq) - pred_rt.extend(pr_rt.data.cpu().tolist()) - seqs.extend(seq.data.cpu().tolist()) - true_rt.extend(rt.data.cpu().tolist()) - - data_frame['rt pred'] = pred_rt - data_frame['seq'] = seqs - data_frame['true rt'] = true_rt - data_frame.to_csv(output_path) - - -if __name__ =='__main__': - # df = load_lib('data/spectral_lib/first_lib.parquet') - # df_2 = extract_sequence(df).reset_index(drop=True) - # df_2.to_csv('data/spectral_lib/data_uniprot_base.csv', index=False) - - - - args = load_args() - - model = ModelTransformer(encoder_ff=args.encoder_ff, decoder_rt_ff=args.decoder_rt_ff, - n_head=args.n_head, encoder_num_layer=args.encoder_num_layer, - decoder_rt_num_layer=args.decoder_rt_num_layer, drop_rate=args.drop_rate, - embedding_dim=args.embedding_dim, acti=args.activation, norm=args.norm_first, seq_length=30) - - if torch.cuda.is_available(): - model = model.cuda() - - model.load_state_dict(torch.load(args.model_weigh, weights_only=True)) - - print(args.dataset_test) - data_test = load_data(data_source='data/spectral_lib/data_uniprot_base.csv', batch_size=args.batch_size, length=30, mode=args.split_test, - seq_col=args.seq_test) - - predict(data_test, model, args.output) \ No newline at end of file