diff --git a/data/data_processing.py b/data/data_processing.py index 9fd62ec6892602bc7bfcb1f037f93b7a2140ef8f..bb012ebdbab7894f5c0c8a561afe131338a816d2 100644 --- a/data/data_processing.py +++ b/data/data_processing.py @@ -1,7 +1,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -# from loess.loess_1d import loess_1d +from loess.loess_1d import loess_1d import time ALPHABET_UNMOD = { diff --git a/data/msp_file_extraction.py b/data/msp_file_extraction.py index 44cced6ede8d779d33abba10aaa59baadc42c609..e198c562a33b7681c25dd61258bda4e39f2423d0 100644 --- a/data/msp_file_extraction.py +++ b/data/msp_file_extraction.py @@ -71,10 +71,31 @@ if __name__ == '__main__': # df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False) # # - # #write new .msp with new RT - # - # - df= pd.read_csv('spectral_lib/df_predicted_library_oktoberfest.csv') + #write new .msp with new RT + seq=[] + + file = open("spectral_lib/predicted_library.msp", "r") + content=file.readlines() + file.close() + remove = False + predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') + pred_rt=predicted_lib['rt pred'] + + for i in range(len(content)) : + if remove: + if 'Name:' in content[i]: + remove = False + else : + pass + + if 'Name:'in content[i]: + s=content[i].split(': ')[1].split('/')[0] + if 'C' in s or len(s)>30: + remove=True + else : + seq.append(s) + + df = pd.DataFrame(seq,columns=['sequence']) predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') diff --git a/diann_lib_processing.py b/diann_lib_processing.py index 2c0ff1f32540dd624a9c2cc4db9c764a8fdf5f53..770538071d83fc92147ff230d335260e3ef7b51b 100644 --- a/diann_lib_processing.py +++ b/diann_lib_processing.py @@ -5,6 +5,8 @@ import pyarrow.parquet as pq import pyarrow as pa import torch import matplotlib.pyplot as plt +from loess.loess_1d import loess_1d + from model.model import ModelTransformer from config import load_args from data.dataset import load_data @@ -93,34 +95,81 @@ def predict(data_pred, model, output_path): if __name__ =='__main__': - # df = load_lib('data/spectral_lib/first_lib.parquet') - # + df = load_lib('spectral_lib/first_lib.parquet') + # plt.hist(df['RT']) # plt.savefig('test.png') # - # df_2 = pd.read_csv('data/data_prosit/data.csv') + # df_2 = pd.read_csv('data_prosit/data.csv') # # plt.clf() # plt.hist(df_2['irt']) # plt.savefig('test2.png') - + # # df_2 = extract_sequence(df).reset_index(drop=True) # # pred = pd.read_csv('../output/out_uniprot_base.csv') - # + # pred['seq']=pred['seq'].map(numerical_to_alphabetical_str) # # pred['Modified.Sequence']=pred['seq'] # # result = pd.merge(df,pred[['Modified.Sequence','rt pred']],on='Modified.Sequence',how='left') # - # result['RT']=result['rt pred'] + # + # + # #alignement + # + # ref = pd.read_csv('data_prosit/data_noc.csv') + # df_ISA = pd.read_csv('data_ISA/data_aligned_isa_noc.csv') + # + # dataset, reference, column_dataset, column_ref, seq_data, seq_ref = df_ISA, ref, 'irt_scaled', 'irt', 'sequence','sequence', + # + # dataset_ref=dataset[dataset['state']=='train'] + # dataset_unique = dataset_ref[[seq_data,column_dataset]].groupby(seq_data).mean() + # print('unique',len(dataset_unique)) + # reference_unique = reference[[seq_ref,column_ref]].groupby(seq_ref).mean() + # seq_ref = reference_unique.index + # seq_common = dataset_unique.index + # seq_ref = seq_ref.tolist() + # seq_common = seq_common.tolist() + # + # seq_ref = [tuple(l) for l in seq_ref] + # seq_common = [tuple(l) for l in seq_common] + # + # ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref)) + # inter = set(ind_dict_ref).intersection(seq_common) + # print(len(inter)) + # + # ind_dict_ref = [ind_dict_ref[x] for x in inter] + # + # indices_common = dict((k, i) for i, k in enumerate(seq_common)) + # indices_common = [indices_common[x] for x in inter] + # + # + # rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index() + # rt_data = dataset_unique[column_dataset][indices_common].reset_index() + # + # plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1) + # plt.savefig('test.png') + # + # #présence de NAN qui casse le réalignement (solution temporaire : remplacer par 0. + # result['rt pred']=result['rt pred'].fillna(value=0) + # xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()), + # xnew=result['rt pred'], + # degree=1, + # npoints=None, rotate=False, sigy=None) + # + # + # #writing results + # + # result['RT'] = yout # # result = result.drop('rt pred', axis=1) # # table = pa.Table.from_pandas(result) # - # pq.write_table(table, 'spectral_lib/custom_first_lib.parquet') + # pq.write_table(table, 'spectral_lib/custom_first_lib_prosit_aligned.parquet')