diff --git a/data/msp_file_extraction.py b/data/msp_file_extraction.py index 3dca185dd8673f9de1930992b2a0da9ec2e327bc..44cced6ede8d779d33abba10aaa59baadc42c609 100644 --- a/data/msp_file_extraction.py +++ b/data/msp_file_extraction.py @@ -40,39 +40,42 @@ def numerical_to_alphabetical_str(s): return seq if __name__ == '__main__': - seq=[] - file = open("spectral_lib/predicted_library.msp", "r") - content=file.readlines() - file.close() - remove = False - index_to_remove=[] - updated_content=[] - ind=0 - predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') - pred_rt=predicted_lib['rt pred'] - for i in range(len(content)) : - if remove: - if 'Name:' in content[i]: - remove = False - else : - pass - - if 'Name:'in content[i]: - s=content[i].split(': ')[1].split('/')[0] - if 'C' in s or len(s)>30: - remove=True - else : - seq.append(s) - - df = pd.DataFrame(seq,columns=['sequence']) - df = df.drop_duplicates() - df['irt_scaled']=0 - df['state'] = 'holdout' - df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False) + #extract seq from .msp + # seq=[] + # file = open("spectral_lib/predicted_library.msp", "r") + # content=file.readlines() + # file.close() + # remove = False + # predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') + # pred_rt=predicted_lib['rt pred'] + # + # for i in range(len(content)) : + # if remove: + # if 'Name:' in content[i]: + # remove = False + # else : + # pass + # + # if 'Name:'in content[i]: + # s=content[i].split(': ')[1].split('/')[0] + # if 'C' in s or len(s)>30: + # remove=True + # else : + # seq.append(s) + # + # df = pd.DataFrame(seq,columns=['sequence']) + # df = df.drop_duplicates() + # df['irt_scaled']=0 + # df['state'] = 'holdout' + # df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False) + # + # + # #write new .msp with new RT # - updated_content=[] - ind=0 + # + df= pd.read_csv('spectral_lib/df_predicted_library_oktoberfest.csv') + predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv') predicted_lib['seq'] = predicted_lib['seq'].map(numerical_to_alphabetical_str) @@ -82,6 +85,42 @@ if __name__ == '__main__': pred_rt=predicted_lib['rt pred'] df_joined = pd.merge(df,predicted_lib[['rt pred','sequence']],on='sequence',how='left') + # + + file = open("spectral_lib/predicted_library.msp", "r") + content=file.readlines() + file.close() + + remove = False + + with open('spectral_lib/new_lib.msp', 'w') as f: + i=0 + j=0 + k=-1 + for i in range(len(content)) : + k-=1 + if remove: + if 'Name:' in content[i]: + remove = False + else : + pass + + elif 'Name:'in content[i]: + s=content[i].split(': ')[1].split('/')[0] + k=2 + if 'C' in s or len(s)>30: + remove=True + else : + f.write(f"{content[i]}") + i += 1 + elif k==0: + rt = content[i].split('iRT=')[1] + f.write(f"{content[i].replace(rt, str(df_joined['rt pred'][j]))}\n") + i+=1 + j+=1 + else: + f.write(f"{content[i]}") + i+=1 #1787661 avec C , 15104040 sans \ No newline at end of file