-
Schneider Leo authored10195944
msp_file_extraction.py 3.59 KiB
import pandas as pd
from sympy import false
ALPHABET_UNMOD = {
"": 0,
"A": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
"K": 9,
"L": 10,
"M": 11,
"N": 12,
"P": 13,
"Q": 14,
"R": 15,
"S": 16,
"T": 17,
"V": 18,
"W": 19,
"Y": 20,
"M(UniMod:35)": 21,
"CaC": 22
}
ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
def numerical_to_alphabetical_str(s):
seq = ''
s = s.replace('[','')
s = s.replace(']', '')
arr = s.split(',')
arr = list(map(int, arr))
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
if __name__ == '__main__':
#extract seq from .msp
# seq=[]
# file = open("spectral_lib/predicted_library.msp", "r")
# content=file.readlines()
# file.close()
# remove = False
# predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv')
# pred_rt=predicted_lib['rt pred']
#
# for i in range(len(content)) :
# if remove:
# if 'Name:' in content[i]:
# remove = False
# else :
# pass
#
# if 'Name:'in content[i]:
# s=content[i].split(': ')[1].split('/')[0]
# if 'C' in s or len(s)>30:
# remove=True
# else :
# seq.append(s)
#
# df = pd.DataFrame(seq,columns=['sequence'])
# df = df.drop_duplicates()
# df['irt_scaled']=0
# df['state'] = 'holdout'
# df.to_csv('spectral_lib/df_predicted_library_oktoberfest.csv',index=False)
#
#
#write new .msp with new RT
seq=[]
file = open("spectral_lib/predicted_library.msp", "r")
content=file.readlines()
file.close()
remove = False
predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv')
pred_rt=predicted_lib['rt pred']
for i in range(len(content)) :
if remove:
if 'Name:' in content[i]:
remove = False
else :
pass
if 'Name:'in content[i]:
s=content[i].split(': ')[1].split('/')[0]
if 'C' in s or len(s)>30:
remove=True
else :
seq.append(s)
df = pd.DataFrame(seq,columns=['sequence'])
predicted_lib=pd.read_csv('../output/out_lib_oktoberfest.csv')
predicted_lib['seq'] = predicted_lib['seq'].map(numerical_to_alphabetical_str)
predicted_lib['sequence']=predicted_lib['seq']
pred_rt=predicted_lib['rt pred']
df_joined = pd.merge(df,predicted_lib[['rt pred','sequence']],on='sequence',how='left')
#
file = open("spectral_lib/predicted_library.msp", "r")
content=file.readlines()
file.close()
remove = False
with open('spectral_lib/new_lib.msp', 'w') as f:
i=0
j=0
k=-1
for i in range(len(content)) :
k-=1
if remove:
if 'Name:' in content[i]:
remove = False
else :
pass
elif 'Name:'in content[i]:
s=content[i].split(': ')[1].split('/')[0]
k=2
if 'C' in s or len(s)>30:
remove=True
else :
f.write(f"{content[i]}")
i += 1
elif k==0:
rt = content[i].split('iRT=')[1]
f.write(f"{content[i].replace(rt, str(df_joined['rt pred'][j]))}\n")
i+=1
j+=1
else:
f.write(f"{content[i]}")
i+=1
#1787661 avec C , 15104040 sans