Skip to content
Snippets Groups Projects
Commit c83febd6 authored by Schneider Leo's avatar Schneider Leo
Browse files

fix : optimal peptide set extraction

parent dc91f876
No related branches found
No related tags found
No related merge requests found
...@@ -7,6 +7,14 @@ from analyse_diann_digestion import load_lib ...@@ -7,6 +7,14 @@ from analyse_diann_digestion import load_lib
import matplotlib.image as mpimg import matplotlib.image as mpimg
import re import re
SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter hormaechei', 'Klebsiella aerogenes',
'Klebsiella variicola', 'Raoultella ornithinolytica', 'Citrobacter farmeri', 'Enterobacter asburiae',
'Enterobacter kobei', 'Klebsiella michiganensis', 'Morganella morganii', 'Salmonella enterica',
'Citrobacter amalonaticus', 'Citrobacter freundii', 'Enterobacter chengduensis', 'Escherichia coli',
'Klebsiella oxytoca', 'Proteus mirabilis', 'Serratia marcescens', 'Citrobacter braakii',
'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae',
'Providencia stuartii']
ALPHABET_UNMOD = { ALPHABET_UNMOD = {
"A": 1, "A": 1,
"C": 2, "C": 2,
...@@ -30,7 +38,6 @@ ALPHABET_UNMOD = { ...@@ -30,7 +38,6 @@ ALPHABET_UNMOD = {
"Y": 20, "Y": 20,
} }
MASSES_MONO = { MASSES_MONO = {
"A": 71.03711, "A": 71.03711,
"C": 103.00919, "C": 103.00919,
...@@ -54,7 +61,6 @@ MASSES_MONO = { ...@@ -54,7 +61,6 @@ MASSES_MONO = {
"Y": 163.06333, "Y": 163.06333,
} }
MASSES_AVG = { MASSES_AVG = {
"A": 71.0788, "A": 71.0788,
"C": 103.1388, "C": 103.1388,
...@@ -78,6 +84,7 @@ MASSES_AVG = { ...@@ -78,6 +84,7 @@ MASSES_AVG = {
"Y": 163.1760, "Y": 163.1760,
} }
# trypsin cut after K or R (if not followed by P) # trypsin cut after K or R (if not followed by P)
def cut(seq): def cut(seq):
...@@ -102,12 +109,14 @@ def cut_with_ind(seq, ind_list): ...@@ -102,12 +109,14 @@ def cut_with_ind(seq, ind_list):
return l return l
def digest(seq): def digest(seq):
ind = cut(seq) ind = cut(seq)
return cut_with_ind(seq, ind) return cut_with_ind(seq, ind)
def fasta_similarity(path_fasta_1, path_fasta_2): def fasta_similarity(path_fasta_1, path_fasta_2):
list_seq_1=[] list_seq_1 = []
list_seq_2 = [] list_seq_2 = []
for record in fastapy.parse(path_fasta_1): for record in fastapy.parse(path_fasta_1):
list_seq_1.extend(digest(record.seq)) list_seq_1.extend(digest(record.seq))
...@@ -121,18 +130,20 @@ def fasta_similarity(path_fasta_1, path_fasta_2): ...@@ -121,18 +130,20 @@ def fasta_similarity(path_fasta_1, path_fasta_2):
plt.show() plt.show()
plt.savefig('fasta_similarity.png') plt.savefig('fasta_similarity.png')
def split_string(input_string): def split_string(input_string):
# Use regular expression to split the string at underscore followed by uppercase letter # Use regular expression to split the string at underscore followed by uppercase letter
return re.split(r'_(?=[A-Zc])', input_string) return re.split(r'_(?=[A-Z])|c(?=[A-Z])', input_string)
def build_database_ref_peptide(): def build_database_ref_peptide():
l=[] l = []
with open('../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt', 'r') as f: with open('../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt', 'r') as f:
for line in f: for line in f:
if line != '\n': if line != '\n':
if '>' in line: if '>' in line:
#typo ?? # typo ??
line = line.replace('no_family','No_family') line = line.replace('no_family', 'No_family')
line = line.replace('no_order', 'No_order') line = line.replace('no_order', 'No_order')
split_line = line.split('_') split_line = line.split('_')
...@@ -140,16 +151,17 @@ def build_database_ref_peptide(): ...@@ -140,16 +151,17 @@ def build_database_ref_peptide():
err = split_line[1] err = split_line[1]
prev = split_line[2] prev = split_line[2]
split_line = split_string(line.split(' ')[1]) split_line = split_string(line.split(' ')[1])
spe = split_line[0].replace('_',' ') spe = split_line[0].replace('_', ' ')
gen = split_line[1].replace('_',' ') gen = split_line[1].replace('_', ' ')
fam = split_line[2].replace('_',' ') fam = split_line[2].replace('_', ' ')
o = split_line[3].replace('_',' ') o = split_line[3].replace('_', ' ')
else : else:
seq = line.split(' ')[1] seq = line.split(' ')[1]
l.append({'Sequence' : seq,'Protein code' :prot , 'Error treshold':err , 'Prevalance': prev, l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev,
'Specie':spe ,'Genus':gen ,'Family':fam ,'Order':o }) 'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o})
return pd.DataFrame(l) return pd.DataFrame(l)
def compute_mass(seq, isotop): def compute_mass(seq, isotop):
m = 0 m = 0
if isotop == 'mono': if isotop == 'mono':
...@@ -160,55 +172,53 @@ def compute_mass(seq, isotop): ...@@ -160,55 +172,53 @@ def compute_mass(seq, isotop):
m += MASSES_AVG[char] * seq.count(char) m += MASSES_AVG[char] * seq.count(char)
return m return m
def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
#build peptide list def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
# build peptide list
list_seq_1 = [] list_seq_1 = []
for record in fastapy.parse(path_fasta): for record in fastapy.parse(path_fasta):
list_seq_1.append(record.seq) list_seq_1.append(record.seq)
list_peptides = [] list_peptides = []
for prot in list_seq_1 : for prot in list_seq_1:
list_peptides.extend(digest(prot)) list_peptides.extend(digest(prot))
#compute m/z ration # compute m/z ration
mz_ratio={} mz_ratio = {}
i=0 i = 0
list_peptides = list(set(list_peptides)) list_peptides = list(set(list_peptides))
for seq in list_peptides: for seq in list_peptides:
mz_ratio['seq']=[] mz_ratio['seq'] = []
for charge in possible_charge: for charge in possible_charge:
ratio = compute_mass(seq,'avg')/charge ratio = compute_mass(seq, 'avg') / charge
if ms1_end_mz > ratio > ms1_start_mz: if ms1_end_mz > ratio > ms1_start_mz:
mz_ratio['seq'].append(ratio) mz_ratio['seq'].append(ratio)
i+=1 i += 1
print(i) print(i)
#assocy predict rt # assocy predict rt
data=[] data = []
#predict detectability (optional) # predict detectability (optional)
#build image # build image
total_ms1_mz = ms1_end_mz - ms1_start_mz total_ms1_mz = ms1_end_mz - ms1_start_mz
n_bin_ms1 = int(total_ms1_mz // bin_mz) n_bin_ms1 = int(total_ms1_mz // bin_mz)
im = np.zeros([max_cycle, n_bin_ms1]) im = np.zeros([max_cycle, n_bin_ms1])
max_rt = np.max(rt_pred) max_rt = np.max(rt_pred)
min_rt = np.min(rt_pred) min_rt = np.min(rt_pred)
total_rt = max_rt - min_rt total_rt = max_rt - min_rt
for (rt,mz_ratio) in data : for (rt, mz_ratio) in data:
im[int((rt-min_rt/total_rt)*max_cycle),int(((mz_ratio-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1 im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
return im return im
def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None): def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
df = load_lib(path_parqet) df = load_lib(path_parqet)
df=df[['Stripped.Sequence','Precursor.Charge','RT','Precursor.Mz']] df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
df_unique = df.drop_duplicates() df_unique = df.drop_duplicates()
#build image # build image
total_ms1_mz = ms1_end_mz - ms1_start_mz total_ms1_mz = ms1_end_mz - ms1_start_mz
n_bin_ms1 = int(total_ms1_mz // bin_mz) n_bin_ms1 = int(total_ms1_mz // bin_mz)
im = np.zeros([max_cycle, n_bin_ms1]) im = np.zeros([max_cycle, n_bin_ms1])
...@@ -216,30 +226,38 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma ...@@ -216,30 +226,38 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
max_rt = np.max(df_unique['RT']) max_rt = np.max(df_unique['RT'])
if min_rt is None: if min_rt is None:
min_rt = np.min(df_unique['RT']) min_rt = np.min(df_unique['RT'])
total_rt = max_rt - min_rt +1e-3 total_rt = max_rt - min_rt + 1e-3
for row in df_unique.iterrows() : for row in df_unique.iterrows():
if 900 > int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1) >= 0: if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0:
im[int((row[1]['RT']-min_rt)/total_rt*max_cycle),int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1 im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int(
((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
return im return im
if __name__ == '__main__': if __name__ == '__main__':
# fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta')
# im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[])
# plt.clf()
# mpimg.imsave('test_img.png', im)
df = build_database_ref_peptide() df = build_database_ref_peptide()
df_full = load_lib('fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') for spe in SPECIES:
min_rt = df_full['RT'].min()
max_rt = df_full['RT'].max() df_spe = df[df['Specie']==spe]
for spe in ['Proteus mirabilis','Klebsiella pneumoniae','Klebsiella oxytoca','Enterobacter hormaechei','Citrobacter freundii']: spe_list = df_spe['Sequence'].to_list()
im = build_ref_image_from_diann( with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f:
'fasta/optimal peptide set/'+spe+'.parquet', ms1_end_mz=1250, if not spe_list:
ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) print('Empty : ',spe)
plt.clf() for pep in spe_list :
mpimg.imsave(spe+'.png', im) f.write(pep)
np.save(spe+'.npy', im)
#
#
# df_full = load_lib(
# 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
# min_rt = df_full['RT'].min()
# max_rt = df_full['RT'].max()
#
# for spe in SPECIES:
# im = build_ref_image_from_diann(
# 'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
# ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
# plt.clf()
# mpimg.imsave('img_ref/' + spe + '.png', im)
# np.save(spe + '.npy', im)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment