fix : optimal peptide set extraction

c83febd6 · Schneider Leo · dc91f876 · c83febd6
Commit c83febd6 authored 2 months ago by Schneider Leo
--- a/image_ref/utils.py
+++ b/image_ref/utils.py
@@ -7,6 +7,14 @@ from analyse_diann_digestion import load_lib
 import matplotlib.image as mpimg
 import re
+SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter hormaechei', 'Klebsiella aerogenes',
+                'Klebsiella variicola', 'Raoultella ornithinolytica', 'Citrobacter farmeri', 'Enterobacter asburiae',
+                'Enterobacter kobei', 'Klebsiella michiganensis', 'Morganella morganii', 'Salmonella enterica',
+                'Citrobacter amalonaticus', 'Citrobacter freundii', 'Enterobacter chengduensis', 'Escherichia coli',
+                'Klebsiella oxytoca', 'Proteus mirabilis', 'Serratia marcescens', 'Citrobacter braakii',
+                'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae',
+                'Providencia stuartii']
 ALPHABET_UNMOD = {
    "A": 1,
    "C": 2,
@@ -30,7 +38,6 @@ ALPHABET_UNMOD = {
    "Y": 20,
 }
 MASSES_MONO = {
    "A": 71.03711,
    "C": 103.00919,
@@ -54,7 +61,6 @@ MASSES_MONO = {
    "Y": 163.06333,
 }
 MASSES_AVG = {
    "A": 71.0788,
    "C": 103.1388,
@@ -78,6 +84,7 @@ MASSES_AVG = {
    "Y": 163.1760,
 }
 # trypsin cut after K or R (if not followed by P)
 def cut(seq):
@@ -102,12 +109,14 @@ def cut_with_ind(seq, ind_list):
    return l
 def digest(seq):
    ind = cut(seq)
    return cut_with_ind(seq, ind)
 def fasta_similarity(path_fasta_1, path_fasta_2):
-    list_seq_1=[]
+    list_seq_1 = []
    list_seq_2 = []
    for record in fastapy.parse(path_fasta_1):
        list_seq_1.extend(digest(record.seq))
@@ -121,18 +130,20 @@ def fasta_similarity(path_fasta_1, path_fasta_2):
    plt.show()
    plt.savefig('fasta_similarity.png')
 def split_string(input_string):
    # Use regular expression to split the string at underscore followed by uppercase letter
-    return re.split(r'_(?=[A-Zc])', input_string)
+    return re.split(r'_(?=[A-Z])|c(?=[A-Z])', input_string)
 def build_database_ref_peptide():
-    l=[]
+    l = []
    with open('../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt', 'r') as f:
        for line in f:
            if line != '\n':
                if '>' in line:
-                    #typo ??
+                    # typo ??
-                    line = line.replace('no_family','No_family')
+                    line = line.replace('no_family', 'No_family')
                    line = line.replace('no_order', 'No_order')
                    split_line = line.split('_')
@@ -140,16 +151,17 @@ def build_database_ref_peptide():
                    err = split_line[1]
                    prev = split_line[2]
                    split_line = split_string(line.split(' ')[1])
-                    spe = split_line[0].replace('_',' ')
+                    spe = split_line[0].replace('_', ' ')
-                    gen = split_line[1].replace('_',' ')
+                    gen = split_line[1].replace('_', ' ')
-                    fam = split_line[2].replace('_',' ')
+                    fam = split_line[2].replace('_', ' ')
-                    o = split_line[3].replace('_',' ')
+                    o = split_line[3].replace('_', ' ')
-                else :
+                else:
                    seq = line.split(' ')[1]
-                    l.append({'Sequence' : seq,'Protein code' :prot , 'Error treshold':err , 'Prevalance': prev,
+                    l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev,
-                              'Specie':spe ,'Genus':gen ,'Family':fam ,'Order':o })
+                              'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o})
    return pd.DataFrame(l)
 def compute_mass(seq, isotop):
    m = 0
    if isotop == 'mono':
@@ -160,55 +172,53 @@ def compute_mass(seq, isotop):
            m += MASSES_AVG[char] * seq.count(char)
    return m
-def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
-    #build peptide list
+def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
+    # build peptide list
    list_seq_1 = []
    for record in fastapy.parse(path_fasta):
        list_seq_1.append(record.seq)
    list_peptides = []
-    for prot in list_seq_1 :
+    for prot in list_seq_1:
        list_peptides.extend(digest(prot))
-    #compute m/z ration
+    # compute m/z ration
-    mz_ratio={}
+    mz_ratio = {}
-    i=0
+    i = 0
    list_peptides = list(set(list_peptides))
    for seq in list_peptides:
-        mz_ratio['seq']=[]
+        mz_ratio['seq'] = []
        for charge in possible_charge:
-            ratio = compute_mass(seq,'avg')/charge
+            ratio = compute_mass(seq, 'avg') / charge
            if ms1_end_mz > ratio > ms1_start_mz:
                mz_ratio['seq'].append(ratio)
-                i+=1
+                i += 1
    print(i)
-    #assocy predict rt
+    # assocy predict rt
-    data=[]
+    data = []
-    #predict detectability (optional)
+    # predict detectability (optional)
-    #build image
+    # build image
    total_ms1_mz = ms1_end_mz - ms1_start_mz
    n_bin_ms1 = int(total_ms1_mz // bin_mz)
    im = np.zeros([max_cycle, n_bin_ms1])
    max_rt = np.max(rt_pred)
    min_rt = np.min(rt_pred)
    total_rt = max_rt - min_rt
-    for (rt,mz_ratio) in data :
+    for (rt, mz_ratio) in data:
-        im[int((rt-min_rt/total_rt)*max_cycle),int(((mz_ratio-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1
+        im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
    return im
 def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
    df = load_lib(path_parqet)
-    df=df[['Stripped.Sequence','Precursor.Charge','RT','Precursor.Mz']]
+    df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
    df_unique = df.drop_duplicates()
-    #build image
+    # build image
    total_ms1_mz = ms1_end_mz - ms1_start_mz
    n_bin_ms1 = int(total_ms1_mz // bin_mz)
    im = np.zeros([max_cycle, n_bin_ms1])
@@ -216,30 +226,38 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
        max_rt = np.max(df_unique['RT'])
    if min_rt is None:
        min_rt = np.min(df_unique['RT'])
-    total_rt = max_rt - min_rt +1e-3
+    total_rt = max_rt - min_rt + 1e-3
-    for row in df_unique.iterrows() :
+    for row in df_unique.iterrows():
-        if 900 > int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1) >= 0:
+        if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0:
-            im[int((row[1]['RT']-min_rt)/total_rt*max_cycle),int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1
+            im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int(
+                ((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
    return im
 if __name__ == '__main__':
-    # fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta')
-    # im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[])
-    # plt.clf()
-    # mpimg.imsave('test_img.png', im)
    df = build_database_ref_peptide()
-    df_full = load_lib('fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
+    for spe in SPECIES:
-    min_rt = df_full['RT'].min()
-    max_rt = df_full['RT'].max()
+        df_spe = df[df['Specie']==spe]
-    for spe in ['Proteus mirabilis','Klebsiella pneumoniae','Klebsiella oxytoca','Enterobacter hormaechei','Citrobacter freundii']:
+        spe_list = df_spe['Sequence'].to_list()
-        im = build_ref_image_from_diann(
+        with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f:
-            'fasta/optimal peptide set/'+spe+'.parquet', ms1_end_mz=1250,
+            if not spe_list:
-            ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
+                print('Empty : ',spe)
-        plt.clf()
+            for pep in spe_list :
-        mpimg.imsave(spe+'.png', im)
+                f.write(pep)
-        np.save(spe+'.npy', im)
+    #
+    #
+    # df_full = load_lib(
+    #     'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
+    # min_rt = df_full['RT'].min()
+    # max_rt = df_full['RT'].max()
+    #
+    # for spe in SPECIES:
+    #     im = build_ref_image_from_diann(
+    #         'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
+    #         ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
+    #     plt.clf()
+    #     mpimg.imsave('img_ref/' + spe + '.png', im)
+    #     np.save(spe + '.npy', im)