clean : utils.py

498b38c4 · Schneider Leo · 10edb8ba · 498b38c4
Commit 498b38c4 authored 3 months ago by Schneider Leo
--- a/image_ref/utils.py
+++ b/image_ref/utils.py
@@ -15,76 +15,6 @@ SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter horm
                'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae',
                'Providencia stuartii']
-ALPHABET_UNMOD = {
-    "A": 1,
-    "C": 2,
-    "D": 3,
-    "E": 4,
-    "F": 5,
-    "G": 6,
-    "H": 7,
-    "I": 8,
-    "K": 9,
-    "L": 10,
-    "M": 11,
-    "N": 12,
-    "P": 13,
-    "Q": 14,
-    "R": 15,
-    "S": 16,
-    "T": 17,
-    "V": 18,
-    "W": 19,
-    "Y": 20,
-}
-MASSES_MONO = {
-    "A": 71.03711,
-    "C": 103.00919,
-    "D": 115.02694,
-    "E": 129.04259,
-    "F": 147.06841,
-    "G": 57.02146,
-    "H": 137.05891,
-    "I": 113.08406,
-    "K": 128.09496,
-    "L": 113.08406,
-    "M": 131.04049,
-    "N": 114.04293,
-    "P": 97.05276,
-    "Q": 128.05858,
-    "R": 156.1875,
-    "S": 87.03203,
-    "T": 101.04768,
-    "V": 99.06841,
-    "W": 186.07931,
-    "Y": 163.06333,
-}
-MASSES_AVG = {
-    "A": 71.0788,
-    "C": 103.1388,
-    "D": 115.0886,
-    "E": 129.1155,
-    "F": 147.1766,
-    "G": 57.0519,
-    "H": 137.1411,
-    "I": 113.1594,
-    "K": 128.1741,
-    "L": 113.1594,
-    "M": 131.1926,
-    "N": 114.1038,
-    "P": 97.1167,
-    "Q": 128.1307,
-    "R": 156.1875,
-    "S": 87.0782,
-    "T": 101.1051,
-    "V": 99.1326,
-    "W": 186.2132,
-    "Y": 163.1760,
-}
 # trypsin cut after K or R (if not followed by P)
 def cut(seq):
@@ -142,6 +72,9 @@ def build_database_ref_peptide():
        for line in f:
            if line != '\n':
                if '>' in line:
+                    #remove new line
+                    line = line.replace('\n', '')
                    # typo ??
                    line = line.replace('no_family', 'No_family')
                    line = line.replace('no_order', 'No_order')
@@ -157,66 +90,46 @@ def build_database_ref_peptide():
                    o = split_line[3].replace('_', ' ')
                else:
                    seq = line.split(' ')[1]
+                    #remove new lines
+                    seq = seq.replace('\n','')
                    l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev,
                              'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o})
-    return pd.DataFrame(l)
+    df = pd.DataFrame(l)
+    return df.drop_duplicates()
-def compute_mass(seq, isotop):
-    m = 0
-    if isotop == 'mono':
-        for char in MASSES_MONO.keys():
-            m += MASSES_MONO[char] * seq.count(char)
-    if isotop == 'avg':
-        for char in MASSES_AVG.keys():
-            m += MASSES_AVG[char] * seq.count(char)
-    return m
-def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
+def compute_common_peptide(df_path,species):
-    # build peptide list
+    df = pd.read_csv(df_path)
-    list_seq_1 = []
+    df_filter = df[df['Specie'].isin(species)]
-    for record in fastapy.parse(path_fasta):
+    df_filter = df_filter[['Sequence','Specie']].drop_duplicates()
-        list_seq_1.append(record.seq)
+    df_filter['Count'] = df_filter.groupby('Sequence')['Sequence'].transform('count')
+    df_count = df_filter[['Sequence','Count']].drop_duplicates()
-    list_peptides = []
+    return df_count
-    for prot in list_seq_1:
-        list_peptides.extend(digest(prot))
-    # compute m/z ration
-    mz_ratio = {}
-    i = 0
-    list_peptides = list(set(list_peptides))
-    for seq in list_peptides:
-        mz_ratio['seq'] = []
-        for charge in possible_charge:
-            ratio = compute_mass(seq, 'avg') / charge
-            if ms1_end_mz > ratio > ms1_start_mz:
-                mz_ratio['seq'].append(ratio)
-                i += 1
-    print(i)
-    # assocy predict rt
-    data = []
-    # predict detectability (optional)
+def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
+    df = load_lib(path_parqet)
+    df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
+    df_unique = df.drop_duplicates()
    # build image
    total_ms1_mz = ms1_end_mz - ms1_start_mz
    n_bin_ms1 = int(total_ms1_mz // bin_mz)
    im = np.zeros([max_cycle, n_bin_ms1])
-    max_rt = np.max(rt_pred)
+    if max_rt is None:
-    min_rt = np.min(rt_pred)
+        max_rt = np.max(df_unique['RT'])
-    total_rt = max_rt - min_rt
+    if min_rt is None:
-    for (rt, mz_ratio) in data:
+        min_rt = np.min(df_unique['RT'])
-        im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
+    total_rt = max_rt - min_rt + 1e-3
+    for row in df_unique.iterrows():
+        if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0:
+            im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int(
+                ((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
    return im
-'Klebsiella michiganensis',
+def build_ref_image_from_diann_global(path_parqet, target_seq, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
-def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
    df = load_lib(path_parqet)
    df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
+    df = df[df['Stripped.Sequence'].isin(target_seq)]
    df_unique = df.drop_duplicates()
    # build image
    total_ms1_mz = ms1_end_mz - ms1_start_mz
@@ -236,29 +149,31 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
 if __name__ == '__main__':
-    df = build_database_ref_peptide()
+    # df = build_database_ref_peptide()
-    #Write fasta file
+    # df.to_csv("dataset_species_ref_peptides.csv", index=False)
-    # for spe in SPECIES:
+    # #Write fasta file
-    #
+    # with open('fasta/optimal peptide set/gobal_peptide_set.fasta', "w") as f:
-    #     df_spe = df[df['Specie']==spe]
+    #     for spe in SPECIES:
-    #     spe_list = df_spe['Sequence'].to_list()
+    #         df_spe = df[df['Specie']==spe]
-    #     with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f:
+    #         spe_list = df_spe['Sequence'].drop_duplicate().to_list()
    #         if not spe_list:
    #             print('Empty : ',spe)
    #         for pep in spe_list :
    #             f.write(pep)
+    df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES)
    #
    #Create ref img
-    df_full = load_lib(
+    # df_full = load_lib(
-        'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
+    #     'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
-    min_rt = df_full['RT'].min()
+    # min_rt = df_full['RT'].min()
-    max_rt = df_full['RT'].max()
+    # max_rt = df_full['RT'].max()
+    #
-    for spe in SPECIES:
+    # for spe in SPECIES:
-        im = build_ref_image_from_diann(
+    #     im = build_ref_image_from_diann(
-            'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
+    #         'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
-            ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
+    #         ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
-        plt.clf()
+    #     plt.clf()
-        mpimg.imsave('img_ref/' + spe + '.png', im)
+    #     mpimg.imsave('img_ref/' + spe + '.png', im)
-        np.save(spe + '.npy', im)
+    #     np.save(spe + '.npy', im)