add : filter data based on charge for image creation

3c8721d0 · Schneider Leo · 97d68603 · 3c8721d0 · 3c8721d0
Commit 3c8721d0 authored 2 months ago by Schneider Leo
--- a/image_processing/build_image.py
+++ b/image_processing/build_image.py
@@ -9,7 +9,8 @@ def build_image_ms1_wiff(path, bin_mz):
    #load raw data
    rawFile = WiffFileReader(path)
    max_cycle=0
-    for scanNumber in range (rawFile.GetLastSpectrumNumber()):
+    total_scan_number = rawFile.GetLastSpectrumNumber()
+    for scanNumber in range (total_scan_number):
        if rawFile.GetMSOrderForScanNum(scanNumber) == 1 :
            ms1_start_mz = rawFile.source.ScanInfos[scanNumber].LowMz
            ms1_end_mz = rawFile.source.ScanInfos[scanNumber].HighMz
@@ -23,7 +24,7 @@ def build_image_ms1_wiff(path, bin_mz):
    im = np.zeros([max_cycle, n_bin_ms1])

    cycle = 0
-    for scanNumber in range(rawFile.GetLastSpectrumNumber()):
+    for scanNumber in range(total_scan_number):
        if rawFile.GetMSOrderForScanNum(scanNumber) == 1:
            masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber)
            line = np.zeros(n_bin_ms1)
@@ -35,6 +36,84 @@ def build_image_ms1_wiff(path, bin_mz):

    return im

+def build_image_ms1_wiff_charge_filtered(path, bin_mz=1, tolerance=0.01):
+    #load raw data
+    rawFile = WiffFileReader(path)
+    max_cycle=0
+    total_scan_number = rawFile.GetLastSpectrumNumber()
+    for scanNumber in range (total_scan_number):
+        if rawFile.GetMSOrderForScanNum(scanNumber) == 1 :
+            ms1_start_mz = rawFile.source.ScanInfos[scanNumber].LowMz
+            ms1_end_mz = rawFile.source.ScanInfos[scanNumber].HighMz
+            max_cycle+=1
+
+    print('start', ms1_start_mz, 'end', ms1_end_mz)
+    total_ms1_mz = ms1_end_mz - ms1_start_mz
+
+    n_bin_ms1 = int(total_ms1_mz // bin_mz)
+    size_bin_ms1 = total_ms1_mz / n_bin_ms1
+    im = np.zeros([max_cycle, n_bin_ms1])
+
+    cycle = 0
+    for scanNumber in range(total_scan_number):
+        if rawFile.GetMSOrderForScanNum(scanNumber) == 1:
+            masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber)
+            is_monocharged = [False for i in range(len(masses))]
+            is_bicharged = [False for i in range(len(masses))]
+            is_tricharged = [False for i in range(len(masses))]
+            for i in range(len(masses)):
+                m = masses[i]
+                found_s = True
+                found_b = True
+                found_t = True
+                s = 1
+                b=1
+                t=1
+                if not is_monocharged[i]:
+                    dec = 1
+                    while found_s :
+                        found_s=False
+                        while (i + dec) < len(masses) and masses[i + dec] - m < (s + tolerance):
+                            if masses[i + dec] - m > (1 - tolerance):
+                                is_monocharged[i] = True
+                                is_monocharged[i + dec] = True
+                                found_s=True
+                            dec += 1
+                        s+=1
+
+
+                if not is_bicharged[i]:
+                    dec = 1
+                    while found_b :
+                        found_b=False
+                        while (i + dec) < len(masses) and masses[i + dec] - m < (b / 2 + tolerance):
+                            if masses[i + dec] - m > (1 / 2 - tolerance):
+                                is_bicharged[i] = True
+                                is_bicharged[i + dec] = True
+                                found_b = True
+                            dec += 1
+                        b += 1
+
+                if not is_tricharged[i]:
+                    dec = 1
+                    while found_t :
+                        found_t=False
+                        while (i + dec) < len(masses) and masses[i + dec] - m < (1 / 3 + tolerance):
+                            if masses[i + dec] - m > (t / 3 - tolerance):
+                                is_tricharged[i] = True
+                                is_tricharged[i + dec] = True
+                                found_t = True
+                            dec += 1
+                        t += 1
+            line = np.zeros(n_bin_ms1)
+            if len(masses) > 0:
+                for k in range(len(masses)):
+                    if is_bicharged[k] or is_tricharged[k] :
+                        line[int((masses[k] - ms1_start_mz) // size_bin_ms1)] += intensities[k]
+            im[cycle, :] = line
+            cycle += 1
+    rawFile.Close()
+    return im

 def build_image_ms1(path, bin_mz):
    #load raw data
@@ -79,3 +158,21 @@ def build_image_ms1(path, bin_mz):
        im[c, :] = line

    return im
+
+
+if __name__ == '__main__':
+    path = '../data/raw_data/SERMAR-10-AER-d200.wiff'
+    im_filtered = build_image_ms1_wiff_charge_filtered(path,tolerance=0.0001)
+    # im_base = build_image_ms1_wiff(path,bin_mz=1)
+    im_filtered=np.log(im_filtered+1)
+    # im_base=np.log(im_base+1)
+
+
+
+    plt.imshow(im_filtered, interpolation='nearest')
+    plt.savefig('filtered_image_tol_00001.png')
+    plt.clf()
+
+    # plt.imshow(im_base, interpolation='nearest')
+    # plt.savefig('base_image.png')
+    # plt.clf()
\ No newline at end of file
--- a/image_ref/utils.py
+++ b/image_ref/utils.py
@@ -162,6 +162,7 @@ def build_ref_image_from_diann_global(path_parqet, target_seq, ms1_end_mz, ms1_s


 if __name__ == '__main__':
+    df = load_lib('fasta/global_peptide_list.parquet')
    # df = build_database_ref_peptide()
    # df.to_csv("dataset_species_ref_peptides.csv", index=False)
    # Write fasta file
@@ -173,24 +174,24 @@ if __name__ == '__main__':
    #         print(pep)
    #         f.write(pep+'\n')
    #
-    df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES)
-
-    #
-    # Create ref img
-    df_full = load_lib(
-        'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
-    min_rt = df_full['RT'].min()
-    max_rt = df_full['RT'].max()
-    #
-    df = pd.read_csv("dataset_species_ref_peptides.csv")
+    # df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES)
    #
-    for spe in SPECIES:
-        print(spe)
-        df_spe = df[df['Specie'] == spe]
-        df_spec_no_common = df_spe[df_spe['Sequence'].isin(df_count[df_count['Count']<5]['Sequence'])]
-        im = build_ref_image_from_diann_global(
-            'fasta/global_peptide_list.parquet', target_seq=df_spec_no_common['Sequence'].to_list(), ms1_end_mz=1250,
-            ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
-        plt.clf()
-        mpimg.imsave('img_ref_count_th_5/' + spe + '.png', im)
-        np.save('img_ref_count_th_5/' + spe + '.npy', im)
+    # #
+    # # Create ref img
+    # df_full = load_lib(
+    #     'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
+    # min_rt = df_full['RT'].min()
+    # max_rt = df_full['RT'].max()
+    # #
+    # df = pd.read_csv("dataset_species_ref_peptides.csv")
+    # #
+    # for spe in SPECIES:
+    #     print(spe)
+    #     df_spe = df[df['Specie'] == spe]
+    #     df_spec_no_common = df_spe[df_spe['Sequence'].isin(df_count[df_count['Count']<5]['Sequence'])]
+    #     im = build_ref_image_from_diann_global(
+    #         'fasta/global_peptide_list.parquet', target_seq=df_spec_no_common['Sequence'].to_list(), ms1_end_mz=1250,
+    #         ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
+    #     plt.clf()
+    #     mpimg.imsave('img_ref_count_th_5/' + spe + '.png', im)
+    #     np.save('img_ref_count_th_5/' + spe + '.npy', im)