diff --git a/image_processing/build_image.py b/image_processing/build_image.py index 97e11ed6f5dcbc50a5f0f392193bde1f399eabfe..dda6aa3ca7e73295568aca9ffc8d0283f5f34102 100644 --- a/image_processing/build_image.py +++ b/image_processing/build_image.py @@ -9,7 +9,8 @@ def build_image_ms1_wiff(path, bin_mz): #load raw data rawFile = WiffFileReader(path) max_cycle=0 - for scanNumber in range (rawFile.GetLastSpectrumNumber()): + total_scan_number = rawFile.GetLastSpectrumNumber() + for scanNumber in range (total_scan_number): if rawFile.GetMSOrderForScanNum(scanNumber) == 1 : ms1_start_mz = rawFile.source.ScanInfos[scanNumber].LowMz ms1_end_mz = rawFile.source.ScanInfos[scanNumber].HighMz @@ -23,7 +24,7 @@ def build_image_ms1_wiff(path, bin_mz): im = np.zeros([max_cycle, n_bin_ms1]) cycle = 0 - for scanNumber in range(rawFile.GetLastSpectrumNumber()): + for scanNumber in range(total_scan_number): if rawFile.GetMSOrderForScanNum(scanNumber) == 1: masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber) line = np.zeros(n_bin_ms1) @@ -35,6 +36,84 @@ def build_image_ms1_wiff(path, bin_mz): return im +def build_image_ms1_wiff_charge_filtered(path, bin_mz=1, tolerance=0.01): + #load raw data + rawFile = WiffFileReader(path) + max_cycle=0 + total_scan_number = rawFile.GetLastSpectrumNumber() + for scanNumber in range (total_scan_number): + if rawFile.GetMSOrderForScanNum(scanNumber) == 1 : + ms1_start_mz = rawFile.source.ScanInfos[scanNumber].LowMz + ms1_end_mz = rawFile.source.ScanInfos[scanNumber].HighMz + max_cycle+=1 + + print('start', ms1_start_mz, 'end', ms1_end_mz) + total_ms1_mz = ms1_end_mz - ms1_start_mz + + n_bin_ms1 = int(total_ms1_mz // bin_mz) + size_bin_ms1 = total_ms1_mz / n_bin_ms1 + im = np.zeros([max_cycle, n_bin_ms1]) + + cycle = 0 + for scanNumber in range(total_scan_number): + if rawFile.GetMSOrderForScanNum(scanNumber) == 1: + masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber) + is_monocharged = [False for i in range(len(masses))] + is_bicharged = [False for i in range(len(masses))] + is_tricharged = [False for i in range(len(masses))] + for i in range(len(masses)): + m = masses[i] + found_s = True + found_b = True + found_t = True + s = 1 + b=1 + t=1 + if not is_monocharged[i]: + dec = 1 + while found_s : + found_s=False + while (i + dec) < len(masses) and masses[i + dec] - m < (s + tolerance): + if masses[i + dec] - m > (1 - tolerance): + is_monocharged[i] = True + is_monocharged[i + dec] = True + found_s=True + dec += 1 + s+=1 + + + if not is_bicharged[i]: + dec = 1 + while found_b : + found_b=False + while (i + dec) < len(masses) and masses[i + dec] - m < (b / 2 + tolerance): + if masses[i + dec] - m > (1 / 2 - tolerance): + is_bicharged[i] = True + is_bicharged[i + dec] = True + found_b = True + dec += 1 + b += 1 + + if not is_tricharged[i]: + dec = 1 + while found_t : + found_t=False + while (i + dec) < len(masses) and masses[i + dec] - m < (1 / 3 + tolerance): + if masses[i + dec] - m > (t / 3 - tolerance): + is_tricharged[i] = True + is_tricharged[i + dec] = True + found_t = True + dec += 1 + t += 1 + line = np.zeros(n_bin_ms1) + if len(masses) > 0: + for k in range(len(masses)): + if is_bicharged[k] or is_tricharged[k] : + line[int((masses[k] - ms1_start_mz) // size_bin_ms1)] += intensities[k] + im[cycle, :] = line + cycle += 1 + rawFile.Close() + return im def build_image_ms1(path, bin_mz): #load raw data @@ -79,3 +158,21 @@ def build_image_ms1(path, bin_mz): im[c, :] = line return im + + +if __name__ == '__main__': + path = '../data/raw_data/SERMAR-10-AER-d200.wiff' + im_filtered = build_image_ms1_wiff_charge_filtered(path,tolerance=0.0001) + # im_base = build_image_ms1_wiff(path,bin_mz=1) + im_filtered=np.log(im_filtered+1) + # im_base=np.log(im_base+1) + + + + plt.imshow(im_filtered, interpolation='nearest') + plt.savefig('filtered_image_tol_00001.png') + plt.clf() + + # plt.imshow(im_base, interpolation='nearest') + # plt.savefig('base_image.png') + # plt.clf() \ No newline at end of file diff --git a/image_ref/utils.py b/image_ref/utils.py index 63584d2c5733b2111787289352536820f4bb0dbd..097d8961fce1da755a59019f8cf0e024e996eeac 100644 --- a/image_ref/utils.py +++ b/image_ref/utils.py @@ -162,6 +162,7 @@ def build_ref_image_from_diann_global(path_parqet, target_seq, ms1_end_mz, ms1_s if __name__ == '__main__': + df = load_lib('fasta/global_peptide_list.parquet') # df = build_database_ref_peptide() # df.to_csv("dataset_species_ref_peptides.csv", index=False) # Write fasta file @@ -173,24 +174,24 @@ if __name__ == '__main__': # print(pep) # f.write(pep+'\n') # - df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES) - - # - # Create ref img - df_full = load_lib( - 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') - min_rt = df_full['RT'].min() - max_rt = df_full['RT'].max() - # - df = pd.read_csv("dataset_species_ref_peptides.csv") + # df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES) # - for spe in SPECIES: - print(spe) - df_spe = df[df['Specie'] == spe] - df_spec_no_common = df_spe[df_spe['Sequence'].isin(df_count[df_count['Count']<5]['Sequence'])] - im = build_ref_image_from_diann_global( - 'fasta/global_peptide_list.parquet', target_seq=df_spec_no_common['Sequence'].to_list(), ms1_end_mz=1250, - ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) - plt.clf() - mpimg.imsave('img_ref_count_th_5/' + spe + '.png', im) - np.save('img_ref_count_th_5/' + spe + '.npy', im) + # # + # # Create ref img + # df_full = load_lib( + # 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') + # min_rt = df_full['RT'].min() + # max_rt = df_full['RT'].max() + # # + # df = pd.read_csv("dataset_species_ref_peptides.csv") + # # + # for spe in SPECIES: + # print(spe) + # df_spe = df[df['Specie'] == spe] + # df_spec_no_common = df_spe[df_spe['Sequence'].isin(df_count[df_count['Count']<5]['Sequence'])] + # im = build_ref_image_from_diann_global( + # 'fasta/global_peptide_list.parquet', target_seq=df_spec_no_common['Sequence'].to_list(), ms1_end_mz=1250, + # ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) + # plt.clf() + # mpimg.imsave('img_ref_count_th_5/' + spe + '.png', im) + # np.save('img_ref_count_th_5/' + spe + '.npy', im)