Skip to content
Snippets Groups Projects
Commit 3c8721d0 authored by Schneider Leo's avatar Schneider Leo
Browse files

add : filter data based on charge for image creation

parent 97d68603
No related branches found
No related tags found
No related merge requests found
......@@ -9,7 +9,8 @@ def build_image_ms1_wiff(path, bin_mz):
#load raw data
rawFile = WiffFileReader(path)
max_cycle=0
for scanNumber in range (rawFile.GetLastSpectrumNumber()):
total_scan_number = rawFile.GetLastSpectrumNumber()
for scanNumber in range (total_scan_number):
if rawFile.GetMSOrderForScanNum(scanNumber) == 1 :
ms1_start_mz = rawFile.source.ScanInfos[scanNumber].LowMz
ms1_end_mz = rawFile.source.ScanInfos[scanNumber].HighMz
......@@ -23,7 +24,7 @@ def build_image_ms1_wiff(path, bin_mz):
im = np.zeros([max_cycle, n_bin_ms1])
cycle = 0
for scanNumber in range(rawFile.GetLastSpectrumNumber()):
for scanNumber in range(total_scan_number):
if rawFile.GetMSOrderForScanNum(scanNumber) == 1:
masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber)
line = np.zeros(n_bin_ms1)
......@@ -35,6 +36,84 @@ def build_image_ms1_wiff(path, bin_mz):
return im
def build_image_ms1_wiff_charge_filtered(path, bin_mz=1, tolerance=0.01):
#load raw data
rawFile = WiffFileReader(path)
max_cycle=0
total_scan_number = rawFile.GetLastSpectrumNumber()
for scanNumber in range (total_scan_number):
if rawFile.GetMSOrderForScanNum(scanNumber) == 1 :
ms1_start_mz = rawFile.source.ScanInfos[scanNumber].LowMz
ms1_end_mz = rawFile.source.ScanInfos[scanNumber].HighMz
max_cycle+=1
print('start', ms1_start_mz, 'end', ms1_end_mz)
total_ms1_mz = ms1_end_mz - ms1_start_mz
n_bin_ms1 = int(total_ms1_mz // bin_mz)
size_bin_ms1 = total_ms1_mz / n_bin_ms1
im = np.zeros([max_cycle, n_bin_ms1])
cycle = 0
for scanNumber in range(total_scan_number):
if rawFile.GetMSOrderForScanNum(scanNumber) == 1:
masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber)
is_monocharged = [False for i in range(len(masses))]
is_bicharged = [False for i in range(len(masses))]
is_tricharged = [False for i in range(len(masses))]
for i in range(len(masses)):
m = masses[i]
found_s = True
found_b = True
found_t = True
s = 1
b=1
t=1
if not is_monocharged[i]:
dec = 1
while found_s :
found_s=False
while (i + dec) < len(masses) and masses[i + dec] - m < (s + tolerance):
if masses[i + dec] - m > (1 - tolerance):
is_monocharged[i] = True
is_monocharged[i + dec] = True
found_s=True
dec += 1
s+=1
if not is_bicharged[i]:
dec = 1
while found_b :
found_b=False
while (i + dec) < len(masses) and masses[i + dec] - m < (b / 2 + tolerance):
if masses[i + dec] - m > (1 / 2 - tolerance):
is_bicharged[i] = True
is_bicharged[i + dec] = True
found_b = True
dec += 1
b += 1
if not is_tricharged[i]:
dec = 1
while found_t :
found_t=False
while (i + dec) < len(masses) and masses[i + dec] - m < (1 / 3 + tolerance):
if masses[i + dec] - m > (t / 3 - tolerance):
is_tricharged[i] = True
is_tricharged[i + dec] = True
found_t = True
dec += 1
t += 1
line = np.zeros(n_bin_ms1)
if len(masses) > 0:
for k in range(len(masses)):
if is_bicharged[k] or is_tricharged[k] :
line[int((masses[k] - ms1_start_mz) // size_bin_ms1)] += intensities[k]
im[cycle, :] = line
cycle += 1
rawFile.Close()
return im
def build_image_ms1(path, bin_mz):
#load raw data
......@@ -79,3 +158,21 @@ def build_image_ms1(path, bin_mz):
im[c, :] = line
return im
if __name__ == '__main__':
path = '../data/raw_data/SERMAR-10-AER-d200.wiff'
im_filtered = build_image_ms1_wiff_charge_filtered(path,tolerance=0.0001)
# im_base = build_image_ms1_wiff(path,bin_mz=1)
im_filtered=np.log(im_filtered+1)
# im_base=np.log(im_base+1)
plt.imshow(im_filtered, interpolation='nearest')
plt.savefig('filtered_image_tol_00001.png')
plt.clf()
# plt.imshow(im_base, interpolation='nearest')
# plt.savefig('base_image.png')
# plt.clf()
\ No newline at end of file
......@@ -162,6 +162,7 @@ def build_ref_image_from_diann_global(path_parqet, target_seq, ms1_end_mz, ms1_s
if __name__ == '__main__':
df = load_lib('fasta/global_peptide_list.parquet')
# df = build_database_ref_peptide()
# df.to_csv("dataset_species_ref_peptides.csv", index=False)
# Write fasta file
......@@ -173,24 +174,24 @@ if __name__ == '__main__':
# print(pep)
# f.write(pep+'\n')
#
df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES)
#
# Create ref img
df_full = load_lib(
'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
min_rt = df_full['RT'].min()
max_rt = df_full['RT'].max()
#
df = pd.read_csv("dataset_species_ref_peptides.csv")
# df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES)
#
for spe in SPECIES:
print(spe)
df_spe = df[df['Specie'] == spe]
df_spec_no_common = df_spe[df_spe['Sequence'].isin(df_count[df_count['Count']<5]['Sequence'])]
im = build_ref_image_from_diann_global(
'fasta/global_peptide_list.parquet', target_seq=df_spec_no_common['Sequence'].to_list(), ms1_end_mz=1250,
ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
plt.clf()
mpimg.imsave('img_ref_count_th_5/' + spe + '.png', im)
np.save('img_ref_count_th_5/' + spe + '.npy', im)
# #
# # Create ref img
# df_full = load_lib(
# 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
# min_rt = df_full['RT'].min()
# max_rt = df_full['RT'].max()
# #
# df = pd.read_csv("dataset_species_ref_peptides.csv")
# #
# for spe in SPECIES:
# print(spe)
# df_spe = df[df['Specie'] == spe]
# df_spec_no_common = df_spe[df_spe['Sequence'].isin(df_count[df_count['Count']<5]['Sequence'])]
# im = build_ref_image_from_diann_global(
# 'fasta/global_peptide_list.parquet', target_seq=df_spec_no_common['Sequence'].to_list(), ms1_end_mz=1250,
# ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
# plt.clf()
# mpimg.imsave('img_ref_count_th_5/' + spe + '.png', im)
# np.save('img_ref_count_th_5/' + spe + '.npy', im)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment