diff --git a/barlow_twin_like/dataset_barlow.py b/barlow_twin_like/dataset_barlow.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/barlow_twin_like/main.py b/barlow_twin_like/main.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/barlow_twin_like/model.py b/barlow_twin_like/model.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/image_processing/build_dataset.py b/image_processing/build_dataset.py index 1eaa264550de17c7cb8e5d0fbf8765b5d955e9e1..3e6155f841b4419831db3aff898885ea878ac76f 100644 --- a/image_processing/build_dataset.py +++ b/image_processing/build_dataset.py @@ -5,8 +5,8 @@ import re import numpy as np import matplotlib.image as mpimg -from build_image import build_image_ms1 -from image_processing.build_image import build_image_ms1_wiff + +from image_processing.build_image import build_image_ms1_wiff, build_image_ms1_wiff_charge_filtered, build_image_ms1_wiff_charge_filtered_apex_only """ find . -name '*.mzML' -exec cp -prv '{}' '/home/leo/PycharmProjects/pseudo_image/data/raw_data' ';' @@ -173,15 +173,15 @@ def create_dataset(): name = label[label['path_aer'] == path.split("/")[-1]]['sample_name'].values[0] analyse = 'AER' if species is not None: #save image in species specific dir - directory_path_png = '../data/processed_data_wiff/png_image/{}'.format(species) - directory_path_npy = '../data/processed_data_wiff/npy_image/{}'.format(species) + directory_path_png = '../data/processed_data_wiff_clean_005_10000_apex/png_image/{}'.format(species) + directory_path_npy = '../data/processed_data_wiff_clean_005_10000_apex/npy_image/{}'.format(species) if not os.path.isdir(directory_path_png): os.makedirs(directory_path_png) if not os.path.isdir(directory_path_npy): os.makedirs(directory_path_npy) if not os.path.isfile(directory_path_png + "/" + name + '_' + analyse + '.png'): try : - mat = build_image_ms1_wiff(path, 1) + mat = build_image_ms1_wiff_charge_filtered_apex_only(path, bin_mz=1,tolerance=0.005,noise=10000) mpimg.imsave(directory_path_png + "/" + name + '_' + analyse + '.png', mat) np.save(directory_path_npy + "/" + name + '_' + analyse + '.npy', mat) print('image create') @@ -202,15 +202,15 @@ def create_dataset(): name = label[label['path_aer'] == path.split("/")[-1]]['sample_name'].values[0] analyse = 'AER' if species is not None: - directory_path_png = '../data/processed_data_wiff/png_image/{}'.format(species) - directory_path_npy = '../data/processed_data_wiff/npy_image/{}'.format(species) + directory_path_png = '../data/processed_data_wiff_clean_005_10000_apex/png_image/{}'.format(species) + directory_path_npy = '../data/processed_data_wiff_clean_005_10000_apex/npy_image/{}'.format(species) if not os.path.isdir(directory_path_png): os.makedirs(directory_path_png) if not os.path.isdir(directory_path_npy): os.makedirs(directory_path_npy) if not os.path.isfile(directory_path_png + "/" + name + '_' + analyse + '.png'): try : - mat = build_image_ms1_wiff(path, 1) + mat = build_image_ms1_wiff_charge_filtered_apex_only(path, bin_mz=1,tolerance=0.005,noise=10000) mpimg.imsave(directory_path_png + "/" + name + '_' + analyse + '.png', mat) np.save(directory_path_npy + "/" + name + '_' + analyse + '.npy', mat) print('image create') diff --git a/image_processing/build_image.py b/image_processing/build_image.py index dda6aa3ca7e73295568aca9ffc8d0283f5f34102..7d7d67d52bf8066091a8ba56d512ef5d434e48eb 100644 --- a/image_processing/build_image.py +++ b/image_processing/build_image.py @@ -2,6 +2,9 @@ import numpy as np import matplotlib.pyplot as plt import matplotlib.colors as colors import pyopenms as oms +from matplotlib import image as mpimg +from torch.onnx.symbolic_opset12 import argmax + from pyRawMSDataReader.pyRawMSDataReader.WiffFileReader_py import WiffFileReader @@ -36,7 +39,7 @@ def build_image_ms1_wiff(path, bin_mz): return im -def build_image_ms1_wiff_charge_filtered(path, bin_mz=1, tolerance=0.01): +def build_image_ms1_wiff_charge_filtered(path, bin_mz=1, tolerance=0.01, noise=10000): #load raw data rawFile = WiffFileReader(path) max_cycle=0 @@ -58,36 +61,20 @@ def build_image_ms1_wiff_charge_filtered(path, bin_mz=1, tolerance=0.01): for scanNumber in range(total_scan_number): if rawFile.GetMSOrderForScanNum(scanNumber) == 1: masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber) - is_monocharged = [False for i in range(len(masses))] is_bicharged = [False for i in range(len(masses))] is_tricharged = [False for i in range(len(masses))] for i in range(len(masses)): m = masses[i] - found_s = True found_b = True found_t = True - s = 1 b=1 t=1 - if not is_monocharged[i]: - dec = 1 - while found_s : - found_s=False - while (i + dec) < len(masses) and masses[i + dec] - m < (s + tolerance): - if masses[i + dec] - m > (1 - tolerance): - is_monocharged[i] = True - is_monocharged[i + dec] = True - found_s=True - dec += 1 - s+=1 - - if not is_bicharged[i]: dec = 1 while found_b : found_b=False while (i + dec) < len(masses) and masses[i + dec] - m < (b / 2 + tolerance): - if masses[i + dec] - m > (1 / 2 - tolerance): + if masses[i + dec] - m > (b / 2 - tolerance): is_bicharged[i] = True is_bicharged[i + dec] = True found_b = True @@ -108,13 +95,102 @@ def build_image_ms1_wiff_charge_filtered(path, bin_mz=1, tolerance=0.01): line = np.zeros(n_bin_ms1) if len(masses) > 0: for k in range(len(masses)): - if is_bicharged[k] or is_tricharged[k] : + if (is_bicharged[k] or is_tricharged[k]) and intensities[k]>=noise: line[int((masses[k] - ms1_start_mz) // size_bin_ms1)] += intensities[k] im[cycle, :] = line cycle += 1 rawFile.Close() return im +#TODO calculer les apex au fil de l'eau, trop long sinon +def build_image_ms1_wiff_charge_filtered_apex_only(path, bin_mz=1, tolerance=0.01, noise=10000): + #load raw data + rawFile = WiffFileReader(path) + max_cycle=0 + total_scan_number = rawFile.GetLastSpectrumNumber() + for scanNumber in range (total_scan_number): + if rawFile.GetMSOrderForScanNum(scanNumber) == 1 : + ms1_start_mz = rawFile.source.ScanInfos[scanNumber].LowMz + ms1_end_mz = rawFile.source.ScanInfos[scanNumber].HighMz + max_cycle+=1 + + print('start', ms1_start_mz, 'end', ms1_end_mz) + total_ms1_mz = ms1_end_mz - ms1_start_mz + + n_bin_ms1 = int(total_ms1_mz // bin_mz) + size_bin_ms1 = total_ms1_mz / n_bin_ms1 + + + mass_list_ms1=[] + intensity_list_ms1 = [] + for scanNumber in range(total_scan_number): + if rawFile.GetMSOrderForScanNum(scanNumber) == 1: + masses, intensities = rawFile.GetCentroidMassListFromScanNum(scanNumber) + mass_list_ms1.append(masses) + intensity_list_ms1.append(intensities) + rawFile.Close() + + cluster_list=[] + cluster_ref_2 = [] + cluster_ref_3 = [] + + for cycle in range(len(mass_list_ms1)): + for i in range(len(mass_list_ms1[cycle])): + is_in_cluster = False + mass, intensity = mass_list_ms1[cycle][i], intensity_list_ms1[cycle][i] + if intensity>=noise: + #check id peak belongs to a previous cluster + for cycle_ref,mass_ref,cluster_id in cluster_ref_2 : + if cycle_ref==cycle: + if mass_ref - tolerance < mass_list_ms1[cycle][i] < mass_ref + tolerance : + if mass - cluster_list[cluster_id][0][1] > 1/(3 - 2 * tolerance) : + cluster_list[cluster_id] = [max([cluster_list[cluster_id][0],(cycle,mass,intensity)],key=lambda x : x[2]),cluster_list[cluster_id][1]+ 1] + else : + cluster_list[cluster_id] = [max([cluster_list[cluster_id][0], (cycle, mass, intensity)], key=lambda x: x[2]),cluster_list[cluster_id][1] + 1] + if not (cycle + 1,mass_ref, cluster_id) in cluster_ref_2: + cluster_ref_2.append((cycle + 1,mass_ref, cluster_id)) + is_in_cluster = True + if not (cycle,mass_ref+1/2,cluster_id) in cluster_ref_2 : + cluster_ref_2.append((cycle,mass_ref+1/2,cluster_id)) + + for cycle_ref,mass_ref,cluster_id in cluster_ref_3 : + if cycle_ref==cycle: + if mass_ref - tolerance < mass_list_ms1[cycle][i] < mass_ref + tolerance : + cluster_list[cluster_id].append((cycle,mass,intensity)) + if not ( cycle + 1,mass_ref, cluster_id) in cluster_ref_3: + cluster_ref_3.append(( cycle + 1,mass_ref, cluster_id)) + is_in_cluster = True + if not (cycle,mass_ref+1/3,cluster_id) in cluster_ref_3 : + cluster_ref_3.append((cycle,mass_ref+1/3,cluster_id)) + #else create a new cluster based on this peak + if not is_in_cluster : + cluster_ref_2.append((cycle,mass+1/2,len(cluster_list))) + cluster_ref_2.append((cycle+1, mass, len(cluster_list))) + cluster_list.append([(0,0,0),0]) #(cycle mass intensity),number of heavy isotop + cluster_ref_3.append((cycle, mass + 1 / 3, len(cluster_list))) + cluster_ref_3.append((cycle + 1, mass, len(cluster_list))) + cluster_list.append([(0,0,0),0]) #(cycle mass intensity),number of heavy isotop + + #remove old cluster ref to improve perf (optional) + for cycle_ref,mass_ref,cluster_id in cluster_ref_2 : + if cycle_ref == cycle : + cluster_ref_2= [x for x in cluster_ref_2 if x != (cycle_ref,mass_ref,cluster_id)] + for cycle_ref,mass_ref, cluster_id in cluster_ref_3 : + if cycle_ref == cycle : + cluster_ref_3 = [x for x in cluster_ref_3 if x != (cycle_ref,mass_ref,cluster_id)] + + apex_list = [] + for cluster in cluster_list : # cluster = (cycle,mass,intensity) + if cluster[1]>=1: #at least a heavy isotop detected (valid istope group) + apex_list.append(cluster[0]) + + #create image based on detected valid apexes + im = np.zeros([max_cycle, n_bin_ms1]) + for cycle, mass, _ in apex_list: + im[int(cycle), int((mass - ms1_start_mz) // size_bin_ms1)] = 1 + return im + + def build_image_ms1(path, bin_mz): #load raw data e = oms.MSExperiment() @@ -162,17 +238,5 @@ def build_image_ms1(path, bin_mz): if __name__ == '__main__': path = '../data/raw_data/SERMAR-10-AER-d200.wiff' - im_filtered = build_image_ms1_wiff_charge_filtered(path,tolerance=0.0001) - # im_base = build_image_ms1_wiff(path,bin_mz=1) - im_filtered=np.log(im_filtered+1) - # im_base=np.log(im_base+1) - - - - plt.imshow(im_filtered, interpolation='nearest') - plt.savefig('filtered_image_tol_00001.png') - plt.clf() - - # plt.imshow(im_base, interpolation='nearest') - # plt.savefig('base_image.png') - # plt.clf() \ No newline at end of file + mat = build_image_ms1_wiff_charge_filtered_apex_only(path) + mpimg.imsave('test.png', mat) \ No newline at end of file