diff --git a/dataset/dataset.py b/dataset/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b0d19939d869aa1d0aad126671689fa552a3f119 --- /dev/null +++ b/dataset/dataset.py @@ -0,0 +1,29 @@ +import torch +import torchvision + +from torch.utils.data import DataLoader +import torchvision.transforms.functional as TF +import random + +root = '../data/processed_data' +dataset = torchvision.datasets.ImageFolder(root, transform=None) +data_loader = DataLoader( + dataset, + batch_size=1, + shuffle=False, + num_workers=0, + collate_fn=None, + pin_memory=False, + ) + +class Threshold_noise: + """Rotate by one of the given angles.""" + + def __init__(self, threshold): + self.threshold = threshold + + def __call__(self, x): + angle = random.choice(self.angles) + return torch.max(x,0) + +rotation_transform = Threshold_noise(threshold=100) \ No newline at end of file diff --git a/image_processing/build_dataset.py b/image_processing/build_dataset.py index 27fda9745622ffae3a4ad27c272474a97d0b265e..2a127e0841ff7fef09c60301134ff6cb753da677 100644 --- a/image_processing/build_dataset.py +++ b/image_processing/build_dataset.py @@ -1,8 +1,24 @@ +import glob +import os import pandas as pd import re +import numpy as np +from PIL import Image +import matplotlib.image as mpimg +from build_image import build_image_ms1 -def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterobacterales.xlsx'): +""" +find . -name '*.mzML' -exec cp -prv '{}' '/home/leo/PycharmProjects/pseudo_image/data/raw_data' ';' +copy des mzml depuis lecteur +""" + +def create_antibio_dataset(path='../data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx'): + """ + Extract and organise labels from raw excel file + :param path: excel path + :return: dataframe + """ df = pd.read_excel(path, header=1) df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', @@ -22,7 +38,7 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob l = split_before_number(s) species = l[0] nb = l[1] - return '{}-{}-{}_100vW_100SPD.wiff'.format(species,nb,analyse) + return '{}-{}-{}_100vW_100SPD.mzML'.format(species,nb,analyse) df['path_ana'] = df['sample_name'].map(lambda x: create_fname(x,analyse='ANA')) df['path_aer'] = df['sample_name'].map(lambda x: create_fname(x, analyse='AER')) @@ -30,11 +46,32 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob return df -def cut_fname(s): - return s.split('/')[-1] +def create_dataset(): + """ + Create images from raw .mzML files and sort it in their corresponding class directory + :return: None + """ + label = create_antibio_dataset() + for path in glob.glob("../data/raw_data/**.mzML"): + print(path) + species = None + if path.split("/")[-1] in label['path_ana'].values: + species = label[label['path_ana'] == path.split("/")[-1]]['species'].values[0] + name = label[label['path_ana'] == path.split("/")[-1]]['sample_name'].values[0] + analyse = 'ANA' + elif path.split("/")[-1] in label['path_aer'].values: + species = label[label['path_aer'] == path.split("/")[-1]]['species'].values[0] + name = label[label['path_aer'] == path.split("/")[-1]]['sample_name'].values[0] + analyse = 'AER' + if species is not None: + directory_path = '../data/processed_data/{}'.format(species) + if not os.path.isdir(directory_path): + os.makedirs(directory_path) + mat = build_image_ms1(path, 1) + mpimg.imsave(directory_path + "/" + name + '_' + analyse + '.png', mat) + np.save(directory_path + "/" + name + '_' + analyse + '.npy', mat) + -def is_file_present(path, df): - return path in df['path_ana'].values or path in df['path_aer'].values -def is_fname_present(path, df): - return path in df['fname'].values \ No newline at end of file +if __name__ =='__main__' : + label = create_antibio_dataset() \ No newline at end of file diff --git a/image_processing/build_image.py b/image_processing/build_image.py index b2f03475636a9fac2e12d1235f63478c75703442..251af24fcae03dc9b1164102a2ec4982c1d52f62 100644 --- a/image_processing/build_image.py +++ b/image_processing/build_image.py @@ -1,6 +1,7 @@ import numpy as np import matplotlib.pyplot as plt import matplotlib.colors as colors +import pyopenms as oms def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'): exp.updateRanges() @@ -26,7 +27,9 @@ def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'): plt.savefig(out_path) # slow for larger data sets -def build_image_ms1(e, bin_mz): +def build_image_ms1(path, bin_mz): + e = oms.MSExperiment() + oms.MzMLFile().load(path, e) e.updateRanges() id = e.getSpectra()[-1].getNativeID() diff --git a/requirements.txt b/requirements.txt index de43a1ebe42a7d41074db486ee3bb8cc65368e1b..6780dc37ff73f4e4abb0c84a415145c2a67d5ac0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,8 @@ numpy~=2.2.3 matplotlib~=3.10.0 pandas~=2.2.3 -pyopenms~=3.3.0 \ No newline at end of file +pyopenms~=3.3.0 +openpyxl +torch~=2.6.0 +torchvision~=0.21.0 +pillow~=11.1.0 \ No newline at end of file