dataset construction and dataloader

51aaa98b · Schneider Leo · 4cf3b6cc · 51aaa98b · 51aaa98b · 51aaa98b
Commit 51aaa98b authored 3 months ago by Schneider Leo
--- a/dataset/dataset.py
+++ b/dataset/dataset.py
+import torch
+import torchvision
+from torch.utils.data import DataLoader
+import torchvision.transforms.functional as TF
+import random
+root = '../data/processed_data'
+dataset = torchvision.datasets.ImageFolder(root, transform=None)
+data_loader = DataLoader(
+    dataset,
+    batch_size=1,
+    shuffle=False,
+    num_workers=0,
+    collate_fn=None,
+    pin_memory=False,
+ )
+class Threshold_noise:
+    """Rotate by one of the given angles."""
+    def __init__(self, threshold):
+        self.threshold = threshold
+    def __call__(self, x):
+        angle = random.choice(self.angles)
+        return torch.max(x,0)
+rotation_transform = Threshold_noise(threshold=100)
\ No newline at end of file
--- a/image_processing/build_dataset.py
+++ b/image_processing/build_dataset.py
+import glob
+import os
 import pandas as pd
 import re
+import numpy as np
+from PIL import Image
+import matplotlib.image as mpimg
+from build_image import build_image_ms1
-def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterobacterales.xlsx'):
+"""
+find . -name '*.mzML' -exec cp -prv '{}' '/home/leo/PycharmProjects/pseudo_image/data/raw_data' ';'
+copy des mzml depuis lecteur
+"""
+def create_antibio_dataset(path='../data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx'):
+    """
+    Extract and organise labels from raw excel file
+    :param path: excel path
+    :return: dataframe
+    """
    df = pd.read_excel(path, header=1)
    df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)',
    'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)',
@@ -22,7 +38,7 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob
        l = split_before_number(s)
        species = l[0]
        nb = l[1]
-        return '{}-{}-{}_100vW_100SPD.wiff'.format(species,nb,analyse)
+        return '{}-{}-{}_100vW_100SPD.mzML'.format(species,nb,analyse)
    df['path_ana'] = df['sample_name'].map(lambda x: create_fname(x,analyse='ANA'))
    df['path_aer'] = df['sample_name'].map(lambda x: create_fname(x, analyse='AER'))
@@ -30,11 +46,32 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob
    return df
-def cut_fname(s):
+def create_dataset():
-    return s.split('/')[-1]
+    """
+    Create images from raw .mzML files and sort it in their corresponding class directory
+    :return: None
+    """
+    label = create_antibio_dataset()
+    for path in glob.glob("../data/raw_data/**.mzML"):
+        print(path)
+        species = None
+        if path.split("/")[-1] in label['path_ana'].values:
+            species = label[label['path_ana'] == path.split("/")[-1]]['species'].values[0]
+            name = label[label['path_ana'] == path.split("/")[-1]]['sample_name'].values[0]
+            analyse = 'ANA'
+        elif path.split("/")[-1] in label['path_aer'].values:
+            species = label[label['path_aer'] == path.split("/")[-1]]['species'].values[0]
+            name = label[label['path_aer'] == path.split("/")[-1]]['sample_name'].values[0]
+            analyse = 'AER'
+        if species is not None:
+            directory_path = '../data/processed_data/{}'.format(species)
+            if not os.path.isdir(directory_path):
+                os.makedirs(directory_path)
+            mat = build_image_ms1(path, 1)
+            mpimg.imsave(directory_path + "/" + name + '_' + analyse + '.png', mat)
+            np.save(directory_path + "/" + name + '_' + analyse + '.npy', mat)
-def is_file_present(path, df):
-    return path in df['path_ana'].values or path in df['path_aer'].values
-def is_fname_present(path, df):
+if __name__ =='__main__' :
-    return path in df['fname'].values
+    label = create_antibio_dataset()
\ No newline at end of file
--- a/image_processing/build_image.py
+++ b/image_processing/build_image.py
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.colors as colors
+import pyopenms as oms
 def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'):
    exp.updateRanges()
@@ -26,7 +27,9 @@ def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'):
    plt.savefig(out_path)  # slow for larger data sets
-def build_image_ms1(e, bin_mz):
+def build_image_ms1(path, bin_mz):
+    e = oms.MSExperiment()
+    oms.MzMLFile().load(path, e)
    e.updateRanges()
    id = e.getSpectra()[-1].getNativeID()

--- a/requirements.txt
+++ b/requirements.txt
 numpy~=2.2.3
 matplotlib~=3.10.0
 pandas~=2.2.3
 pyopenms~=3.3.0
\ No newline at end of file
+openpyxl
+torch~=2.6.0
+torchvision~=0.21.0
+pillow~=11.1.0
\ No newline at end of file