Skip to content
Snippets Groups Projects
Commit 51aaa98b authored by Schneider Leo's avatar Schneider Leo
Browse files

dataset construction and dataloader

parent 4cf3b6cc
No related branches found
No related tags found
No related merge requests found
import torch
import torchvision
from torch.utils.data import DataLoader
import torchvision.transforms.functional as TF
import random
root = '../data/processed_data'
dataset = torchvision.datasets.ImageFolder(root, transform=None)
data_loader = DataLoader(
dataset,
batch_size=1,
shuffle=False,
num_workers=0,
collate_fn=None,
pin_memory=False,
)
class Threshold_noise:
"""Rotate by one of the given angles."""
def __init__(self, threshold):
self.threshold = threshold
def __call__(self, x):
angle = random.choice(self.angles)
return torch.max(x,0)
rotation_transform = Threshold_noise(threshold=100)
\ No newline at end of file
import glob
import os
import pandas as pd import pandas as pd
import re import re
import numpy as np
from PIL import Image
import matplotlib.image as mpimg
from build_image import build_image_ms1
def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterobacterales.xlsx'): """
find . -name '*.mzML' -exec cp -prv '{}' '/home/leo/PycharmProjects/pseudo_image/data/raw_data' ';'
copy des mzml depuis lecteur
"""
def create_antibio_dataset(path='../data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx'):
"""
Extract and organise labels from raw excel file
:param path: excel path
:return: dataframe
"""
df = pd.read_excel(path, header=1) df = pd.read_excel(path, header=1)
df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)',
'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)',
...@@ -22,7 +38,7 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob ...@@ -22,7 +38,7 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob
l = split_before_number(s) l = split_before_number(s)
species = l[0] species = l[0]
nb = l[1] nb = l[1]
return '{}-{}-{}_100vW_100SPD.wiff'.format(species,nb,analyse) return '{}-{}-{}_100vW_100SPD.mzML'.format(species,nb,analyse)
df['path_ana'] = df['sample_name'].map(lambda x: create_fname(x,analyse='ANA')) df['path_ana'] = df['sample_name'].map(lambda x: create_fname(x,analyse='ANA'))
df['path_aer'] = df['sample_name'].map(lambda x: create_fname(x, analyse='AER')) df['path_aer'] = df['sample_name'].map(lambda x: create_fname(x, analyse='AER'))
...@@ -30,11 +46,32 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob ...@@ -30,11 +46,32 @@ def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterob
return df return df
def cut_fname(s): def create_dataset():
return s.split('/')[-1] """
Create images from raw .mzML files and sort it in their corresponding class directory
:return: None
"""
label = create_antibio_dataset()
for path in glob.glob("../data/raw_data/**.mzML"):
print(path)
species = None
if path.split("/")[-1] in label['path_ana'].values:
species = label[label['path_ana'] == path.split("/")[-1]]['species'].values[0]
name = label[label['path_ana'] == path.split("/")[-1]]['sample_name'].values[0]
analyse = 'ANA'
elif path.split("/")[-1] in label['path_aer'].values:
species = label[label['path_aer'] == path.split("/")[-1]]['species'].values[0]
name = label[label['path_aer'] == path.split("/")[-1]]['sample_name'].values[0]
analyse = 'AER'
if species is not None:
directory_path = '../data/processed_data/{}'.format(species)
if not os.path.isdir(directory_path):
os.makedirs(directory_path)
mat = build_image_ms1(path, 1)
mpimg.imsave(directory_path + "/" + name + '_' + analyse + '.png', mat)
np.save(directory_path + "/" + name + '_' + analyse + '.npy', mat)
def is_file_present(path, df):
return path in df['path_ana'].values or path in df['path_aer'].values
def is_fname_present(path, df): if __name__ =='__main__' :
return path in df['fname'].values label = create_antibio_dataset()
\ No newline at end of file \ No newline at end of file
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import matplotlib.colors as colors import matplotlib.colors as colors
import pyopenms as oms
def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'): def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'):
exp.updateRanges() exp.updateRanges()
...@@ -26,7 +27,9 @@ def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'): ...@@ -26,7 +27,9 @@ def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'):
plt.savefig(out_path) # slow for larger data sets plt.savefig(out_path) # slow for larger data sets
def build_image_ms1(e, bin_mz): def build_image_ms1(path, bin_mz):
e = oms.MSExperiment()
oms.MzMLFile().load(path, e)
e.updateRanges() e.updateRanges()
id = e.getSpectra()[-1].getNativeID() id = e.getSpectra()[-1].getNativeID()
......
numpy~=2.2.3 numpy~=2.2.3
matplotlib~=3.10.0 matplotlib~=3.10.0
pandas~=2.2.3 pandas~=2.2.3
pyopenms~=3.3.0 pyopenms~=3.3.0
\ No newline at end of file openpyxl
torch~=2.6.0
torchvision~=0.21.0
pillow~=11.1.0
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment