diff --git a/data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx b/data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..f092ddf15aa3040458d480d46aaeba3d385f7d20 Binary files /dev/null and b/data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx differ diff --git a/image_processing/build_dataset.py b/image_processing/build_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..27fda9745622ffae3a4ad27c272474a97d0b265e --- /dev/null +++ b/image_processing/build_dataset.py @@ -0,0 +1,40 @@ +import pandas as pd +import re + + +def create_antibio_dataset(path='data/230804_strain_peptides_antibiogram_Enterobacterales.xlsx'): + df = pd.read_excel(path, header=1) + df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', + 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', + 'CIP (vitek)','COL (disk)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', + 'CXM (vitek)','CZA (disk)','CZA (vitek)','CZT (disk)','CZT (vitek)','ETP (disk)','ETP (mic)','ETP (vitek)', + 'FEP (disk)','FEP (mic)','FEP (vitek)','FOS (disk)','FOX (disk)','FOX (vitek)','GEN (disk)','GEN (mic)', + 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LTM (disk)','LVX (disk)','LVX (vitek)','MEC (disk)', + 'MEM (disk)','MEM (mic)','MEM (vitek)','NAL (vitek)','NET (disk)','OFX (vitek)','PIP (vitek)','PRL (disk)', + 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TEM (disk)','TEM (vitek)','TGC (disk)','TGC (vitek)', + 'TIC (disk)','TIC (vitek)','TOB (disk)','TOB (vitek)','TZP (disk)','TZP (mic)','TZP (vitek)']] + + def split_before_number(s): + return re.split(r'(\d+)', s) + + + def create_fname(s, analyse): + l = split_before_number(s) + species = l[0] + nb = l[1] + return '{}-{}-{}_100vW_100SPD.wiff'.format(species,nb,analyse) + + df['path_ana'] = df['sample_name'].map(lambda x: create_fname(x,analyse='ANA')) + df['path_aer'] = df['sample_name'].map(lambda x: create_fname(x, analyse='AER')) + + return df + + +def cut_fname(s): + return s.split('/')[-1] + +def is_file_present(path, df): + return path in df['path_ana'].values or path in df['path_aer'].values + +def is_fname_present(path, df): + return path in df['fname'].values \ No newline at end of file diff --git a/image_processing/build_image.py b/image_processing/build_image.py new file mode 100644 index 0000000000000000000000000000000000000000..b2f03475636a9fac2e12d1235f63478c75703442 --- /dev/null +++ b/image_processing/build_image.py @@ -0,0 +1,68 @@ +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.colors as colors + +def plot_spectra_2d(exp, ms_level=1, marker_size=5, out_path='temp.png'): + exp.updateRanges() + for spec in exp: + if spec.getMSLevel() == ms_level: + mz, intensity = spec.get_peaks() + p = intensity.argsort() # sort by intensity to plot highest on top + rt = np.full([mz.shape[0]], spec.getRT(), float) + plt.scatter( + rt, + mz[p], + c=intensity[p], + cmap="afmhot_r", + s=marker_size, + norm=colors.LogNorm( + exp.getMinIntensity() + 1, exp.getMaxIntensity() + ), + ) + plt.clim(exp.getMinIntensity() + 1, exp.getMaxIntensity()) + plt.xlabel("time (s)") + plt.ylabel("m/z") + plt.colorbar() + plt.savefig(out_path) # slow for larger data sets + + +def build_image_ms1(e, bin_mz): + e.updateRanges() + id = e.getSpectra()[-1].getNativeID() + + dico = dict(s.split('=', 1) for s in id.split()) + max_cycle = int(dico['cycle']) + list_cycle = [[] for _ in range(max_cycle)] + + for s in e: + if s.getMSLevel() == 1: + ms1_start_mz = s.getInstrumentSettings().getScanWindows()[0].begin + ms1_end_mz = s.getInstrumentSettings().getScanWindows()[0].end + break + + total_ms1_mz = ms1_end_mz - ms1_start_mz + n_bin_ms1 = int(total_ms1_mz//bin_mz) + size_bin_ms1 = total_ms1_mz / n_bin_ms1 + for spec in e: # data structure + id = spec.getNativeID() + dico = dict(s.split('=', 1) for s in id.split()) + if spec.getMSLevel() == 1: + list_cycle[int(dico['cycle']) - 1].insert(0, spec) + + im = np.zeros([max_cycle, n_bin_ms1]) + + for c in range(max_cycle): # Build one cycle image + line = np.zeros(n_bin_ms1) + if len(list_cycle[c]) > 0: + for k in range(len(list_cycle[c])): + ms1 = list_cycle[c][k] + intensity = ms1.get_peaks()[1] + mz = ms1.get_peaks()[0] + id = ms1.getNativeID() + dico = dict(s.split('=', 1) for s in id.split()) + for i in range(ms1.size()): + line[int((mz[i] - ms1_start_mz) // size_bin_ms1)] += intensity[i] + + im[c, :] = line + + return im diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..de43a1ebe42a7d41074db486ee3bb8cc65368e1b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +numpy~=2.2.3 +matplotlib~=3.10.0 +pandas~=2.2.3 +pyopenms~=3.3.0 \ No newline at end of file