import glob import os import pandas as pd import re import numpy as np from PIL import Image import matplotlib.image as mpimg from build_image import build_image_ms1 """ find . -name '*.mzML' -exec cp -prv '{}' '/home/leo/PycharmProjects/pseudo_image/data/raw_data' ';' copy des mzml depuis lecteur """ antibiotic_tests = ['AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', 'CIP (vitek)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', 'CXM (vitek)','CZA (disk)','CZA (vitek)','CZT (disk)','CZT (vitek)','ETP (disk)','ETP (mic)','ETP (vitek)', 'FEP (disk)','FEP (mic)','FEP (vitek)','FOS (disk)','FOX (disk)','FOX (vitek)','GEN (disk)','GEN (mic)', 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LVX (disk)','LVX (vitek)','MEC (disk)', 'MEM (disk)','MEM (mic)','MEM (vitek)','NAL (vitek)','NET (disk)','OFX (vitek)','PIP (vitek)','PRL (disk)', 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TGC (disk)','TGC (vitek)', 'TIC (disk)','TIC (vitek)','TOB (disk)','TOB (vitek)','TZP (disk)','TZP (mic)','TZP (vitek)'] antibiotic_enterrobacter_breakpoints = { 'AMC (disk)': {"S":14, "R":14 }, 'AMK (disk)': {"S":18, "R":18 }, 'AMK (mic)': {"S":8, "R":8 }, 'AMK (vitek)': {"S":8, "R":8 }, 'AMP (vitek)': {"S":8, "R":8 }, 'AMX (disk)': {"S":14, "R":14 }, 'AMX (vitek)': {"S":8, "R":8 }, 'ATM (disk)': {"S":26, "R":21 }, 'ATM (vitek)': {"S":1, "R":4 }, 'CAZ (disk)': {"S":22, "R":22 }, 'CAZ (mic)': {"S":1, "R":4 }, 'CAZ (vitek)': {"S":1, "R":4 }, 'CHL (vitek)': {"S":16, "R":16 }, 'CIP (disk)': {"S":25, "R":22 }, 'CIP (vitek)': {"S":0.25, "R":0.5 }, 'COL (disk)': {"S":None, "R":None }, # : https://academic-oup-com.docelec.univ-lyon1.fr/cid/article/71/9/e523/5735218?login=true&token=eyJhbGciOiJub25lIn0.eyJleHAiOjE3NDU2NjA0NTgsImp0aSI6IjcxYzJmOWI1LTlhMWYtNGRiMy1iYmE0LTA0MGRlMTU3NjdmZSJ9. #deleted since method is not accurate (DO NOT USE IT) 'COL (mic)': {"S":2, "R":2 }, 'CRO (mic)': {"S":1, "R":2 }, 'CRO (vitek)': {"S":1, "R":2 }, 'CTX (disk)': {"S":20, "R":17 }, 'CTX (mic)': {"S":1, "R":2 }, 'CTX (vitek)': {"S":1, "R":2 }, 'CXM (vitek)': {"S":0.001, "R":8 }, 'CZA (disk)': {"S":13, "R":13 }, 'CZA (vitek)': {"S":8, "R":8 }, 'CZT (disk)': {"S":22, "R":22 }, 'CZT (vitek)': {"S":2, "R":2 }, 'ETP (disk)': {"S":23, "R":23 }, 'ETP (mic)': {"S":0.5, "R":0.5 }, 'ETP (vitek)': {"S":0.5, "R":0.5 }, 'FEP (disk)': {"S":27, "R":24 }, 'FEP (mic)': {"S":1, "R":4 }, 'FEP (vitek)': {"S":1, "R":4 }, 'FOS (disk)': {"S":24, "R":24 },#pas clair ? 'FOX (disk)': {"S":19, "R":19 },#screen only ? 'FOX (vitek)': {"S":8, "R":8 },#screen only ? high sensitivity but poor specificity for identification of AmpC-producing Enterobacterales 'GEN (disk)': {"S":17, "R":17 }, 'GEN (mic)': {"S":2, "R":2 }, #entre parenthèse 'GEN (vitek)': {"S":2, "R":2 }, #entre parenthèse cf https://www.eucast.org/eucastguidancedocuments/ ? 'IPM (disk)': {"S":22, "R":19 }, 'IPM (mic)': {"S":2, "R":4 }, 'IPM (vitek)': {"S":2, "R":4 }, 'LTM (disk)': {"S":None, "R":None }, # Lactimidomycin ? 'LVX (disk)': {"S":23, "R":19 }, 'LVX (vitek)': {"S":0.5, "R":1 }, 'MEC (disk)': {"S":15, "R":15 }, 'MEM (disk)': {"S":22, "R":16 }, 'MEM (mic)': {"S":2, "R":8 }, 'MEM (vitek)': {"S":2, "R":8 }, 'NAL (vitek)': {"S":2, "R":8 }, #pas présent dans EUCAST, trouvé dans CLSI M100 (for uninary tract only) 'NET (disk)': {"S":15, "R":12 }, #insuffisant evidencence for EUCAST, found in CLSI M100 'OFX (vitek)': {"S":0.25, "R":0.5 }, 'PIP (vitek)': {"S":8, "R":8 }, 'PRL (disk)': {"S":20, "R":20 }, 'SXT (disk)': {"S":14, "R":11 }, 'SXT (vitek)': {"S":2, "R":4 }, 'TCC (disk)': {"S":8, "R":16 }, 'TCC (vitek)': {"S":23, "R":20 }, 'TEM (disk)': {"S":None, "R":None },#Abréviation non standard 'TEM (vitek)': {"S":None, "R":None },#Abréviation non standard 'TGC (disk)': {"S":18, "R":18 }, #pour E.coli et C.koseri seulement 'TGC (vitek)': {"S":0.5, "R":0.5 }, 'TIC (disk)': {"S":13, "R":20 }, 'TIC (vitek)': {"S":8, "R":16 }, 'TOB (disk)': {"S":16, "R":16 }, #entre parenthèse cf https://www.eucast.org/eucastguidancedocuments/ ? 'TOB (vitek)': {"S":2, "R":2 }, #entre parenthèse cf https://www.eucast.org/eucastguidancedocuments/ ? 'TZP (disk)': {"S":20, "R":20 }, 'TZP (mic)': {"S":8, "R":8 }, 'TZP (vitek)': {"S":8, "R":8 }, } def create_antibio_dataset(path='../data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx',suffix='-d200'): """ Extract and build file name corresponding to each sample :param path: excel path :return: dataframe """ df = pd.read_excel(path, header=1) df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', 'CIP (vitek)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', 'CXM (vitek)','CZA (disk)','CZA (vitek)','CZT (disk)','CZT (vitek)','ETP (disk)','ETP (mic)','ETP (vitek)', 'FEP (disk)','FEP (mic)','FEP (vitek)','FOS (disk)','FOX (disk)','FOX (vitek)','GEN (disk)','GEN (mic)', 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LVX (disk)','LVX (vitek)','MEC (disk)', 'MEM (disk)','MEM (mic)','MEM (vitek)','NAL (vitek)','NET (disk)','OFX (vitek)','PIP (vitek)','PRL (disk)', 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TGC (disk)','TGC (vitek)', 'TIC (disk)','TIC (vitek)','TOB (disk)','TOB (vitek)','TZP (disk)','TZP (mic)','TZP (vitek)']] for test in antibiotic_tests :# S - Susceptible R - Resistant U- Uncertain #convert to string and transform >8 to 8 df[test] = df[test].map(lambda x :float(str(x).replace('>','').replace('<',''))) df[test+' cat']= 'NA' if 'mic' in test or 'vitek' in test : try : df.loc[df[test] <= antibiotic_enterrobacter_breakpoints[test]['S'], test+ ' cat'] = 'S' df.loc[df[test] >= antibiotic_enterrobacter_breakpoints[test]['R'], test + ' cat'] = 'R' df.loc[(antibiotic_enterrobacter_breakpoints[test]['S'] < df[test]) & (df[test] < antibiotic_enterrobacter_breakpoints[test]['R']), test + ' cat'] = 'U' except: pass elif 'disk' in test: try : df.loc[df[test] >= antibiotic_enterrobacter_breakpoints[test]['S'], test + ' cat'] = 'S' df.loc[df[test] <= antibiotic_enterrobacter_breakpoints[test]['R'], test + ' cat'] = 'R' df.loc[ (antibiotic_enterrobacter_breakpoints[test]['S'] > df[test]) & (df[test] > antibiotic_enterrobacter_breakpoints[test][ 'R']), test + ' cat'] = 'U' except: pass def split_before_number(s): return re.split(r'(\d+)', s) def create_fname(s, analyse): l = split_before_number(s) species = l[0] nb = l[1] return '{}-{}-{}{}.mzML'.format(species,nb,analyse,suffix) df['path_ana'] = df['sample_name'].map(lambda x: create_fname(x,analyse='ANA')) df['path_aer'] = df['sample_name'].map(lambda x: create_fname(x, analyse='AER')) return df def create_dataset(): """ Create images from raw .mzML files and sort it in their corresponding class directory :return: None """ label = create_antibio_dataset(suffix='-d200') for path in glob.glob("../data/raw_data/**.mzML"): print(path) species = None if path.split("/")[-1] in label['path_ana'].values: species = label[label['path_ana'] == path.split("/")[-1]]['species'].values[0] name = label[label['path_ana'] == path.split("/")[-1]]['sample_name'].values[0] analyse = 'ANA' elif path.split("/")[-1] in label['path_aer'].values: species = label[label['path_aer'] == path.split("/")[-1]]['species'].values[0] name = label[label['path_aer'] == path.split("/")[-1]]['sample_name'].values[0] analyse = 'AER' if species is not None: directory_path_png = '../data/processed_data/png_image/{}'.format(species) directory_path_npy = '../data/processed_data/npy_image/{}'.format(species) if not os.path.isdir(directory_path_png): os.makedirs(directory_path_png) if not os.path.isdir(directory_path_npy): os.makedirs(directory_path_npy) mat = build_image_ms1(path, 1) mpimg.imsave(directory_path_png + "/" + name + '_' + analyse + '.png', mat) np.save(directory_path_npy + "/" + name + '_' + analyse + '.npy', mat) label = create_antibio_dataset(suffix='_100vW_100SPD') for path in glob.glob("../data/raw_data/**.mzML"): print(path) species = None if path.split("/")[-1] in label['path_ana'].values: species = label[label['path_ana'] == path.split("/")[-1]]['species'].values[0] name = label[label['path_ana'] == path.split("/")[-1]]['sample_name'].values[0] analyse = 'ANA' elif path.split("/")[-1] in label['path_aer'].values: species = label[label['path_aer'] == path.split("/")[-1]]['species'].values[0] name = label[label['path_aer'] == path.split("/")[-1]]['sample_name'].values[0] analyse = 'AER' if species is not None: directory_path_png = '../data/processed_data/png_image/{}'.format(species) directory_path_npy = '../data/processed_data/npy_image/{}'.format(species) if not os.path.isdir(directory_path_png): os.makedirs(directory_path_png) if not os.path.isdir(directory_path_npy): os.makedirs(directory_path_npy) mat = build_image_ms1(path, 1) mpimg.imsave(directory_path_png + "/" + name + '_' + analyse + '.png', mat) np.save(directory_path_npy + "/" + name + '_' + analyse + '.npy', mat) def extract_antio_res_labels(): """ Extract and organise labels from raw excel file :param path: excel path :return: dataframe """ path = '../data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx' df = pd.read_excel(path, header=1) df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', 'CIP (vitek)','COL (disk)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', 'CXM (vitek)','CZA (disk)','CZA (vitek)','CZT (disk)','CZT (vitek)','ETP (disk)','ETP (mic)','ETP (vitek)', 'FEP (disk)','FEP (mic)','FEP (vitek)','FOS (disk)','FOX (disk)','FOX (vitek)','GEN (disk)','GEN (mic)', 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LTM (disk)','LVX (disk)','LVX (vitek)','MEC (disk)', 'MEM (disk)','MEM (mic)','MEM (vitek)','NAL (vitek)','NET (disk)','OFX (vitek)','PIP (vitek)','PRL (disk)', 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TEM (disk)','TEM (vitek)','TGC (disk)','TGC (vitek)', 'TIC (disk)','TIC (vitek)','TOB (disk)','TOB (vitek)','TZP (disk)','TZP (mic)','TZP (vitek)']] if __name__ =='__main__' : df = create_antibio_dataset()