diff --git a/data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx b/data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx index f092ddf15aa3040458d480d46aaeba3d385f7d20..ed86b8afffd856bdcba65bda4fd2ecdcbca22f4b 100644 Binary files a/data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx and b/data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx differ diff --git a/dataset/dataset.py b/dataset/dataset.py index 1365db861c8ff4e5b1fc2fdce701af878c13da59..054b697d0b7e993f0849d2e0f2e164670bca0500 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -1,5 +1,3 @@ -import random - import numpy as np import torch import torchvision @@ -57,11 +55,9 @@ def load_data(base_dir, batch_size, shuffle=True, noise_threshold=0): train_dataset = torchvision.datasets.ImageFolder(root=base_dir, transform=train_transform) val_dataset = torchvision.datasets.ImageFolder(root=base_dir, transform=val_transform) - #Same seed to avoid overlap while having different transforms - seed = random.randint(0,1000) - train_dataset, _ = train_test_split(train_dataset, test_size=None, train_size=None, random_state=seed, shuffle=True, + train_dataset, _ = train_test_split(train_dataset, test_size=None, train_size=None, random_state=42, shuffle=True, stratify=True) - _, val_dataset = train_test_split(val_dataset, test_size=None, train_size=None, random_state=seed, shuffle=True, + _, val_dataset = train_test_split(val_dataset, test_size=None, train_size=None, random_state=42, shuffle=True, stratify=True) data_loader_train = data.DataLoader( diff --git a/image_processing/build_dataset.py b/image_processing/build_dataset.py index 49a5153a819c0c9fd75f1787c5aa8f2db636d464..21e181d1f23193b269cc3682e8d8c07d0953e54b 100644 --- a/image_processing/build_dataset.py +++ b/image_processing/build_dataset.py @@ -12,24 +12,129 @@ from build_image import build_image_ms1 find . -name '*.mzML' -exec cp -prv '{}' '/home/leo/PycharmProjects/pseudo_image/data/raw_data' ';' copy des mzml depuis lecteur """ +antibiotic_tests = ['AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', + 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', + 'CIP (vitek)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', + 'CXM (vitek)','CZA (disk)','CZA (vitek)','CZT (disk)','CZT (vitek)','ETP (disk)','ETP (mic)','ETP (vitek)', + 'FEP (disk)','FEP (mic)','FEP (vitek)','FOS (disk)','FOX (disk)','FOX (vitek)','GEN (disk)','GEN (mic)', + 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LVX (disk)','LVX (vitek)','MEC (disk)', + 'MEM (disk)','MEM (mic)','MEM (vitek)','NAL (vitek)','NET (disk)','OFX (vitek)','PIP (vitek)','PRL (disk)', + 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TGC (disk)','TGC (vitek)', + 'TIC (disk)','TIC (vitek)','TOB (disk)','TOB (vitek)','TZP (disk)','TZP (mic)','TZP (vitek)'] + +antibiotic_enterrobacter_breakpoints = { + 'AMC (disk)': {"S":14, "R":14 }, + 'AMK (disk)': {"S":18, "R":18 }, + 'AMK (mic)': {"S":8, "R":8 }, + 'AMK (vitek)': {"S":8, "R":8 }, + 'AMP (vitek)': {"S":8, "R":8 }, + 'AMX (disk)': {"S":14, "R":14 }, + 'AMX (vitek)': {"S":8, "R":8 }, + 'ATM (disk)': {"S":26, "R":21 }, + 'ATM (vitek)': {"S":1, "R":4 }, + 'CAZ (disk)': {"S":22, "R":22 }, + 'CAZ (mic)': {"S":1, "R":4 }, + 'CAZ (vitek)': {"S":1, "R":4 }, + 'CHL (vitek)': {"S":16, "R":16 }, + 'CIP (disk)': {"S":25, "R":22 }, + 'CIP (vitek)': {"S":0.25, "R":0.5 }, + 'COL (disk)': {"S":None, "R":None }, # : https://academic-oup-com.docelec.univ-lyon1.fr/cid/article/71/9/e523/5735218?login=true&token=eyJhbGciOiJub25lIn0.eyJleHAiOjE3NDU2NjA0NTgsImp0aSI6IjcxYzJmOWI1LTlhMWYtNGRiMy1iYmE0LTA0MGRlMTU3NjdmZSJ9. + #deleted since method is not accurate (DO NOT USE IT) + 'COL (mic)': {"S":2, "R":2 }, + 'CRO (mic)': {"S":1, "R":2 }, + 'CRO (vitek)': {"S":1, "R":2 }, + 'CTX (disk)': {"S":20, "R":17 }, + 'CTX (mic)': {"S":1, "R":2 }, + 'CTX (vitek)': {"S":1, "R":2 }, + 'CXM (vitek)': {"S":0.001, "R":8 }, + 'CZA (disk)': {"S":13, "R":13 }, + 'CZA (vitek)': {"S":8, "R":8 }, + 'CZT (disk)': {"S":22, "R":22 }, + 'CZT (vitek)': {"S":2, "R":2 }, + 'ETP (disk)': {"S":23, "R":23 }, + 'ETP (mic)': {"S":0.5, "R":0.5 }, + 'ETP (vitek)': {"S":0.5, "R":0.5 }, + 'FEP (disk)': {"S":27, "R":24 }, + 'FEP (mic)': {"S":1, "R":4 }, + 'FEP (vitek)': {"S":1, "R":4 }, + 'FOS (disk)': {"S":24, "R":24 },#pas clair ? + 'FOX (disk)': {"S":19, "R":19 },#screen only ? + 'FOX (vitek)': {"S":8, "R":8 },#screen only ? high sensitivity but poor specificity for identification of AmpC-producing Enterobacterales + 'GEN (disk)': {"S":17, "R":17 }, + 'GEN (mic)': {"S":2, "R":2 }, #entre parenthèse + 'GEN (vitek)': {"S":2, "R":2 }, #entre parenthèse cf https://www.eucast.org/eucastguidancedocuments/ ? + 'IPM (disk)': {"S":22, "R":19 }, + 'IPM (mic)': {"S":2, "R":4 }, + 'IPM (vitek)': {"S":2, "R":4 }, + 'LTM (disk)': {"S":None, "R":None }, # Lactimidomycin ? + 'LVX (disk)': {"S":23, "R":19 }, + 'LVX (vitek)': {"S":0.5, "R":1 }, + 'MEC (disk)': {"S":15, "R":15 }, + 'MEM (disk)': {"S":22, "R":16 }, + 'MEM (mic)': {"S":2, "R":8 }, + 'MEM (vitek)': {"S":2, "R":8 }, + 'NAL (vitek)': {"S":2, "R":8 }, #pas présent dans EUCAST, trouvé dans CLSI M100 (for uninary tract only) + 'NET (disk)': {"S":15, "R":12 }, #insuffisant evidencence for EUCAST, found in CLSI M100 + 'OFX (vitek)': {"S":0.25, "R":0.5 }, + 'PIP (vitek)': {"S":8, "R":8 }, + 'PRL (disk)': {"S":20, "R":20 }, + 'SXT (disk)': {"S":14, "R":11 }, + 'SXT (vitek)': {"S":2, "R":4 }, + 'TCC (disk)': {"S":8, "R":16 }, + 'TCC (vitek)': {"S":23, "R":20 }, + 'TEM (disk)': {"S":None, "R":None },#Abréviation non standard + 'TEM (vitek)': {"S":None, "R":None },#Abréviation non standard + 'TGC (disk)': {"S":18, "R":18 }, #pour E.coli et C.koseri seulement + 'TGC (vitek)': {"S":0.5, "R":0.5 }, + 'TIC (disk)': {"S":13, "R":20 }, + 'TIC (vitek)': {"S":8, "R":16 }, + 'TOB (disk)': {"S":16, "R":16 }, #entre parenthèse cf https://www.eucast.org/eucastguidancedocuments/ ? + 'TOB (vitek)': {"S":2, "R":2 }, #entre parenthèse cf https://www.eucast.org/eucastguidancedocuments/ ? + 'TZP (disk)': {"S":20, "R":20 }, + 'TZP (mic)': {"S":8, "R":8 }, + 'TZP (vitek)': {"S":8, "R":8 }, +} + def create_antibio_dataset(path='../data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx',suffix='-d200'): """ - Extract and organise labels from raw excel file + Extract and build file name corresponding to each sample :param path: excel path :return: dataframe """ df = pd.read_excel(path, header=1) df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', - 'CIP (vitek)','COL (disk)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', + 'CIP (vitek)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', 'CXM (vitek)','CZA (disk)','CZA (vitek)','CZT (disk)','CZT (vitek)','ETP (disk)','ETP (mic)','ETP (vitek)', 'FEP (disk)','FEP (mic)','FEP (vitek)','FOS (disk)','FOX (disk)','FOX (vitek)','GEN (disk)','GEN (mic)', - 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LTM (disk)','LVX (disk)','LVX (vitek)','MEC (disk)', + 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LVX (disk)','LVX (vitek)','MEC (disk)', 'MEM (disk)','MEM (mic)','MEM (vitek)','NAL (vitek)','NET (disk)','OFX (vitek)','PIP (vitek)','PRL (disk)', - 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TEM (disk)','TEM (vitek)','TGC (disk)','TGC (vitek)', + 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TGC (disk)','TGC (vitek)', 'TIC (disk)','TIC (vitek)','TOB (disk)','TOB (vitek)','TZP (disk)','TZP (mic)','TZP (vitek)']] + for test in antibiotic_tests :# S - Susceptible R - Resistant U- Uncertain + #convert to string and transform >8 to 8 + df[test] = df[test].map(lambda x :float(str(x).replace('>','').replace('<',''))) + df[test+' cat']= 'NA' + if 'mic' in test or 'vitek' in test : + try : + df.loc[df[test] <= antibiotic_enterrobacter_breakpoints[test]['S'], test+ ' cat'] = 'S' + df.loc[df[test] >= antibiotic_enterrobacter_breakpoints[test]['R'], test + ' cat'] = 'R' + df.loc[(antibiotic_enterrobacter_breakpoints[test]['S'] < df[test]) & (df[test] < antibiotic_enterrobacter_breakpoints[test]['R']), test + ' cat'] = 'U' + except: + pass + elif 'disk' in test: + try : + df.loc[df[test] >= antibiotic_enterrobacter_breakpoints[test]['S'], test + ' cat'] = 'S' + df.loc[df[test] <= antibiotic_enterrobacter_breakpoints[test]['R'], test + ' cat'] = 'R' + df.loc[ + (antibiotic_enterrobacter_breakpoints[test]['S'] > df[test]) & (df[test] > antibiotic_enterrobacter_breakpoints[test][ + 'R']), test + ' cat'] = 'U' + except: + pass + + def split_before_number(s): return re.split(r'(\d+)', s) @@ -98,5 +203,26 @@ def create_dataset(): np.save(directory_path_npy + "/" + name + '_' + analyse + '.npy', mat) +def extract_antio_res_labels(): + """ + Extract and organise labels from raw excel file + :param + path: excel + path + :return: dataframe + """ + path = '../data/label_raw/230804_strain_peptides_antibiogram_Enterobacterales.xlsx' + df = pd.read_excel(path, header=1) + df = df[['sample_name','species','AMC (disk)','AMK (disk)','AMK (mic)','AMK (vitek)','AMP (vitek)','AMX (disk)', + 'AMX (vitek)','ATM (disk)','ATM (vitek)','CAZ (disk)','CAZ (mic)','CAZ (vitek)','CHL (vitek)','CIP (disk)', + 'CIP (vitek)','COL (disk)','COL (mic)','CRO (mic)','CRO (vitek)','CTX (disk)','CTX (mic)','CTX (vitek)', + 'CXM (vitek)','CZA (disk)','CZA (vitek)','CZT (disk)','CZT (vitek)','ETP (disk)','ETP (mic)','ETP (vitek)', + 'FEP (disk)','FEP (mic)','FEP (vitek)','FOS (disk)','FOX (disk)','FOX (vitek)','GEN (disk)','GEN (mic)', + 'GEN (vitek)','IPM (disk)','IPM (mic)','IPM (vitek)','LTM (disk)','LVX (disk)','LVX (vitek)','MEC (disk)', + 'MEM (disk)','MEM (mic)','MEM (vitek)','NAL (vitek)','NET (disk)','OFX (vitek)','PIP (vitek)','PRL (disk)', + 'SXT (disk)','SXT (vitek)','TCC (disk)','TCC (vitek)','TEM (disk)','TEM (vitek)','TGC (disk)','TGC (vitek)', + 'TIC (disk)','TIC (vitek)','TOB (disk)','TOB (vitek)','TZP (disk)','TZP (mic)','TZP (vitek)']] + + if __name__ =='__main__' : - create_dataset() \ No newline at end of file + df = create_antibio_dataset() \ No newline at end of file diff --git a/models/model.py b/models/model.py index 8b03f7eea1f1edbde7df6d59025363b9137457ea..b59076c929d4cafb5f64f593c369c82a2defaac8 100644 --- a/models/model.py +++ b/models/model.py @@ -296,7 +296,7 @@ class Classification_model_duo(nn.Module): self.predictor = nn.Linear(in_features=self.n_class*2,out_features=self.n_class) - def forward(self, input_aer, input_ana): + def forward(self, input_aer, input_ana, input_ref): out_aer = self.im_encoder(input_aer) out_ana = self.im_encoder(input_ana) out = torch.concat([out_aer,out_ana],dim=1)