diff --git a/image_ref/utils.py b/image_ref/utils.py index 02eb8fd11461ca9f9aecf90c8cbde9b194f359c0..bebd1b433e7de6401080cb93e415d95bd89b7fbf 100644 --- a/image_ref/utils.py +++ b/image_ref/utils.py @@ -15,76 +15,6 @@ SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter horm 'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae', 'Providencia stuartii'] -ALPHABET_UNMOD = { - "A": 1, - "C": 2, - "D": 3, - "E": 4, - "F": 5, - "G": 6, - "H": 7, - "I": 8, - "K": 9, - "L": 10, - "M": 11, - "N": 12, - "P": 13, - "Q": 14, - "R": 15, - "S": 16, - "T": 17, - "V": 18, - "W": 19, - "Y": 20, -} - -MASSES_MONO = { - "A": 71.03711, - "C": 103.00919, - "D": 115.02694, - "E": 129.04259, - "F": 147.06841, - "G": 57.02146, - "H": 137.05891, - "I": 113.08406, - "K": 128.09496, - "L": 113.08406, - "M": 131.04049, - "N": 114.04293, - "P": 97.05276, - "Q": 128.05858, - "R": 156.1875, - "S": 87.03203, - "T": 101.04768, - "V": 99.06841, - "W": 186.07931, - "Y": 163.06333, -} - -MASSES_AVG = { - "A": 71.0788, - "C": 103.1388, - "D": 115.0886, - "E": 129.1155, - "F": 147.1766, - "G": 57.0519, - "H": 137.1411, - "I": 113.1594, - "K": 128.1741, - "L": 113.1594, - "M": 131.1926, - "N": 114.1038, - "P": 97.1167, - "Q": 128.1307, - "R": 156.1875, - "S": 87.0782, - "T": 101.1051, - "V": 99.1326, - "W": 186.2132, - "Y": 163.1760, -} - - # trypsin cut after K or R (if not followed by P) def cut(seq): @@ -142,6 +72,9 @@ def build_database_ref_peptide(): for line in f: if line != '\n': if '>' in line: + #remove new line + line = line.replace('\n', '') + # typo ?? line = line.replace('no_family', 'No_family') line = line.replace('no_order', 'No_order') @@ -157,66 +90,46 @@ def build_database_ref_peptide(): o = split_line[3].replace('_', ' ') else: seq = line.split(' ')[1] + #remove new lines + seq = seq.replace('\n','') l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev, 'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o}) - return pd.DataFrame(l) - - -def compute_mass(seq, isotop): - m = 0 - if isotop == 'mono': - for char in MASSES_MONO.keys(): - m += MASSES_MONO[char] * seq.count(char) - if isotop == 'avg': - for char in MASSES_AVG.keys(): - m += MASSES_AVG[char] * seq.count(char) - return m - + df = pd.DataFrame(l) + return df.drop_duplicates() -def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred): - # build peptide list - list_seq_1 = [] - for record in fastapy.parse(path_fasta): - list_seq_1.append(record.seq) - - list_peptides = [] - for prot in list_seq_1: - list_peptides.extend(digest(prot)) - - # compute m/z ration - mz_ratio = {} - i = 0 - list_peptides = list(set(list_peptides)) - for seq in list_peptides: - mz_ratio['seq'] = [] - for charge in possible_charge: - ratio = compute_mass(seq, 'avg') / charge - if ms1_end_mz > ratio > ms1_start_mz: - mz_ratio['seq'].append(ratio) - i += 1 - print(i) - - # assocy predict rt - data = [] +def compute_common_peptide(df_path,species): + df = pd.read_csv(df_path) + df_filter = df[df['Specie'].isin(species)] + df_filter = df_filter[['Sequence','Specie']].drop_duplicates() + df_filter['Count'] = df_filter.groupby('Sequence')['Sequence'].transform('count') + df_count = df_filter[['Sequence','Count']].drop_duplicates() + return df_count - # predict detectability (optional) +def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None): + df = load_lib(path_parqet) + df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']] + df_unique = df.drop_duplicates() # build image total_ms1_mz = ms1_end_mz - ms1_start_mz n_bin_ms1 = int(total_ms1_mz // bin_mz) im = np.zeros([max_cycle, n_bin_ms1]) - max_rt = np.max(rt_pred) - min_rt = np.min(rt_pred) - total_rt = max_rt - min_rt - for (rt, mz_ratio) in data: - im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1 + if max_rt is None: + max_rt = np.max(df_unique['RT']) + if min_rt is None: + min_rt = np.min(df_unique['RT']) + total_rt = max_rt - min_rt + 1e-3 + for row in df_unique.iterrows(): + if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0: + im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int( + ((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1 return im -'Klebsiella michiganensis', -def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None): +def build_ref_image_from_diann_global(path_parqet, target_seq, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None): df = load_lib(path_parqet) df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']] + df = df[df['Stripped.Sequence'].isin(target_seq)] df_unique = df.drop_duplicates() # build image total_ms1_mz = ms1_end_mz - ms1_start_mz @@ -236,29 +149,31 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma if __name__ == '__main__': - df = build_database_ref_peptide() - #Write fasta file - # for spe in SPECIES: - # - # df_spe = df[df['Specie']==spe] - # spe_list = df_spe['Sequence'].to_list() - # with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f: + # df = build_database_ref_peptide() + # df.to_csv("dataset_species_ref_peptides.csv", index=False) + # #Write fasta file + # with open('fasta/optimal peptide set/gobal_peptide_set.fasta', "w") as f: + # for spe in SPECIES: + # df_spe = df[df['Specie']==spe] + # spe_list = df_spe['Sequence'].drop_duplicate().to_list() # if not spe_list: # print('Empty : ',spe) # for pep in spe_list : # f.write(pep) + df_count = compute_common_peptide("dataset_species_ref_peptides.csv", SPECIES) + # #Create ref img - df_full = load_lib( - 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') - min_rt = df_full['RT'].min() - max_rt = df_full['RT'].max() - - for spe in SPECIES: - im = build_ref_image_from_diann( - 'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250, - ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) - plt.clf() - mpimg.imsave('img_ref/' + spe + '.png', im) - np.save(spe + '.npy', im) + # df_full = load_lib( + # 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') + # min_rt = df_full['RT'].min() + # max_rt = df_full['RT'].max() + # + # for spe in SPECIES: + # im = build_ref_image_from_diann( + # 'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250, + # ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) + # plt.clf() + # mpimg.imsave('img_ref/' + spe + '.png', im) + # np.save(spe + '.npy', im)