diff --git a/image_ref/utils.py b/image_ref/utils.py index 8182a39c5cc60b2e3fffee93932926f5654b76ac..093ac7c5328f7550c99308ad6d5cdc97128cfdde 100644 --- a/image_ref/utils.py +++ b/image_ref/utils.py @@ -7,6 +7,14 @@ from analyse_diann_digestion import load_lib import matplotlib.image as mpimg import re +SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter hormaechei', 'Klebsiella aerogenes', + 'Klebsiella variicola', 'Raoultella ornithinolytica', 'Citrobacter farmeri', 'Enterobacter asburiae', + 'Enterobacter kobei', 'Klebsiella michiganensis', 'Morganella morganii', 'Salmonella enterica', + 'Citrobacter amalonaticus', 'Citrobacter freundii', 'Enterobacter chengduensis', 'Escherichia coli', + 'Klebsiella oxytoca', 'Proteus mirabilis', 'Serratia marcescens', 'Citrobacter braakii', + 'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae', + 'Providencia stuartii'] + ALPHABET_UNMOD = { "A": 1, "C": 2, @@ -30,7 +38,6 @@ ALPHABET_UNMOD = { "Y": 20, } - MASSES_MONO = { "A": 71.03711, "C": 103.00919, @@ -54,7 +61,6 @@ MASSES_MONO = { "Y": 163.06333, } - MASSES_AVG = { "A": 71.0788, "C": 103.1388, @@ -78,6 +84,7 @@ MASSES_AVG = { "Y": 163.1760, } + # trypsin cut after K or R (if not followed by P) def cut(seq): @@ -102,12 +109,14 @@ def cut_with_ind(seq, ind_list): return l + def digest(seq): ind = cut(seq) return cut_with_ind(seq, ind) + def fasta_similarity(path_fasta_1, path_fasta_2): - list_seq_1=[] + list_seq_1 = [] list_seq_2 = [] for record in fastapy.parse(path_fasta_1): list_seq_1.extend(digest(record.seq)) @@ -121,18 +130,20 @@ def fasta_similarity(path_fasta_1, path_fasta_2): plt.show() plt.savefig('fasta_similarity.png') + def split_string(input_string): # Use regular expression to split the string at underscore followed by uppercase letter - return re.split(r'_(?=[A-Zc])', input_string) + return re.split(r'_(?=[A-Z])|c(?=[A-Z])', input_string) + def build_database_ref_peptide(): - l=[] + l = [] with open('../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt', 'r') as f: for line in f: if line != '\n': if '>' in line: - #typo ?? - line = line.replace('no_family','No_family') + # typo ?? + line = line.replace('no_family', 'No_family') line = line.replace('no_order', 'No_order') split_line = line.split('_') @@ -140,16 +151,17 @@ def build_database_ref_peptide(): err = split_line[1] prev = split_line[2] split_line = split_string(line.split(' ')[1]) - spe = split_line[0].replace('_',' ') - gen = split_line[1].replace('_',' ') - fam = split_line[2].replace('_',' ') - o = split_line[3].replace('_',' ') - else : + spe = split_line[0].replace('_', ' ') + gen = split_line[1].replace('_', ' ') + fam = split_line[2].replace('_', ' ') + o = split_line[3].replace('_', ' ') + else: seq = line.split(' ')[1] - l.append({'Sequence' : seq,'Protein code' :prot , 'Error treshold':err , 'Prevalance': prev, - 'Specie':spe ,'Genus':gen ,'Family':fam ,'Order':o }) + l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev, + 'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o}) return pd.DataFrame(l) + def compute_mass(seq, isotop): m = 0 if isotop == 'mono': @@ -160,55 +172,53 @@ def compute_mass(seq, isotop): m += MASSES_AVG[char] * seq.count(char) return m -def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred): - #build peptide list +def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred): + # build peptide list list_seq_1 = [] for record in fastapy.parse(path_fasta): list_seq_1.append(record.seq) list_peptides = [] - for prot in list_seq_1 : + for prot in list_seq_1: list_peptides.extend(digest(prot)) - #compute m/z ration - mz_ratio={} - i=0 + # compute m/z ration + mz_ratio = {} + i = 0 list_peptides = list(set(list_peptides)) for seq in list_peptides: - mz_ratio['seq']=[] + mz_ratio['seq'] = [] for charge in possible_charge: - ratio = compute_mass(seq,'avg')/charge + ratio = compute_mass(seq, 'avg') / charge if ms1_end_mz > ratio > ms1_start_mz: mz_ratio['seq'].append(ratio) - i+=1 + i += 1 print(i) - #assocy predict rt - data=[] + # assocy predict rt + data = [] - #predict detectability (optional) + # predict detectability (optional) - #build image + # build image total_ms1_mz = ms1_end_mz - ms1_start_mz n_bin_ms1 = int(total_ms1_mz // bin_mz) im = np.zeros([max_cycle, n_bin_ms1]) max_rt = np.max(rt_pred) min_rt = np.min(rt_pred) total_rt = max_rt - min_rt - for (rt,mz_ratio) in data : - im[int((rt-min_rt/total_rt)*max_cycle),int(((mz_ratio-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1 + for (rt, mz_ratio) in data: + im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1 return im def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None): - - df = load_lib(path_parqet) - df=df[['Stripped.Sequence','Precursor.Charge','RT','Precursor.Mz']] + df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']] df_unique = df.drop_duplicates() - #build image + # build image total_ms1_mz = ms1_end_mz - ms1_start_mz n_bin_ms1 = int(total_ms1_mz // bin_mz) im = np.zeros([max_cycle, n_bin_ms1]) @@ -216,30 +226,38 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma max_rt = np.max(df_unique['RT']) if min_rt is None: min_rt = np.min(df_unique['RT']) - total_rt = max_rt - min_rt +1e-3 - for row in df_unique.iterrows() : - if 900 > int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1) >= 0: - im[int((row[1]['RT']-min_rt)/total_rt*max_cycle),int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1 + total_rt = max_rt - min_rt + 1e-3 + for row in df_unique.iterrows(): + if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0: + im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int( + ((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1 return im - if __name__ == '__main__': - # fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta') - # im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[]) - # plt.clf() - # mpimg.imsave('test_img.png', im) - df = build_database_ref_peptide() - df_full = load_lib('fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') - min_rt = df_full['RT'].min() - max_rt = df_full['RT'].max() - for spe in ['Proteus mirabilis','Klebsiella pneumoniae','Klebsiella oxytoca','Enterobacter hormaechei','Citrobacter freundii']: - im = build_ref_image_from_diann( - 'fasta/optimal peptide set/'+spe+'.parquet', ms1_end_mz=1250, - ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) - plt.clf() - mpimg.imsave(spe+'.png', im) - np.save(spe+'.npy', im) - + for spe in SPECIES: + + df_spe = df[df['Specie']==spe] + spe_list = df_spe['Sequence'].to_list() + with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f: + if not spe_list: + print('Empty : ',spe) + for pep in spe_list : + f.write(pep) + + # + # + # df_full = load_lib( + # 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet') + # min_rt = df_full['RT'].min() + # max_rt = df_full['RT'].max() + # + # for spe in SPECIES: + # im = build_ref_image_from_diann( + # 'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250, + # ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt) + # plt.clf() + # mpimg.imsave('img_ref/' + spe + '.png', im) + # np.save(spe + '.npy', im)