diff --git a/image_ref/utils.py b/image_ref/utils.py index 13d6a69367d38196741a5c16a32b626103eb8486..727f5e17b0a939bb0ed6dd10983acb63ee628d37 100644 --- a/image_ref/utils.py +++ b/image_ref/utils.py @@ -1,10 +1,11 @@ import fastapy import matplotlib.pyplot as plt import numpy as np +import pandas as pd from matplotlib_venn import venn2 from analyse_diann_digestion import load_lib import matplotlib.image as mpimg - +import re ALPHABET_UNMOD = { "A": 1, @@ -120,6 +121,35 @@ def fasta_similarity(path_fasta_1, path_fasta_2): plt.show() plt.savefig('fasta_similarity.png') +def split_string(input_string): + # Use regular expression to split the string at underscore followed by uppercase letter + return re.split(r'_(?=[A-Zc])', input_string) + +def build_database_ref_peptide(): + l=[] + with open('../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt', 'r') as f: + for line in f: + if line != '\n': + if '>' in line: + #typo ?? + line = line.replace('no_family','No_family') + line = line.replace('no_order', 'No_order') + + split_line = line.split('_') + prot = split_line[0][1:] + err = split_line[1] + prev = split_line[2] + split_line = split_string(line.split(' ')[1]) + spe = split_line[0].replace('_',' ') + gen = split_line[1].replace('_',' ') + fam = split_line[2].replace('_',' ') + o = split_line[3].replace('_',' ') + else : + seq = line.split(' ')[1] + l.append({'Sequence' : seq,'Protein code' :prot , 'Error treshold':err , 'Prevalance': prev, + 'Specie':spe ,'Genus':gen ,'Family':fam ,'Order':o }) + return pd.DataFrame(l) + def compute_mass(seq, isotop): m = 0 if isotop == 'mono': @@ -195,6 +225,13 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma if __name__ == '__main__': # fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta') - im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[]) - plt.clf() - mpimg.imsave('test_img.png', im) + # im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[]) + # plt.clf() + # mpimg.imsave('test_img.png', im) + + df = build_database_ref_peptide() + for spe in ['Proteus mirabilis','Klebsiella pneumoniae','Klebsiella oxytoca','Enterobacter hormaechei','Citrobacter freundii']: + df_spe = df[df['Specie']==spe] + with open(spe+'.fasta','w') as f: + for r in df_spe.iterrows(): + f.write(r[1]['Sequence'])