diff --git a/image_ref/utils.py b/image_ref/utils.py
index 8182a39c5cc60b2e3fffee93932926f5654b76ac..093ac7c5328f7550c99308ad6d5cdc97128cfdde 100644
--- a/image_ref/utils.py
+++ b/image_ref/utils.py
@@ -7,6 +7,14 @@ from analyse_diann_digestion import load_lib
 import matplotlib.image as mpimg
 import re
 
+SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter hormaechei', 'Klebsiella aerogenes',
+                'Klebsiella variicola', 'Raoultella ornithinolytica', 'Citrobacter farmeri', 'Enterobacter asburiae',
+                'Enterobacter kobei', 'Klebsiella michiganensis', 'Morganella morganii', 'Salmonella enterica',
+                'Citrobacter amalonaticus', 'Citrobacter freundii', 'Enterobacter chengduensis', 'Escherichia coli',
+                'Klebsiella oxytoca', 'Proteus mirabilis', 'Serratia marcescens', 'Citrobacter braakii',
+                'Citrobacter koseri', 'Enterobacter cloacae', 'Hafnia alvei', 'Klebsiella pneumoniae',
+                'Providencia stuartii']
+
 ALPHABET_UNMOD = {
     "A": 1,
     "C": 2,
@@ -30,7 +38,6 @@ ALPHABET_UNMOD = {
     "Y": 20,
 }
 
-
 MASSES_MONO = {
     "A": 71.03711,
     "C": 103.00919,
@@ -54,7 +61,6 @@ MASSES_MONO = {
     "Y": 163.06333,
 }
 
-
 MASSES_AVG = {
     "A": 71.0788,
     "C": 103.1388,
@@ -78,6 +84,7 @@ MASSES_AVG = {
     "Y": 163.1760,
 }
 
+
 # trypsin cut after K or R (if not followed by P)
 
 def cut(seq):
@@ -102,12 +109,14 @@ def cut_with_ind(seq, ind_list):
 
     return l
 
+
 def digest(seq):
     ind = cut(seq)
     return cut_with_ind(seq, ind)
 
+
 def fasta_similarity(path_fasta_1, path_fasta_2):
-    list_seq_1=[]
+    list_seq_1 = []
     list_seq_2 = []
     for record in fastapy.parse(path_fasta_1):
         list_seq_1.extend(digest(record.seq))
@@ -121,18 +130,20 @@ def fasta_similarity(path_fasta_1, path_fasta_2):
     plt.show()
     plt.savefig('fasta_similarity.png')
 
+
 def split_string(input_string):
     # Use regular expression to split the string at underscore followed by uppercase letter
-    return re.split(r'_(?=[A-Zc])', input_string)
+    return re.split(r'_(?=[A-Z])|c(?=[A-Z])', input_string)
+
 
 def build_database_ref_peptide():
-    l=[]
+    l = []
     with open('../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt', 'r') as f:
         for line in f:
             if line != '\n':
                 if '>' in line:
-                    #typo ??
-                    line = line.replace('no_family','No_family')
+                    # typo ??
+                    line = line.replace('no_family', 'No_family')
                     line = line.replace('no_order', 'No_order')
 
                     split_line = line.split('_')
@@ -140,16 +151,17 @@ def build_database_ref_peptide():
                     err = split_line[1]
                     prev = split_line[2]
                     split_line = split_string(line.split(' ')[1])
-                    spe = split_line[0].replace('_',' ')
-                    gen = split_line[1].replace('_',' ')
-                    fam = split_line[2].replace('_',' ')
-                    o = split_line[3].replace('_',' ')
-                else :
+                    spe = split_line[0].replace('_', ' ')
+                    gen = split_line[1].replace('_', ' ')
+                    fam = split_line[2].replace('_', ' ')
+                    o = split_line[3].replace('_', ' ')
+                else:
                     seq = line.split(' ')[1]
-                    l.append({'Sequence' : seq,'Protein code' :prot , 'Error treshold':err , 'Prevalance': prev,
-                              'Specie':spe ,'Genus':gen ,'Family':fam ,'Order':o })
+                    l.append({'Sequence': seq, 'Protein code': prot, 'Error treshold': err, 'Prevalance': prev,
+                              'Specie': spe, 'Genus': gen, 'Family': fam, 'Order': o})
     return pd.DataFrame(l)
 
+
 def compute_mass(seq, isotop):
     m = 0
     if isotop == 'mono':
@@ -160,55 +172,53 @@ def compute_mass(seq, isotop):
             m += MASSES_AVG[char] * seq.count(char)
     return m
 
-def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
 
-    #build peptide list
+def build_ref_image(path_fasta, possible_charge, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, rt_pred):
+    # build peptide list
     list_seq_1 = []
     for record in fastapy.parse(path_fasta):
         list_seq_1.append(record.seq)
 
     list_peptides = []
-    for prot in list_seq_1 :
+    for prot in list_seq_1:
         list_peptides.extend(digest(prot))
 
-    #compute m/z ration
-    mz_ratio={}
-    i=0
+    # compute m/z ration
+    mz_ratio = {}
+    i = 0
     list_peptides = list(set(list_peptides))
     for seq in list_peptides:
-        mz_ratio['seq']=[]
+        mz_ratio['seq'] = []
         for charge in possible_charge:
-            ratio = compute_mass(seq,'avg')/charge
+            ratio = compute_mass(seq, 'avg') / charge
             if ms1_end_mz > ratio > ms1_start_mz:
                 mz_ratio['seq'].append(ratio)
-                i+=1
+                i += 1
     print(i)
 
-    #assocy predict rt
-    data=[]
+    # assocy predict rt
+    data = []
 
-    #predict detectability (optional)
+    # predict detectability (optional)
 
-    #build image
+    # build image
     total_ms1_mz = ms1_end_mz - ms1_start_mz
     n_bin_ms1 = int(total_ms1_mz // bin_mz)
     im = np.zeros([max_cycle, n_bin_ms1])
     max_rt = np.max(rt_pred)
     min_rt = np.min(rt_pred)
     total_rt = max_rt - min_rt
-    for (rt,mz_ratio) in data :
-        im[int((rt-min_rt/total_rt)*max_cycle),int(((mz_ratio-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1
+    for (rt, mz_ratio) in data:
+        im[int((rt - min_rt / total_rt) * max_cycle), int(((mz_ratio - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
 
     return im
 
 
 def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, max_cycle, min_rt=None, max_rt=None):
-
-
     df = load_lib(path_parqet)
-    df=df[['Stripped.Sequence','Precursor.Charge','RT','Precursor.Mz']]
+    df = df[['Stripped.Sequence', 'Precursor.Charge', 'RT', 'Precursor.Mz']]
     df_unique = df.drop_duplicates()
-    #build image
+    # build image
     total_ms1_mz = ms1_end_mz - ms1_start_mz
     n_bin_ms1 = int(total_ms1_mz // bin_mz)
     im = np.zeros([max_cycle, n_bin_ms1])
@@ -216,30 +226,38 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
         max_rt = np.max(df_unique['RT'])
     if min_rt is None:
         min_rt = np.min(df_unique['RT'])
-    total_rt = max_rt - min_rt +1e-3
-    for row in df_unique.iterrows() :
-        if 900 > int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1) >= 0:
-            im[int((row[1]['RT']-min_rt)/total_rt*max_cycle),int(((row[1]['Precursor.Mz']-ms1_start_mz)/total_ms1_mz)*n_bin_ms1)]=1
+    total_rt = max_rt - min_rt + 1e-3
+    for row in df_unique.iterrows():
+        if 900 > int(((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1) >= 0:
+            im[int((row[1]['RT'] - min_rt) / total_rt * max_cycle), int(
+                ((row[1]['Precursor.Mz'] - ms1_start_mz) / total_ms1_mz) * n_bin_ms1)] = 1
 
     return im
 
 
-
 if __name__ == '__main__':
-    # fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta')
-    # im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[])
-    # plt.clf()
-    # mpimg.imsave('test_img.png', im)
-
     df = build_database_ref_peptide()
-    df_full = load_lib('fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
-    min_rt = df_full['RT'].min()
-    max_rt = df_full['RT'].max()
-    for spe in ['Proteus mirabilis','Klebsiella pneumoniae','Klebsiella oxytoca','Enterobacter hormaechei','Citrobacter freundii']:
-        im = build_ref_image_from_diann(
-            'fasta/optimal peptide set/'+spe+'.parquet', ms1_end_mz=1250,
-            ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
-        plt.clf()
-        mpimg.imsave(spe+'.png', im)
-        np.save(spe+'.npy', im)
-
+    for spe in SPECIES:
+
+        df_spe = df[df['Specie']==spe]
+        spe_list = df_spe['Sequence'].to_list()
+        with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f:
+            if not spe_list:
+                print('Empty : ',spe)
+            for pep in spe_list :
+                f.write(pep)
+
+    #
+    #
+    # df_full = load_lib(
+    #     'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
+    # min_rt = df_full['RT'].min()
+    # max_rt = df_full['RT'].max()
+    #
+    # for spe in SPECIES:
+    #     im = build_ref_image_from_diann(
+    #         'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
+    #         ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
+    #     plt.clf()
+    #     mpimg.imsave('img_ref/' + spe + '.png', im)
+    #     np.save(spe + '.npy', im)