diff --git a/constant.py b/constant.py
index 485b7345933f08c3e5f3dbaa5becd92b99f63a5d..7d32614f7c0dce8dc4bb0bce3480dc615813c12f 100644
--- a/constant.py
+++ b/constant.py
@@ -20,8 +20,8 @@ ALPHABET_UNMOD = {
     "V": 18,
     "W": 19,
     "Y": 20,
-    "CaC": 22,
-    "OxM": 21
+    "OxM": 21,
+    "CaC": 22
 }
 
 ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}
\ No newline at end of file
diff --git a/data/data_PXD006109/data_aligned_plasma.csv b/data/data_PXD006109/data_aligned_plasma.csv
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/data/data_exploration.py b/data/data_exploration.py
index 3974273b127ad534af6861735b5af5d876100a1e..808983e9935aff445ed8135efbe81c0ab408dbde 100644
--- a/data/data_exploration.py
+++ b/data/data_exploration.py
@@ -1,8 +1,10 @@
 import numpy as np
 import matplotlib.pyplot as plt
 import pandas as pd
+from numpy.ma.core import shape
+
+from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
 
-from constant import ALPHABET_UNMOD
 
 def length_distribution(data, plot=False, save=False, f_name='fig/data_exploration/length_distribution.png'):
     maximum = 31
@@ -36,6 +38,9 @@ def aa_distribution(data, plot=False, save=False, f_name='fig/data_exploration/a
 
     freq = 100 * freq / freq.sum()
 
+    for i in range(len(freq)) :
+        print(freq[i],ALPHABET_UNMOD_REV[i])
+
     dict_freq = ALPHABET_UNMOD.copy()
     for aa in list(ALPHABET_UNMOD.keys()):
         dict_freq[aa] = freq[ALPHABET_UNMOD[aa]]
@@ -62,25 +67,25 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e
 
 def main():
     #data prosit
-    df = pd.read_csv('data_prosit/data.csv')
-    _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png')
-    _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png')
-    retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png')
-    df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
-    _ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png')
-    _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png')
-    retention_time_distribution(df_unique['irt_scaled'], False, True,
-                                '../fig/data_exploration/retention_time_distribution_prosit_unique.png')
+    # df = pd.read_csv('data_prosit/data.csv')
+    # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png')
+    # _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png')
+    # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png')
+    # df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
+    # _ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png')
+    # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png')
+    # retention_time_distribution(df_unique['irt_scaled'], False, True,
+    #                             '../fig/data_exploration/retention_time_distribution_prosit_unique.png')
 
     #prosit no cysteine
-    df = pd.read_csv('data_prosit/data_noc.csv')
-    _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png')
-    _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png')
-    retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png')
-    df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
-    _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png')
-    _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png')
-    retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png')
+    # df = pd.read_csv('data_prosit/data_noc.csv')
+    # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png')
+    # _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png')
+    # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png')
+    # df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean()
+    # _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png')
+    # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png')
+    # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png')
 
     #isa
     # df = pd.read_csv('data_ISA/data_aligned_isa.csv')
@@ -102,6 +107,16 @@ def main():
     # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_unique.png')
     # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_unique.png')
 
+    #isa mox
+    df = pd.read_csv('data_ISA_mox/data_aligned_isa_noc.csv')
+    _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox.png')
+    _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox.png')
+    retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox.png')
+    df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean()
+    _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox_unique.png')
+    _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox_unique.png')
+    retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox_unique.png')
+
 
 if __name__ == '__main__':
     main()
diff --git a/data/data_processing.py b/data/data_processing.py
index fec3d58bf13cc69e28b148e1dd334ac7681314aa..b7495bdff889e133431d56dabadb54d5077e9036 100644
--- a/data/data_processing.py
+++ b/data/data_processing.py
@@ -18,24 +18,26 @@ def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref):
 
     ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref))
     inter = set(ind_dict_ref).intersection(seq_common)
+    print(len(inter))
 
     ind_dict_ref = [ind_dict_ref[x] for x in inter]
 
     indices_common = dict((k, i) for i, k in enumerate(seq_common))
     indices_common = [indices_common[x] for x in inter]
 
+
     rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index()
     rt_data = dataset_unique[column_dataset][indices_common].reset_index()
 
-    plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist())
+    plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1)
     plt.savefig('test.png')
 
     xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()),
                                 xnew=dataset[column_dataset],
-                                degree=1, frac=0.5,
+                                degree=1, frac=0.25,
                                 npoints=None, rotate=False, sigy=None)
 
-    plt.scatter(xout, yout)
+    plt.scatter(xout, yout, s=0.1)
     plt.savefig('test_2.png')
 
     dataset[column_dataset] = yout
@@ -115,46 +117,41 @@ def numerical_to_alphabetical_str(s):
 
 def main():
     ref = pd.read_csv('data_prosit/data.csv')
-    df_ISA = pd.read_csv('data_ISA_mox/data_isa.csv')
-    df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence')
-    df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa.csv', index=False)
-
-    ref = pd.read_csv('data_prosit/data_noc.csv')
-    df_ISA = pd.read_csv('data_ISA_mox/data_isa_noc.csv')
+    df_ISA = pd.read_csv('data_PXD006109/data_plasma.csv')
     df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence')
-    df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa_noc.csv', index=False)
+    df_ISA_aligned.to_csv('data_PXD006109/data_aligned_plasma.csv', index=False)
 
 
 
 
 if __name__ == '__main__':
-    # main()
-
-    df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv')
-    df_base = df_base[['sequence', 'irt_scaled','state']]
-    t = [0.7,1,10]
-    #reste 07 1 et all
-    name = ['07','1','all']
-    for i in range(len(name)):
-
-
-        #creating augmented datasets
-        print('thresold {} en cours'.format(name[i]))
-        #
-        df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv')
-        df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv')
-        df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv')
-        df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv')
-        df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv')
-
-        list_df = [df_0,df_1,df_2,df_3,df_4]
-        df = select_best_data(list_df, t[i])
-        df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
-        df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
-        df['state'] = 'train'
-        df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str)
-        df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True)
-        df_augmented_1.columns = ['sequence', 'irt_scaled','state']
-
-        df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False)
-        print(df_augmented_1.shape)
\ No newline at end of file
+    main()
+
+    # df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv')
+    # df_base = df_base[['sequence', 'irt_scaled','state']]
+    # t = [0.7,1,10]
+    # #reste 07 1 et all
+    # name = ['07','1','all']
+    # for i in range(len(name)):
+    #
+    #
+    #     #creating augmented datasets
+    #     print('thresold {} en cours'.format(name[i]))
+    #     #
+    #     df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv')
+    #     df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv')
+    #     df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv')
+    #     df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv')
+    #     df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv')
+    #
+    #     list_df = [df_0,df_1,df_2,df_3,df_4]
+    #     df = select_best_data(list_df, t[i])
+    #     df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
+    #     df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i]))
+    #     df['state'] = 'train'
+    #     df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str)
+    #     df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True)
+    #     df_augmented_1.columns = ['sequence', 'irt_scaled','state']
+    #
+    #     df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False)
+    #     print(df_augmented_1.shape)
\ No newline at end of file
diff --git a/data/data_viz.py b/data/data_viz.py
index 9e0898330c1480fb5132775b85dbb2fd3d6cc88e..dfcb91c2247db9ededaa5d8c761f46f027b236a4 100644
--- a/data/data_viz.py
+++ b/data/data_viz.py
@@ -166,9 +166,10 @@ def plot_res():
 def calc_and_plot_res():
     all_data=[]
     base = 'out_early_stop_'
-    for name in ['ISA_noc_ISA_noc','ISA_aug_005_ISA_noc','ISA_aug_01_ISA_noc','ISA_aug_02_ISA_noc','ISA_aug_03_ISA_noc',
-                 'ISA_aug_04_ISA_noc','ISA_aug_05_ISA_noc','ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc',
-                 'ISA_aug_all_ISA_noc','prosit_ISA_noc']:
+    for name in ['ISA_noc_mox_ISA_noc_mox','ISA_noc_aug_005_mox_ISA_noc_mox','ISA_noc_aug_01_mox_ISA_noc_mox',
+                 'ISA_noc_aug_02_mox_ISA_noc_mox','ISA_noc_aug_03_mox_ISA_noc_mox','ISA_noc_aug_04_mox_ISA_noc_mox',
+                 'ISA_noc_aug_05_mox_ISA_noc_mox','ISA_noc_aug_07_mox_ISA_noc_mox','ISA_noc_aug_1_mox_ISA_noc_mox',
+                 'ISA_noc_aug_all_mox_ISA_noc_mox','prosit_mod_ISA_noc_mox']:
         print(name)
         r2_list=[]
         for index in range(10):
@@ -184,7 +185,7 @@ def calc_and_plot_res():
     axs.set_xticks([y + 1 for y in range(len(all_data))],
                    labels=[ 'ISA_noc', 'Augm 0.05', 'Augm 0.1', 'Augm 0.2', 'Augm 0.3', 'Augm 0.4', 'Augm 0.5', 'Augm 0.7',
                            'Augm 1', 'Augm all', 'Prosit', ])
-    plt.savefig('../fig/model perf/summary_early_stop.png')
+    plt.savefig('../fig/model perf/summary_early_stop_mox.png')
 
 def error_by_methionine(dataframe):
     def fonc(a):
@@ -206,6 +207,7 @@ def error_by_methionine(dataframe):
 
 
 if __name__ == '__main__' :
+    calc_and_plot_res()
     # base = ['ISA_noc_ISA_noc','prosit_ISA_noc', 'ISA_noc_prosit', 'prosit_prosit']
     # augmented = ['ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc','ISA_aug_all_ISA_noc']
     # for f_suffix_name in augmented:
@@ -218,6 +220,6 @@ if __name__ == '__main__' :
             # histo_length_by_error(df, bins=10, display=False, save=True, path='../fig/model perf/histo_length_{}_{}.png'.format(f_suffix_name,number))
     # calc_and_plot_res()
 
-    for number in range(10):
-        df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number)))
-        error_by_methionine(df)
+    # for number in range(10):
+    #     df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number)))
+    #     error_by_methionine(df)
diff --git a/data/msms_processing.py b/data/msms_processing.py
index 9cebfb6387b81a07dfdcf1f8ecb780bfd54b55c8..4ea3e2cadee8604122c4623746afe3e520705805 100644
--- a/data/msms_processing.py
+++ b/data/msms_processing.py
@@ -3,7 +3,7 @@ import numpy as np
 import random
 from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
 
-def load_data(msms_filet_path='data/msms.txt', score_treshold=70):
+def load_data(msms_filet_path='data/msms_plasma.txt', score_treshold=70):
     data = pd.read_csv(msms_filet_path, sep='\t')
     data_compact = data[['Modified sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']]
     data_filtered = data_compact[data_compact['Score'] > score_treshold]
@@ -23,7 +23,7 @@ def convert(l):
         num_str)]
 
 def convert_mod_to_prosit(s):
-    return s.replace('(Oxidation (M))','-OxM-').replace('(Acetyl (Protein N-term))','') #acetyl reliquat d'une mauvaise analyse, a enlever a terme
+    return s.replace('M(ox)','-OxM-').replace('(ac)','@')
 
 def numerical_to_alphabetical(arr):
     seq = ''
@@ -31,6 +31,14 @@ def numerical_to_alphabetical(arr):
         seq+=ALPHABET_UNMOD_REV[arr[i]]
     return seq
 
+def filter_acetyl(df):
+    df2 = df.copy()
+    df2['acetyl']=df['sequence'].map(lambda x: '@' in x)
+    df2 = df2[df2['acetyl']==False]
+    df_final = df2.drop('acetyl', axis=1)
+    df_final.reset_index(drop=True, inplace=True)
+    return df_final
+
 def filter_cysteine(df):
     df2 = df.copy()
     df2['cysteine']=df['sequence'].map(lambda x: 'C' in x)
@@ -79,19 +87,12 @@ def add_split_column(data, split=(0.7,0.15,0.15)):
     return data_split
 
 def main():
-    df_03_02 = load_data('data_ISA_mox/msms_03_02.txt', 70)
-    df_16_01 = load_data('data_ISA_mox/msms_16_01.txt', 70)
-    df_20_01 = load_data('data_ISA_mox/msms_20_01.txt', 70)
-    df_30_01 = load_data('data_ISA_mox/msms_30_01.txt', 70)
-    merged_df = pd.concat([df_20_01, df_30_01, df_16_01, df_03_02], ignore_index=True)
+    df_plasma = load_data('data_PXD006109/msms_plasma.txt', 70)
+    merged_df = pd.concat([df_plasma], ignore_index=True)
     final_df = add_split_column(merged_df)
-    final_df.to_csv('data_ISA_mox/data_isa.csv', index=False)
-    df2 = filter_cysteine(final_df)
-    df2.to_csv('data_ISA_mox/data_isa_noc.csv', index=False)
+    final_df = filter_acetyl(final_df)
+    final_df.to_csv('data_PXD006109/data_plasma.csv', index=False)
 
-    # final_df= pd.read_csv('data_prosit/data.csv')
-    # df2 = filter_cysteine(final_df)
-    # df2.to_csv('data_prosit/data_noc.csv', index=False)
 
 if __name__ == '__main__':
     main()
\ No newline at end of file