diff --git a/alignement.py b/alignement.py
index 87a97b87700518a0b23ef66a569e41b30052cd3f..58c5969325cab3a60f86e4793584a4ba14f2ed37 100644
--- a/alignement.py
+++ b/alignement.py
@@ -7,7 +7,7 @@ from common_dataset import Common_Dataset
 import matplotlib.pyplot as plt
 
 ALPHABET_UNMOD = {
-    "_": 0,
+    "": 0,
     "A": 1,
     "C": 2,
     "D": 3,
@@ -67,14 +67,27 @@ def align(dataset, reference):
     return dataset
 
 
-data_ori = RT_Dataset(None, 'database/data.csv', 'train', 25).data
-data_ori ['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical)
-# data_ori = pd.read_pickle('database/data_01_16_DIA_ISA_55.pkl')
-# data_ori = Common_Dataset(data_ori, 30).data
-data_train = pd.read_pickle('database/data_DIA_ISA_55_train.pkl')
-# data_train = Common_Dataset(data_train, 30).data
-#
+data_ori = RT_Dataset(None, 'database/data_train.csv', 'train', 25).data
+data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical)
+
+data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True)
+data_align = align(data_train, data_ori)
+data_align.to_pickle('database/data_DIA_16_01_aligned.pkl')
+data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True)
+data_align = align(data_train, data_ori)
+data_align.to_pickle('database/data_DIA_17_01_aligned.pkl')
+data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True)
+data_align = align(data_train, data_ori)
+data_align.to_pickle('database/data_DIA_20_01_aligned.pkl')
+data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True)
+data_align = align(data_train, data_ori)
+data_align.to_pickle('database/data_DIA_23_01_aligned.pkl')
+data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True)
+data_align = align(data_train, data_ori)
+data_align.to_pickle('database/data_DIA_24_01_aligned.pkl')
+data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True)
 data_align = align(data_train, data_ori)
+data_align.to_pickle('database/data_DIA_30_01_aligned.pkl')
 #
 plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1)
 plt.savefig('test_align_2.png')
diff --git a/database/data_DIA_ISA_55_test.pkl b/database/data_DIA_ISA_55_test.pkl
index 96dc65f3e991f18cdca1a7e5214afa0adf3f68a3..b7186f96f9507a7ce52296e19e8c4b7b6da6a15a 100644
Binary files a/database/data_DIA_ISA_55_test.pkl and b/database/data_DIA_ISA_55_test.pkl differ
diff --git a/database/data_DIA_ISA_55_train.pkl b/database/data_DIA_ISA_55_train.pkl
index 96dc65f3e991f18cdca1a7e5214afa0adf3f68a3..435b980676ffb348a88160fa92b574236aad8f75 100644
Binary files a/database/data_DIA_ISA_55_train.pkl and b/database/data_DIA_ISA_55_train.pkl differ
diff --git a/msms_processing.py b/msms_processing.py
index 6c3bbb8dd3d06f0f8da5204ce487387e3822e88e..bde575fa0ed43c3f5a374af6d4cbb63c7ddc21ee 100644
--- a/msms_processing.py
+++ b/msms_processing.py
@@ -3,6 +3,7 @@ import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.decomposition import PCA
 import json
+import random
 
 
 def load_data(msms_filet_path='data/msms.txt', score_treshold=70):
@@ -76,13 +77,42 @@ def mscatter(x,y, ax=None, m=None, **kw):
             paths.append(path)
         sc.set_paths(paths)
     return sc
-
+#data gradient 10 :
+# 16/01  20/01 30/01
+#data gradient 3 :
+# 17/01 23/01 24/01
 if __name__ == '__main__':
-    pass
-    # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', i)
-    # data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', i)
-    # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', i)
-    # data = pd.concat([data_1, data_2, data_3], ignore_index=True)
+    data_1 = pd.read_pickle('database/data_DIA_16_01_aligned.pkl')
+    data_2 = pd.read_pickle('database/data_DIA_17_01_aligned.pkl')
+    data_3 = pd.read_pickle('database/data_DIA_20_01_aligned.pkl')
+    data_4 = pd.read_pickle('database/data_DIA_23_01_aligned.pkl')
+    data_5 = pd.read_pickle('database/data_DIA_24_01_aligned.pkl')
+    data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl')
+    data = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True)
+
+    num_total = len(data)
+    train_size = np.floor(0.8*num_total)
+    list_gr=[]
+    train_set = []
+    test_set=[]
+    s = 0
+    groups = data.groupby('Sequence')
+    for seq, gr in groups:
+        list_gr.append(gr)
+    random.shuffle(list_gr)
+    for gr in list_gr :
+        if s < train_size :
+            train_set.append(gr)
+            s+= len(gr)
+        else :
+            test_set.append(gr)
+
+    dataset_train = pd.concat(train_set).reset_index(drop=True)
+    dataset_test = pd.concat(test_set).reset_index(drop=True)
+
+
+
+
     # err_rt = []
     # err_spec = []
     # nb_data = []