diff --git a/alignement.py b/alignement.py
index 744746df1eaab9754a9b29e6573aafb3723a0461..f1d6f581238acfa0aa66a2079748fbe8650538c7 100644
--- a/alignement.py
+++ b/alignement.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pandas as pd
 from loess.loess_1d import loess_1d
+from sympy.abc import alpha
 
 import dataloader
 from dataloader import RT_Dataset
@@ -75,6 +76,29 @@ def filter_cysteine(df, col):
     data = df[df['cys']].reset_index(drop=True)
     return data
 
+def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
+    df_value_list = []
+    df_sub_value_list=[]
+    i=0
+    for r in sub_df.iterrows() :
+        print(i)
+        i+=1
+        try :
+
+            df_value_list.append(df[df['Sequence']==r[1]['Sequence']]['Retention time'].reset_index(drop=True)[0])
+            df_sub_value_list.append(r[1]['Retention time'])
+        except:
+            pass
+
+    fig, ax = plt.subplots()
+    ax.scatter(df_sub_value_list, df_value_list)
+
+    if save :
+        plt.savefig(path)
+    plt.clf()
+
+    return df_value_list, df_sub_value_list
+
 # data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True)
 # # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical)
 #
@@ -146,14 +170,38 @@ def filter_cysteine(df, col):
 # dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl')
 # dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl')
 
-data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True)
-data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True)
-data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True)
-data_ori['Sequence']=data_ori['sequence']
-data_ori['Retention time']=data_ori['irt']
-data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True)
-data_align = align(data_train, data_ori)
+# data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True)
+# data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True)
+# data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True)
+# data_ori['Sequence']=data_ori['sequence']
+# data_ori['Retention time']=data_ori['irt']
+# data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True)
+# data_align = align(data_train, data_ori)
+#
+# data_align.to_pickle('database/data_ISA_dual_align.pkl')
+
+
+
+df_ori = pd.read_csv('database/data_train.csv')
+df_ori['Sequence']=df_ori['sequence']
+df_ori['Retention time']=df_ori['irt']
+df_diann = pd.read_csv('database/CIT_BASE_UP000584719_546.csv')
+
+df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl')
+
+
+
+df_diann_aligned = align(df_diann, df_ori)
 
-data_align.to_pickle('database/data_ISA_dual_align.pkl')
+df_value_list, df_sub_value_list = compare_include_df(df_diann_aligned, df_ISA, True)
 
-# df = filter_cysteine(data_train_1,'sequence')
+import scipy as sp
+from sklearn.metrics import r2_score
+fig, ax = plt.subplots()
+ax.scatter(df_sub_value_list, df_value_list, s=0.1,alpha=0.1)
+x = np.array([min(df_value_list), max(df_value_list)])
+linreg = sp.stats.linregress(df_value_list, df_sub_value_list)
+ax.annotate("r-squared = {:.3f}".format(r2_score(df_value_list, df_sub_value_list)), (0, 1))
+plt.plot(x, linreg.intercept + linreg.slope * x, 'r')
+plt.savefig('scatter_DIANN-ISA_aligned_on_prosit.png')
+plt.clf()
diff --git a/diann_processing.py b/diann_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..94e3bb3830d930d6461b51e8c7e725143704befb
--- /dev/null
+++ b/diann_processing.py
@@ -0,0 +1,8 @@
+import pandas as pd
+
+spec_lib = pd.read_parquet('database/CIT_BASE_UP000584719_546.parquet')
+lib_rt = spec_lib[['Stripped.Sequence','RT']]
+df = lib_rt.groupby(['Stripped.Sequence'])['RT'].mean().to_frame().reset_index()
+df.rename(columns = {'Stripped.Sequence':'Sequence', 'RT':'Retention time'}, inplace = True)
+df.to_csv('database/CIT_BASE_UP000584719_546.csv')
+