diff --git a/alignement.py b/alignement.py index 969e06e77a679316e6df0762894e93c4807dc5dd..4c68763de793adaa3a60b20d049272cf08119300 100644 --- a/alignement.py +++ b/alignement.py @@ -182,15 +182,15 @@ def compare_include_df(df, sub_df, save = True, path = 'temp.png'): # dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl') # dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl') -# data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True) -# data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True) -# data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True) -# data_ori['Sequence']=data_ori['sequence'] -# data_ori['Retention time']=data_ori['irt'] +data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True) +data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True) +data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True) +data_ori['Sequence']=data_ori['sequence'] +data_ori['Retention time']=data_ori['irt'] # data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True) -# data_align = align(data_train_2, data_ori) -# -# data_align.to_pickle('database/data_ISA_dual_align_train.pkl') +data_align = align(data_train_1, data_ori) + +data_align.to_pickle('database/data_ISA_dual_align_test.pkl') #compare DIANN pred to DIA mesures @@ -211,32 +211,32 @@ def compare_include_df(df, sub_df, save = True, path = 'temp.png'): # create augmented dataset from ISA data + column invariant prosit peptides -df_base = pd.read_pickle('database/data_DIA_ISA_55_train.pkl') +df_base = pd.read_pickle('database/data_ISA_dual_align_train.pkl') df_base = df_base[['Sequence','Retention time']] -df_1 = pd.read_pickle('database/data_prosit_threshold_5.pkl') +df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl') df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str) -df_2 = pd.read_pickle('database/data_prosit_threshold_7.pkl') +df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl') df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str) -df_3 = pd.read_pickle('database/data_prosit_threshold_10.pkl') +df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl') df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str) df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True) df_augmented_1.columns=['sequence','irt'] df_augmented_1['state']='train' -df_augmented_1.to_csv('database/data_ISA_augmented_5.csv') +df_augmented_1.to_csv('database/data_ISA_augmented_1_30_01.csv') df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True) df_augmented_2.columns=['sequence','irt'] df_augmented_2['state']='train' -df_augmented_2.to_csv('database/data_ISA_augmented_7.csv') +df_augmented_2.to_csv('database/data_ISA_augmented_2_30_01.csv') df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True) df_augmented_3.columns=['sequence','irt'] df_augmented_3['state']='train' -df_augmented_3.to_csv('database/data_ISA_augmented_10.csv') +df_augmented_3.to_csv('database/data_ISA_augmented_3_30_01.csv') # testing intersection between test and augmented dataset diff --git a/database/data_ISA_dual_align_test.pkl b/database/data_ISA_dual_align_test.pkl new file mode 100644 index 0000000000000000000000000000000000000000..aa39bf8eb3916eb3dee3f5d43dfc1bf891d13c16 Binary files /dev/null and b/database/data_ISA_dual_align_test.pkl differ