diff --git a/alignement.py b/alignement.py index 744746df1eaab9754a9b29e6573aafb3723a0461..f1d6f581238acfa0aa66a2079748fbe8650538c7 100644 --- a/alignement.py +++ b/alignement.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd from loess.loess_1d import loess_1d +from sympy.abc import alpha import dataloader from dataloader import RT_Dataset @@ -75,6 +76,29 @@ def filter_cysteine(df, col): data = df[df['cys']].reset_index(drop=True) return data +def compare_include_df(df, sub_df, save = True, path = 'temp.png'): + df_value_list = [] + df_sub_value_list=[] + i=0 + for r in sub_df.iterrows() : + print(i) + i+=1 + try : + + df_value_list.append(df[df['Sequence']==r[1]['Sequence']]['Retention time'].reset_index(drop=True)[0]) + df_sub_value_list.append(r[1]['Retention time']) + except: + pass + + fig, ax = plt.subplots() + ax.scatter(df_sub_value_list, df_value_list) + + if save : + plt.savefig(path) + plt.clf() + + return df_value_list, df_sub_value_list + # data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True) # # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) # @@ -146,14 +170,38 @@ def filter_cysteine(df, col): # dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl') # dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl') -data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True) -data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True) -data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True) -data_ori['Sequence']=data_ori['sequence'] -data_ori['Retention time']=data_ori['irt'] -data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True) -data_align = align(data_train, data_ori) +# data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True) +# data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True) +# data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True) +# data_ori['Sequence']=data_ori['sequence'] +# data_ori['Retention time']=data_ori['irt'] +# data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True) +# data_align = align(data_train, data_ori) +# +# data_align.to_pickle('database/data_ISA_dual_align.pkl') + + + +df_ori = pd.read_csv('database/data_train.csv') +df_ori['Sequence']=df_ori['sequence'] +df_ori['Retention time']=df_ori['irt'] +df_diann = pd.read_csv('database/CIT_BASE_UP000584719_546.csv') + +df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl') + + + +df_diann_aligned = align(df_diann, df_ori) -data_align.to_pickle('database/data_ISA_dual_align.pkl') +df_value_list, df_sub_value_list = compare_include_df(df_diann_aligned, df_ISA, True) -# df = filter_cysteine(data_train_1,'sequence') +import scipy as sp +from sklearn.metrics import r2_score +fig, ax = plt.subplots() +ax.scatter(df_sub_value_list, df_value_list, s=0.1,alpha=0.1) +x = np.array([min(df_value_list), max(df_value_list)]) +linreg = sp.stats.linregress(df_value_list, df_sub_value_list) +ax.annotate("r-squared = {:.3f}".format(r2_score(df_value_list, df_sub_value_list)), (0, 1)) +plt.plot(x, linreg.intercept + linreg.slope * x, 'r') +plt.savefig('scatter_DIANN-ISA_aligned_on_prosit.png') +plt.clf() diff --git a/diann_processing.py b/diann_processing.py new file mode 100644 index 0000000000000000000000000000000000000000..94e3bb3830d930d6461b51e8c7e725143704befb --- /dev/null +++ b/diann_processing.py @@ -0,0 +1,8 @@ +import pandas as pd + +spec_lib = pd.read_parquet('database/CIT_BASE_UP000584719_546.parquet') +lib_rt = spec_lib[['Stripped.Sequence','RT']] +df = lib_rt.groupby(['Stripped.Sequence'])['RT'].mean().to_frame().reset_index() +df.rename(columns = {'Stripped.Sequence':'Sequence', 'RT':'Retention time'}, inplace = True) +df.to_csv('database/CIT_BASE_UP000584719_546.csv') +