Skip to content
Snippets Groups Projects
Commit dc85d9ea authored by Schneider Leo's avatar Schneider Leo
Browse files

DIANN data align

parent 75fe5a29
No related branches found
No related tags found
No related merge requests found
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from loess.loess_1d import loess_1d from loess.loess_1d import loess_1d
from sympy.abc import alpha
import dataloader import dataloader
from dataloader import RT_Dataset from dataloader import RT_Dataset
...@@ -75,6 +76,29 @@ def filter_cysteine(df, col): ...@@ -75,6 +76,29 @@ def filter_cysteine(df, col):
data = df[df['cys']].reset_index(drop=True) data = df[df['cys']].reset_index(drop=True)
return data return data
def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
df_value_list = []
df_sub_value_list=[]
i=0
for r in sub_df.iterrows() :
print(i)
i+=1
try :
df_value_list.append(df[df['Sequence']==r[1]['Sequence']]['Retention time'].reset_index(drop=True)[0])
df_sub_value_list.append(r[1]['Retention time'])
except:
pass
fig, ax = plt.subplots()
ax.scatter(df_sub_value_list, df_value_list)
if save :
plt.savefig(path)
plt.clf()
return df_value_list, df_sub_value_list
# data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True) # data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True)
# # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) # # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical)
# #
...@@ -146,14 +170,38 @@ def filter_cysteine(df, col): ...@@ -146,14 +170,38 @@ def filter_cysteine(df, col):
# dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl') # dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl')
# dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl') # dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl')
data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True) # data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True)
data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True) # data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True)
data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True) # data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True)
data_ori['Sequence']=data_ori['sequence'] # data_ori['Sequence']=data_ori['sequence']
data_ori['Retention time']=data_ori['irt'] # data_ori['Retention time']=data_ori['irt']
data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True) # data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True)
data_align = align(data_train, data_ori) # data_align = align(data_train, data_ori)
#
# data_align.to_pickle('database/data_ISA_dual_align.pkl')
df_ori = pd.read_csv('database/data_train.csv')
df_ori['Sequence']=df_ori['sequence']
df_ori['Retention time']=df_ori['irt']
df_diann = pd.read_csv('database/CIT_BASE_UP000584719_546.csv')
df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl')
df_diann_aligned = align(df_diann, df_ori)
data_align.to_pickle('database/data_ISA_dual_align.pkl') df_value_list, df_sub_value_list = compare_include_df(df_diann_aligned, df_ISA, True)
# df = filter_cysteine(data_train_1,'sequence') import scipy as sp
from sklearn.metrics import r2_score
fig, ax = plt.subplots()
ax.scatter(df_sub_value_list, df_value_list, s=0.1,alpha=0.1)
x = np.array([min(df_value_list), max(df_value_list)])
linreg = sp.stats.linregress(df_value_list, df_sub_value_list)
ax.annotate("r-squared = {:.3f}".format(r2_score(df_value_list, df_sub_value_list)), (0, 1))
plt.plot(x, linreg.intercept + linreg.slope * x, 'r')
plt.savefig('scatter_DIANN-ISA_aligned_on_prosit.png')
plt.clf()
import pandas as pd
spec_lib = pd.read_parquet('database/CIT_BASE_UP000584719_546.parquet')
lib_rt = spec_lib[['Stripped.Sequence','RT']]
df = lib_rt.groupby(['Stripped.Sequence'])['RT'].mean().to_frame().reset_index()
df.rename(columns = {'Stripped.Sequence':'Sequence', 'RT':'Retention time'}, inplace = True)
df.to_csv('database/CIT_BASE_UP000584719_546.csv')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment