Skip to content
Snippets Groups Projects
Commit 168de16c authored by Schneider Leo's avatar Schneider Leo
Browse files

selection colomn invariant prosit data

parent 7f5c53ca
No related branches found
No related tags found
No related merge requests found
......@@ -45,6 +45,16 @@ def numerical_to_alphabetical(arr):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def numerical_to_alphabetical_str(s):
seq = ''
s = s.replace('[','')
s = s.replace(']', '')
arr = s.split(',')
arr = list(map(int, arr))
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def align(dataset, reference):
seq_ref = reference['Sequence']
seq_common = dataset['Sequence']
......@@ -87,7 +97,7 @@ def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
df_sub_value_list.append(r[1]['Retention time'])
except:
pass
plt.clf()
fig, ax = plt.subplots()
ax.scatter(df_sub_value_list, df_value_list)
x = np.array([min(df_value_list), max(df_value_list)])
......@@ -183,16 +193,44 @@ def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
# data_align.to_pickle('database/data_ISA_dual_align.pkl')
#compare DIANN pred to DIA mesures
# df_ori = pd.read_csv('database/data_train.csv')
# df_ori['Sequence']=df_ori['sequence']
# df_ori['Retention time']=df_ori['irt']
# df_diann = pd.read_csv('database/CIT_BASE_UP000584719_546.csv')
#
# df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl')
#
#
#
# df_diann_aligned = align(df_diann, df_ori)
#
# df_value_list, df_sub_value_list = compare_include_df(df_diann_aligned, df_ISA, True)
#create augmented dataset from ISA data + column invariant prosit peptides
df_base = pd.read_pickle('database/data_ISA_aligned_prosit.pkl')
df_base = df_base[['Sequence','Retention time']]
df_ori = pd.read_csv('database/data_train.csv')
df_ori['Sequence']=df_ori['sequence']
df_ori['Retention time']=df_ori['irt']
df_diann = pd.read_csv('database/CIT_BASE_UP000584719_546.csv')
df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl')
df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str)
df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl')
df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl')
df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str)
df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl')
df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str)
df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True)
df_augmented_1.columns=['sequence','retention_time']
df_augmented_1.to_csv('database/data_ISA_augmented_1.csv')
df_diann_aligned = align(df_diann, df_ori)
df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True)
df_augmented_2.columns=['sequence','retention_time']
df_augmented_2.to_csv('database/data_ISA_augmented_2.csv')
df_value_list, df_sub_value_list = compare_include_df(df_diann_aligned, df_ISA, True)
df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True)
df_augmented_3.columns=['sequence','retention_time']
df_augmented_3.to_csv('database/data_ISA_augmented_3.csv')
\ No newline at end of file
......@@ -287,11 +287,31 @@ def RT_distrib(Y, f_name):
#
#ISA DATA
df = pd.read_pickle('database/data_ISA_aligned_prosit.pkl')
seq = df['Sequence'].unique()
# df = pd.read_pickle('database/data_ISA_aligned_prosit.pkl')
# seq = df['Sequence'].unique()
# rt = df['Retention time']
df_mean = df.groupby(['Sequence'])['Retention time'].mean()
# df_mean = df.groupby(['Sequence'])['Retention time'].mean()
# feq_aa(seq, plot=False, save=True, f_name='fig/histo_aa_ISA_unique.png')
# dist_long(seq, plot=False, save=True, f_name='fig/histo_length_ISA_unique.png')
RT_distrib(df_mean, 'fig/histo_RT_ISA_unique.png')
# RT_distrib(df_mean, 'fig/histo_RT_ISA_unique.png')
df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl')
seq_list = []
var_list = []
num_list = []
len_list=[]
for seq, gr in df_ISA.groupby(['Sequence']) :
if len(gr)>10 :
seq_list.append(seq)
var_list.append(np.std(gr['Retention time']))
num_list.append(len(gr))
len_list.append(len(seq))
idx = np.argsort(var_list)
seq_list = np.array(seq_list)[idx][::-1]
var_list = np.array(var_list)[idx][::-1]
num_list = np.array(num_list)[idx][::-1]
len_list = np.array(len_list)[idx][::-1]
......@@ -252,11 +252,9 @@ def select_best_data(df1,df2,threshold):
df = pd.concat([df_group_1, df_group_2], axis=1)
df['mean']=(df['abs err 1']+df['abs err 2'])/2
df_res = df[df['mean']<threshold]
print(df_res.size)
df_res = df_res['seq']
df_res.columns = ['seq','temp']
df_res = df_res['seq']
good_seq=[]
good_rt=[]
for r in df1.iterrows() :
......@@ -312,7 +310,4 @@ df_2 = pd.read_csv('output/out_common_ISA_prosit_eval_2.csv')
df = select_best_data(df_1, df_2, 3)
df.to_pickle('database/data_prosit_threshold_3.pkl')
# compare_error(df_1,df_2,save=True,path='fig/custom model res/ISA_prosit_error_variation.png')
# compare_error(df_1,df_2,save=True,path='fig/custom model res/ISA_prosit_error_variation.png')
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -57,4 +57,4 @@ def digest(seq, format):
ind = cut(seq, format)
return cut_with_ind(seq, ind)
res = digest('MNPLLILTFVAAALAAPFDDDDKIVGGYNCEENSVPYQVSLNSGYHFCGGSLINEQWVVSAGHCYKSRIQVRLGEHNIEVLEGNEQFINAAKIIRHPQYDRKTLNNDIMLIKLSSRAVINARVSTISLPTAPPATGTKCLISGWGNTASSGADYPDELQCLDAPVLSQAKCEASYPGKITSNMFCVGFLEGGKDSCQGDSGGPVVCNGQLQGVVSWGDGCAQKNKPGVYTKVYNYVKWIKNTIAANS','alphabetical')
\ No newline at end of file
res = digest('FPTDDDDKIVGGYTCAANSIPYQVSLNSGSHFCGGSLINSQWVVSAAHCYKSRIQVRLGEHNIDVLEGNEQFINAAKIITHPNFNGNTLDNDIMLIKLSSPATLNSRVATVSLPRSCAAAGTECLISGWGNTKSSGSSYPSLLQCLKAPVLSDSSCKSSYPGQITGNMICVGFLEGGKDSCQGDSGGPVVCNGQLQGIVSWGYGCAQKNKPGVYTKVCNYVNWIQQTIAA','alphabetical')
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment