Skip to content
Snippets Groups Projects
Commit 323f1738 authored by Schneider Leo's avatar Schneider Leo
Browse files

datasets

parent 82794a08
Branches
No related tags found
No related merge requests found
...@@ -211,45 +211,45 @@ def compare_include_df(df, sub_df, save = True, path = 'temp.png'): ...@@ -211,45 +211,45 @@ def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
# create augmented dataset from ISA data + column invariant prosit peptides # create augmented dataset from ISA data + column invariant prosit peptides
# df_base = pd.read_pickle('database/data_DIA_ISA_55_train.pkl') df_base = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl')
# df_base = df_base[['Sequence','Retention time']] df_base = df_base[['Sequence','Retention time']]
#
# df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl') df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl')
# df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str) df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str)
#
# df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl') df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl')
# df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str) df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str)
#
# df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl') df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl')
# df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str) df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str)
#
# df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True) df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True)
# df_augmented_1.columns=['sequence','irt'] df_augmented_1.columns=['sequence','irt']
# df_augmented_1['state']='train' df_augmented_1['state']='train'
# df_augmented_1.to_csv('database/data_ISA_augmented_1.csv') df_augmented_1.to_csv('database/data_ISA_augmented_1_30_01.csv')
#
# df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True) df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True)
# df_augmented_2.columns=['sequence','irt'] df_augmented_2.columns=['sequence','irt']
# df_augmented_2['state']='train' df_augmented_2['state']='train'
# df_augmented_2.to_csv('database/data_ISA_augmented_2.csv') df_augmented_2.to_csv('database/data_ISA_augmented_2_30_01.csv')
#
# df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True) df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True)
# df_augmented_3.columns=['sequence','irt'] df_augmented_3.columns=['sequence','irt']
# df_augmented_3['state']='train' df_augmented_3['state']='train'
# df_augmented_3.to_csv('database/data_ISA_augmented_3.csv') df_augmented_3.to_csv('database/data_ISA_augmented_3_30_01.csv')
# testing intersection between test and augmented dataset # testing intersection between test and augmented dataset
# df_sup = pd.read_pickle('database/data_prosit_threshold_3.pkl') df_sup = pd.read_pickle('database/data_prosit_threshold_3.pkl')
# df_test = pd.read_pickle('database/data_DIA_ISA_55_test.pkl') df_test = pd.read_pickle('database/data_DIA_ISA_55_test.pkl')
#
# inter = [] inter = []
# n = 0 n = 0
# df_sup['Sequence']= df_sup['Sequence'].map(numerical_to_alphabetical_str) df_sup['Sequence']= df_sup['Sequence'].map(numerical_to_alphabetical_str)
# groups_sup = df_sup.groupby('Sequence') groups_sup = df_sup.groupby('Sequence')
#
# groups_test = df_test.groupby('Sequence') groups_test = df_test.groupby('Sequence')
# for seq, _ in groups_sup: for seq, _ in groups_sup:
# for seq2, _ in groups_test: for seq2, _ in groups_test:
# if seq2==seq : if seq2==seq :
# inter.append(seq) inter.append(seq)
\ No newline at end of file \ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -43,7 +43,7 @@ class Model_Common_Transformer(nn.Module): ...@@ -43,7 +43,7 @@ class Model_Common_Transformer(nn.Module):
n_head=1, seq_length=25, n_head=1, seq_length=25,
charge_max=4, charge_frag_max=3, encoder_ff=512, encoder_num_layer=1, decoder_rt_num_layer=1, charge_max=4, charge_frag_max=3, encoder_ff=512, encoder_num_layer=1, decoder_rt_num_layer=1,
decoder_int_num_layer=1, acti='relu', norm=False): decoder_int_num_layer=1, acti='relu', norm=False):
self.charge_max = charge_max #TODO filter charge in train to be in 1-4 0-5 atm self.charge_max = charge_max
self.seq_length = seq_length self.seq_length = seq_length
self.nb_aa = nb_aa self.nb_aa = nb_aa
self.charge_frag_max = charge_frag_max self.charge_frag_max = charge_frag_max
......
...@@ -66,6 +66,7 @@ def alphabetical_to_numerical(seq): ...@@ -66,6 +66,7 @@ def alphabetical_to_numerical(seq):
dec += 4 dec += 4
return np.array(num) return np.array(num)
if __name__ == '__main__' :
# sources = ('data/intensity/sequence_train.npy', # sources = ('data/intensity/sequence_train.npy',
# 'data/intensity/intensity_train.npy', # 'data/intensity/intensity_train.npy',
...@@ -146,17 +147,17 @@ def alphabetical_to_numerical(seq): ...@@ -146,17 +147,17 @@ def alphabetical_to_numerical(seq):
# #
# np.save('data/intensity/irt_holdout.npy',data_int.irt) # np.save('data/intensity/irt_holdout.npy',data_int.irt)
df = pd.read_pickle('database/data_prosit_merged_holdout.pkl') # df = pd.read_pickle('database/data_prosit_merged_holdout.pkl')
print(len(df)) # print(len(df))
possible_charges = [1,2,3,4] # possible_charges = [1,2,3,4]
df = df[df['Charge'].isin(possible_charges)] # df = df[df['Charge'].isin(possible_charges)]
print(len(df)) # print(len(df))
df.to_pickle('database/data_prosit_merged_holdout.pkl') # df.to_pickle('database/data_prosit_merged_holdout.pkl')
df = pd.read_pickle('database/data_prosit_merged_train.pkl') # df = pd.read_pickle('database/data_prosit_merged_train.pkl')
print(len(df)) # print(len(df))
df=df.loc[df['Retention time']!=0] # df=df.loc[df['Retention time']!=0]
df = df[df['Charge'].isin(possible_charges)] # df = df[df['Charge'].isin(possible_charges)]
print(len(df)) # print(len(df))
df.to_pickle('database/data_prosit_merged_train.pkl') # df.to_pickle('database/data_prosit_merged_train.pkl')
pass
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment