Skip to content
Snippets Groups Projects
Commit 323f1738 authored by Schneider Leo's avatar Schneider Leo
Browse files

datasets

parent 82794a08
No related branches found
No related tags found
No related merge requests found
......@@ -210,46 +210,46 @@ def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
#create augmented dataset from ISA data + column invariant prosit peptides
# df_base = pd.read_pickle('database/data_DIA_ISA_55_train.pkl')
# df_base = df_base[['Sequence','Retention time']]
#
# df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl')
# df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str)
#
# df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl')
# df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str)
#
# df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl')
# df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str)
#
# df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True)
# df_augmented_1.columns=['sequence','irt']
# df_augmented_1['state']='train'
# df_augmented_1.to_csv('database/data_ISA_augmented_1.csv')
#
# df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True)
# df_augmented_2.columns=['sequence','irt']
# df_augmented_2['state']='train'
# df_augmented_2.to_csv('database/data_ISA_augmented_2.csv')
#
# df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True)
# df_augmented_3.columns=['sequence','irt']
# df_augmented_3['state']='train'
# df_augmented_3.to_csv('database/data_ISA_augmented_3.csv')
# create augmented dataset from ISA data + column invariant prosit peptides
df_base = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl')
df_base = df_base[['Sequence','Retention time']]
df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl')
df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str)
#testing intersection between test and augmented dataset
# df_sup = pd.read_pickle('database/data_prosit_threshold_3.pkl')
# df_test = pd.read_pickle('database/data_DIA_ISA_55_test.pkl')
#
# inter = []
# n = 0
# df_sup['Sequence']= df_sup['Sequence'].map(numerical_to_alphabetical_str)
# groups_sup = df_sup.groupby('Sequence')
#
# groups_test = df_test.groupby('Sequence')
# for seq, _ in groups_sup:
# for seq2, _ in groups_test:
# if seq2==seq :
# inter.append(seq)
\ No newline at end of file
df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl')
df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str)
df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl')
df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str)
df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True)
df_augmented_1.columns=['sequence','irt']
df_augmented_1['state']='train'
df_augmented_1.to_csv('database/data_ISA_augmented_1_30_01.csv')
df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True)
df_augmented_2.columns=['sequence','irt']
df_augmented_2['state']='train'
df_augmented_2.to_csv('database/data_ISA_augmented_2_30_01.csv')
df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True)
df_augmented_3.columns=['sequence','irt']
df_augmented_3['state']='train'
df_augmented_3.to_csv('database/data_ISA_augmented_3_30_01.csv')
# testing intersection between test and augmented dataset
df_sup = pd.read_pickle('database/data_prosit_threshold_3.pkl')
df_test = pd.read_pickle('database/data_DIA_ISA_55_test.pkl')
inter = []
n = 0
df_sup['Sequence']= df_sup['Sequence'].map(numerical_to_alphabetical_str)
groups_sup = df_sup.groupby('Sequence')
groups_test = df_test.groupby('Sequence')
for seq, _ in groups_sup:
for seq2, _ in groups_test:
if seq2==seq :
inter.append(seq)
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -43,7 +43,7 @@ class Model_Common_Transformer(nn.Module):
n_head=1, seq_length=25,
charge_max=4, charge_frag_max=3, encoder_ff=512, encoder_num_layer=1, decoder_rt_num_layer=1,
decoder_int_num_layer=1, acti='relu', norm=False):
self.charge_max = charge_max #TODO filter charge in train to be in 1-4 0-5 atm
self.charge_max = charge_max
self.seq_length = seq_length
self.nb_aa = nb_aa
self.charge_frag_max = charge_frag_max
......
......@@ -66,97 +66,98 @@ def alphabetical_to_numerical(seq):
dec += 4
return np.array(num)
# sources = ('data/intensity/sequence_train.npy',
# 'data/intensity/intensity_train.npy',
# 'data/intensity/precursor_charge_train.npy',
# 'data/intensity/precursor_charge_train.npy')
#
#
# data_rt = pd.read_csv('database/data_unique_ptms.csv')
# data_rt['Sequence']=data_rt['mod_sequence']
#
# padding(data_rt, 'Sequence', 30)
# data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical)
#
# data_rt =data_rt.drop(columns='mod_sequence')
#
# data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3])
#
# seq_rt = data_rt.Sequence
# seq_int = data_int.seq
# seq_rt = seq_rt.tolist()
# seq_int = seq_int.tolist()
# seq_rt = [tuple(l) for l in seq_rt]
# seq_int = [tuple(l) for l in seq_int]
#
# ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt))
# inter = set(ind_dict_rt).intersection(seq_int)
# ind_dict_rt = [ind_dict_rt[x] for x in inter]
#
#
# data_int.irt = np.zeros(data_int.energy.shape)
#
# i=0
# for ind in ind_dict_rt :
# print(i,'/',len(ind_dict_rt))
# i+=1
# ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]]
# data_int.irt[ind_int] = data_rt.irt[ind]
#
# np.save('data/intensity/irt_train.npy',data_int.irt)
# sources = ('data/intensity/sequence_holdout.npy',
# 'data/intensity/intensity_holdout.npy',
# 'data/intensity/precursor_charge_holdout.npy',
# 'data/intensity/precursor_charge_holdout.npy')
#
#
# data_rt = pd.read_csv('database/data_unique_ptms.csv')
# data_rt['Sequence']=data_rt['mod_sequence']
#
# padding(data_rt, 'Sequence', 30)
# data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical)
#
# data_rt =data_rt.drop(columns='mod_sequence')
#
# data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3])
#
# seq_rt = data_rt.Sequence
# seq_int = data_int.seq
# seq_rt = seq_rt.tolist()
# seq_int = seq_int.tolist()
# seq_rt = [tuple(l) for l in seq_rt]
# seq_int = [tuple(l) for l in seq_int]
#
# ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt))
# inter = set(ind_dict_rt).intersection(seq_int)
# ind_dict_rt = [ind_dict_rt[x] for x in inter]
#
#
# data_int.irt = np.zeros(data_int.energy.shape)
#
# i=0
# for ind in ind_dict_rt :
# print(i,'/',len(ind_dict_rt))
# i+=1
# ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]]
# data_int.irt[ind_int] = data_rt.irt[ind]
#
# np.save('data/intensity/irt_holdout.npy',data_int.irt)
df = pd.read_pickle('database/data_prosit_merged_holdout.pkl')
print(len(df))
possible_charges = [1,2,3,4]
df = df[df['Charge'].isin(possible_charges)]
print(len(df))
df.to_pickle('database/data_prosit_merged_holdout.pkl')
df = pd.read_pickle('database/data_prosit_merged_train.pkl')
print(len(df))
df=df.loc[df['Retention time']!=0]
df = df[df['Charge'].isin(possible_charges)]
print(len(df))
df.to_pickle('database/data_prosit_merged_train.pkl')
if __name__ == '__main__' :
# sources = ('data/intensity/sequence_train.npy',
# 'data/intensity/intensity_train.npy',
# 'data/intensity/precursor_charge_train.npy',
# 'data/intensity/precursor_charge_train.npy')
#
#
# data_rt = pd.read_csv('database/data_unique_ptms.csv')
# data_rt['Sequence']=data_rt['mod_sequence']
#
# padding(data_rt, 'Sequence', 30)
# data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical)
#
# data_rt =data_rt.drop(columns='mod_sequence')
#
# data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3])
#
# seq_rt = data_rt.Sequence
# seq_int = data_int.seq
# seq_rt = seq_rt.tolist()
# seq_int = seq_int.tolist()
# seq_rt = [tuple(l) for l in seq_rt]
# seq_int = [tuple(l) for l in seq_int]
#
# ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt))
# inter = set(ind_dict_rt).intersection(seq_int)
# ind_dict_rt = [ind_dict_rt[x] for x in inter]
#
#
# data_int.irt = np.zeros(data_int.energy.shape)
#
# i=0
# for ind in ind_dict_rt :
# print(i,'/',len(ind_dict_rt))
# i+=1
# ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]]
# data_int.irt[ind_int] = data_rt.irt[ind]
#
# np.save('data/intensity/irt_train.npy',data_int.irt)
# sources = ('data/intensity/sequence_holdout.npy',
# 'data/intensity/intensity_holdout.npy',
# 'data/intensity/precursor_charge_holdout.npy',
# 'data/intensity/precursor_charge_holdout.npy')
#
#
# data_rt = pd.read_csv('database/data_unique_ptms.csv')
# data_rt['Sequence']=data_rt['mod_sequence']
#
# padding(data_rt, 'Sequence', 30)
# data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical)
#
# data_rt =data_rt.drop(columns='mod_sequence')
#
# data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3])
#
# seq_rt = data_rt.Sequence
# seq_int = data_int.seq
# seq_rt = seq_rt.tolist()
# seq_int = seq_int.tolist()
# seq_rt = [tuple(l) for l in seq_rt]
# seq_int = [tuple(l) for l in seq_int]
#
# ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt))
# inter = set(ind_dict_rt).intersection(seq_int)
# ind_dict_rt = [ind_dict_rt[x] for x in inter]
#
#
# data_int.irt = np.zeros(data_int.energy.shape)
#
# i=0
# for ind in ind_dict_rt :
# print(i,'/',len(ind_dict_rt))
# i+=1
# ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]]
# data_int.irt[ind_int] = data_rt.irt[ind]
#
# np.save('data/intensity/irt_holdout.npy',data_int.irt)
# df = pd.read_pickle('database/data_prosit_merged_holdout.pkl')
# print(len(df))
# possible_charges = [1,2,3,4]
# df = df[df['Charge'].isin(possible_charges)]
# print(len(df))
# df.to_pickle('database/data_prosit_merged_holdout.pkl')
# df = pd.read_pickle('database/data_prosit_merged_train.pkl')
# print(len(df))
# df=df.loc[df['Retention time']!=0]
# df = df[df['Charge'].isin(possible_charges)]
# print(len(df))
# df.to_pickle('database/data_prosit_merged_train.pkl')
pass
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment