diff --git a/prosit_data_merge.py b/prosit_data_merge.py index 02c2431318ef70cf27642427be4468fa4a00681c..a0d6b56bf316bdbd1cb0fcfa787db2f7176b1eb8 100644 --- a/prosit_data_merge.py +++ b/prosit_data_merge.py @@ -67,44 +67,44 @@ def alphabetical_to_numerical(seq): return np.array(num) -sources = ('data/intensity/sequence_train.npy', - 'data/intensity/intensity_train.npy', - 'data/intensity/precursor_charge_train.npy', - 'data/intensity/precursor_charge_train.npy') - - -data_rt = pd.read_csv('database/data_unique_ptms.csv') -data_rt['Sequence']=data_rt['mod_sequence'] - -padding(data_rt, 'Sequence', 30) -data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical) - -data_rt =data_rt.drop(columns='mod_sequence') - -data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3]) - -seq_rt = data_rt.Sequence -seq_int = data_int.seq -seq_rt = seq_rt.tolist() -seq_int = seq_int.tolist() -seq_rt = [tuple(l) for l in seq_rt] -seq_int = [tuple(l) for l in seq_int] - -ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt)) -inter = set(ind_dict_rt).intersection(seq_int) -ind_dict_rt = [ind_dict_rt[x] for x in inter] - - -data_int.irt = np.zeros(data_int.energy.shape) - -i=0 -for ind in ind_dict_rt : - print(i,'/',len(ind_dict_rt)) - i+=1 - ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]] - data_int.irt[ind_int] = data_rt.irt[ind] - -np.save('data/intensity/irt_train.npy',data_int.irt) +# sources = ('data/intensity/sequence_train.npy', +# 'data/intensity/intensity_train.npy', +# 'data/intensity/precursor_charge_train.npy', +# 'data/intensity/precursor_charge_train.npy') +# +# +# data_rt = pd.read_csv('database/data_unique_ptms.csv') +# data_rt['Sequence']=data_rt['mod_sequence'] +# +# padding(data_rt, 'Sequence', 30) +# data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical) +# +# data_rt =data_rt.drop(columns='mod_sequence') +# +# data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3]) +# +# seq_rt = data_rt.Sequence +# seq_int = data_int.seq +# seq_rt = seq_rt.tolist() +# seq_int = seq_int.tolist() +# seq_rt = [tuple(l) for l in seq_rt] +# seq_int = [tuple(l) for l in seq_int] +# +# ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt)) +# inter = set(ind_dict_rt).intersection(seq_int) +# ind_dict_rt = [ind_dict_rt[x] for x in inter] +# +# +# data_int.irt = np.zeros(data_int.energy.shape) +# +# i=0 +# for ind in ind_dict_rt : +# print(i,'/',len(ind_dict_rt)) +# i+=1 +# ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]] +# data_int.irt[ind_int] = data_rt.irt[ind] +# +# np.save('data/intensity/irt_train.npy',data_int.irt) sources = ('data/intensity/sequence_holdout.npy',