From 283c24211c57abffb3d57173aef2f620f2a2ea1b Mon Sep 17 00:00:00 2001 From: Schneider Leo <leo.schneider@etu.ec-lyon.fr> Date: Fri, 18 Oct 2024 10:42:07 +0200 Subject: [PATCH] datasets --- prosit_data_merge.py | 76 ++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/prosit_data_merge.py b/prosit_data_merge.py index 02c2431..a0d6b56 100644 --- a/prosit_data_merge.py +++ b/prosit_data_merge.py @@ -67,44 +67,44 @@ def alphabetical_to_numerical(seq): return np.array(num) -sources = ('data/intensity/sequence_train.npy', - 'data/intensity/intensity_train.npy', - 'data/intensity/precursor_charge_train.npy', - 'data/intensity/precursor_charge_train.npy') - - -data_rt = pd.read_csv('database/data_unique_ptms.csv') -data_rt['Sequence']=data_rt['mod_sequence'] - -padding(data_rt, 'Sequence', 30) -data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical) - -data_rt =data_rt.drop(columns='mod_sequence') - -data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3]) - -seq_rt = data_rt.Sequence -seq_int = data_int.seq -seq_rt = seq_rt.tolist() -seq_int = seq_int.tolist() -seq_rt = [tuple(l) for l in seq_rt] -seq_int = [tuple(l) for l in seq_int] - -ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt)) -inter = set(ind_dict_rt).intersection(seq_int) -ind_dict_rt = [ind_dict_rt[x] for x in inter] - - -data_int.irt = np.zeros(data_int.energy.shape) - -i=0 -for ind in ind_dict_rt : - print(i,'/',len(ind_dict_rt)) - i+=1 - ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]] - data_int.irt[ind_int] = data_rt.irt[ind] - -np.save('data/intensity/irt_train.npy',data_int.irt) +# sources = ('data/intensity/sequence_train.npy', +# 'data/intensity/intensity_train.npy', +# 'data/intensity/precursor_charge_train.npy', +# 'data/intensity/precursor_charge_train.npy') +# +# +# data_rt = pd.read_csv('database/data_unique_ptms.csv') +# data_rt['Sequence']=data_rt['mod_sequence'] +# +# padding(data_rt, 'Sequence', 30) +# data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical) +# +# data_rt =data_rt.drop(columns='mod_sequence') +# +# data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3]) +# +# seq_rt = data_rt.Sequence +# seq_int = data_int.seq +# seq_rt = seq_rt.tolist() +# seq_int = seq_int.tolist() +# seq_rt = [tuple(l) for l in seq_rt] +# seq_int = [tuple(l) for l in seq_int] +# +# ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt)) +# inter = set(ind_dict_rt).intersection(seq_int) +# ind_dict_rt = [ind_dict_rt[x] for x in inter] +# +# +# data_int.irt = np.zeros(data_int.energy.shape) +# +# i=0 +# for ind in ind_dict_rt : +# print(i,'/',len(ind_dict_rt)) +# i+=1 +# ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]] +# data_int.irt[ind_int] = data_rt.irt[ind] +# +# np.save('data/intensity/irt_train.npy',data_int.irt) sources = ('data/intensity/sequence_holdout.npy', -- GitLab