diff --git a/common_dataset.py b/common_dataset.py index b2fca949fbdde16e17c82c4ca3b6b9813cfcc37c..dffa493811b218a1d3484e1fd20866fbdbf8db74 100644 --- a/common_dataset.py +++ b/common_dataset.py @@ -165,16 +165,16 @@ def load_data(path_train, path_val, path_test, batch_size, length, pad=False, co return train_loader, val_loader, test_loader -# irt_train = np.load('data/intensity/collision_irt_train.npy') -# seq_train = np.load('data/intensity/sequence_train.npy') -# charge_train = np.load('data/intensity/precursor_charge_train.npy') -# spectra_train = np.load('data/intensity/intensity_train.npy') -# -# irt_holdout = np.load('data/intensity/collision_irt_holdout.npy') -# seq_holdout = np.load('data/intensity/sequence_holdout.npy') -# charge_holdout = np.load('data/intensity/precursor_charge_holdout.npy') -# spectra_holdout = np.load('data/intensity/intensity_holdout.npy') -# +irt_train = np.load('data/intensity/collision_irt_train.npy') +seq_train = np.load('data/intensity/sequence_train.npy') +charge_train = np.load('data/intensity/precursor_charge_train.npy') +spectra_train = np.load('data/intensity/intensity_train.npy') + +irt_holdout = np.load('data/intensity/collision_irt_holdout.npy') +seq_holdout = np.load('data/intensity/sequence_holdout.npy') +charge_holdout = np.load('data/intensity/precursor_charge_holdout.npy') +spectra_holdout = np.load('data/intensity/intensity_holdout.npy') + # dataset_train = pd.DataFrame({'Sequence':list(seq_train), 'Retention time':list(irt_train), 'Charge':list(charge_train), 'Spectra' : list(spectra_train)},index=list(range(6787933))) # dataset_train.to_pickle('database/data_prosit_merged_train.pkl') # dataset_test = pd.DataFrame({'Sequence':list(seq_holdout), 'Retention time':list(irt_holdout), 'Charge':list(charge_holdout), 'Spectra' : list(spectra_holdout)},index=list(range(754215))) diff --git a/dataloader.py b/dataloader.py index 743676ecb9535962056cc4dc5b216b6a18aa0928..b6b3f941c41c7137fb3836913cf7a74f3bd75164 100644 --- a/dataloader.py +++ b/dataloader.py @@ -195,9 +195,16 @@ class Intentsity_Dataset(Dataset): return torch.tensor(self.seq[idx]), torch.tensor([self.energy[idx]]).float(), torch.tensor( self.precursor_charge[idx]), torch.tensor(self.intensity[idx]).float() -# storage = H5ToStorage('database/traintest_hcd.hdf5') -# storage.make_npy_file('data/intensity/method_train.npy','method') -# storage.make_npy_file('data/intensity/sequence_train.npy','sequence_integer') -# storage.make_npy_file('data/intensity/intensity_train.npy', 'intensities_raw') -# storage.make_npy_file('data/intensity/collision_energy_train.npy', 'collision_energy_aligned_normed') -# storage.make_npy_file('data/intensity/precursor_charge_train.npy', 'precursor_charge_onehot') +storage = H5ToStorage('database/traintest_hcd.hdf5') +storage.make_npy_file('data/intensity/method_train.npy','method') +storage.make_npy_file('data/intensity/sequence_train.npy','sequence_integer') +storage.make_npy_file('data/intensity/intensity_train.npy', 'intensities_raw') +storage.make_npy_file('data/intensity/collision_energy_train.npy', 'collision_energy_aligned_normed') +storage.make_npy_file('data/intensity/precursor_charge_train.npy', 'precursor_charge_onehot') + +storage = H5ToStorage('database/holdout_hcd.hdf5') +storage.make_npy_file('data/intensity/method_holdout.npy','method') +storage.make_npy_file('data/intensity/sequence_holdout.npy','sequence_integer') +storage.make_npy_file('data/intensity/intensity_holdout.npy', 'intensities_raw') +storage.make_npy_file('data/intensity/collision_energy_holdout.npy', 'collision_energy_aligned_normed') +storage.make_npy_file('data/intensity/precursor_charge_holdout.npy', 'precursor_charge_onehot') diff --git a/prosit_data_merge.py b/prosit_data_merge.py index 3c37066bb189e259f22cea14b48fdfe7befc73bf..8cc8125091127df85a6eded82d254d9124880541 100644 --- a/prosit_data_merge.py +++ b/prosit_data_merge.py @@ -105,13 +105,45 @@ for ind in ind_dict_rt : ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]] data_int.irt[ind_int] = data_rt.irt[ind] -np.save('data/intensity/collision_irt_train.npy',data_int.irt) +np.save('data/intensity/irt_train.npy',data_int.irt) -# indices_common = dict((k, i) for i, k in enumerate(seq_int)) -# indices_common = [indices_common[x] for x in inter] -# -# data_int.irt[indices_common] = data_rt.irt[ind_dict_rt] -# +sources = ('data/intensity/sequence_holdout.npy', + 'data/intensity/intensity_holdout.npy', + 'data/intensity/collision_energy_holdout.npy', + 'data/intensity/precursor_charge_holdout.npy') +data_rt = pd.read_csv('database/data_unique_ptms.csv') +data_rt['Sequence']=data_rt['mod_sequence'] + +padding(data_rt, 'Sequence', 30) +data_rt['Sequence'] = data_rt['Sequence'].map(alphabetical_to_numerical) + +data_rt =data_rt.drop(columns='mod_sequence') + +data_int = load_intensity_df_from_files(sources[0], sources[1], sources[2], sources[3]) + +seq_rt = data_rt.Sequence +seq_int = data_int.seq +seq_rt = seq_rt.tolist() +seq_int = seq_int.tolist() +seq_rt = [tuple(l) for l in seq_rt] +seq_int = [tuple(l) for l in seq_int] + +ind_dict_rt = dict((k, i) for i, k in enumerate(seq_rt)) +inter = set(ind_dict_rt).intersection(seq_int) +ind_dict_rt = [ind_dict_rt[x] for x in inter] + + +data_int.irt = np.zeros(data_int.energy.shape) + +i=0 +for ind in ind_dict_rt : + print(i,'/',len(ind_dict_rt)) + i+=1 + ind_int = [k for k, x in enumerate(seq_int) if x == seq_rt[ind]] + data_int.irt[ind_int] = data_rt.irt[ind] + +np.save('data/intensity/irt_holdout.npy',data_int.irt) +