import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA import json import random def load_data(msms_filet_path='data/msms.txt', score_treshold=70): data = pd.read_csv(msms_filet_path, sep='\t') data_compact = data[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']] data_filtered = data_compact[data_compact['Score'] > score_treshold] data_filtered = data_filtered[data_filtered['Length'] < 26] data_filtered['Spectra'] = data_filtered.apply(lambda x: filter_intensity(x.Matches, x.Intensities), axis=1) return data_filtered[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Spectra']] def convert(l): return [num_int for num_str in l.split() for num_int in ( lambda x: [float(x.replace('[', '').replace(']', ''))] if x.replace('.', '').replace('[', '').replace(']', '').replace( 'e+', '').isdigit() else [])( num_str)] def filter_intensity(matches, int_exp): frag_name = ['y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7', 'y8', 'y9', 'y10', 'y11', 'y12', 'y13', 'y14', 'y15', 'y16', 'y17', 'y18', 'y19', 'y20', 'y21', 'y22', 'y23', 'y24', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9', 'b10', 'b11', 'b12', 'b13', 'b14', 'b15', 'b16', 'b17', 'b18', 'b19', 'b20', 'b21', 'b22', 'b23', 'b24', 'y1(2+)', 'y2(2+)', 'y3(2+)', 'y4(2+)', 'y5(2+)', 'y6(2+)', 'y7(2+)', 'y8(2+)', 'y9(2+)', 'y10(2+)', 'y11(2+)', 'y12(2+)', 'y13(2+)', 'y14(2+)', 'y15(2+)', 'y16(2+)', 'y17(2+)', 'y18(2+)', 'y19(2+)', 'y20(2+)', 'y21(2+)', 'y22(2+)', 'y23(2+)', 'y24(2+)', 'b1(2+)', 'b2(2+)', 'b3(2+)', 'b4(2+)', 'b5(2+)', 'b6(2+)', 'b7(2+)', 'b8(2+)', 'b9(2+)', 'b10(2+)', 'b11(2+)', 'b12(2+)', 'b13(2+)', 'b14(2+)', 'b15(2+)', 'b16(2+)', 'b17(2+)', 'b18(2+)', 'b19(2+)', 'b20(2+)', 'b21(2+)', 'b22(2+)', 'b23(2+)', 'b24(2+)', 'y1(3+)', 'y2(3+)', 'y3(3+)', 'y4(3+)', 'y5(3+)', 'y6(3+)', 'y7(3+)', 'y8(3+)', 'y9(3+)', 'y10(3+)', 'y11(3+)', 'y12(3+)', 'y13(3+)', 'y14(3+)', 'y15(3+)', 'y16(3+)', 'y17(3+)', 'y18(3+)', 'y19(3+)', 'y20(3+)', 'y21(3+)', 'y22(3+)', 'y23(3+)', 'y24(3+)', 'b1(3+)', 'b2(3+)', 'b3(3+)', 'b4(3+)', 'b5(3+)', 'b6(3+)', 'b7(3+)', 'b8(3+)', 'b9(3+)', 'b10(3+)', 'b11(3+)', 'b12(3+)', 'b13(3+)', 'b14(3+)', 'b15(3+)', 'b16(3+)', 'b17(3+)', 'b18(3+)', 'b19(3+)', 'b20(3+)', 'b21(3+)', 'b22(3+)', 'b23(3+)', 'b24(3+)' ] intensity = np.zeros(len(frag_name)) matches = matches.split(";") int_exp = int_exp.split(";") ind1 = np.where(np.isin(matches, frag_name))[0].tolist() ind2 = np.where(np.isin(frag_name, matches))[0].tolist() intensity[ind2] = np.array(int_exp)[ind1] return intensity def merge_dataset_by_name(list_names): list_dataset = [] for name in list_names: list_dataset.append(pd.read_csv(name)) merged_dataset = pd.concat(list_dataset) return merged_dataset import matplotlib.pyplot as plt def mscatter(x,y, ax=None, m=None, **kw): import matplotlib.markers as mmarkers ax = ax or plt.gca() sc = ax.scatter(x,y,**kw) if (m is not None) and (len(m)==len(x)): paths = [] for marker in m: if isinstance(marker, mmarkers.MarkerStyle): marker_obj = marker else: marker_obj = mmarkers.MarkerStyle(marker) path = marker_obj.get_path().transformed( marker_obj.get_transform()) paths.append(path) sc.set_paths(paths) return sc #data gradient 10 : # 16/01 20/01 30/01 #data gradient 3 : # 17/01 23/01 24/01 if __name__ == '__main__': data_1 = pd.read_pickle('database/data_DIA_16_01_aligned.pkl') data_1['file']= 1 data_2 = pd.read_pickle('database/data_DIA_17_01_aligned.pkl') data_2['file'] = 2 data_3 = pd.read_pickle('database/data_DIA_20_01_aligned.pkl') data_3['file'] = 3 data_4 = pd.read_pickle('database/data_DIA_23_01_aligned.pkl') data_4['file'] = 4 data_5 = pd.read_pickle('database/data_DIA_24_01_aligned.pkl') data_5['file'] = 5 data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl') data_6['file'] = 6 data = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True) num_total = len(data) train_size = np.floor(0.8*num_total) list_gr=[] train_set = [] test_set=[] s = 0 groups = data.groupby('Sequence') for seq, gr in groups: list_gr.append(gr) random.shuffle(list_gr) for gr in list_gr : if s < train_size : train_set.append(gr) s+= len(gr) else : test_set.append(gr) dataset_train = pd.concat(train_set).reset_index(drop=True) dataset_test = pd.concat(test_set).reset_index(drop=True) dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl') dataset_test.to_pickle('database/data_DIA_ISA_55_test.pkl') # err_rt = [] # err_spec = [] # nb_data = [] # nb_gr = [] # for i in range(0, 120, 5): # print(i) # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', i) # data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', i) # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', i) # data = pd.concat([data_1, data_2, data_3], ignore_index=True) # groups = data.groupby('Sequence') # avg_err = 0 # avg_cosim = 0 # nb_data.append(len(data)) # nb_gr.append(len(groups)) # for seq, gr in groups: # mean = gr['Retention time'].mean() # mean_spec = np.mean(gr['Spectra'], axis=0) # cos_sim = gr['Spectra'].apply( # lambda x: (np.dot(x, mean_spec) / (np.linalg.norm(x) * np.linalg.norm(mean_spec)))).mean() # err = abs(gr['Retention time'] - mean).mean() # avg_err += err * gr.shape[0] # avg_cosim += cos_sim * gr.shape[0] # avg_err = avg_err / data.shape[0] # avg_cosim = avg_cosim / data.shape[0] # err_rt.append(avg_err) # err_spec.append(avg_cosim) # # fig, axs = plt.subplots(2, 2) # axs[0, 0].scatter(range(0, 120, 5), 1 -2 * np.arccos(err_spec)/np.pi) # axs[1, 0].scatter(range(0, 120, 5), err_rt) # axs[0, 1].scatter(range(0, 120, 5), nb_gr) # axs[1, 1].scatter(range(0, 120, 5), nb_data) # axs[0, 0].set_title('spectral angle') # axs[1, 0].set_title('avg rt err') # axs[0, 1].set_title('nb groupes') # axs[1, 1].set_title('nb data') # plt.savefig('fig_data_full.png') # data_1 = load_data('data/Custom_dataset/msms_MsBM_UGTs.txt', 55) # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', 55) # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', 55) # data = pd.concat([data_1, data_2, data_3], ignore_index=True) # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', 55) # data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', 55) # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', 55) # data = pd.concat([data_1, data_2, data_3], ignore_index=True) # data.to_csv('database/data_3_first_55.csv') # pd.to_pickle(data,"database/data_all_type.pkl") # data = pd.read_pickle("database/data_all_type.pkl") # data['number comp'] = data['Spectra'].apply(lambda x: np.sum(x > 0.1)) # sizes_gr = [] # # groups = data.groupby('Sequence') # for seq, gr in groups: # groups_2 = gr.groupby('Charge') # for ch,gr_2 in groups_2 : # array = np.stack(gr_2['Spectra']) # sizes_gr.append(array.shape[0]) # if array.shape[0] > 10: # # standardized_data = (array - array.mean(axis=0)) / array.std(axis=0) # standardized_data = np.nan_to_num(standardized_data) # covariance_matrix = np.cov(standardized_data, ddof=1, rowvar=False, dtype=np.float32) # # eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix) # # # np.argsort can only provide lowest to highest; use [::-1] to reverse the list # order_of_importance = np.argsort(eigenvalues)[::-1] # # # utilize the sort order to sort eigenvalues and eigenvectors # sorted_eigenvalues = eigenvalues[order_of_importance] # sorted_eigenvectors = eigenvectors[:, order_of_importance] # sort the columns # # use sorted_eigenvalues to ensure the explained variances correspond to the eigenvectors # explained_variance = sorted_eigenvalues / np.sum(sorted_eigenvalues) # # k = 2 # select the number of principal components # reduced_data = np.matmul(standardized_data, sorted_eigenvectors[:, :k]) # transform the original data # # ### Step 7: Determine the Explained Variance # total_explained_variance = sum(explained_variance[:k]) # # ### Potential Next Steps: Iterate on the Number of Principal Components # cm = plt.cm.get_cmap('RdYlBu') # f, ( ax3, ax2) = plt.subplots(1, 2) # # sc = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax1, m = mark[gr['Charge']], s=15, c=gr['Score'], vmin=50, vmax=250, cmap=cm) # # ax1.text(0.25, 0, total_explained_variance, # # horizontalalignment='center', # # verticalalignment='center', # # transform=ax1.transAxes) # # f.colorbar(sc, ax=ax1) # # sc2 = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax2, s=5, c=gr_2['number comp'], # vmin=min(gr['number comp']), vmax=max(gr['number comp']), cmap=cm) # f.colorbar(sc2, ax=ax2) # # sc3 = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax3, s=5, c=gr_2['Score'], vmin=min(gr_2['Score']), vmax=max(gr_2['Score']), cmap=cm) # f.colorbar(sc3, ax=ax3) # # plt.savefig('fig/pca/pca_' + seq + '_' + str(ch) + '.png') # plt.close()