Skip to content
Snippets Groups Projects
msms_processing.py 10.3 KiB
Newer Older
Léo Schneider's avatar
Léo Schneider committed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import json
Schneider Leo's avatar
Schneider Leo committed
import random
Léo Schneider's avatar
Léo Schneider committed


def load_data(msms_filet_path='data/msms.txt', score_treshold=70):
    data = pd.read_csv(msms_filet_path, sep='\t')
    data_compact = data[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']]
    data_filtered = data_compact[data_compact['Score'] > score_treshold]
    data_filtered = data_filtered[data_filtered['Length'] < 26]
    data_filtered['Spectra'] = data_filtered.apply(lambda x: filter_intensity(x.Matches, x.Intensities), axis=1)
    return data_filtered[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Spectra']]


def convert(l):
    return [num_int for num_str in l.split() for num_int in (
        lambda x: [float(x.replace('[', '').replace(']', ''))] if x.replace('.', '').replace('[', '').replace(']',
                                                                                                              '').replace(
            'e+', '').isdigit() else [])(
        num_str)]


def filter_intensity(matches, int_exp):
    frag_name = ['y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7', 'y8', 'y9', 'y10', 'y11', 'y12', 'y13', 'y14', 'y15', 'y16',
                 'y17', 'y18', 'y19', 'y20', 'y21', 'y22', 'y23', 'y24',
                 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9', 'b10', 'b11', 'b12', 'b13', 'b14', 'b15', 'b16',
                 'b17', 'b18', 'b19', 'b20', 'b21', 'b22', 'b23', 'b24',
                 'y1(2+)', 'y2(2+)', 'y3(2+)', 'y4(2+)', 'y5(2+)', 'y6(2+)', 'y7(2+)', 'y8(2+)', 'y9(2+)', 'y10(2+)',
                 'y11(2+)', 'y12(2+)', 'y13(2+)', 'y14(2+)', 'y15(2+)', 'y16(2+)',
                 'y17(2+)', 'y18(2+)', 'y19(2+)', 'y20(2+)', 'y21(2+)', 'y22(2+)', 'y23(2+)', 'y24(2+)',
                 'b1(2+)', 'b2(2+)', 'b3(2+)', 'b4(2+)', 'b5(2+)', 'b6(2+)', 'b7(2+)', 'b8(2+)', 'b9(2+)', 'b10(2+)',
                 'b11(2+)', 'b12(2+)', 'b13(2+)', 'b14(2+)', 'b15(2+)', 'b16(2+)',
                 'b17(2+)', 'b18(2+)', 'b19(2+)', 'b20(2+)', 'b21(2+)', 'b22(2+)', 'b23(2+)', 'b24(2+)',
                 'y1(3+)', 'y2(3+)', 'y3(3+)', 'y4(3+)', 'y5(3+)', 'y6(3+)', 'y7(3+)', 'y8(3+)', 'y9(3+)',
                 'y10(3+)', 'y11(3+)', 'y12(3+)', 'y13(3+)', 'y14(3+)', 'y15(3+)', 'y16(3+)', 'y17(3+)', 'y18(3+)',
                 'y19(3+)', 'y20(3+)', 'y21(3+)', 'y22(3+)', 'y23(3+)',
                 'y24(3+)',
                 'b1(3+)', 'b2(3+)', 'b3(3+)', 'b4(3+)', 'b5(3+)', 'b6(3+)', 'b7(3+)', 'b8(3+)', 'b9(3+)', 'b10(3+)',
                 'b11(3+)', 'b12(3+)', 'b13(3+)', 'b14(3+)', 'b15(3+)', 'b16(3+)',
                 'b17(3+)', 'b18(3+)', 'b19(3+)', 'b20(3+)', 'b21(3+)', 'b22(3+)', 'b23(3+)', 'b24(3+)'
                 ]
    intensity = np.zeros(len(frag_name))
    matches = matches.split(";")
    int_exp = int_exp.split(";")

    ind1 = np.where(np.isin(matches, frag_name))[0].tolist()
    ind2 = np.where(np.isin(frag_name, matches))[0].tolist()
    intensity[ind2] = np.array(int_exp)[ind1]
    return intensity


def merge_dataset_by_name(list_names):
    list_dataset = []
    for name in list_names:
        list_dataset.append(pd.read_csv(name))
    merged_dataset = pd.concat(list_dataset)
    return merged_dataset

import matplotlib.pyplot as plt

def mscatter(x,y, ax=None, m=None, **kw):
    import matplotlib.markers as mmarkers
    ax = ax or plt.gca()
    sc = ax.scatter(x,y,**kw)
    if (m is not None) and (len(m)==len(x)):
        paths = []
        for marker in m:
            if isinstance(marker, mmarkers.MarkerStyle):
                marker_obj = marker
            else:
                marker_obj = mmarkers.MarkerStyle(marker)
            path = marker_obj.get_path().transformed(
                        marker_obj.get_transform())
            paths.append(path)
        sc.set_paths(paths)
    return sc
Schneider Leo's avatar
Schneider Leo committed
#data gradient 10 :
# 16/01  20/01 30/01
#data gradient 3 :
# 17/01 23/01 24/01
Léo Schneider's avatar
Léo Schneider committed
if __name__ == '__main__':
Schneider Leo's avatar
Schneider Leo committed
    data_1 = pd.read_pickle('database/data_DIA_16_01_aligned.pkl')
Schneider Leo's avatar
Schneider Leo committed
    data_1['file']= 1
Schneider Leo's avatar
Schneider Leo committed
    data_2 = pd.read_pickle('database/data_DIA_17_01_aligned.pkl')
Schneider Leo's avatar
Schneider Leo committed
    data_2['file'] = 2
Schneider Leo's avatar
Schneider Leo committed
    data_3 = pd.read_pickle('database/data_DIA_20_01_aligned.pkl')
Schneider Leo's avatar
Schneider Leo committed
    data_3['file'] = 3
Schneider Leo's avatar
Schneider Leo committed
    data_4 = pd.read_pickle('database/data_DIA_23_01_aligned.pkl')
Schneider Leo's avatar
Schneider Leo committed
    data_4['file'] = 4
Schneider Leo's avatar
Schneider Leo committed
    data_5 = pd.read_pickle('database/data_DIA_24_01_aligned.pkl')
Schneider Leo's avatar
Schneider Leo committed
    data_5['file'] = 5
Schneider Leo's avatar
Schneider Leo committed
    data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl')
Schneider Leo's avatar
Schneider Leo committed
    data_6['file'] = 6
Schneider Leo's avatar
Schneider Leo committed
    data = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True)

    num_total = len(data)
    train_size = np.floor(0.8*num_total)
    list_gr=[]
    train_set = []
    test_set=[]
    s = 0
    groups = data.groupby('Sequence')
    for seq, gr in groups:
        list_gr.append(gr)
    random.shuffle(list_gr)
    for gr in list_gr :
        if s < train_size :
            train_set.append(gr)
            s+= len(gr)
        else :
            test_set.append(gr)

    dataset_train = pd.concat(train_set).reset_index(drop=True)
    dataset_test = pd.concat(test_set).reset_index(drop=True)
Schneider Leo's avatar
Schneider Leo committed
    dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl')
    dataset_test.to_pickle('database/data_DIA_ISA_55_test.pkl')
Léo Schneider's avatar
Léo Schneider committed
    # err_rt = []
    # err_spec = []
    # nb_data = []
    # nb_gr = []
    # for i in range(0, 120, 5):
    #     print(i)
    #     data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', i)
    #     data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', i)
    #     data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', i)
    #     data = pd.concat([data_1, data_2, data_3], ignore_index=True)
    #     groups = data.groupby('Sequence')
    #     avg_err = 0
    #     avg_cosim = 0
    #     nb_data.append(len(data))
    #     nb_gr.append(len(groups))
    #     for seq, gr in groups:
    #         mean = gr['Retention time'].mean()
    #         mean_spec = np.mean(gr['Spectra'], axis=0)
    #         cos_sim = gr['Spectra'].apply(
    #             lambda x: (np.dot(x, mean_spec) / (np.linalg.norm(x) * np.linalg.norm(mean_spec)))).mean()
    #         err = abs(gr['Retention time'] - mean).mean()
    #         avg_err += err * gr.shape[0]
    #         avg_cosim += cos_sim * gr.shape[0]
    #     avg_err = avg_err / data.shape[0]
    #     avg_cosim = avg_cosim / data.shape[0]
    #     err_rt.append(avg_err)
    #     err_spec.append(avg_cosim)
    #
    # fig, axs = plt.subplots(2, 2)
    # axs[0, 0].scatter(range(0, 120, 5),  1 -2 * np.arccos(err_spec)/np.pi)
    # axs[1, 0].scatter(range(0, 120, 5), err_rt)
    # axs[0, 1].scatter(range(0, 120, 5), nb_gr)
    # axs[1, 1].scatter(range(0, 120, 5), nb_data)
    # axs[0, 0].set_title('spectral angle')
    # axs[1, 0].set_title('avg rt err')
    # axs[0, 1].set_title('nb groupes')
    # axs[1, 1].set_title('nb data')
    # plt.savefig('fig_data_full.png')
    # data_1 = load_data('data/Custom_dataset/msms_MsBM_UGTs.txt', 55)
    # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', 55)
    # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', 55)
    # data = pd.concat([data_1, data_2, data_3], ignore_index=True)


    # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', 55)
    # data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', 55)
    # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', 55)
    # data = pd.concat([data_1, data_2, data_3], ignore_index=True)
    # data.to_csv('database/data_3_first_55.csv')

    # pd.to_pickle(data,"database/data_all_type.pkl")
    # data = pd.read_pickle("database/data_all_type.pkl")
    # data['number comp'] = data['Spectra'].apply(lambda x: np.sum(x > 0.1))
    # sizes_gr = []
    #
    # groups = data.groupby('Sequence')
    # for seq, gr in groups:
    #     groups_2 = gr.groupby('Charge')
    #     for ch,gr_2 in groups_2 :
    #         array = np.stack(gr_2['Spectra'])
    #         sizes_gr.append(array.shape[0])
    #         if array.shape[0] > 10:
    #
    #             standardized_data = (array - array.mean(axis=0)) / array.std(axis=0)
    #             standardized_data = np.nan_to_num(standardized_data)
    #             covariance_matrix = np.cov(standardized_data, ddof=1, rowvar=False, dtype=np.float32)
    #
    #             eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    #
    #             # np.argsort can only provide lowest to highest; use [::-1] to reverse the list
    #             order_of_importance = np.argsort(eigenvalues)[::-1]
    #
    #             # utilize the sort order to sort eigenvalues and eigenvectors
    #             sorted_eigenvalues = eigenvalues[order_of_importance]
    #             sorted_eigenvectors = eigenvectors[:, order_of_importance]  # sort the columns
    #             # use sorted_eigenvalues to ensure the explained variances correspond to the eigenvectors
    #             explained_variance = sorted_eigenvalues / np.sum(sorted_eigenvalues)
    #
    #             k = 2  # select the number of principal components
    #             reduced_data = np.matmul(standardized_data, sorted_eigenvectors[:, :k])  # transform the original data
    #
    #             ### Step 7: Determine the Explained Variance
    #             total_explained_variance = sum(explained_variance[:k])
    #
    #             ### Potential Next Steps: Iterate on the Number of Principal Components
    #             cm = plt.cm.get_cmap('RdYlBu')
    #             f, ( ax3, ax2) = plt.subplots(1, 2)
    #             # sc = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax1,  m = mark[gr['Charge']], s=15, c=gr['Score'], vmin=50, vmax=250, cmap=cm)
    #             # ax1.text(0.25, 0, total_explained_variance,
    #             #          horizontalalignment='center',
    #             #          verticalalignment='center',
    #             #          transform=ax1.transAxes)
    #             # f.colorbar(sc, ax=ax1)
    #
    #             sc2 = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax2, s=5, c=gr_2['number comp'],
    #                               vmin=min(gr['number comp']), vmax=max(gr['number comp']), cmap=cm)
    #             f.colorbar(sc2, ax=ax2)
    #
    #             sc3 = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax3, s=5, c=gr_2['Score'], vmin=min(gr_2['Score']), vmax=max(gr_2['Score']), cmap=cm)
    #             f.colorbar(sc3, ax=ax3)
    #
    #             plt.savefig('fig/pca/pca_' + seq + '_' + str(ch) + '.png')
    #             plt.close()