msms_processing.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import json
import random


def load_data(msms_filet_path='data/msms.txt', score_treshold=70):
    data = pd.read_csv(msms_filet_path, sep='\t')
    data_compact = data[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']]
    data_filtered = data_compact[data_compact['Score'] > score_treshold]
    data_filtered = data_filtered[data_filtered['Length'] < 26]
    data_filtered['Spectra'] = data_filtered.apply(lambda x: filter_intensity(x.Matches, x.Intensities), axis=1)
    return data_filtered[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Spectra']]


def convert(l):
    return [num_int for num_str in l.split() for num_int in (
        lambda x: [float(x.replace('[', '').replace(']', ''))] if x.replace('.', '').replace('[', '').replace(']',
                                                                                                              '').replace(
            'e+', '').isdigit() else [])(
        num_str)]


def filter_intensity(matches, int_exp):
    frag_name = ['y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7', 'y8', 'y9', 'y10', 'y11', 'y12', 'y13', 'y14', 'y15', 'y16',
                 'y17', 'y18', 'y19', 'y20', 'y21', 'y22', 'y23', 'y24',
                 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'b8', 'b9', 'b10', 'b11', 'b12', 'b13', 'b14', 'b15', 'b16',
                 'b17', 'b18', 'b19', 'b20', 'b21', 'b22', 'b23', 'b24',
                 'y1(2+)', 'y2(2+)', 'y3(2+)', 'y4(2+)', 'y5(2+)', 'y6(2+)', 'y7(2+)', 'y8(2+)', 'y9(2+)', 'y10(2+)',
                 'y11(2+)', 'y12(2+)', 'y13(2+)', 'y14(2+)', 'y15(2+)', 'y16(2+)',
                 'y17(2+)', 'y18(2+)', 'y19(2+)', 'y20(2+)', 'y21(2+)', 'y22(2+)', 'y23(2+)', 'y24(2+)',
                 'b1(2+)', 'b2(2+)', 'b3(2+)', 'b4(2+)', 'b5(2+)', 'b6(2+)', 'b7(2+)', 'b8(2+)', 'b9(2+)', 'b10(2+)',
                 'b11(2+)', 'b12(2+)', 'b13(2+)', 'b14(2+)', 'b15(2+)', 'b16(2+)',
                 'b17(2+)', 'b18(2+)', 'b19(2+)', 'b20(2+)', 'b21(2+)', 'b22(2+)', 'b23(2+)', 'b24(2+)',
                 'y1(3+)', 'y2(3+)', 'y3(3+)', 'y4(3+)', 'y5(3+)', 'y6(3+)', 'y7(3+)', 'y8(3+)', 'y9(3+)',
                 'y10(3+)', 'y11(3+)', 'y12(3+)', 'y13(3+)', 'y14(3+)', 'y15(3+)', 'y16(3+)', 'y17(3+)', 'y18(3+)',
                 'y19(3+)', 'y20(3+)', 'y21(3+)', 'y22(3+)', 'y23(3+)',
                 'y24(3+)',
                 'b1(3+)', 'b2(3+)', 'b3(3+)', 'b4(3+)', 'b5(3+)', 'b6(3+)', 'b7(3+)', 'b8(3+)', 'b9(3+)', 'b10(3+)',
                 'b11(3+)', 'b12(3+)', 'b13(3+)', 'b14(3+)', 'b15(3+)', 'b16(3+)',
                 'b17(3+)', 'b18(3+)', 'b19(3+)', 'b20(3+)', 'b21(3+)', 'b22(3+)', 'b23(3+)', 'b24(3+)'
                 ]
    intensity = np.zeros(len(frag_name))
    matches = matches.split(";")
    int_exp = int_exp.split(";")

    ind1 = np.where(np.isin(matches, frag_name))[0].tolist()
    ind2 = np.where(np.isin(frag_name, matches))[0].tolist()
    intensity[ind2] = np.array(int_exp)[ind1]
    return intensity


def merge_dataset_by_name(list_names):
    list_dataset = []
    for name in list_names:
        list_dataset.append(pd.read_csv(name))
    merged_dataset = pd.concat(list_dataset)
    return merged_dataset

import matplotlib.pyplot as plt

def mscatter(x,y, ax=None, m=None, **kw):
    import matplotlib.markers as mmarkers
    ax = ax or plt.gca()
    sc = ax.scatter(x,y,**kw)
    if (m is not None) and (len(m)==len(x)):
        paths = []
        for marker in m:
            if isinstance(marker, mmarkers.MarkerStyle):
                marker_obj = marker
            else:
                marker_obj = mmarkers.MarkerStyle(marker)
            path = marker_obj.get_path().transformed(
                        marker_obj.get_transform())
            paths.append(path)
        sc.set_paths(paths)
    return sc
#data gradient 10 :
# 16/01  20/01 30/01
#data gradient 3 :
# 17/01 23/01 24/01
if __name__ == '__main__':
    data_1 = pd.read_pickle('database/data_DIA_16_01_aligned.pkl')
    data_1['file']= 1
    data_2 = pd.read_pickle('database/data_DIA_17_01_aligned.pkl')
    data_2['file'] = 2
    data_3 = pd.read_pickle('database/data_DIA_20_01_aligned.pkl')
    data_3['file'] = 3
    data_4 = pd.read_pickle('database/data_DIA_23_01_aligned.pkl')
    data_4['file'] = 4
    data_5 = pd.read_pickle('database/data_DIA_24_01_aligned.pkl')
    data_5['file'] = 5
    data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl')
    data_6['file'] = 6
    data = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True)

    num_total = len(data)
    train_size = np.floor(0.8*num_total)
    list_gr=[]
    train_set = []
    test_set=[]
    s = 0
    groups = data.groupby('Sequence')
    for seq, gr in groups:
        list_gr.append(gr)
    random.shuffle(list_gr)
    for gr in list_gr :
        if s < train_size :
            train_set.append(gr)
            s+= len(gr)
        else :
            test_set.append(gr)

    dataset_train = pd.concat(train_set).reset_index(drop=True)
    dataset_test = pd.concat(test_set).reset_index(drop=True)
    dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl')
    dataset_test.to_pickle('database/data_DIA_ISA_55_test.pkl')


    # err_rt = []
    # err_spec = []
    # nb_data = []
    # nb_gr = []
    # for i in range(0, 120, 5):
    #     print(i)
    #     data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', i)
    #     data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', i)
    #     data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', i)
    #     data = pd.concat([data_1, data_2, data_3], ignore_index=True)
    #     groups = data.groupby('Sequence')
    #     avg_err = 0
    #     avg_cosim = 0
    #     nb_data.append(len(data))
    #     nb_gr.append(len(groups))
    #     for seq, gr in groups:
    #         mean = gr['Retention time'].mean()
    #         mean_spec = np.mean(gr['Spectra'], axis=0)
    #         cos_sim = gr['Spectra'].apply(
    #             lambda x: (np.dot(x, mean_spec) / (np.linalg.norm(x) * np.linalg.norm(mean_spec)))).mean()
    #         err = abs(gr['Retention time'] - mean).mean()
    #         avg_err += err * gr.shape[0]
    #         avg_cosim += cos_sim * gr.shape[0]
    #     avg_err = avg_err / data.shape[0]
    #     avg_cosim = avg_cosim / data.shape[0]
    #     err_rt.append(avg_err)
    #     err_spec.append(avg_cosim)
    #
    # fig, axs = plt.subplots(2, 2)
    # axs[0, 0].scatter(range(0, 120, 5),  1 -2 * np.arccos(err_spec)/np.pi)
    # axs[1, 0].scatter(range(0, 120, 5), err_rt)
    # axs[0, 1].scatter(range(0, 120, 5), nb_gr)
    # axs[1, 1].scatter(range(0, 120, 5), nb_data)
    # axs[0, 0].set_title('spectral angle')
    # axs[1, 0].set_title('avg rt err')
    # axs[0, 1].set_title('nb groupes')
    # axs[1, 1].set_title('nb data')
    # plt.savefig('fig_data_full.png')
    # data_1 = load_data('data/Custom_dataset/msms_MsBM_UGTs.txt', 55)
    # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', 55)
    # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', 55)
    # data = pd.concat([data_1, data_2, data_3], ignore_index=True)


    # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', 55)
    # data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', 55)
    # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', 55)
    # data = pd.concat([data_1, data_2, data_3], ignore_index=True)
    # data.to_csv('database/data_3_first_55.csv')

    # pd.to_pickle(data,"database/data_all_type.pkl")
    # data = pd.read_pickle("database/data_all_type.pkl")
    # data['number comp'] = data['Spectra'].apply(lambda x: np.sum(x > 0.1))
    # sizes_gr = []
    #
    # groups = data.groupby('Sequence')
    # for seq, gr in groups:
    #     groups_2 = gr.groupby('Charge')
    #     for ch,gr_2 in groups_2 :
    #         array = np.stack(gr_2['Spectra'])
    #         sizes_gr.append(array.shape[0])
    #         if array.shape[0] > 10:
    #
    #             standardized_data = (array - array.mean(axis=0)) / array.std(axis=0)
    #             standardized_data = np.nan_to_num(standardized_data)
    #             covariance_matrix = np.cov(standardized_data, ddof=1, rowvar=False, dtype=np.float32)
    #
    #             eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
    #
    #             # np.argsort can only provide lowest to highest; use [::-1] to reverse the list
    #             order_of_importance = np.argsort(eigenvalues)[::-1]
    #
    #             # utilize the sort order to sort eigenvalues and eigenvectors
    #             sorted_eigenvalues = eigenvalues[order_of_importance]
    #             sorted_eigenvectors = eigenvectors[:, order_of_importance]  # sort the columns
    #             # use sorted_eigenvalues to ensure the explained variances correspond to the eigenvectors
    #             explained_variance = sorted_eigenvalues / np.sum(sorted_eigenvalues)
    #
    #             k = 2  # select the number of principal components
    #             reduced_data = np.matmul(standardized_data, sorted_eigenvectors[:, :k])  # transform the original data
    #
    #             ### Step 7: Determine the Explained Variance
    #             total_explained_variance = sum(explained_variance[:k])
    #
    #             ### Potential Next Steps: Iterate on the Number of Principal Components
    #             cm = plt.cm.get_cmap('RdYlBu')
    #             f, ( ax3, ax2) = plt.subplots(1, 2)
    #             # sc = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax1,  m = mark[gr['Charge']], s=15, c=gr['Score'], vmin=50, vmax=250, cmap=cm)
    #             # ax1.text(0.25, 0, total_explained_variance,
    #             #          horizontalalignment='center',
    #             #          verticalalignment='center',
    #             #          transform=ax1.transAxes)
    #             # f.colorbar(sc, ax=ax1)
    #
    #             sc2 = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax2, s=5, c=gr_2['number comp'],
    #                               vmin=min(gr['number comp']), vmax=max(gr['number comp']), cmap=cm)
    #             f.colorbar(sc2, ax=ax2)
    #
    #             sc3 = mscatter(reduced_data[:, 0], reduced_data[:, 1], ax = ax3, s=5, c=gr_2['Score'], vmin=min(gr_2['Score']), vmax=max(gr_2['Score']), cmap=cm)
    #             f.colorbar(sc3, ax=ax3)
    #
    #             plt.savefig('fig/pca/pca_' + seq + '_' + str(ch) + '.png')
    #             plt.close()