data_viz.py

import scipy as sp
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
import matplotlib.colors as mcolors

from mass_prediction import compute_frag_mz_ration

seq = 'YEEEFLR'
def data(a):
    b=a+a
    return b

# int = np.random.rand(174)

names = ['b1(+)', 'y1(+)', 'b1(2+)', 'y1(2+)', 'b1(3+)', 'y1(3+)','b2(+)', 'y2(+)', 'b2(2+)', 'y2(2+)', 'b2(3+)', 'y2(3+)',
         'b3(+)', 'y3(+)', 'b3(2+)', 'y3(2+)', 'b3(3+)', 'y3(3+)', 'b4(+)', 'y4(+)', 'b4(2+)', 'y4(2+)', 'b4(3+)',
         'y4(3+)','b5(+)', 'y5(+)', 'b5(2+)', 'y5(2+)', 'b5(3+)', 'y5(3+)','b6(+)', 'y6(+)', 'b6(2+)', 'y6(2+)',
         'b6(3+)', 'y6(3+)','b7(+)', 'y7(+)', 'b7(2+)', 'y7(2+)', 'b7(3+)', 'y7(3+)','b8(+)', 'y8(+)', 'b8(2+)',
         'y8(2+)', 'b8(3+)', 'y8(3+)','b9(+)', 'y9(+)', 'b9(2+)', 'y9(2+)', 'b9(3+)', 'y9(3+)','b10(+)', 'y10(+)',
         'b10(2+)', 'y10(2+)', 'b10(3+)', 'y10(3+)','b11(+)', 'y11(+)', 'b11(2+)', 'y11(2+)', 'b11(3+)', 'y11(3+)',
         'b12(+)', 'y12(+)', 'b12(2+)', 'y12(2+)', 'b12(3+)', 'y12(3+)', 'b13(+)', 'y13(+)', 'b13(2+)', 'y13(2+)',
         'b13(3+)', 'y13(3+)','b14(+)', 'y14(+)', 'b14(2+)', 'y14(2+)', 'b14(3+)', 'y14(3+)','b15(+)', 'y15(+)',
         'b15(2+)', 'y15(2+)', 'b15(3+)', 'y15(3+)', 'b16(+)', 'y16(+)', 'b16(2+)', 'y16(2+)', 'b16(3+)', 'y16(3+)',
         'b17(+)', 'y17(+)', 'b17(2+)', 'y17(2+)', 'b17(3+)', 'y17(3+)','b18(+)', 'y18(+)', 'b18(2+)', 'y18(2+)',
         'b18(3+)', 'y18(3+)','b19(+)', 'y19(+)', 'b19(2+)', 'y19(2+)', 'b19(3+)', 'y19(3+)','b20(+)', 'y20(+)',
         'b20(2+)', 'y20(2+)', 'b20(3+)', 'y20(3+)','b21(+)', 'y21(+)', 'b21(2+)', 'y21(2+)', 'b21(3+)', 'y21(3+)',
         'b22(+)', 'y22(+)', 'b22(2+)', 'y22(2+)', 'b22(3+)', 'y22(3+)','b23(+)', 'y23(+)', 'b23(2+)', 'y23(2+)',
         'b23(3+)', 'y23(3+)','b24(+)', 'y24(+)', 'b24(2+)', 'y24(2+)', 'b24(3+)', 'y24(3+)','b25(+)', 'y25(+)',
         'b25(2+)', 'y25(2+)', 'b25(3+)', 'y25(3+)','b26(+)', 'y26(+)', 'b26(2+)', 'y26(2+)', 'b26(3+)', 'y26(3+)',
         'b27(+)', 'y27(+)', 'b27(2+)', 'y27(2+)', 'b27(3+)', 'y27(3+)','b28(+)', 'y28(+)', 'b28(2+)', 'y28(2+)',
         'b28(3+)', 'y28(3+)','b29(+)', 'y29(+)', 'b29(2+)', 'y29(2+)', 'b29(3+)', 'y29(3+)']

names = np.array(names)

def frag_spectra(int, seq):
    masses = compute_frag_mz_ration(seq,'mono')
    msk = [el!=-1. for el in int]
    # Choose some nice levels
    levels = int[msk]
    dates = masses[msk]
    # Create figure and plot a stem plot with the date
    fig, ax = plt.subplots(figsize=(8.8, 4), constrained_layout=True)
    ax.set(title=seq + " fragmentation spectra")

    ax.vlines(dates, 0, levels, color="tab:red")  # The vertical stems.
    ax.plot(dates, np.zeros_like(dates),
            color="k", markerfacecolor="w")  # Baseline and markers on it.

    # annotate lines
    for d, l, r in zip(dates, levels, names):
        ax.annotate(r, xy=(d, l),
                    xytext=(-3, np.sign(l) * 3), textcoords="offset points",
                    horizontalalignment="right",
                    verticalalignment="bottom" if l > 0 else "top")


    plt.setp(ax.get_xticklabels(), rotation=30, ha="right")

    # remove y axis and spines
    ax.yaxis.set_visible(False)
    ax.spines[["left", "top", "right"]].set_visible(False)

    ax.margins(y=0.1)
    plt.show()

def frag_spectra_comparison(int_1, seq_1, int_2, seq_2=None):
    if seq_2 is None :
        seq_2 = seq_1
    masses_1 = compute_frag_mz_ration(seq_1,'mono')
    msk_1 = [el!=-1 for el in int_1]
    levels_1 = int_1[msk_1]
    dates_1 = masses_1[msk_1]
    names_1 = names[msk_1]
    masses_2 = compute_frag_mz_ration(seq_2, 'mono')
    msk_2 = [el != -1. for el in int_2]
    levels_2 = int_2[msk_2]
    dates_2 = masses_2[msk_2]
    names_2 = names[msk_2]
    # Create figure and plot a stem plot with the date
    fig, ax = plt.subplots(figsize=(8.8, 4), constrained_layout=True)
    ax.set(title=seq_1 + " / " +seq_2 + " fragmentation spectra comparison")

    ax.vlines(dates_1, 0, levels_1, color="tab:red")  # The vertical stems.
    ax.plot(dates_1, np.zeros_like(dates_1),
            color="k", markerfacecolor="w")  # Baseline and markers on it.

    # annotate lines
    for d, l, r in zip(dates_1, levels_1, names_1):
        ax.annotate(r, xy=(d, l),
                    xytext=(-3, np.sign(l) * 3), textcoords="offset points",
                    horizontalalignment="right",
                    verticalalignment="bottom" if l > 0 else "top")

    ax.vlines(dates_2, 0, -levels_2, color="tab:blue")  # The vertical stems.
    ax.plot(dates_2, np.zeros_like(dates_2),
            color="k", markerfacecolor="w")  # Baseline and markers on it.

    # annotate lines
    for d, l, r in zip(dates_2, -levels_2, names_2):
        ax.annotate(r, xy=(d, l),
                    xytext=(-3, np.sign(l) * 3), textcoords="offset points",
                    horizontalalignment="right",
                    verticalalignment="bottom" if l > 0 else "top")


    plt.setp(ax.get_xticklabels(), rotation=30, ha="right")

    # remove y axis and spines
    ax.yaxis.set_visible(False)
    ax.spines[["left", "top", "right"]].set_visible(False)

    ax.margins(y=0.1)
    plt.show()


def histo_abs_error(dataframe, display=False, save=False, path=None):
    points = dataframe['abs_error']

    ## combine these different collections into a list
    data_to_plot = [points]


    # Create a figure instance
    fig, ax = plt.subplots()

    # Create the boxplot
    ax.set_xlabel('abs error')
    ax.violinplot(data_to_plot, vert=False, side='high', showmedians=True, quantiles=[0.95])
    ax.set_xlim(0,175)
    if display :
        plt.show()

    if save :
        plt.savefig(path)


def random_color_deterministic(df, column):

    def rd10(str):
        color = list(mcolors.CSS4_COLORS)
        random.seed(str)
        return color[random.randint(0,147)]

    df['color']=df[column].map(rd10)

def scatter_rt(dataframe, display=False, save=False, path=None, color = False, col = 'seq'):
    fig, ax = plt.subplots()
    if color :
        random_color_deterministic(dataframe, col)
        ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1, color = dataframe['color'])
    else :
        ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1)
    ax.set_xlabel('true RT')
    ax.set_ylabel('pred RT')
    x = np.array([min(dataframe['true rt']), max(dataframe['true rt'])])
    linreg = sp.stats.linregress(dataframe['true rt'], dataframe['rt pred'])
    ax.annotate("r-squared = {:.3f}".format(r2_score(dataframe['true rt'], dataframe['rt pred'])), (0, 1))
    plt.plot(x, linreg.intercept + linreg.slope * x, 'r')
    if display :
        plt.show()

    if save :
        plt.savefig(path)


def histo_abs_error_by_length(dataframe, display=False, save=False, path=None):
    data_to_plot =[]
    max_length = max(dataframe['length'])
    min_length = min(dataframe['length'])
    for l in range(min_length, max_length):
        data_to_plot.append(dataframe['abs_error'].where(dataframe['length']==l))

    # data_to_plot.append()


    fig, ax = plt.subplots()

    # Create the boxplot
    bp = ax.violinplot(data_to_plot, vert=True, side='low')
    if display:
        plt.show()

    if save:
        plt.savefig(path)

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0))
    return (cumsum[N:] - cumsum[:-N]) / float(N)

def histo_length_by_error(dataframe, bins, display=False, save=False, path=None):
    data_to_plot = []
    quanti = []
    max_error = max(dataframe['abs_error'])
    inter = np.linspace(0, max_error, num=bins+1)
    inter_m = running_mean(inter, 2)

    inter_labels = list(map(lambda x : round(x,2),inter_m))
    inter_labels.insert(0,0)
    for i in range(bins):
        a = dataframe.loc[(inter[i] < dataframe['abs_error']) & (dataframe['abs_error'] < inter[i+1])]['length']
        if len(a)>0:
            data_to_plot.append(a)
            quanti.append(0.95)
        else :
            data_to_plot.append([0])
            quanti.append(0.95)


    fig, ax = plt.subplots()

    # Create the boxplot
    ax.violinplot(data_to_plot, vert=True, side='high', showmedians=True)
    ax.set_ylabel('length')
    ax.set_xticks(range(len(inter)),inter_labels)
    if display:
        plt.show()

    if save:
        plt.savefig(path)

def compare_error(df1, df2, display=False, save=False, path=None):
    df1['abs err 1'] = df1['rt pred'] - df1['true rt']
    df2['abs err 2'] = df2['rt pred'] - df2['true rt']
    df_group_1 = df1.groupby(['seq'])['abs err 1'].mean().to_frame().reset_index()
    df_group_2 = df2.groupby(['seq'])['abs err 2'].mean().to_frame().reset_index()
    df = pd.concat([df_group_1,df_group_2],axis=1)

    fig, ax = plt.subplots()
    ax.scatter(df['abs err 1'], df['abs err 2'], s=0.1, alpha=0.05)

    plt.savefig('temp.png')


    if display:
        plt.show()

    if save:
        plt.savefig(path)

def select_best_data(df1,df2,threshold):
    df1['abs err 1'] = abs(df1['rt pred'] - df1['true rt'])
    df2['abs err 2'] = abs(df2['rt pred'] - df2['true rt'])
    df_group_1 = df1.groupby(['seq'])['abs err 1'].mean().to_frame().reset_index()
    df_group_2 = df2.groupby(['seq'])['abs err 2'].mean().to_frame().reset_index()
    df = pd.concat([df_group_1, df_group_2], axis=1)
    df['mean']=(df['abs err 1']+df['abs err 2'])/2
    df_res = df[df['mean']<threshold]
    df_res = df_res['seq']
    df_res.columns = ['seq','temp']
    df_res = df_res['seq']
    good_seq=[]
    good_rt=[]
    for r in df1.iterrows() :
        if r[1]['seq'] in df_res.values :
            good_rt.append(r[1]['true rt'])
            good_seq.append(r[1]['seq'])
    return pd.DataFrame({'Sequence' : good_seq, 'Retention time': good_rt})


def add_length(dataframe):
    def fonc(a):
        a = a.replace('[', '')
        a = a.replace(']', '')
        a = a.split(',')
        a = list(map(int, a))
        return np.count_nonzero(np.array(a))
    dataframe['length']=dataframe['seq'].map(fonc)


df = pd.read_csv('output/out_common_ISA_augmented_10_eval.csv')
add_length(df)
df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_SA_augmented_10_eval.png')
scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_SA_augmented_10_eval.png', color=True)
histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_SA_augmented_10_eval.png')
#
# df = pd.read_csv('output/out_common_ISA_augmented_3_eval.csv')
# add_length(df)
# df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_augmented_3_eval.png')
# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_augmented_3_eval.png', color=True)
# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_augmented_3_eval.png')
#
# df = pd.read_csv('output/out_common_prosit_ISA_eval_3.csv')
# add_length(df)
# df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_ISA_eval_3.png')
# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_ISA_eval_3.png', color=True)
# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_ISA_eval_3.png')

# df = pd.read_csv('output/out_common_ISA_prosit_eval.csv')
# add_length(df)
# df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_prosit_eval.png')
# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_prosit_eval.png', color=True, col = 'seq')
# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_prosit_eval.png')

## Compare error variation between run
## Prosit column changes affect some peptides more than others (but consistently)
# df_1 = pd.read_csv('output/out_common_ISA_prosit_eval.csv')
# df_2 = pd.read_csv('output/out_common_ISA_prosit_eval_2.csv')
#
# df = select_best_data(df_1, df_2, 7)
# df.to_pickle('database/data_prosit_threshold_7.pkl')
# compare_error(df_1,df_2,save=True,path='fig/custom model res/ISA_prosit_error_variation.png')