Skip to content
Snippets Groups Projects
data_viz.py 12.1 KiB
Newer Older
Schneider Leo's avatar
Schneider Leo committed
import scipy as sp
from sklearn.metrics import r2_score
Léo Schneider's avatar
Léo Schneider committed
import matplotlib.pyplot as plt
import numpy as np
import random
Léo Schneider's avatar
Léo Schneider committed
import pandas as pd
import matplotlib.colors as mcolors
Léo Schneider's avatar
Léo Schneider committed

Léo Schneider's avatar
Léo Schneider committed
from mass_prediction import compute_frag_mz_ration

seq = 'YEEEFLR'
def data(a):
    b=a+a
    return b

Léo Schneider's avatar
Léo Schneider committed
# int = np.random.rand(174)
Léo Schneider's avatar
Léo Schneider committed

names = ['b1(+)', 'y1(+)', 'b1(2+)', 'y1(2+)', 'b1(3+)', 'y1(3+)','b2(+)', 'y2(+)', 'b2(2+)', 'y2(2+)', 'b2(3+)', 'y2(3+)',
         'b3(+)', 'y3(+)', 'b3(2+)', 'y3(2+)', 'b3(3+)', 'y3(3+)', 'b4(+)', 'y4(+)', 'b4(2+)', 'y4(2+)', 'b4(3+)',
         'y4(3+)','b5(+)', 'y5(+)', 'b5(2+)', 'y5(2+)', 'b5(3+)', 'y5(3+)','b6(+)', 'y6(+)', 'b6(2+)', 'y6(2+)',
         'b6(3+)', 'y6(3+)','b7(+)', 'y7(+)', 'b7(2+)', 'y7(2+)', 'b7(3+)', 'y7(3+)','b8(+)', 'y8(+)', 'b8(2+)',
         'y8(2+)', 'b8(3+)', 'y8(3+)','b9(+)', 'y9(+)', 'b9(2+)', 'y9(2+)', 'b9(3+)', 'y9(3+)','b10(+)', 'y10(+)',
         'b10(2+)', 'y10(2+)', 'b10(3+)', 'y10(3+)','b11(+)', 'y11(+)', 'b11(2+)', 'y11(2+)', 'b11(3+)', 'y11(3+)',
         'b12(+)', 'y12(+)', 'b12(2+)', 'y12(2+)', 'b12(3+)', 'y12(3+)', 'b13(+)', 'y13(+)', 'b13(2+)', 'y13(2+)',
         'b13(3+)', 'y13(3+)','b14(+)', 'y14(+)', 'b14(2+)', 'y14(2+)', 'b14(3+)', 'y14(3+)','b15(+)', 'y15(+)',
         'b15(2+)', 'y15(2+)', 'b15(3+)', 'y15(3+)', 'b16(+)', 'y16(+)', 'b16(2+)', 'y16(2+)', 'b16(3+)', 'y16(3+)',
         'b17(+)', 'y17(+)', 'b17(2+)', 'y17(2+)', 'b17(3+)', 'y17(3+)','b18(+)', 'y18(+)', 'b18(2+)', 'y18(2+)',
         'b18(3+)', 'y18(3+)','b19(+)', 'y19(+)', 'b19(2+)', 'y19(2+)', 'b19(3+)', 'y19(3+)','b20(+)', 'y20(+)',
         'b20(2+)', 'y20(2+)', 'b20(3+)', 'y20(3+)','b21(+)', 'y21(+)', 'b21(2+)', 'y21(2+)', 'b21(3+)', 'y21(3+)',
         'b22(+)', 'y22(+)', 'b22(2+)', 'y22(2+)', 'b22(3+)', 'y22(3+)','b23(+)', 'y23(+)', 'b23(2+)', 'y23(2+)',
         'b23(3+)', 'y23(3+)','b24(+)', 'y24(+)', 'b24(2+)', 'y24(2+)', 'b24(3+)', 'y24(3+)','b25(+)', 'y25(+)',
         'b25(2+)', 'y25(2+)', 'b25(3+)', 'y25(3+)','b26(+)', 'y26(+)', 'b26(2+)', 'y26(2+)', 'b26(3+)', 'y26(3+)',
         'b27(+)', 'y27(+)', 'b27(2+)', 'y27(2+)', 'b27(3+)', 'y27(3+)','b28(+)', 'y28(+)', 'b28(2+)', 'y28(2+)',
         'b28(3+)', 'y28(3+)','b29(+)', 'y29(+)', 'b29(2+)', 'y29(2+)', 'b29(3+)', 'y29(3+)']

names = np.array(names)

def frag_spectra(int, seq):
    masses = compute_frag_mz_ration(seq,'mono')
    msk = [el!=-1. for el in int]
    # Choose some nice levels
    levels = int[msk]
    dates = masses[msk]
    # Create figure and plot a stem plot with the date
    fig, ax = plt.subplots(figsize=(8.8, 4), constrained_layout=True)
    ax.set(title=seq + " fragmentation spectra")

    ax.vlines(dates, 0, levels, color="tab:red")  # The vertical stems.
    ax.plot(dates, np.zeros_like(dates),
            color="k", markerfacecolor="w")  # Baseline and markers on it.

    # annotate lines
    for d, l, r in zip(dates, levels, names):
        ax.annotate(r, xy=(d, l),
                    xytext=(-3, np.sign(l) * 3), textcoords="offset points",
                    horizontalalignment="right",
                    verticalalignment="bottom" if l > 0 else "top")


    plt.setp(ax.get_xticklabels(), rotation=30, ha="right")

    # remove y axis and spines
    ax.yaxis.set_visible(False)
    ax.spines[["left", "top", "right"]].set_visible(False)

    ax.margins(y=0.1)
    plt.show()

def frag_spectra_comparison(int_1, seq_1, int_2, seq_2=None):
    if seq_2 is None :
        seq_2 = seq_1
    masses_1 = compute_frag_mz_ration(seq_1,'mono')
    msk_1 = [el!=-1 for el in int_1]
    levels_1 = int_1[msk_1]
    dates_1 = masses_1[msk_1]
    names_1 = names[msk_1]
    masses_2 = compute_frag_mz_ration(seq_2, 'mono')
    msk_2 = [el != -1. for el in int_2]
    levels_2 = int_2[msk_2]
    dates_2 = masses_2[msk_2]
    names_2 = names[msk_2]
    # Create figure and plot a stem plot with the date
    fig, ax = plt.subplots(figsize=(8.8, 4), constrained_layout=True)
    ax.set(title=seq_1 + " / " +seq_2 + " fragmentation spectra comparison")

    ax.vlines(dates_1, 0, levels_1, color="tab:red")  # The vertical stems.
    ax.plot(dates_1, np.zeros_like(dates_1),
            color="k", markerfacecolor="w")  # Baseline and markers on it.

    # annotate lines
    for d, l, r in zip(dates_1, levels_1, names_1):
        ax.annotate(r, xy=(d, l),
                    xytext=(-3, np.sign(l) * 3), textcoords="offset points",
                    horizontalalignment="right",
                    verticalalignment="bottom" if l > 0 else "top")

    ax.vlines(dates_2, 0, -levels_2, color="tab:blue")  # The vertical stems.
    ax.plot(dates_2, np.zeros_like(dates_2),
            color="k", markerfacecolor="w")  # Baseline and markers on it.

    # annotate lines
    for d, l, r in zip(dates_2, -levels_2, names_2):
        ax.annotate(r, xy=(d, l),
                    xytext=(-3, np.sign(l) * 3), textcoords="offset points",
                    horizontalalignment="right",
                    verticalalignment="bottom" if l > 0 else "top")




    plt.setp(ax.get_xticklabels(), rotation=30, ha="right")

    # remove y axis and spines
    ax.yaxis.set_visible(False)
    ax.spines[["left", "top", "right"]].set_visible(False)

    ax.margins(y=0.1)
Léo Schneider's avatar
Léo Schneider committed
    plt.show()


def histo_abs_error(dataframe, display=False, save=False, path=None):
    points = dataframe['abs_error']

    ## combine these different collections into a list
    data_to_plot = [points]

Léo Schneider's avatar
Léo Schneider committed
    # Create a figure instance
Léo Schneider's avatar
Léo Schneider committed
    fig, ax = plt.subplots()
Léo Schneider's avatar
Léo Schneider committed

    # Create the boxplot
Léo Schneider's avatar
Léo Schneider committed
    ax.set_xlabel('abs error')
    ax.violinplot(data_to_plot, vert=False, side='high', showmedians=True, quantiles=[0.95])
    ax.set_xlim(0,175)
Léo Schneider's avatar
Léo Schneider committed
    if display :
        plt.show()

    if save :
        plt.savefig(path)

Léo Schneider's avatar
Léo Schneider committed

Schneider Leo's avatar
Schneider Leo committed
def random_color_deterministic(df, column):

    def rd10(str):
        color = list(mcolors.CSS4_COLORS)
        random.seed(str)
        return color[random.randint(0,147)]

Schneider Leo's avatar
Schneider Leo committed
    df['color']=df[column].map(rd10)
Schneider Leo's avatar
Schneider Leo committed
def scatter_rt(dataframe, display=False, save=False, path=None, color = False, col = 'seq'):
    fig, ax = plt.subplots()
    if color :
Schneider Leo's avatar
Schneider Leo committed
        random_color_deterministic(dataframe, col)
        ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1, color = dataframe['color'])
    else :
        ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1)
Léo Schneider's avatar
Léo Schneider committed
    ax.set_xlabel('true RT')
    ax.set_ylabel('pred RT')
Schneider Leo's avatar
Schneider Leo committed
    x = np.array([min(dataframe['true rt']), max(dataframe['true rt'])])
    linreg = sp.stats.linregress(dataframe['true rt'], dataframe['rt pred'])
    ax.annotate("r-squared = {:.3f}".format(r2_score(dataframe['true rt'], dataframe['rt pred'])), (0, 1))
    plt.plot(x, linreg.intercept + linreg.slope * x, 'r')
Léo Schneider's avatar
Léo Schneider committed
    if display :
        plt.show()

    if save :
        plt.savefig(path)
Léo Schneider's avatar
Léo Schneider committed


Léo Schneider's avatar
Léo Schneider committed
def histo_abs_error_by_length(dataframe, display=False, save=False, path=None):
    data_to_plot =[]
    max_length = max(dataframe['length'])
    min_length = min(dataframe['length'])
    for l in range(min_length, max_length):
        data_to_plot.append(dataframe['abs_error'].where(dataframe['length']==l))

    # data_to_plot.append()

Léo Schneider's avatar
Léo Schneider committed

Léo Schneider's avatar
Léo Schneider committed
    fig, ax = plt.subplots()
Léo Schneider's avatar
Léo Schneider committed

    # Create the boxplot
Léo Schneider's avatar
Léo Schneider committed
    bp = ax.violinplot(data_to_plot, vert=True, side='low')
Léo Schneider's avatar
Léo Schneider committed
    if display:
        plt.show()

    if save:
        plt.savefig(path)

Léo Schneider's avatar
Léo Schneider committed
def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0))
    return (cumsum[N:] - cumsum[:-N]) / float(N)

Léo Schneider's avatar
Léo Schneider committed
def histo_length_by_error(dataframe, bins, display=False, save=False, path=None):
    data_to_plot = []
Léo Schneider's avatar
Léo Schneider committed
    quanti = []
Léo Schneider's avatar
Léo Schneider committed
    max_error = max(dataframe['abs_error'])
Léo Schneider's avatar
Léo Schneider committed
    inter = np.linspace(0, max_error, num=bins+1)
    inter_m = running_mean(inter, 2)

    inter_labels = list(map(lambda x : round(x,2),inter_m))
    inter_labels.insert(0,0)
Léo Schneider's avatar
Léo Schneider committed
    for i in range(bins):
Léo Schneider's avatar
Léo Schneider committed
        a = dataframe.loc[(inter[i] < dataframe['abs_error']) & (dataframe['abs_error'] < inter[i+1])]['length']
        if len(a)>0:
            data_to_plot.append(a)
            quanti.append(0.95)
        else :
            data_to_plot.append([0])
            quanti.append(0.95)
Léo Schneider's avatar
Léo Schneider committed


Léo Schneider's avatar
Léo Schneider committed
    fig, ax = plt.subplots()
Léo Schneider's avatar
Léo Schneider committed

    # Create the boxplot
Léo Schneider's avatar
Léo Schneider committed
    ax.violinplot(data_to_plot, vert=True, side='high', showmedians=True)
    ax.set_ylabel('length')
    ax.set_xticks(range(len(inter)),inter_labels)
Léo Schneider's avatar
Léo Schneider committed
    if display:
        plt.show()

    if save:
        plt.savefig(path)

def compare_error(df1, df2, display=False, save=False, path=None):
    df1['abs err 1'] = df1['rt pred'] - df1['true rt']
    df2['abs err 2'] = df2['rt pred'] - df2['true rt']
Schneider Leo's avatar
Schneider Leo committed
    df_group_1 = df1.groupby(['seq'])['abs err 1'].mean().to_frame().reset_index()
    df_group_2 = df2.groupby(['seq'])['abs err 2'].mean().to_frame().reset_index()
    df = pd.concat([df_group_1,df_group_2],axis=1)

    fig, ax = plt.subplots()
    ax.scatter(df['abs err 1'], df['abs err 2'], s=0.1, alpha=0.05)

    plt.savefig('temp.png')
Léo Schneider's avatar
Léo Schneider committed


    if display:
        plt.show()

    if save:
        plt.savefig(path)
Léo Schneider's avatar
Léo Schneider committed

def select_best_data(df1,df2,threshold):
    df1['abs err 1'] = abs(df1['rt pred'] - df1['true rt'])
    df2['abs err 2'] = abs(df2['rt pred'] - df2['true rt'])
    df_group_1 = df1.groupby(['seq'])['abs err 1'].mean().to_frame().reset_index()
    df_group_2 = df2.groupby(['seq'])['abs err 2'].mean().to_frame().reset_index()
    df = pd.concat([df_group_1, df_group_2], axis=1)
    df['mean']=(df['abs err 1']+df['abs err 2'])/2
    df_res = df[df['mean']<threshold]
    df_res = df_res['seq']
    df_res.columns = ['seq','temp']
    df_res = df_res['seq']
    good_seq=[]
    good_rt=[]
    for r in df1.iterrows() :
        if r[1]['seq'] in df_res.values :
            good_rt.append(r[1]['true rt'])
            good_seq.append(r[1]['seq'])
    return pd.DataFrame({'Sequence' : good_seq, 'Retention time': good_rt})


Léo Schneider's avatar
Léo Schneider committed
def add_length(dataframe):
    def fonc(a):
        a = a.replace('[', '')
        a = a.replace(']', '')
        a = a.split(',')
        a = list(map(int, a))
        return np.count_nonzero(np.array(a))
    dataframe['length']=dataframe['seq'].map(fonc)


Schneider Leo's avatar
Schneider Leo committed
df = pd.read_csv('output/out_common_ISA_augmented_10_eval.csv')
Schneider Leo's avatar
Schneider Leo committed
add_length(df)
Schneider Leo's avatar
Schneider Leo committed
df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_SA_augmented_10_eval.png')
scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_SA_augmented_10_eval.png', color=True)
histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_SA_augmented_10_eval.png')
Schneider Leo's avatar
Schneider Leo committed
#
Schneider Leo's avatar
Schneider Leo committed
# df = pd.read_csv('output/out_common_ISA_augmented_3_eval.csv')
Schneider Leo's avatar
Schneider Leo committed
# add_length(df)
# df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
Schneider Leo's avatar
Schneider Leo committed
# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_augmented_3_eval.png')
# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_augmented_3_eval.png', color=True)
# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_augmented_3_eval.png')
Schneider Leo's avatar
Schneider Leo committed
#
# df = pd.read_csv('output/out_common_prosit_ISA_eval_3.csv')
# add_length(df)
# df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_ISA_eval_3.png')
# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_ISA_eval_3.png', color=True)
# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_ISA_eval_3.png')
Schneider Leo's avatar
Schneider Leo committed
# df = pd.read_csv('output/out_common_ISA_prosit_eval.csv')
# add_length(df)
# df['abs_error'] =  np.abs(df['rt pred']-df['true rt'])
Schneider Leo's avatar
Schneider Leo committed
# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_prosit_eval.png')
# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_prosit_eval.png', color=True, col = 'seq')
# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_prosit_eval.png')

## Compare error variation between run
## Prosit column changes affect some peptides more than others (but consistently)
Schneider Leo's avatar
Schneider Leo committed
# df_1 = pd.read_csv('output/out_common_ISA_prosit_eval.csv')
# df_2 = pd.read_csv('output/out_common_ISA_prosit_eval_2.csv')
#
# df = select_best_data(df_1, df_2, 7)
# df.to_pickle('database/data_prosit_threshold_7.pkl')
Schneider Leo's avatar
Schneider Leo committed
# compare_error(df_1,df_2,save=True,path='fig/custom model res/ISA_prosit_error_variation.png')