Skip to content
Snippets Groups Projects
alignement.py 8.78 KiB
Newer Older
Léo Schneider's avatar
Léo Schneider committed
import numpy as np
import pandas as pd
from loess.loess_1d import loess_1d
import scipy as sp
from sklearn.metrics import r2_score
Schneider Leo's avatar
Schneider Leo committed
from sympy.abc import alpha
Léo Schneider's avatar
Léo Schneider committed

Schneider Leo's avatar
Schneider Leo committed
import dataloader
Léo Schneider's avatar
Léo Schneider committed
from dataloader import RT_Dataset
Schneider Leo's avatar
Schneider Leo committed
from msms_processing import load_data
Léo Schneider's avatar
Léo Schneider committed
import matplotlib.pyplot as plt

Schneider Leo's avatar
Schneider Leo committed

Léo Schneider's avatar
Léo Schneider committed
ALPHABET_UNMOD = {
Schneider Leo's avatar
Schneider Leo committed
    "": 0,
Léo Schneider's avatar
Léo Schneider committed
    "A": 1,
    "C": 2,
    "D": 3,
    "E": 4,
    "F": 5,
    "G": 6,
    "H": 7,
    "I": 8,
    "K": 9,
    "L": 10,
    "M": 11,
    "N": 12,
    "P": 13,
    "Q": 14,
    "R": 15,
    "S": 16,
    "T": 17,
    "V": 18,
    "W": 19,
    "Y": 20,
    "CaC": 21,
    "OxM": 22
}

ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()}

def numerical_to_alphabetical(arr):
    seq = ''
    for i in range(len(arr)):
        seq+=ALPHABET_UNMOD_REV[arr[i]]
    return seq

def numerical_to_alphabetical_str(s):
    seq = ''
    s = s.replace('[','')
    s = s.replace(']', '')
    arr = s.split(',')
    arr = list(map(int, arr))
    for i in range(len(arr)):
        seq+=ALPHABET_UNMOD_REV[arr[i]]
    return seq

Léo Schneider's avatar
Léo Schneider committed
def align(dataset, reference):
Schneider Leo's avatar
Schneider Leo committed
    seq_ref = reference['Sequence']
Léo Schneider's avatar
Léo Schneider committed
    seq_common = dataset['Sequence']
    seq_ref = seq_ref.tolist()
    seq_common = seq_common.tolist()

    seq_ref = [tuple(l) for l in seq_ref]
    seq_common = [tuple(l) for l in seq_common]

    ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref))
    inter = set(ind_dict_ref).intersection(seq_common)
    ind_dict_ref = [ind_dict_ref[x] for x in inter]

    indices_common = dict((k, i) for i, k in enumerate(seq_common))
    indices_common = [indices_common[x] for x in inter]

Schneider Leo's avatar
Schneider Leo committed
    rt_ref = reference['Retention time'][ind_dict_ref].reset_index()
Léo Schneider's avatar
Léo Schneider committed
    rt_data = dataset['Retention time'][indices_common].reset_index()

Schneider Leo's avatar
Schneider Leo committed
    xout, yout, wout = loess_1d(np.array(rt_data['Retention time'].tolist()), np.array(rt_ref['Retention time'].tolist()),
Léo Schneider's avatar
Léo Schneider committed
                                xnew=dataset['Retention time'],
                                degree=1, frac=0.5,
                                npoints=None, rotate=False, sigy=None)
    dataset['Retention time'] = yout
    return dataset

Schneider Leo's avatar
Schneider Leo committed
def filter_cysteine(df, col):
    def map_cys(str):
        return not('C' in str)
    df['cys'] = df[col].map(map_cys)
    data = df[df['cys']].reset_index(drop=True)
    return data
Léo Schneider's avatar
Léo Schneider committed

Schneider Leo's avatar
Schneider Leo committed
def compare_include_df(df, sub_df, save = True, path = 'temp.png'):
    df_value_list = []
    df_sub_value_list=[]
    for r in sub_df.iterrows() :
        try :
            df_value_list.append(df[df['Sequence']==r[1]['Sequence']]['Retention time'].reset_index(drop=True)[0])
            df_sub_value_list.append(r[1]['Retention time'])
        except:
            pass
Schneider Leo's avatar
Schneider Leo committed
    fig, ax = plt.subplots()
    ax.scatter(df_sub_value_list, df_value_list)
    x = np.array([min(df_value_list), max(df_value_list)])
    linreg = sp.stats.linregress(df_value_list, df_sub_value_list)
    ax.annotate("r-squared = {:.3f}".format(r2_score(df_value_list, df_sub_value_list)), (0, 1))
    plt.plot(x, linreg.intercept + linreg.slope * x, 'r')
Schneider Leo's avatar
Schneider Leo committed

    if save :
        plt.savefig(path)
    plt.clf()

    return df_value_list, df_sub_value_list

Schneider Leo's avatar
Schneider Leo committed
# data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True)
# # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical)
#
# data_train = load_data('msms/msms16_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True)
Schneider Leo's avatar
Schneider Leo committed
# data_align = align(data_train, data_ori)
Schneider Leo's avatar
Schneider Leo committed
# data_align.to_pickle('database/data_DIA_16_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms17_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_17_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms20_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_20_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms23_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_23_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms24_01.txt').reset_index(drop=True)
# # data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True)
# data_align = align(data_train, data_ori)
# data_align.to_pickle('database/data_DIA_24_01_aligned30_01.pkl')
#
# data_train = load_data('msms/msms30_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True)
# # data_align = align(data_train, data_ori)
# data_train.to_pickle('database/data_DIA_30_01_aligned30_01.pkl')
Léo Schneider's avatar
Léo Schneider committed
#
Schneider Leo's avatar
Schneider Leo committed
# plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1)
# plt.savefig('test_align_2.png')
Léo Schneider's avatar
Léo Schneider committed
#
# dataset_ref = pd.read_pickle('database/data_01_16_DIA_ISA_55.pkl')
# data_ref = Common_Dataset(dataset_ref, 25).data
# dataset_2 = pd.read_pickle('database/data_01_20_DIA_ISA_55.pkl')
# data_2 = Common_Dataset(dataset_2, 25).data
# dataset_3 = pd.read_pickle('database/data_01_17_DIA_ISA_55.pkl')
# data_3 = Common_Dataset(dataset_3, 25).data
# dataset_4 = pd.read_pickle('database/data_01_23_DIA_ISA_55.pkl')
# data_4 = Common_Dataset(dataset_4, 25).data
# data_align_3 = align(data_3, data_ref)
# data_align_4 = align(data_4, data_ref)
#
# data = pd.concat([data_ref, data_2, data_align_3, data_align_4], ignore_index=True)
# data = data.drop(columns='index')
# data['Sequence'] = data['Sequence'].map(numerical_to_alphabetical)
# num_data = data.shape[0]
# train_num = np.floor(num_data*0.8)
# train_size=0
# list_train=[]
# list_test=[]
# groups = data.groupby('Sequence')
# for seq, gr in groups:
#
#     train_size+= gr.shape[0]
#
#     if train_size>train_num:
#         list_test.append(gr)
#     else:
#         list_train.append(gr)
#
#
# dataset_train = pd.concat(list_train, ignore_index=True)
# dataset_test = pd.concat(list_test, ignore_index=True)
# dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl')
# dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl')
Schneider Leo's avatar
Schneider Leo committed
# data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_test_30_01.pkl').reset_index(drop=True)
# data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl').reset_index(drop=True)
# data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True)
# data_ori['Sequence']=data_ori['sequence']
# data_ori['Retention time']=data_ori['irt']
# data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True)
# data_align = align(data_train, data_ori)
#
# data_align.to_pickle('database/data_ISA_dual_align.pkl')


#compare DIANN pred to DIA mesures
# df_ori = pd.read_csv('database/data_train.csv')
# df_ori['Sequence']=df_ori['sequence']
# df_ori['Retention time']=df_ori['irt']
# df_diann = pd.read_csv('database/CIT_BASE_UP000584719_546.csv')
#
# df_ISA = pd.read_pickle('database/data_ISA_dual_align.pkl')
#
#
#
# df_diann_aligned = align(df_diann, df_ori)
#
# df_value_list, df_sub_value_list = compare_include_df(df_diann_aligned, df_ISA, True)




Schneider Leo's avatar
Schneider Leo committed
# create augmented dataset from ISA data + column invariant prosit peptides
df_base = pd.read_pickle('database/data_DIA_ISA_55_train_30_01.pkl')
df_base = df_base[['Sequence','Retention time']]
Schneider Leo's avatar
Schneider Leo committed

Schneider Leo's avatar
Schneider Leo committed
df_1 = pd.read_pickle('database/data_prosit_threshold_1.pkl')
df_1['Sequence']= df_1['Sequence'].map(numerical_to_alphabetical_str)
Schneider Leo's avatar
Schneider Leo committed

Schneider Leo's avatar
Schneider Leo committed
df_2 = pd.read_pickle('database/data_prosit_threshold_2.pkl')
df_2['Sequence']= df_2['Sequence'].map(numerical_to_alphabetical_str)

df_3 = pd.read_pickle('database/data_prosit_threshold_3.pkl')
df_3['Sequence']= df_3['Sequence'].map(numerical_to_alphabetical_str)

df_augmented_1 = pd.concat([df_1,df_base],axis=0).reset_index(drop=True)
df_augmented_1.columns=['sequence','irt']
df_augmented_1['state']='train'
df_augmented_1.to_csv('database/data_ISA_augmented_1_30_01.csv')

df_augmented_2 = pd.concat([df_2,df_base],axis=0).reset_index(drop=True)
df_augmented_2.columns=['sequence','irt']
df_augmented_2['state']='train'
df_augmented_2.to_csv('database/data_ISA_augmented_2_30_01.csv')

df_augmented_3 = pd.concat([df_3,df_base],axis=0).reset_index(drop=True)
df_augmented_3.columns=['sequence','irt']
df_augmented_3['state']='train'
df_augmented_3.to_csv('database/data_ISA_augmented_3_30_01.csv')


# testing intersection between test and augmented dataset
df_sup = pd.read_pickle('database/data_prosit_threshold_3.pkl')
df_test = pd.read_pickle('database/data_DIA_ISA_55_test.pkl')

inter = []
n = 0
df_sup['Sequence']= df_sup['Sequence'].map(numerical_to_alphabetical_str)
groups_sup = df_sup.groupby('Sequence')

groups_test = df_test.groupby('Sequence')
for seq, _ in groups_sup:
    for seq2, _ in groups_test:
        if seq2==seq :
            inter.append(seq)