diff --git a/data/data_exploration.py b/data/data_exploration.py index f3f79b27845837e1b0c50c0532feda6795e9e18c..ca7742db4d7c19e655727aa606112c868c8db0f5 100644 --- a/data/data_exploration.py +++ b/data/data_exploration.py @@ -2,7 +2,7 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd from numpy.ma.core import shape - +from sklearn.metrics import r2_score from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV @@ -75,6 +75,13 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e plt.clf() plt.close() +def optimal_prediction_r2(df,seq_col,rt_col): + df_group_1 = df.groupby([seq_col])[rt_col].mean().to_frame().reset_index() + df_group_1['mean rt']=df_group_1[rt_col] + df_group_1=df_group_1[[seq_col,'mean rt']] + df_merged = df_group_1 .merge(df, how='inner', on=seq_col) + return r2_score(df_merged['mean rt'], df_merged[rt_col]) + def main(): #data prosit # df = pd.read_csv('data_prosit/data.csv') @@ -134,6 +141,24 @@ def main(): _ = aa_distribution(df['seq'], False, True, '../fig/data_exploration/aa_distribution_ISA_prosit_outlier.png') retention_time_distribution(df['true rt'], False, True, '../fig/data_exploration/retention_time_distribution_ISA_prosit_outlier.png') + #compare variance of outliers vs others petides + seq_out_list = df['seq'].to_list() + df_prosit = pd.read_csv('data_prosit/data_noc.csv') if __name__ == '__main__': - main() + df = pd.read_csv('data_PXD006109/plasma/data_prosit_outlier.csv') + df['seq'] = df['seq'].map(numerical_to_alphabetical_str) + + # compare variance of outliers vs others petides + seq_out_list = df['seq'].to_list() + df_prosit = pd.read_csv('data_prosit/data.csv') + df_prosit['outlier']=df_prosit['sequence'].map(lambda x : x in seq_out_list) + df_prosit_outlier = df_prosit[df_prosit['outlier']==True] + df_agg_out = df_prosit_outlier.groupby(pd.Grouper(key='mod_sequence'))['irt_scaled'].agg(['mean', 'median', 'var']).reset_index() + df_agg_pro = df_prosit.groupby(pd.Grouper(key='mod_sequence'))['irt_scaled'].agg(['mean', 'median', 'var']).reset_index() + # main() + + plt.hist(df_agg_out['var']) + plt.savefig('var_outlier.png') + plt.hist(df_agg_pro['var']) + plt.savefig('var_pro.png') \ No newline at end of file diff --git a/data/data_viz.py b/data/data_viz.py index d4e452ccf5ad04c52f6dbd2d492796392d2545dd..6e08e9a01848e69447cca57625a184fd24054f13 100644 --- a/data/data_viz.py +++ b/data/data_viz.py @@ -316,17 +316,37 @@ if __name__ == '__main__' : # dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv') # df2 = filter_outlier_rt(dataframe) # df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False) - df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv') - compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num') - - df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv') - compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num') - - df = pd.read_csv('../data/data_ISA/data_isa.csv') - compute_peptide_properties(df,'ISA','sequence') - - df = pd.read_csv('../data/data_prosit/data.csv') - compute_peptide_properties(df,'prosit','sequence') - - df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv') - compute_peptide_properties(df,'plasma','sequence') \ No newline at end of file + # df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv') + # compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num') + ## r2_list = [] + # for index in range(10): + # dataframe = pd.read_csv( '../output/out_tranfert_prosit_isa_mox_'+str(index)+'.csv') + # r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred'])) + # r2_arr = np.array(r2_list) + # print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106 + # df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv') + # compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num') + # + # df = pd.read_csv('../data/data_ISA/data_isa.csv') + # compute_peptide_properties(df,'ISA','sequence') + # + # df = pd.read_csv('../data/data_prosit/data.csv') + # compute_peptide_properties(df,'prosit','sequence') + # + # df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv') + # compute_peptide_properties(df,'plasma','sequence') + + r2_list = [] + for index in range(10): + dataframe = pd.read_csv( '../output/out_tranfert_prosit_ISA_'+str(index)+'.csv') + r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred'])) + r2_arr = np.array(r2_list) + print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106 et 0.9775968974701088 +/- 0.0022122290222140123 => moins bon que mélange + + r2_list = [] + for index in range(10): + dataframe = pd.read_csv( '../output/out_tranfert_prosit_plasma_'+str(index)+'.csv') + r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred'])) + r2_arr = np.array(r2_list) + print(r2_arr.mean(),'+/-',r2_arr.std()) #0.977876349023085 +/- 0.0040982548977069695 et 0.982352390283812 +/- 0.00036 => equivalent au mélange + pass \ No newline at end of file