Skip to content
Snippets Groups Projects
Commit 2dbfe726 authored by Schneider Leo's avatar Schneider Leo
Browse files

data viz

parent 73ce0ee0
No related branches found
No related tags found
No related merge requests found
......@@ -2,7 +2,7 @@ import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy.ma.core import shape
from sklearn.metrics import r2_score
from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
......@@ -75,6 +75,13 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e
plt.clf()
plt.close()
def optimal_prediction_r2(df,seq_col,rt_col):
df_group_1 = df.groupby([seq_col])[rt_col].mean().to_frame().reset_index()
df_group_1['mean rt']=df_group_1[rt_col]
df_group_1=df_group_1[[seq_col,'mean rt']]
df_merged = df_group_1 .merge(df, how='inner', on=seq_col)
return r2_score(df_merged['mean rt'], df_merged[rt_col])
def main():
#data prosit
# df = pd.read_csv('data_prosit/data.csv')
......@@ -134,6 +141,24 @@ def main():
_ = aa_distribution(df['seq'], False, True, '../fig/data_exploration/aa_distribution_ISA_prosit_outlier.png')
retention_time_distribution(df['true rt'], False, True, '../fig/data_exploration/retention_time_distribution_ISA_prosit_outlier.png')
#compare variance of outliers vs others petides
seq_out_list = df['seq'].to_list()
df_prosit = pd.read_csv('data_prosit/data_noc.csv')
if __name__ == '__main__':
main()
df = pd.read_csv('data_PXD006109/plasma/data_prosit_outlier.csv')
df['seq'] = df['seq'].map(numerical_to_alphabetical_str)
# compare variance of outliers vs others petides
seq_out_list = df['seq'].to_list()
df_prosit = pd.read_csv('data_prosit/data.csv')
df_prosit['outlier']=df_prosit['sequence'].map(lambda x : x in seq_out_list)
df_prosit_outlier = df_prosit[df_prosit['outlier']==True]
df_agg_out = df_prosit_outlier.groupby(pd.Grouper(key='mod_sequence'))['irt_scaled'].agg(['mean', 'median', 'var']).reset_index()
df_agg_pro = df_prosit.groupby(pd.Grouper(key='mod_sequence'))['irt_scaled'].agg(['mean', 'median', 'var']).reset_index()
# main()
plt.hist(df_agg_out['var'])
plt.savefig('var_outlier.png')
plt.hist(df_agg_pro['var'])
plt.savefig('var_pro.png')
\ No newline at end of file
......@@ -316,17 +316,37 @@ if __name__ == '__main__' :
# dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv')
# df2 = filter_outlier_rt(dataframe)
# df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False)
df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv')
compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num')
df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv')
compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num')
df = pd.read_csv('../data/data_ISA/data_isa.csv')
compute_peptide_properties(df,'ISA','sequence')
df = pd.read_csv('../data/data_prosit/data.csv')
compute_peptide_properties(df,'prosit','sequence')
df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv')
compute_peptide_properties(df,'plasma','sequence')
\ No newline at end of file
# df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv')
# compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num')
## r2_list = []
# for index in range(10):
# dataframe = pd.read_csv( '../output/out_tranfert_prosit_isa_mox_'+str(index)+'.csv')
# r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
# r2_arr = np.array(r2_list)
# print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106
# df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv')
# compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num')
#
# df = pd.read_csv('../data/data_ISA/data_isa.csv')
# compute_peptide_properties(df,'ISA','sequence')
#
# df = pd.read_csv('../data/data_prosit/data.csv')
# compute_peptide_properties(df,'prosit','sequence')
#
# df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv')
# compute_peptide_properties(df,'plasma','sequence')
r2_list = []
for index in range(10):
dataframe = pd.read_csv( '../output/out_tranfert_prosit_ISA_'+str(index)+'.csv')
r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
r2_arr = np.array(r2_list)
print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106 et 0.9775968974701088 +/- 0.0022122290222140123 => moins bon que mélange
r2_list = []
for index in range(10):
dataframe = pd.read_csv( '../output/out_tranfert_prosit_plasma_'+str(index)+'.csv')
r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
r2_arr = np.array(r2_list)
print(r2_arr.mean(),'+/-',r2_arr.std()) #0.977876349023085 +/- 0.0040982548977069695 et 0.982352390283812 +/- 0.00036 => equivalent au mélange
pass
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment