Skip to content
Snippets Groups Projects
Commit 2dbfe726 authored by Schneider Leo's avatar Schneider Leo
Browse files

data viz

parent 73ce0ee0
No related branches found
No related tags found
No related merge requests found
...@@ -2,7 +2,7 @@ import numpy as np ...@@ -2,7 +2,7 @@ import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
from numpy.ma.core import shape from numpy.ma.core import shape
from sklearn.metrics import r2_score
from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV
...@@ -75,6 +75,13 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e ...@@ -75,6 +75,13 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e
plt.clf() plt.clf()
plt.close() plt.close()
def optimal_prediction_r2(df,seq_col,rt_col):
df_group_1 = df.groupby([seq_col])[rt_col].mean().to_frame().reset_index()
df_group_1['mean rt']=df_group_1[rt_col]
df_group_1=df_group_1[[seq_col,'mean rt']]
df_merged = df_group_1 .merge(df, how='inner', on=seq_col)
return r2_score(df_merged['mean rt'], df_merged[rt_col])
def main(): def main():
#data prosit #data prosit
# df = pd.read_csv('data_prosit/data.csv') # df = pd.read_csv('data_prosit/data.csv')
...@@ -134,6 +141,24 @@ def main(): ...@@ -134,6 +141,24 @@ def main():
_ = aa_distribution(df['seq'], False, True, '../fig/data_exploration/aa_distribution_ISA_prosit_outlier.png') _ = aa_distribution(df['seq'], False, True, '../fig/data_exploration/aa_distribution_ISA_prosit_outlier.png')
retention_time_distribution(df['true rt'], False, True, '../fig/data_exploration/retention_time_distribution_ISA_prosit_outlier.png') retention_time_distribution(df['true rt'], False, True, '../fig/data_exploration/retention_time_distribution_ISA_prosit_outlier.png')
#compare variance of outliers vs others petides
seq_out_list = df['seq'].to_list()
df_prosit = pd.read_csv('data_prosit/data_noc.csv')
if __name__ == '__main__': if __name__ == '__main__':
main() df = pd.read_csv('data_PXD006109/plasma/data_prosit_outlier.csv')
df['seq'] = df['seq'].map(numerical_to_alphabetical_str)
# compare variance of outliers vs others petides
seq_out_list = df['seq'].to_list()
df_prosit = pd.read_csv('data_prosit/data.csv')
df_prosit['outlier']=df_prosit['sequence'].map(lambda x : x in seq_out_list)
df_prosit_outlier = df_prosit[df_prosit['outlier']==True]
df_agg_out = df_prosit_outlier.groupby(pd.Grouper(key='mod_sequence'))['irt_scaled'].agg(['mean', 'median', 'var']).reset_index()
df_agg_pro = df_prosit.groupby(pd.Grouper(key='mod_sequence'))['irt_scaled'].agg(['mean', 'median', 'var']).reset_index()
# main()
plt.hist(df_agg_out['var'])
plt.savefig('var_outlier.png')
plt.hist(df_agg_pro['var'])
plt.savefig('var_pro.png')
\ No newline at end of file
...@@ -316,17 +316,37 @@ if __name__ == '__main__' : ...@@ -316,17 +316,37 @@ if __name__ == '__main__' :
# dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv') # dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv')
# df2 = filter_outlier_rt(dataframe) # df2 = filter_outlier_rt(dataframe)
# df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False) # df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False)
df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv') # df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv')
compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num') # compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num')
## r2_list = []
df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv') # for index in range(10):
compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num') # dataframe = pd.read_csv( '../output/out_tranfert_prosit_isa_mox_'+str(index)+'.csv')
# r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
df = pd.read_csv('../data/data_ISA/data_isa.csv') # r2_arr = np.array(r2_list)
compute_peptide_properties(df,'ISA','sequence') # print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106
# df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv')
df = pd.read_csv('../data/data_prosit/data.csv') # compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num')
compute_peptide_properties(df,'prosit','sequence') #
# df = pd.read_csv('../data/data_ISA/data_isa.csv')
df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv') # compute_peptide_properties(df,'ISA','sequence')
compute_peptide_properties(df,'plasma','sequence') #
\ No newline at end of file # df = pd.read_csv('../data/data_prosit/data.csv')
# compute_peptide_properties(df,'prosit','sequence')
#
# df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv')
# compute_peptide_properties(df,'plasma','sequence')
r2_list = []
for index in range(10):
dataframe = pd.read_csv( '../output/out_tranfert_prosit_ISA_'+str(index)+'.csv')
r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
r2_arr = np.array(r2_list)
print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106 et 0.9775968974701088 +/- 0.0022122290222140123 => moins bon que mélange
r2_list = []
for index in range(10):
dataframe = pd.read_csv( '../output/out_tranfert_prosit_plasma_'+str(index)+'.csv')
r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
r2_arr = np.array(r2_list)
print(r2_arr.mean(),'+/-',r2_arr.std()) #0.977876349023085 +/- 0.0040982548977069695 et 0.982352390283812 +/- 0.00036 => equivalent au mélange
pass
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment