-
Schneider Leo authored2dbfe726
data_viz.py 12.57 KiB
import scipy as sp
from networkx.algorithms.traversal import dfs_tree
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import numpy as np
import random
import pandas as pd
from constant import ALPHABET_UNMOD_REV
import matplotlib.colors as mcolors
import peptides as pep
def histo_abs_error(dataframe, display=False, save=False, path=None):
points = dataframe['abs_error']
## combine these different collections into a list
data_to_plot = [points]
# Create a figure instance
fig, ax = plt.subplots()
# Create the boxplot
ax.set_xlabel('abs error')
ax.violinplot(data_to_plot, vert=False, side='high', showmedians=True, quantiles=[0.95])
ax.set_xlim(0,4)
if display :
plt.show()
if save :
plt.savefig(path)
def random_color_deterministic(df, column):
def rd10(str):
color = list(mcolors.CSS4_COLORS)
random.seed(str)
return color[random.randint(0,147)]
df['color']=df[column].map(rd10)
def scatter_rt(dataframe, display=False, save=False, path=None, color = False, col = 'seq'):
fig, ax = plt.subplots()
if color :
random_color_deterministic(dataframe, col)
ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1, color = dataframe['color'])
else :
ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1)
ax.set_xlabel('true RT')
ax.set_ylabel('pred RT')
x = np.array([min(dataframe['true rt']), max(dataframe['true rt'])])
linreg = sp.stats.linregress(dataframe['true rt'], dataframe['rt pred'])
ax.annotate("r-squared = {:.3f}".format(r2_score(dataframe['true rt'], dataframe['rt pred'])), (0, 1))
plt.plot(x, linreg.intercept + linreg.slope * x, 'r')
if display :
plt.show()
if save :
plt.savefig(path)
def histo_abs_error_by_length(dataframe, display=False, save=False, path=None):
data_to_plot =[]
max_length = max(dataframe['length'])
min_length = min(dataframe['length'])
for l in range(min_length, max_length):
data_to_plot.append(dataframe['abs_error'].where(dataframe['length']==l))
# data_to_plot.append()
fig, ax = plt.subplots()
# Create the boxplot
bp = ax.violinplot(data_to_plot, vert=True, side='low')
if display:
plt.show()
if save:
plt.savefig(path)
def running_mean(x, N):
cumsum = np.cumsum(np.insert(x, 0, 0))
return (cumsum[N:] - cumsum[:-N]) / float(N)
def histo_length_by_error(dataframe, bins, display=False, save=False, path=None):
data_to_plot = []
quanti = []
max_error = max(dataframe['abs_error'])
inter = np.linspace(0, max_error, num=bins+1)
inter_m = running_mean(inter, 2)
inter_labels = list(map(lambda x : round(x,2),inter_m))
inter_labels.insert(0,0)
for i in range(bins):
a = dataframe.loc[(inter[i] < dataframe['abs_error']) & (dataframe['abs_error'] < inter[i+1])]['length']
if len(a)>0:
data_to_plot.append(a)
quanti.append(0.95)
else :
data_to_plot.append([0])
quanti.append(0.95)
fig, ax = plt.subplots()
# Create the boxplot
ax.violinplot(data_to_plot, vert=True, side='high', showmedians=True)
ax.set_ylabel('length')
ax.set_xticks(range(len(inter)),inter_labels)
if display:
plt.show()
if save:
plt.savefig(path)
def add_length(dataframe):
def fonc(a):
a = a.replace('[', '')
a = a.replace(']', '')
a = a.split(',')
a = list(map(int, a))
return np.count_nonzero(np.array(a))
dataframe['length']=dataframe['seq'].map(fonc)
def numerical_to_alphabetical_str(s):
seq = ''
s = s.replace('[','')
s = s.replace(']', '')
arr = s.split(',')
arr = list(map(int, arr))
for i in range(len(arr)):
seq+=ALPHABET_UNMOD_REV[arr[i]]
return seq
def plot_res():
import matplotlib.pyplot as plt
import numpy as np
fig, axs = plt.subplots(figsize=(9, 4))
all_data = [[0.911,0.899,0.9,0.885],
[0.852,0.75,0.857,0.788],
[0.853,0.839,0.862,0.826],
[0.902,0.833,0.808],
[0.9,0.904,0.922,0.907],
[0.915,0.912,0.923,0.911],
[0.945,0.918,0.919,0.933],
[0.91,0.919,0.927,0.906],
[0.881,0.901,0.919,0.902],
[0.893,0.909,0.918,0.896]]
# plot box plot
axs.boxplot(all_data)
axs.set_title('Box plot')
# adding horizontal grid lines
axs.yaxis.grid(True)
axs.set_xticks([y + 1 for y in range(len(all_data))],
labels=['Prosit','ISA_noc','Augm 0.05','Augm 0.1','Augm 0.2','Augm 0.3','Augm 0.4','Augm 05', 'Augm 0.7','Augm 1','Augm all',])
plt.savefig('../fig/model perf/summary.png')
def calc_and_plot_res():
all_data=[]
threshold = ['005','01','02','03','04','05','07','1','all']
base_aug_1 = 'out_early_stop_ISA_aug_'
base_aug_2 = '_ISA_noc'
name_list = ['out_early_stop_ISA_noc_ISA_noc']
for t in threshold :
name_list.append(base_aug_1+t+base_aug_2)
name_list.append('out_early_stop_prosit_ISA_noc')
print(name_list)
for name in name_list:
r2_list=[]
for index in range(10):
dataframe = pd.read_csv('../archive_output/ISA/'+name+'_'+str(index)+'.csv')
r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
all_data.append(r2_list)
fig, axs = plt.subplots(2, 1, figsize=(9, 8))
axs[0].boxplot(all_data)
axs[0].set_title('Box plot')
# adding horizontal grid lines
axs[0].yaxis.grid(True)
axs[1].set_xticks([y for y in range(len(all_data))],
labels=[ 'plasma', 'Augm 0.05', 'Augm 0.1', 'Augm 0.2', 'Augm 0.3', 'Augm 0.4', 'Augm 0.5', 'Augm 0.7',
'Augm 1', 'Augm all', 'Prosit', ], rotation=30)
ref_path, base_path = './data_ISA/data_isa_noc.csv', './data_ISA/isa_data_augmented_'
name = ['005','01','02','03','04','05','07','1','all']
df = pd.read_csv(ref_path)
size = [df.shape[0]]
x = [i for i in range(len(name)+2)]
for i in range(len(name)):
path = base_path+name[i]+'.csv'
df = pd.read_csv(path)
s = df.shape[0]
size.append(s)
df = pd.read_csv('./data_prosit/data.csv')
size.append(df.shape[0])
size = np.array(size)/max(size)
for i in range(len(size)):
axs[1].text(i, size[i], '{:.3f}'.format(size[i]), style='italic')
axs[0].text(i+0.5, np.mean(all_data[i])-0.01, f': {np.mean(all_data[i]):.3f}')
axs[1].plot(x,size)
plt.savefig('../fig/model perf/summary_early_stop_isa.png')
def error_by_methionine(dataframe):
def fonc(a):
a = a.replace('[', '')
a = a.replace(']', '')
a = a.split(',')
a = list(map(int, a))
return a.count(11)
dataframe['meth_count']=dataframe['seq'].map(fonc)
dataframe['abs_error'] = np.abs(dataframe['rt pred'] - dataframe['true rt'])
data_0 = dataframe[dataframe['meth_count'] == 0]
data_1 = dataframe[dataframe['meth_count'] == 1]
data_2 = dataframe[dataframe['meth_count'] == 2]
data_p = dataframe[dataframe['meth_count'] > 2]
print('no meth', len(data_0),data_0['abs_error'].mean(),'1 meth', len(data_1),data_1['abs_error'].mean(),'2 meth', len(data_2),data_2['abs_error'].mean(),
'3+ meth', len(data_p),data_p['abs_error'].mean())
def filter_outlier_rt(dataframe):
dataframe['error'] = dataframe['rt pred'] - dataframe['true rt']
df_out = dataframe[dataframe['error'] < -1.6]
df_in = dataframe[dataframe['error'] >= -1.6]
fig, ax = plt.subplots()
ax.scatter(df_out['true rt'], df_out['rt pred'], s=.1, color='red')
ax.scatter(df_in['true rt'], df_in['rt pred'], s=.1, color='green')
ax.set_xlabel('true RT')
ax.set_ylabel('pred RT')
plt.savefig('../fig/data_exploration/outlier_selection.png')
return df_out
def plot_augmented_dataset_size(ref_path,base_path):
t = [0.05,0.1,0.2,0.3,0.4,0.5,0.7,1,10]
name = ['005','01','02','03','04','05','07','1','all']
df = pd.read_csv(ref_path)
size = [df.shape[0]]
x = [i for i in range(len(name)+1)]
for i in range(len(name)):
path = base_path+name[i]+'.csv'
df = pd.read_csv(path)
size.append(df.shape[0])
size = np.array(size)/max(size)
fig, ax = plt.subplots()
ax.plot(x,size)
ax.set_xticks([y + 1 for y in range(len(name)+1)],
labels=['base', 'Augm 0.05', 'Augm 0.1', 'Augm 0.2', 'Augm 0.3', 'Augm 0.4', 'Augm 0.5',
'Augm 0.7',
'Augm 1', 'Augm all'], rotation=30)
plt.savefig('../fig/data_exploration/augmented_dataset_size.png')
def compute_peptide_properties(df, base_name, col='seq', format='alpha'):
if format!= 'alpha':
df[col] = df[col].map(numerical_to_alphabetical_str)
hydro=[]
isop=[]
molecular_w = []
for p in df[col]:
pept = pep.Peptide(p)
hydro.append(pept.hydrophobicity())
isop.append(pept.isoelectric_point())
molecular_w.append(pept.molecular_weight())
plt.hist(hydro,bins = 50)
plt.title("Hydrophobicity")
plt.savefig('../fig/data_exploration/hydrophobicity_{}.png'.format(base_name))
plt.clf()
plt.hist(hydro,bins = 50)
plt.title("Isoelectric point")
plt.savefig('../fig/data_exploration/isoelectric_point_{}.png'.format(base_name))
plt.clf()
plt.hist(hydro,bins = 50)
plt.title("Molecular weight")
plt.savefig('../fig/data_exploration/molecular_weight_{}.png'.format(base_name))
plt.clf()
if __name__ == '__main__' :
# calc_and_plot_res()
# base = ['plasma_plasma','plasma_prosit']
# # augmented = ['ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc','ISA_aug_all_ISA_noc']
# for f_suffix_name in base:
# for number in ['1','2','3','4']:
# df = pd.read_csv('../output/out_early_stop_{}_{}.csv'.format(f_suffix_name,number))
# # add_length(df)
# df['abs_error'] = np.abs(df['rt pred']-df['true rt'])
# # histo_abs_error(df, display=False, save=True, path='../fig/model perf/histo_{}_{}.png'.format(f_suffix_name,number))
# scatter_rt(df, display=False, save=True, path='../fig/model perf/RT_pred_{}_{}.png'.format(f_suffix_name,number), color=True)
# # histo_length_by_error(df, bins=10, display=False, save=True, path='../fig/model perf/histo_length_{}_{}.png'.format(f_suffix_name,number))
# for number in range(10):
# df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number)))
# error_by_methionine(df)
# dataframe = pd.read_csv('../output/out_early_stop_plasma_prosit_0.csv')
# df = filter_outlier_rt(dataframe)
# df.to_csv('../data/data_PXD006109/data_prosit_outlier.csv', index=False)
#
# dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv')
# df2 = filter_outlier_rt(dataframe)
# df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False)
# df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv')
# compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num')
## r2_list = []
# for index in range(10):
# dataframe = pd.read_csv( '../output/out_tranfert_prosit_isa_mox_'+str(index)+'.csv')
# r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
# r2_arr = np.array(r2_list)
# print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106
# df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv')
# compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num')
#
# df = pd.read_csv('../data/data_ISA/data_isa.csv')
# compute_peptide_properties(df,'ISA','sequence')
#
# df = pd.read_csv('../data/data_prosit/data.csv')
# compute_peptide_properties(df,'prosit','sequence')
#
# df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv')
# compute_peptide_properties(df,'plasma','sequence')
r2_list = []
for index in range(10):
dataframe = pd.read_csv( '../output/out_tranfert_prosit_ISA_'+str(index)+'.csv')
r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
r2_arr = np.array(r2_list)
print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106 et 0.9775968974701088 +/- 0.0022122290222140123 => moins bon que mélange
r2_list = []
for index in range(10):
dataframe = pd.read_csv( '../output/out_tranfert_prosit_plasma_'+str(index)+'.csv')
r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
r2_arr = np.array(r2_list)
print(r2_arr.mean(),'+/-',r2_arr.std()) #0.977876349023085 +/- 0.0040982548977069695 et 0.982352390283812 +/- 0.00036 => equivalent au mélange
pass