Skip to content
Snippets Groups Projects
Commit f114baba authored by Schneider Leo's avatar Schneider Leo
Browse files

dataset aligned 30_01

parent 66693158
No related branches found
No related tags found
No related merge requests found
...@@ -42,7 +42,7 @@ def numerical_to_alphabetical(arr): ...@@ -42,7 +42,7 @@ def numerical_to_alphabetical(arr):
return seq return seq
def align(dataset, reference): def align(dataset, reference):
seq_ref = reference['sequence'] seq_ref = reference['Sequence']
seq_common = dataset['Sequence'] seq_common = dataset['Sequence']
seq_ref = seq_ref.tolist() seq_ref = seq_ref.tolist()
seq_common = seq_common.tolist() seq_common = seq_common.tolist()
...@@ -57,10 +57,10 @@ def align(dataset, reference): ...@@ -57,10 +57,10 @@ def align(dataset, reference):
indices_common = dict((k, i) for i, k in enumerate(seq_common)) indices_common = dict((k, i) for i, k in enumerate(seq_common))
indices_common = [indices_common[x] for x in inter] indices_common = [indices_common[x] for x in inter]
rt_ref = reference['irt'][ind_dict_ref].reset_index() rt_ref = reference['Retention time'][ind_dict_ref].reset_index()
rt_data = dataset['Retention time'][indices_common].reset_index() rt_data = dataset['Retention time'][indices_common].reset_index()
xout, yout, wout = loess_1d(np.array(rt_data['Retention time'].tolist()), np.array(rt_ref['irt'].tolist()), xout, yout, wout = loess_1d(np.array(rt_data['Retention time'].tolist()), np.array(rt_ref['Retention time'].tolist()),
xnew=dataset['Retention time'], xnew=dataset['Retention time'],
degree=1, frac=0.5, degree=1, frac=0.5,
npoints=None, rotate=False, sigy=None) npoints=None, rotate=False, sigy=None)
...@@ -68,38 +68,38 @@ def align(dataset, reference): ...@@ -68,38 +68,38 @@ def align(dataset, reference):
return dataset return dataset
data_ori = RT_Dataset(None, 'database/data_train.csv', 'train', 25).data data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True)
data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical)
data_train = load_data('msms/msms16_01.txt').reset_index(drop=True) data_train = load_data('msms/msms16_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True)
data_align = align(data_train, data_ori) data_align = align(data_train, data_ori)
data_align.to_pickle('database/data_DIA_16_01_aligned.pkl') data_align.to_pickle('database/data_DIA_16_01_aligned30_01.pkl')
data_train = load_data('msms/msms17_01.txt').reset_index(drop=True) data_train = load_data('msms/msms17_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True)
data_align = align(data_train, data_ori) data_align = align(data_train, data_ori)
data_align.to_pickle('database/data_DIA_17_01_aligned.pkl') data_align.to_pickle('database/data_DIA_17_01_aligned30_01.pkl')
data_train = load_data('msms/msms20_01.txt').reset_index(drop=True) data_train = load_data('msms/msms20_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True)
data_align = align(data_train, data_ori) data_align = align(data_train, data_ori)
data_align.to_pickle('database/data_DIA_20_01_aligned.pkl') data_align.to_pickle('database/data_DIA_20_01_aligned30_01.pkl')
data_train = load_data('msms/msms23_01.txt').reset_index(drop=True) data_train = load_data('msms/msms23_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True)
data_align = align(data_train, data_ori) data_align = align(data_train, data_ori)
data_align.to_pickle('database/data_DIA_23_01_aligned.pkl') data_align.to_pickle('database/data_DIA_23_01_aligned30_01.pkl')
data_train = load_data('msms/msms24_01.txt').reset_index(drop=True) data_train = load_data('msms/msms24_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True)
data_align = align(data_train, data_ori) data_align = align(data_train, data_ori)
data_align.to_pickle('database/data_DIA_24_01_aligned.pkl') data_align.to_pickle('database/data_DIA_24_01_aligned30_01.pkl')
data_train = load_data('msms/msms30_01.txt').reset_index(drop=True) # data_train = load_data('msms/msms30_01.txt').reset_index(drop=True)
# data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True) # # data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True)
data_align = align(data_train, data_ori) # data_align = align(data_train, data_ori)
data_align.to_pickle('database/data_DIA_30_01_aligned.pkl') # data_align.to_pickle('database/data_DIA_30_01_aligned30_01.pkl')
# #
# plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1) # plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1)
# plt.savefig('test_align_2.png') # plt.savefig('test_align_2.png')
......
...@@ -139,19 +139,19 @@ def histo_abs_error(dataframe, display=False, save=False, path=None): ...@@ -139,19 +139,19 @@ def histo_abs_error(dataframe, display=False, save=False, path=None):
plt.savefig(path) plt.savefig(path)
def random_color_deterministic(df): def random_color_deterministic(df, column):
def rd10(str): def rd10(str):
color = list(mcolors.CSS4_COLORS) color = list(mcolors.CSS4_COLORS)
random.seed(str) random.seed(str)
return color[random.randint(0,147)] return color[random.randint(0,147)]
df['color']=df['seq'].map(rd10) df['color']=df[column].map(rd10)
def scatter_rt(dataframe, display=False, save=False, path=None, color = False): def scatter_rt(dataframe, display=False, save=False, path=None, color = False, col = 'seq'):
fig, ax = plt.subplots() fig, ax = plt.subplots()
if color : if color :
random_color_deterministic(dataframe) random_color_deterministic(dataframe, col)
ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1, color = dataframe['color']) ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1, color = dataframe['color'])
else : else :
ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1) ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1)
...@@ -243,6 +243,7 @@ def compare_error(df1, df2, display=False, save=False, path=None): ...@@ -243,6 +243,7 @@ def compare_error(df1, df2, display=False, save=False, path=None):
if save: if save:
plt.savefig(path) plt.savefig(path)
def add_length(dataframe): def add_length(dataframe):
def fonc(a): def fonc(a):
a = a.replace('[', '') a = a.replace('[', '')
...@@ -253,23 +254,30 @@ def add_length(dataframe): ...@@ -253,23 +254,30 @@ def add_length(dataframe):
dataframe['length']=dataframe['seq'].map(fonc) dataframe['length']=dataframe['seq'].map(fonc)
df = pd.read_csv('output/out_common_ISA_ISA_eval.csv') # df = pd.read_csv('output/out_common_ISA_ISA_eval.csv')
add_length(df) # add_length(df)
df['abs_error'] = np.abs(df['rt pred']-df['true rt']) # df['abs_error'] = np.abs(df['rt pred']-df['true rt'])
histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_ISA_eval.png') # histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_ISA_eval.png')
scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_eval.png', color=True) # scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_eval.png', color=True)
histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_ISA_eval.png') # histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_ISA_eval.png')
#
df = pd.read_csv('output/out_common_prosit_prosit_eval.csv') # df = pd.read_csv('output/out_common_prosit_prosit_eval.csv')
add_length(df) # add_length(df)
df['abs_error'] = np.abs(df['rt pred']-df['true rt']) # df['abs_error'] = np.abs(df['rt pred']-df['true rt'])
histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_prosit_eval.png') # histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_prosit_eval.png')
scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_prosit_eval.png', color=True) # scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_prosit_eval.png', color=True)
histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_prosit_eval.png') # histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_prosit_eval.png')
#
df = pd.read_csv('output/out_common_transfereval.csv') # df = pd.read_csv('output/out_common_transfereval.csv')
# add_length(df)
# df['abs_error'] = np.abs(df['rt pred']-df['true rt'])
# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_ISA_eval.png')
# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_ISA_eval.png', color=True)
# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_ISA_eval.png')
df = pd.read_csv('output/out_common_ISA_ISA_eval_2.csv')
add_length(df) add_length(df)
df['abs_error'] = np.abs(df['rt pred']-df['true rt']) df['abs_error'] = np.abs(df['rt pred']-df['true rt'])
histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_ISA_eval.png') histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_ISA_eval_2.png')
scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_ISA_eval.png', color=True) scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_eval_2_seq.png', color=True, col = 'seq')
histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_ISA_eval.png') histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_ISA_eval_2.png')
\ No newline at end of file \ No newline at end of file
File added
File added
...@@ -82,15 +82,15 @@ def mscatter(x,y, ax=None, m=None, **kw): ...@@ -82,15 +82,15 @@ def mscatter(x,y, ax=None, m=None, **kw):
#data gradient 3 : #data gradient 3 :
# 17/01 23/01 24/01 # 17/01 23/01 24/01
if __name__ == '__main__': if __name__ == '__main__':
data_1 = pd.read_pickle('database/data_DIA_16_01_aligned.pkl') data_1 = pd.read_pickle('database/data_DIA_16_01_aligned30_01.pkl')
data_1['file']= 1 data_1['file']= 1
data_2 = pd.read_pickle('database/data_DIA_17_01_aligned.pkl') data_2 = pd.read_pickle('database/data_DIA_17_01_aligned30_01.pkl')
data_2['file'] = 2 data_2['file'] = 2
data_3 = pd.read_pickle('database/data_DIA_20_01_aligned.pkl') data_3 = pd.read_pickle('database/data_DIA_20_01_aligned30_01.pkl')
data_3['file'] = 3 data_3['file'] = 3
data_4 = pd.read_pickle('database/data_DIA_23_01_aligned.pkl') data_4 = pd.read_pickle('database/data_DIA_23_01_aligned30_01.pkl')
data_4['file'] = 4 data_4['file'] = 4
data_5 = pd.read_pickle('database/data_DIA_24_01_aligned.pkl') data_5 = pd.read_pickle('database/data_DIA_24_01_aligned30_01.pkl')
data_5['file'] = 5 data_5['file'] = 5
data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl') data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl')
data_6['file'] = 6 data_6['file'] = 6
...@@ -115,8 +115,8 @@ if __name__ == '__main__': ...@@ -115,8 +115,8 @@ if __name__ == '__main__':
dataset_train = pd.concat(train_set).reset_index(drop=True) dataset_train = pd.concat(train_set).reset_index(drop=True)
dataset_test = pd.concat(test_set).reset_index(drop=True) dataset_test = pd.concat(test_set).reset_index(drop=True)
dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl') dataset_train.to_pickle('database/data_DIA_ISA_55_train_30_01.pkl')
dataset_test.to_pickle('database/data_DIA_ISA_55_test.pkl') dataset_test.to_pickle('database/data_DIA_ISA_55_test_30_01.pkl')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment