diff --git a/alignement.py b/alignement.py index 1c11a0737517221c5edf939c0991ada89efe594c..4e290fb52501fc7e04826d0fe063ad6b9d48506d 100644 --- a/alignement.py +++ b/alignement.py @@ -42,7 +42,7 @@ def numerical_to_alphabetical(arr): return seq def align(dataset, reference): - seq_ref = reference['sequence'] + seq_ref = reference['Sequence'] seq_common = dataset['Sequence'] seq_ref = seq_ref.tolist() seq_common = seq_common.tolist() @@ -57,10 +57,10 @@ def align(dataset, reference): indices_common = dict((k, i) for i, k in enumerate(seq_common)) indices_common = [indices_common[x] for x in inter] - rt_ref = reference['irt'][ind_dict_ref].reset_index() + rt_ref = reference['Retention time'][ind_dict_ref].reset_index() rt_data = dataset['Retention time'][indices_common].reset_index() - xout, yout, wout = loess_1d(np.array(rt_data['Retention time'].tolist()), np.array(rt_ref['irt'].tolist()), + xout, yout, wout = loess_1d(np.array(rt_data['Retention time'].tolist()), np.array(rt_ref['Retention time'].tolist()), xnew=dataset['Retention time'], degree=1, frac=0.5, npoints=None, rotate=False, sigy=None) @@ -68,38 +68,38 @@ def align(dataset, reference): return dataset -data_ori = RT_Dataset(None, 'database/data_train.csv', 'train', 25).data -data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) +data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True) +# data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) data_train = load_data('msms/msms16_01.txt').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True) data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_16_01_aligned.pkl') +data_align.to_pickle('database/data_DIA_16_01_aligned30_01.pkl') data_train = load_data('msms/msms17_01.txt').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True) data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_17_01_aligned.pkl') +data_align.to_pickle('database/data_DIA_17_01_aligned30_01.pkl') data_train = load_data('msms/msms20_01.txt').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True) data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_20_01_aligned.pkl') +data_align.to_pickle('database/data_DIA_20_01_aligned30_01.pkl') data_train = load_data('msms/msms23_01.txt').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True) data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_23_01_aligned.pkl') +data_align.to_pickle('database/data_DIA_23_01_aligned30_01.pkl') data_train = load_data('msms/msms24_01.txt').reset_index(drop=True) # data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True) data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_24_01_aligned.pkl') +data_align.to_pickle('database/data_DIA_24_01_aligned30_01.pkl') -data_train = load_data('msms/msms30_01.txt').reset_index(drop=True) -# data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True) -data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_30_01_aligned.pkl') +# data_train = load_data('msms/msms30_01.txt').reset_index(drop=True) +# # data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True) +# data_align = align(data_train, data_ori) +# data_align.to_pickle('database/data_DIA_30_01_aligned30_01.pkl') # # plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1) # plt.savefig('test_align_2.png') diff --git a/data_viz.py b/data_viz.py index 9a609cc9b838e896c09ef4e47aa15c6d4dd69720..de2373da73dd6c09664588d71603657617218da6 100644 --- a/data_viz.py +++ b/data_viz.py @@ -139,19 +139,19 @@ def histo_abs_error(dataframe, display=False, save=False, path=None): plt.savefig(path) -def random_color_deterministic(df): +def random_color_deterministic(df, column): def rd10(str): color = list(mcolors.CSS4_COLORS) random.seed(str) return color[random.randint(0,147)] - df['color']=df['seq'].map(rd10) + df['color']=df[column].map(rd10) -def scatter_rt(dataframe, display=False, save=False, path=None, color = False): +def scatter_rt(dataframe, display=False, save=False, path=None, color = False, col = 'seq'): fig, ax = plt.subplots() if color : - random_color_deterministic(dataframe) + random_color_deterministic(dataframe, col) ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1, color = dataframe['color']) else : ax.scatter(dataframe['true rt'], dataframe['rt pred'], s=.1) @@ -243,6 +243,7 @@ def compare_error(df1, df2, display=False, save=False, path=None): if save: plt.savefig(path) + def add_length(dataframe): def fonc(a): a = a.replace('[', '') @@ -253,23 +254,30 @@ def add_length(dataframe): dataframe['length']=dataframe['seq'].map(fonc) -df = pd.read_csv('output/out_common_ISA_ISA_eval.csv') -add_length(df) -df['abs_error'] = np.abs(df['rt pred']-df['true rt']) -histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_ISA_eval.png') -scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_eval.png', color=True) -histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_ISA_eval.png') - -df = pd.read_csv('output/out_common_prosit_prosit_eval.csv') -add_length(df) -df['abs_error'] = np.abs(df['rt pred']-df['true rt']) -histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_prosit_eval.png') -scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_prosit_eval.png', color=True) -histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_prosit_eval.png') - -df = pd.read_csv('output/out_common_transfereval.csv') +# df = pd.read_csv('output/out_common_ISA_ISA_eval.csv') +# add_length(df) +# df['abs_error'] = np.abs(df['rt pred']-df['true rt']) +# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_ISA_eval.png') +# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_eval.png', color=True) +# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_ISA_eval.png') +# +# df = pd.read_csv('output/out_common_prosit_prosit_eval.csv') +# add_length(df) +# df['abs_error'] = np.abs(df['rt pred']-df['true rt']) +# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_prosit_eval.png') +# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_prosit_eval.png', color=True) +# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_prosit_eval.png') +# +# df = pd.read_csv('output/out_common_transfereval.csv') +# add_length(df) +# df['abs_error'] = np.abs(df['rt pred']-df['true rt']) +# histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_ISA_eval.png') +# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_ISA_eval.png', color=True) +# histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_ISA_eval.png') + +df = pd.read_csv('output/out_common_ISA_ISA_eval_2.csv') add_length(df) df['abs_error'] = np.abs(df['rt pred']-df['true rt']) -histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_prosit_ISA_eval.png') -scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_prosit_ISA_eval.png', color=True) -histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_prosit_ISA_eval.png') \ No newline at end of file +histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_ISA_eval_2.png') +scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_eval_2_seq.png', color=True, col = 'seq') +histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_ISA_eval_2.png') \ No newline at end of file diff --git a/database/data_DIA_ISA_55_test_30_01.pkl b/database/data_DIA_ISA_55_test_30_01.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b6091e333179587f777da5d16c2880e5bdda77d7 Binary files /dev/null and b/database/data_DIA_ISA_55_test_30_01.pkl differ diff --git a/database/data_DIA_ISA_55_train_30_01.pkl b/database/data_DIA_ISA_55_train_30_01.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0ee7e1b319fa2376ae34864af21725b36c7ec267 Binary files /dev/null and b/database/data_DIA_ISA_55_train_30_01.pkl differ diff --git a/msms_processing.py b/msms_processing.py index b202abb2aba113030d18362ddf0b73e11ed45881..05cbac233ff85d0d21efe0fd7eda470963c44344 100644 --- a/msms_processing.py +++ b/msms_processing.py @@ -82,15 +82,15 @@ def mscatter(x,y, ax=None, m=None, **kw): #data gradient 3 : # 17/01 23/01 24/01 if __name__ == '__main__': - data_1 = pd.read_pickle('database/data_DIA_16_01_aligned.pkl') + data_1 = pd.read_pickle('database/data_DIA_16_01_aligned30_01.pkl') data_1['file']= 1 - data_2 = pd.read_pickle('database/data_DIA_17_01_aligned.pkl') + data_2 = pd.read_pickle('database/data_DIA_17_01_aligned30_01.pkl') data_2['file'] = 2 - data_3 = pd.read_pickle('database/data_DIA_20_01_aligned.pkl') + data_3 = pd.read_pickle('database/data_DIA_20_01_aligned30_01.pkl') data_3['file'] = 3 - data_4 = pd.read_pickle('database/data_DIA_23_01_aligned.pkl') + data_4 = pd.read_pickle('database/data_DIA_23_01_aligned30_01.pkl') data_4['file'] = 4 - data_5 = pd.read_pickle('database/data_DIA_24_01_aligned.pkl') + data_5 = pd.read_pickle('database/data_DIA_24_01_aligned30_01.pkl') data_5['file'] = 5 data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl') data_6['file'] = 6 @@ -115,8 +115,8 @@ if __name__ == '__main__': dataset_train = pd.concat(train_set).reset_index(drop=True) dataset_test = pd.concat(test_set).reset_index(drop=True) - dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl') - dataset_test.to_pickle('database/data_DIA_ISA_55_test.pkl') + dataset_train.to_pickle('database/data_DIA_ISA_55_train_30_01.pkl') + dataset_test.to_pickle('database/data_DIA_ISA_55_test_30_01.pkl')