diff --git a/data_viz.py b/data_viz.py index f8b8e912e1ba450ba69248f9ae6f471d31c126f8..a9060d07085c9ab404924e93bb1e56c935cd5fe1 100644 --- a/data_viz.py +++ b/data_viz.py @@ -275,14 +275,17 @@ def add_length(dataframe): dataframe['length']=dataframe['seq'].map(fonc) -# df = pd.read_csv('output/out_common_ISA_ISA_eval_2.csv') -# add_length(df) +df = pd.read_csv('output/out_common_ISA_ISA_eval_2.csv') +add_length(df) +df['rt pred'] = 0 +for seq, gr in df.groupby('seq') : + df.loc[df['seq']==seq,'rt pred']=gr['true rt'].mean() # df['abs_error'] = np.abs(df['rt pred']-df['true rt']) # histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_ISA_eval.png') -# scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_eval.png', color=True) +scatter_rt(df, display=False, save=True, path='fig/custom model res/RT_pred_ISA_ISA_best_possible.png', color=True) # histo_length_by_error(df, bins=10, display=False, save=True, path='fig/custom model res/histo_length_ISA_ISA_eval.png') # -df = pd.read_csv('output/out_common_ISA_augmented_3_eval.csv') +# df = pd.read_csv('output/out_common_ISA_augmented_3_eval.csv') # add_length(df) # df['abs_error'] = np.abs(df['rt pred']-df['true rt']) # histo_abs_error(df, display=False, save=True, path='fig/custom model res/histo_ISA_augmented_3_eval.png') diff --git a/local_integration_msms.py b/local_integration_msms.py index 17c67ab23cd7c07c21226cffbd2bf1c3e5ecf53b..bba47338e9a3cfe188798bff67ebc6ecb904726a 100644 --- a/local_integration_msms.py +++ b/local_integration_msms.py @@ -119,21 +119,26 @@ if __name__ == "__main__": df_peak = df_peak[465 < df_peak['RT']] df_peak = df_peak[466 > df_peak['RT']] - - df_peak2 = df[df['MSlevel'] == 2] - df_peak2 = df_peak2[750.1 < df_peak2['MS1_mz_max']] - df_peak2 = df_peak2[750.1 > df_peak2['MS1_mz_min']] - df_peak2 = df_peak2[463 < df_peak2['RT']] - df_peak2 = df_peak2[467 > df_peak2['RT']] - - mz1, inty1 = integrate_ms_ms(df_peak, 1) - mz2, inty2 = integrate_ms_ms(df_peak2, 1) - plt.clf() - fig, ax = plt.subplots() - ax.plot(mz1, inty1, linewidth=0.3) - ax.plot(mz2, inty2, linewidth=0.3) - ax.set_xlim(200, 1800) - plt.savefig('spec_combined.png') - plt.clf() +# +# df_peak2 = df[df['MSlevel'] == 2] +# df_peak2 = df_peak2[750.1 < df_peak2['MS1_mz_max']] +# df_peak2 = df_peak2[750.1 > df_peak2['MS1_mz_min']] +# df_peak2 = df_peak2[463 < df_peak2['RT']] +# df_peak2 = df_peak2[467 > df_peak2['RT']] +# +# mz1, inty1 = integrate_ms_ms(df_peak, 1) +# mz2, inty2 = integrate_ms_ms(df_peak2, 1) +# plt.clf() +# fig, ax = plt.subplots() +# ax.plot(mz1, inty1, linewidth=0.3) +# ax.plot(mz2, inty2, linewidth=0.3) +# ax.set_xlim(200, 1800) +# plt.savefig('spec_combined.png') +# plt.clf() + + df = pd.read_csv('data/staph140_maxquant.csv') + df['Retention time'] = df['Retention time']*60 + df_filtered = df[df['Retention time']>463 ] + df_filtered = df_filtered[df_filtered['Retention time']<467 ] #358.1 358.32 \ No newline at end of file diff --git a/msms_processing.py b/msms_processing.py index 55cd7dcf2aac8223e7ab69b6b80af9a8b4629ccb..6437c3fb1c31dff6e16cb726884a3f4d90863ffe 100644 --- a/msms_processing.py +++ b/msms_processing.py @@ -8,11 +8,11 @@ import random def load_data(msms_filet_path='data/msms.txt', score_treshold=70): data = pd.read_csv(msms_filet_path, sep='\t') - data_compact = data[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']] + data_compact = data[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities','m/z', 'Precursor Intensity']] data_filtered = data_compact[data_compact['Score'] > score_treshold] data_filtered = data_filtered[data_filtered['Length'] < 26] data_filtered['Spectra'] = data_filtered.apply(lambda x: filter_intensity(x.Matches, x.Intensities), axis=1) - return data_filtered[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Spectra']] + return data_filtered[['Sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Spectra','m/z', 'Precursor Intensity']] def convert(l): @@ -82,41 +82,43 @@ def mscatter(x,y, ax=None, m=None, **kw): #data gradient 3 : # 17/01 23/01 24/01 if __name__ == '__main__': - data_1 = pd.read_pickle('database/data_DIA_16_01_aligned30_01.pkl') - data_1['file']= 1 - data_2 = pd.read_pickle('database/data_DIA_17_01_aligned30_01.pkl') - data_2['file'] = 2 - data_3 = pd.read_pickle('database/data_DIA_20_01_aligned30_01.pkl') - data_3['file'] = 3 - data_4 = pd.read_pickle('database/data_DIA_23_01_aligned30_01.pkl') - data_4['file'] = 4 - data_5 = pd.read_pickle('database/data_DIA_24_01_aligned30_01.pkl') - data_5['file'] = 5 - data_6 = pd.read_pickle('database/data_DIA_30_01_aligned30_01.pkl') - data_6['file'] = 6 - data = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True) - - num_total = len(data) - train_size = np.floor(0.8*num_total) - list_gr=[] - train_set = [] - test_set=[] - s = 0 - groups = data.groupby('Sequence') - for seq, gr in groups: - list_gr.append(gr) - random.shuffle(list_gr) - for gr in list_gr : - if s < train_size : - train_set.append(gr) - s+= len(gr) - else : - test_set.append(gr) - - dataset_train = pd.concat(train_set).reset_index(drop=True) - dataset_test = pd.concat(test_set).reset_index(drop=True) - dataset_train.to_pickle('database/data_DIA_ISA_55_train_30_01.pkl') - dataset_test.to_pickle('database/data_DIA_ISA_55_test_30_01.pkl') + # data_1 = pd.read_pickle('database/data_DIA_16_01_aligned30_01.pkl') + # data_1['file']= 1 + # data_2 = pd.read_pickle('database/data_DIA_17_01_aligned30_01.pkl') + # data_2['file'] = 2 + # data_3 = pd.read_pickle('database/data_DIA_20_01_aligned30_01.pkl') + # data_3['file'] = 3 + # data_4 = pd.read_pickle('database/data_DIA_23_01_aligned30_01.pkl') + # data_4['file'] = 4 + # data_5 = pd.read_pickle('database/data_DIA_24_01_aligned30_01.pkl') + # data_5['file'] = 5 + # data_6 = pd.read_pickle('database/data_DIA_30_01_aligned30_01.pkl') + # data_6['file'] = 6 + # data = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True) + # + # num_total = len(data) + # train_size = np.floor(0.8*num_total) + # list_gr=[] + # train_set = [] + # test_set=[] + # s = 0 + # groups = data.groupby('Sequence') + # for seq, gr in groups: + # list_gr.append(gr) + # random.shuffle(list_gr) + # for gr in list_gr : + # if s < train_size : + # train_set.append(gr) + # s+= len(gr) + # else : + # test_set.append(gr) + # + # dataset_train = pd.concat(train_set).reset_index(drop=True) + # dataset_test = pd.concat(test_set).reset_index(drop=True) + # dataset_train.to_pickle('database/data_DIA_ISA_55_train_30_01.pkl') + # dataset_test.to_pickle('database/data_DIA_ISA_55_test_30_01.pkl') + + data_1 = load_data('data/msms.txt', 0) diff --git a/prosit_data_merge.py b/prosit_data_merge.py index 1404420178aa74b6ce73b4780f2075e7727696d5..02c2431318ef70cf27642427be4468fa4a00681c 100644 --- a/prosit_data_merge.py +++ b/prosit_data_merge.py @@ -41,7 +41,6 @@ def padding(dataframe, columns, length): return x + (length - len(x) + 2 * x.count('-')) * '_' for i in range(len(dataframe)): - print(i) if len(dataframe[columns][i]) > length + 2 * dataframe[columns][i].count('-'): dataframe.drop(i)