diff --git a/alignement.py b/alignement.py index 579c2c0b0cc2a261a4dfc5035c28bf9488ad0ef0..94b782162844e37137f52e894b3caa211b7348d8 100644 --- a/alignement.py +++ b/alignement.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd from loess.loess_1d import loess_1d +import dataloader from dataloader import RT_Dataset from msms_processing import load_data import matplotlib.pyplot as plt @@ -68,43 +69,42 @@ def align(dataset, reference): return dataset -data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True) -# data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) - -data_train = load_data('msms/msms16_01.txt').reset_index(drop=True) -# data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True) -data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_16_01_aligned30_01.pkl') - -data_train = load_data('msms/msms17_01.txt').reset_index(drop=True) -# data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True) -data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_17_01_aligned30_01.pkl') - -data_train = load_data('msms/msms20_01.txt').reset_index(drop=True) -# data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True) -data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_20_01_aligned30_01.pkl') - -data_train = load_data('msms/msms23_01.txt').reset_index(drop=True) -# data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True) -data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_23_01_aligned30_01.pkl') - -data_train = load_data('msms/msms24_01.txt').reset_index(drop=True) -# data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True) -data_align = align(data_train, data_ori) -data_align.to_pickle('database/data_DIA_24_01_aligned30_01.pkl') - -data_train = load_data('msms/msms30_01.txt').reset_index(drop=True) -# data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True) +# data_ori = load_data('msms/msms30_01.txt').reset_index(drop=True) +# # data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) +# +# data_train = load_data('msms/msms16_01.txt').reset_index(drop=True) +# # data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True) # data_align = align(data_train, data_ori) -data_train.to_pickle('database/data_DIA_30_01_aligned30_01.pkl') +# data_align.to_pickle('database/data_DIA_16_01_aligned30_01.pkl') +# +# data_train = load_data('msms/msms17_01.txt').reset_index(drop=True) +# # data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True) +# data_align = align(data_train, data_ori) +# data_align.to_pickle('database/data_DIA_17_01_aligned30_01.pkl') +# +# data_train = load_data('msms/msms20_01.txt').reset_index(drop=True) +# # data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True) +# data_align = align(data_train, data_ori) +# data_align.to_pickle('database/data_DIA_20_01_aligned30_01.pkl') +# +# data_train = load_data('msms/msms23_01.txt').reset_index(drop=True) +# # data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True) +# data_align = align(data_train, data_ori) +# data_align.to_pickle('database/data_DIA_23_01_aligned30_01.pkl') +# +# data_train = load_data('msms/msms24_01.txt').reset_index(drop=True) +# # data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True) +# data_align = align(data_train, data_ori) +# data_align.to_pickle('database/data_DIA_24_01_aligned30_01.pkl') +# +# data_train = load_data('msms/msms30_01.txt').reset_index(drop=True) +# data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True) +# # data_align = align(data_train, data_ori) +# data_train.to_pickle('database/data_DIA_30_01_aligned30_01.pkl') # # plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1) # plt.savefig('test_align_2.png') # -# # dataset_ref = pd.read_pickle('database/data_01_16_DIA_ISA_55.pkl') # data_ref = Common_Dataset(dataset_ref, 25).data # dataset_2 = pd.read_pickle('database/data_01_20_DIA_ISA_55.pkl') @@ -139,3 +139,11 @@ data_train.to_pickle('database/data_DIA_30_01_aligned30_01.pkl') # dataset_test = pd.concat(list_test, ignore_index=True) # dataset_train.to_pickle('database/data_DIA_ISA_55_train.pkl') # dataset_train.to_pickle('database/data_DIA_ISA_55_test.pkl') + +data_train_1 = pd.read_pickle('database/data_DIA_ISA_55_train.pkl').reset_index(drop=True) +data_train_2 = pd.read_pickle('database/data_DIA_ISA_55_test.pkl').reset_index(drop=True) +data_ori = pd.read_csv('database/data_train.csv').reset_index(drop=True) +data_ori['Sequence']=data_ori['sequence'] +data_ori['Retention time']=data_ori['irt'] +data_train = pd.concat([data_train_2,data_train_1]).reset_index(drop=True) +data_align = align(data_train, data_ori) diff --git a/database/data_ISA_aligned_prosit.csv b/database/data_ISA_aligned_prosit.csv index ad48f91637989d9241567186af0a688cf8e801ca..569eaef45fd9044c3d842e6b9107be16ae9f2246 100644 Binary files a/database/data_ISA_aligned_prosit.csv and b/database/data_ISA_aligned_prosit.csv differ