diff --git a/alignement.py b/alignement.py index 87a97b87700518a0b23ef66a569e41b30052cd3f..58c5969325cab3a60f86e4793584a4ba14f2ed37 100644 --- a/alignement.py +++ b/alignement.py @@ -7,7 +7,7 @@ from common_dataset import Common_Dataset import matplotlib.pyplot as plt ALPHABET_UNMOD = { - "_": 0, + "": 0, "A": 1, "C": 2, "D": 3, @@ -67,14 +67,27 @@ def align(dataset, reference): return dataset -data_ori = RT_Dataset(None, 'database/data.csv', 'train', 25).data -data_ori ['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) -# data_ori = pd.read_pickle('database/data_01_16_DIA_ISA_55.pkl') -# data_ori = Common_Dataset(data_ori, 30).data -data_train = pd.read_pickle('database/data_DIA_ISA_55_train.pkl') -# data_train = Common_Dataset(data_train, 30).data -# +data_ori = RT_Dataset(None, 'database/data_train.csv', 'train', 25).data +data_ori['sequence'] = data_ori['sequence'].map(numerical_to_alphabetical) + +data_train = pd.read_pickle('database/data_DIA_16_01.pkl').reset_index(drop=True) +data_align = align(data_train, data_ori) +data_align.to_pickle('database/data_DIA_16_01_aligned.pkl') +data_train = pd.read_pickle('database/data_DIA_17_01.pkl').reset_index(drop=True) +data_align = align(data_train, data_ori) +data_align.to_pickle('database/data_DIA_17_01_aligned.pkl') +data_train = pd.read_pickle('database/data_DIA_20_01.pkl').reset_index(drop=True) +data_align = align(data_train, data_ori) +data_align.to_pickle('database/data_DIA_20_01_aligned.pkl') +data_train = pd.read_pickle('database/data_DIA_23_01.pkl').reset_index(drop=True) +data_align = align(data_train, data_ori) +data_align.to_pickle('database/data_DIA_23_01_aligned.pkl') +data_train = pd.read_pickle('database/data_DIA_24_01.pkl').reset_index(drop=True) +data_align = align(data_train, data_ori) +data_align.to_pickle('database/data_DIA_24_01_aligned.pkl') +data_train = pd.read_pickle('database/data_DIA_30_01.pkl').reset_index(drop=True) data_align = align(data_train, data_ori) +data_align.to_pickle('database/data_DIA_30_01_aligned.pkl') # plt.scatter(data_train['Retention time'], data_align['Retention time'], s=1) plt.savefig('test_align_2.png') diff --git a/database/data_DIA_ISA_55_test.pkl b/database/data_DIA_ISA_55_test.pkl index 96dc65f3e991f18cdca1a7e5214afa0adf3f68a3..b7186f96f9507a7ce52296e19e8c4b7b6da6a15a 100644 Binary files a/database/data_DIA_ISA_55_test.pkl and b/database/data_DIA_ISA_55_test.pkl differ diff --git a/database/data_DIA_ISA_55_train.pkl b/database/data_DIA_ISA_55_train.pkl index 96dc65f3e991f18cdca1a7e5214afa0adf3f68a3..435b980676ffb348a88160fa92b574236aad8f75 100644 Binary files a/database/data_DIA_ISA_55_train.pkl and b/database/data_DIA_ISA_55_train.pkl differ diff --git a/msms_processing.py b/msms_processing.py index 6c3bbb8dd3d06f0f8da5204ce487387e3822e88e..bde575fa0ed43c3f5a374af6d4cbb63c7ddc21ee 100644 --- a/msms_processing.py +++ b/msms_processing.py @@ -3,6 +3,7 @@ import numpy as np import matplotlib.pyplot as plt from sklearn.decomposition import PCA import json +import random def load_data(msms_filet_path='data/msms.txt', score_treshold=70): @@ -76,13 +77,42 @@ def mscatter(x,y, ax=None, m=None, **kw): paths.append(path) sc.set_paths(paths) return sc - +#data gradient 10 : +# 16/01 20/01 30/01 +#data gradient 3 : +# 17/01 23/01 24/01 if __name__ == '__main__': - pass - # data_2 = load_data('data/Custom_dataset/msmsHBM_UCGTs.txt', i) - # data_1 = load_data('data/Custom_dataset/msmsHBM_P450s.txt', i) - # data_3 = load_data('data/Custom_dataset/msmsMkBM_P450s.txt', i) - # data = pd.concat([data_1, data_2, data_3], ignore_index=True) + data_1 = pd.read_pickle('database/data_DIA_16_01_aligned.pkl') + data_2 = pd.read_pickle('database/data_DIA_17_01_aligned.pkl') + data_3 = pd.read_pickle('database/data_DIA_20_01_aligned.pkl') + data_4 = pd.read_pickle('database/data_DIA_23_01_aligned.pkl') + data_5 = pd.read_pickle('database/data_DIA_24_01_aligned.pkl') + data_6 = pd.read_pickle('database/data_DIA_30_01_aligned.pkl') + data = pd.concat([data_1, data_2, data_3, data_4, data_5, data_6], ignore_index=True) + + num_total = len(data) + train_size = np.floor(0.8*num_total) + list_gr=[] + train_set = [] + test_set=[] + s = 0 + groups = data.groupby('Sequence') + for seq, gr in groups: + list_gr.append(gr) + random.shuffle(list_gr) + for gr in list_gr : + if s < train_size : + train_set.append(gr) + s+= len(gr) + else : + test_set.append(gr) + + dataset_train = pd.concat(train_set).reset_index(drop=True) + dataset_test = pd.concat(test_set).reset_index(drop=True) + + + + # err_rt = [] # err_spec = [] # nb_data = []