diff --git a/constant.py b/constant.py index 485b7345933f08c3e5f3dbaa5becd92b99f63a5d..7d32614f7c0dce8dc4bb0bce3480dc615813c12f 100644 --- a/constant.py +++ b/constant.py @@ -20,8 +20,8 @@ ALPHABET_UNMOD = { "V": 18, "W": 19, "Y": 20, - "CaC": 22, - "OxM": 21 + "OxM": 21, + "CaC": 22 } ALPHABET_UNMOD_REV = {v: k for k, v in ALPHABET_UNMOD.items()} \ No newline at end of file diff --git a/data/data_PXD006109/data_aligned_plasma.csv b/data/data_PXD006109/data_aligned_plasma.csv new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/data/data_exploration.py b/data/data_exploration.py index 3974273b127ad534af6861735b5af5d876100a1e..808983e9935aff445ed8135efbe81c0ab408dbde 100644 --- a/data/data_exploration.py +++ b/data/data_exploration.py @@ -1,8 +1,10 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd +from numpy.ma.core import shape + +from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV -from constant import ALPHABET_UNMOD def length_distribution(data, plot=False, save=False, f_name='fig/data_exploration/length_distribution.png'): maximum = 31 @@ -36,6 +38,9 @@ def aa_distribution(data, plot=False, save=False, f_name='fig/data_exploration/a freq = 100 * freq / freq.sum() + for i in range(len(freq)) : + print(freq[i],ALPHABET_UNMOD_REV[i]) + dict_freq = ALPHABET_UNMOD.copy() for aa in list(ALPHABET_UNMOD.keys()): dict_freq[aa] = freq[ALPHABET_UNMOD[aa]] @@ -62,25 +67,25 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e def main(): #data prosit - df = pd.read_csv('data_prosit/data.csv') - _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png') - _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png') - retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png') - df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() - _ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png') - _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png') - retention_time_distribution(df_unique['irt_scaled'], False, True, - '../fig/data_exploration/retention_time_distribution_prosit_unique.png') + # df = pd.read_csv('data_prosit/data.csv') + # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png') + # _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png') + # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png') + # df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() + # _ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png') + # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png') + # retention_time_distribution(df_unique['irt_scaled'], False, True, + # '../fig/data_exploration/retention_time_distribution_prosit_unique.png') #prosit no cysteine - df = pd.read_csv('data_prosit/data_noc.csv') - _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png') - _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png') - retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png') - df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() - _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png') - _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png') - retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png') + # df = pd.read_csv('data_prosit/data_noc.csv') + # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png') + # _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png') + # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png') + # df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() + # _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png') + # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png') + # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png') #isa # df = pd.read_csv('data_ISA/data_aligned_isa.csv') @@ -102,6 +107,16 @@ def main(): # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_unique.png') # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_unique.png') + #isa mox + df = pd.read_csv('data_ISA_mox/data_aligned_isa_noc.csv') + _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox.png') + _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox.png') + retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox.png') + df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean() + _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_noc_mox_unique.png') + _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_mox_unique.png') + retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox_unique.png') + if __name__ == '__main__': main() diff --git a/data/data_processing.py b/data/data_processing.py index fec3d58bf13cc69e28b148e1dd334ac7681314aa..b7495bdff889e133431d56dabadb54d5077e9036 100644 --- a/data/data_processing.py +++ b/data/data_processing.py @@ -18,24 +18,26 @@ def align(dataset, reference, column_dataset, column_ref, seq_data, seq_ref): ind_dict_ref = dict((k, i) for i, k in enumerate(seq_ref)) inter = set(ind_dict_ref).intersection(seq_common) + print(len(inter)) ind_dict_ref = [ind_dict_ref[x] for x in inter] indices_common = dict((k, i) for i, k in enumerate(seq_common)) indices_common = [indices_common[x] for x in inter] + rt_ref = reference_unique[column_ref][ind_dict_ref].reset_index() rt_data = dataset_unique[column_dataset][indices_common].reset_index() - plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist()) + plt.scatter(rt_data[column_dataset].tolist(),rt_ref[column_ref].tolist(),s=0.1) plt.savefig('test.png') xout, yout, wout = loess_1d(np.array(rt_data[column_dataset].tolist()), np.array(rt_ref[column_ref].tolist()), xnew=dataset[column_dataset], - degree=1, frac=0.5, + degree=1, frac=0.25, npoints=None, rotate=False, sigy=None) - plt.scatter(xout, yout) + plt.scatter(xout, yout, s=0.1) plt.savefig('test_2.png') dataset[column_dataset] = yout @@ -115,46 +117,41 @@ def numerical_to_alphabetical_str(s): def main(): ref = pd.read_csv('data_prosit/data.csv') - df_ISA = pd.read_csv('data_ISA_mox/data_isa.csv') - df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence') - df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa.csv', index=False) - - ref = pd.read_csv('data_prosit/data_noc.csv') - df_ISA = pd.read_csv('data_ISA_mox/data_isa_noc.csv') + df_ISA = pd.read_csv('data_PXD006109/data_plasma.csv') df_ISA_aligned = align(df_ISA, ref, 'irt_scaled', 'irt_scaled','sequence', 'mod_sequence') - df_ISA_aligned.to_csv('data_ISA_mox/data_aligned_isa_noc.csv', index=False) + df_ISA_aligned.to_csv('data_PXD006109/data_aligned_plasma.csv', index=False) if __name__ == '__main__': - # main() - - df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv') - df_base = df_base[['sequence', 'irt_scaled','state']] - t = [0.7,1,10] - #reste 07 1 et all - name = ['07','1','all'] - for i in range(len(name)): - - - #creating augmented datasets - print('thresold {} en cours'.format(name[i])) - # - df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv') - df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv') - df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv') - df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv') - df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv') - - list_df = [df_0,df_1,df_2,df_3,df_4] - df = select_best_data(list_df, t[i]) - df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i])) - df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i])) - df['state'] = 'train' - df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str) - df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True) - df_augmented_1.columns = ['sequence', 'irt_scaled','state'] - - df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False) - print(df_augmented_1.shape) \ No newline at end of file + main() + + # df_base = pd.read_csv('data_ISA/data_aligned_isa_noc.csv') + # df_base = df_base[['sequence', 'irt_scaled','state']] + # t = [0.7,1,10] + # #reste 07 1 et all + # name = ['07','1','all'] + # for i in range(len(name)): + # + # + # #creating augmented datasets + # print('thresold {} en cours'.format(name[i])) + # # + # df_0 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_0.csv') + # df_1 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_1.csv') + # df_2 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_2.csv') + # df_3 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_3.csv') + # df_4 = pd.read_csv('../output/out_ISA_noc_mox_prosit_mod_4.csv') + # + # list_df = [df_0,df_1,df_2,df_3,df_4] + # df = select_best_data(list_df, t[i]) + # df.to_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i])) + # df = pd.read_pickle('data_ISA_mox/data_ISA_additionnal_{}.pkl'.format(name[i])) + # df['state'] = 'train' + # df['sequence'] = df['sequence'].map(numerical_to_alphabetical_str) + # df_augmented_1 = pd.concat([df, df_base], axis=0).reset_index(drop=True) + # df_augmented_1.columns = ['sequence', 'irt_scaled','state'] + # + # df_augmented_1.to_csv('data_ISA_mox/isa_data_augmented_{}.csv'.format(name[i]), index=False) + # print(df_augmented_1.shape) \ No newline at end of file diff --git a/data/data_viz.py b/data/data_viz.py index 9e0898330c1480fb5132775b85dbb2fd3d6cc88e..dfcb91c2247db9ededaa5d8c761f46f027b236a4 100644 --- a/data/data_viz.py +++ b/data/data_viz.py @@ -166,9 +166,10 @@ def plot_res(): def calc_and_plot_res(): all_data=[] base = 'out_early_stop_' - for name in ['ISA_noc_ISA_noc','ISA_aug_005_ISA_noc','ISA_aug_01_ISA_noc','ISA_aug_02_ISA_noc','ISA_aug_03_ISA_noc', - 'ISA_aug_04_ISA_noc','ISA_aug_05_ISA_noc','ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc', - 'ISA_aug_all_ISA_noc','prosit_ISA_noc']: + for name in ['ISA_noc_mox_ISA_noc_mox','ISA_noc_aug_005_mox_ISA_noc_mox','ISA_noc_aug_01_mox_ISA_noc_mox', + 'ISA_noc_aug_02_mox_ISA_noc_mox','ISA_noc_aug_03_mox_ISA_noc_mox','ISA_noc_aug_04_mox_ISA_noc_mox', + 'ISA_noc_aug_05_mox_ISA_noc_mox','ISA_noc_aug_07_mox_ISA_noc_mox','ISA_noc_aug_1_mox_ISA_noc_mox', + 'ISA_noc_aug_all_mox_ISA_noc_mox','prosit_mod_ISA_noc_mox']: print(name) r2_list=[] for index in range(10): @@ -184,7 +185,7 @@ def calc_and_plot_res(): axs.set_xticks([y + 1 for y in range(len(all_data))], labels=[ 'ISA_noc', 'Augm 0.05', 'Augm 0.1', 'Augm 0.2', 'Augm 0.3', 'Augm 0.4', 'Augm 0.5', 'Augm 0.7', 'Augm 1', 'Augm all', 'Prosit', ]) - plt.savefig('../fig/model perf/summary_early_stop.png') + plt.savefig('../fig/model perf/summary_early_stop_mox.png') def error_by_methionine(dataframe): def fonc(a): @@ -206,6 +207,7 @@ def error_by_methionine(dataframe): if __name__ == '__main__' : + calc_and_plot_res() # base = ['ISA_noc_ISA_noc','prosit_ISA_noc', 'ISA_noc_prosit', 'prosit_prosit'] # augmented = ['ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc','ISA_aug_all_ISA_noc'] # for f_suffix_name in augmented: @@ -218,6 +220,6 @@ if __name__ == '__main__' : # histo_length_by_error(df, bins=10, display=False, save=True, path='../fig/model perf/histo_length_{}_{}.png'.format(f_suffix_name,number)) # calc_and_plot_res() - for number in range(10): - df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number))) - error_by_methionine(df) + # for number in range(10): + # df = pd.read_csv('../output/out_{}_{}.csv'.format('early_stop_ISA_noc_prosit',str(number))) + # error_by_methionine(df) diff --git a/data/msms_processing.py b/data/msms_processing.py index 9cebfb6387b81a07dfdcf1f8ecb780bfd54b55c8..4ea3e2cadee8604122c4623746afe3e520705805 100644 --- a/data/msms_processing.py +++ b/data/msms_processing.py @@ -3,7 +3,7 @@ import numpy as np import random from constant import ALPHABET_UNMOD, ALPHABET_UNMOD_REV -def load_data(msms_filet_path='data/msms.txt', score_treshold=70): +def load_data(msms_filet_path='data/msms_plasma.txt', score_treshold=70): data = pd.read_csv(msms_filet_path, sep='\t') data_compact = data[['Modified sequence', 'Length', 'Charge', 'Retention time', 'Score', 'Matches', 'Intensities']] data_filtered = data_compact[data_compact['Score'] > score_treshold] @@ -23,7 +23,7 @@ def convert(l): num_str)] def convert_mod_to_prosit(s): - return s.replace('(Oxidation (M))','-OxM-').replace('(Acetyl (Protein N-term))','') #acetyl reliquat d'une mauvaise analyse, a enlever a terme + return s.replace('M(ox)','-OxM-').replace('(ac)','@') def numerical_to_alphabetical(arr): seq = '' @@ -31,6 +31,14 @@ def numerical_to_alphabetical(arr): seq+=ALPHABET_UNMOD_REV[arr[i]] return seq +def filter_acetyl(df): + df2 = df.copy() + df2['acetyl']=df['sequence'].map(lambda x: '@' in x) + df2 = df2[df2['acetyl']==False] + df_final = df2.drop('acetyl', axis=1) + df_final.reset_index(drop=True, inplace=True) + return df_final + def filter_cysteine(df): df2 = df.copy() df2['cysteine']=df['sequence'].map(lambda x: 'C' in x) @@ -79,19 +87,12 @@ def add_split_column(data, split=(0.7,0.15,0.15)): return data_split def main(): - df_03_02 = load_data('data_ISA_mox/msms_03_02.txt', 70) - df_16_01 = load_data('data_ISA_mox/msms_16_01.txt', 70) - df_20_01 = load_data('data_ISA_mox/msms_20_01.txt', 70) - df_30_01 = load_data('data_ISA_mox/msms_30_01.txt', 70) - merged_df = pd.concat([df_20_01, df_30_01, df_16_01, df_03_02], ignore_index=True) + df_plasma = load_data('data_PXD006109/msms_plasma.txt', 70) + merged_df = pd.concat([df_plasma], ignore_index=True) final_df = add_split_column(merged_df) - final_df.to_csv('data_ISA_mox/data_isa.csv', index=False) - df2 = filter_cysteine(final_df) - df2.to_csv('data_ISA_mox/data_isa_noc.csv', index=False) + final_df = filter_acetyl(final_df) + final_df.to_csv('data_PXD006109/data_plasma.csv', index=False) - # final_df= pd.read_csv('data_prosit/data.csv') - # df2 = filter_cysteine(final_df) - # df2.to_csv('data_prosit/data_noc.csv', index=False) if __name__ == '__main__': main() \ No newline at end of file