diff --git a/data/data_exploration.py b/data/data_exploration.py index 6a36320168ffd6af154345769c48642ec369ddec..3974273b127ad534af6861735b5af5d876100a1e 100644 --- a/data/data_exploration.py +++ b/data/data_exploration.py @@ -5,13 +5,13 @@ import pandas as pd from constant import ALPHABET_UNMOD def length_distribution(data, plot=False, save=False, f_name='fig/data_exploration/length_distribution.png'): - max = 31 - dist = np.zeros(max) + maximum = 31 + dist = np.zeros(maximum) for seq in data: dist[len(list(seq)) - seq.count('-') * 2] += 1 if plot or save: - plt.stairs(dist, range(max + 1), fill=True) + plt.stairs(dist, range(maximum + 1), fill=True) if plot: plt.show() if save: @@ -64,9 +64,9 @@ def main(): #data prosit df = pd.read_csv('data_prosit/data.csv') _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit.png') - _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png') + _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit.png') retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit.png') - df_unique = df[['sequence','irt_scaled']].groupby('sequence').mean() + df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() _ = length_distribution(df_unique.index, False, True, '../fig/data_exploration/length_distribution_prosit_unique.png') _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_unique.png') retention_time_distribution(df_unique['irt_scaled'], False, True, @@ -75,32 +75,32 @@ def main(): #prosit no cysteine df = pd.read_csv('data_prosit/data_noc.csv') _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_prosit_noc.png') - _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png') + _ = aa_distribution(df['mod_sequence'], False, True, '../fig/data_exploration/aa_distribution_prosit_noc.png') retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc.png') - df_unique = df[['sequence','irt_scaled']].groupby('sequence').mean() + df_unique = df[['mod_sequence','irt_scaled']].groupby('mod_sequence').mean() _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_prosit_noc_unique.png') _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_prosit_noc_unique.png') retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_noc_unique.png') #isa - df = pd.read_csv('data_ISA/data_aligned_isa.csv') - _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa.png') - _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa.png') - retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa.png') - df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean() - _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_unique.png') - _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_unique.png') - retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_unique.png') - - #isa no cystéine - df = pd.read_csv('data_ISA/data_aligned_isa_noc.csv') - _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa_noc.png') - _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa_noc.png') - retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc.png') - df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean() - _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_noc_unique.png') - _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_unique.png') - retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_unique.png') + # df = pd.read_csv('data_ISA/data_aligned_isa.csv') + # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa.png') + # _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa.png') + # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa.png') + # df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean() + # _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_unique.png') + # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_unique.png') + # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_unique.png') + # + # #isa no cystéine + # df = pd.read_csv('data_ISA/data_aligned_isa_noc.csv') + # _ = length_distribution(df['sequence'],False ,True, '../fig/data_exploration/length_distribution_isa_noc.png') + # _ = aa_distribution(df['sequence'], False, True, '../fig/data_exploration/aa_distribution_isa_noc.png') + # retention_time_distribution(df['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc.png') + # df_unique = df[['sequence', 'irt_scaled']].groupby('sequence').mean() + # _ = length_distribution(df_unique.index,False ,True, '../fig/data_exploration/length_distribution_isa_noc_unique.png') + # _ = aa_distribution(df_unique.index, False, True, '../fig/data_exploration/aa_distribution_isa_noc_unique.png') + # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_unique.png') if __name__ == '__main__':