diff --git a/data/data_exploration.py b/data/data_exploration.py index a90deded704c34291d66c38d8cbc3d5a08b1189c..f3f79b27845837e1b0c50c0532feda6795e9e18c 100644 --- a/data/data_exploration.py +++ b/data/data_exploration.py @@ -128,11 +128,11 @@ def main(): # retention_time_distribution(df_unique['irt_scaled'], False, True, '../fig/data_exploration/retention_time_distribution_isa_noc_mox_unique.png') #prosit outlier_plasma - df = pd.read_csv('data_PXD006109/data_prosit_outlier.csv') + df = pd.read_csv('data_ISA/data_prosit_outlier.csv') df['seq']=df['seq'].map(numerical_to_alphabetical_str) - _ = length_distribution(df['seq'],False ,True, '../fig/data_exploration/length_distribution_prosit_outlier.png') - _ = aa_distribution(df['seq'], False, True, '../fig/data_exploration/aa_distribution_prosit_outlier.png') - retention_time_distribution(df['true rt'], False, True, '../fig/data_exploration/retention_time_distribution_prosit_outlier.png') + _ = length_distribution(df['seq'],False ,True, '../fig/data_exploration/length_distribution_ISA_prosit_outlier.png') + _ = aa_distribution(df['seq'], False, True, '../fig/data_exploration/aa_distribution_ISA_prosit_outlier.png') + retention_time_distribution(df['true rt'], False, True, '../fig/data_exploration/retention_time_distribution_ISA_prosit_outlier.png') if __name__ == '__main__': diff --git a/data/data_viz.py b/data/data_viz.py index 0dc9bd95d81345a1027493a4cc0021c1db3afc85..d4e452ccf5ad04c52f6dbd2d492796392d2545dd 100644 --- a/data/data_viz.py +++ b/data/data_viz.py @@ -7,7 +7,7 @@ import random import pandas as pd from constant import ALPHABET_UNMOD_REV import matplotlib.colors as mcolors - +import peptides as pep def histo_abs_error(dataframe, display=False, save=False, path=None): points = dataframe['abs_error'] @@ -266,9 +266,35 @@ def plot_augmented_dataset_size(ref_path,base_path): plt.savefig('../fig/data_exploration/augmented_dataset_size.png') +def compute_peptide_properties(df, base_name, col='seq', format='alpha'): + if format!= 'alpha': + df[col] = df[col].map(numerical_to_alphabetical_str) + hydro=[] + isop=[] + molecular_w = [] + for p in df[col]: + pept = pep.Peptide(p) + hydro.append(pept.hydrophobicity()) + isop.append(pept.isoelectric_point()) + molecular_w.append(pept.molecular_weight()) + plt.hist(hydro,bins = 50) + plt.title("Hydrophobicity") + plt.savefig('../fig/data_exploration/hydrophobicity_{}.png'.format(base_name)) + plt.clf() + + plt.hist(hydro,bins = 50) + plt.title("Isoelectric point") + plt.savefig('../fig/data_exploration/isoelectric_point_{}.png'.format(base_name)) + plt.clf() + + plt.hist(hydro,bins = 50) + plt.title("Molecular weight") + plt.savefig('../fig/data_exploration/molecular_weight_{}.png'.format(base_name)) + plt.clf() + if __name__ == '__main__' : - calc_and_plot_res() + # calc_and_plot_res() # base = ['plasma_plasma','plasma_prosit'] # # augmented = ['ISA_aug_07_ISA_noc','ISA_aug_1_ISA_noc','ISA_aug_all_ISA_noc'] # for f_suffix_name in base: @@ -285,4 +311,22 @@ if __name__ == '__main__' : # error_by_methionine(df) # dataframe = pd.read_csv('../output/out_early_stop_plasma_prosit_0.csv') # df = filter_outlier_rt(dataframe) - # df.to_csv('../data/data_PXD006109/data_prosit_outlier.csv', index=False) \ No newline at end of file + # df.to_csv('../data/data_PXD006109/data_prosit_outlier.csv', index=False) + # + # dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv') + # df2 = filter_outlier_rt(dataframe) + # df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False) + df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv') + compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num') + + df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv') + compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num') + + df = pd.read_csv('../data/data_ISA/data_isa.csv') + compute_peptide_properties(df,'ISA','sequence') + + df = pd.read_csv('../data/data_prosit/data.csv') + compute_peptide_properties(df,'prosit','sequence') + + df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv') + compute_peptide_properties(df,'plasma','sequence') \ No newline at end of file diff --git a/main.py b/main.py index 472cdb9986316e56e8ce41b89173c328ff129c9c..9893370015b16305e26f6abaf6e919fcea0afe9e 100644 --- a/main.py +++ b/main.py @@ -106,7 +106,7 @@ def main(args): embedding_dim=args.embedding_dim, acti=args.activation, norm=args.norm_first) if args.model_weigh is not None : - model.load_state_dict(torch.load(args.model_weigh+'.pt', weights_only=True)) + model.load_state_dict(torch.load(args.model_weigh, weights_only=True)) if torch.cuda.is_available(): model = model.cuda()