diff --git a/dataset_comparison.py b/dataset_comparison.py index f620d0d02e6524cb4e4b230c8eacf5c0dacbcf00..facbc640d38e403bd6413f9c164ad3ceea02c32c 100644 --- a/dataset_comparison.py +++ b/dataset_comparison.py @@ -1,55 +1,169 @@ import pandas as pd -from datasets import load_dataset, DatasetDict - - -df_list =["Wilhelmlab/detectability-proteometools", "Wilhelmlab/detectability-wang","Wilhelmlab/detectability-sinitcyn"] -df_flyer = pd.read_csv('ISA_data/df_flyer_no_miscleavage.csv') -df_no_flyer = pd.read_csv('ISA_data/df_non_flyer_no_miscleavage.csv') - -for label_type in ['Classes fragment','Classes precursor', 'Classes MaxLFQ'] : - df_full = pd.concat([df_flyer,df_no_flyer]) - df_size = df_full.shape[0] - nb_no_flyer = df_full[df_full[label_type]==0].shape[0] - nb_weak_flyer = df_full[df_full[label_type] == 1].shape[0] - nb_intermediate_flyer = df_full[df_full[label_type] == 2].shape[0] - nb_strong_flyer = df_full[df_full[label_type] == 3].shape[0] - print('df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(label_type,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size)) - -l_inter_ISA=[] -l_df_hg=[] -for hf_data_name in df_list : - - hf_dataset_split = load_dataset(hf_data_name) - l = [pd.DataFrame(hf_dataset_split[k]) for k in hf_dataset_split.keys()] - df_hg = pd.concat(l) - - df_size = df_hg.shape[0] - nb_no_flyer = df_hg[df_hg['Classes']==0].shape[0] - nb_weak_flyer = df_hg[df_hg['Classes'] == 1].shape[0] - nb_intermediate_flyer = df_hg[df_hg['Classes'] == 2].shape[0] - nb_strong_flyer = df_hg[df_hg['Classes'] == 3].shape[0] - print('df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(hf_data_name,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size)) - - df_common = df_hg.join(df_full.set_index('Sequences'),on='Sequences',how='inner',lsuffix='_hg',rsuffix='_ISA') - size_inter = df_common.shape[0] - same_label = df_common[df_common['Classes']==df_common['Classes MaxLFQ']].shape[0] - l_inter_ISA.append(df_common) - print('Inter with ISA df size : {}, similar label : {:.2f}%'.format(size_inter,100*same_label/size_inter)) - - for df_hg_bis in l_df_hg : - df_common = df_hg.join(df_hg_bis.set_index('Sequences'), on='Sequences', how='inner', lsuffix='_hg', - rsuffix='_hg_bis') +from datasets import load_dataset +from keras.src.utils.text_dataset import paths_and_labels_to_dataset +from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix +import matplotlib.pyplot as plt + +def intra_dataset_varaition(): + df_flyer_zeno = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv') + df_flyer_astral = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral_4.csv') + + conf_matrix_zeno_maxlfq_precursor = confusion_matrix(df_flyer_zeno['Classes MaxLFQ'], df_flyer_zeno['Classes precursor']) + conf_matrix_zeno_maxlfq_fragments= confusion_matrix(df_flyer_zeno['Classes MaxLFQ'],df_flyer_zeno['Classes fragment']) + conf_matrix_zeno_fragments_precursor = confusion_matrix(df_flyer_zeno['Classes fragment'],df_flyer_zeno['Classes precursor']) + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix_zeno_maxlfq_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix Zeno (maxlfq vs precursor)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_zeno_maxlfq_precursor', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix_zeno_maxlfq_fragments, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix Zeno (maxlfq vs fragments)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_zeno_maxlfq_fragments', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix_zeno_fragments_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix Zeno (fragments vs precursor)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_zeno_fragments_precursor', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + + conf_matrix_astral_maxlfq_precursor = confusion_matrix(df_flyer_astral['Classes MaxLFQ'], df_flyer_astral['Classes precursor']) + conf_matrix_astral_maxlfq_fragments= confusion_matrix(df_flyer_astral['Classes MaxLFQ'],df_flyer_astral['Classes fragment']) + conf_matrix_astral_fragments_precursor = confusion_matrix(df_flyer_astral['Classes fragment'],df_flyer_astral['Classes precursor']) + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix_astral_maxlfq_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix astral (maxlfq vs precursor)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_astral_maxlfq_precursor', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix_astral_maxlfq_fragments, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix astral (maxlfq vs fragments)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_astral_maxlfq_fragments', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix_astral_fragments_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix astral (fragments vs precursor)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_astral_fragments_precursor', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + +def ISA_dataset_variation(): + df_flyer = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv') + df_no_flyer = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage.csv') + + df_flyer_astral = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral.csv') + df_no_flyer_astral = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage_astral.csv') + df_flyer_Zeno = df_flyer[['Sequences','Classes MaxLFQ']] + df_flyer_astral = df_flyer_astral[['Sequences','Classes MaxLFQ']] + df_no_flyer_Zeno = df_no_flyer[['Sequences','Classes MaxLFQ']] + df_no_flyer_astral = df_no_flyer_astral[['Sequences','Classes MaxLFQ']] + + df_zeno = pd.concat([df_flyer_Zeno,df_no_flyer_Zeno],axis=0) + df_astral = pd.concat([df_flyer_astral,df_no_flyer_astral],axis=0) + df_inter = df_zeno.join(df_astral.set_index('Sequences'),on='Sequences',how = 'inner',lsuffix ='zeno',rsuffix='astral') + + df_inter_flyer = df_flyer_Zeno.join(df_flyer_astral.set_index('Sequences'),on='Sequences',how = 'inner',lsuffix ='zeno',rsuffix='astral') + + + conf_matrix= confusion_matrix(df_inter['Classes MaxLFQastral'],df_inter['Classes MaxLFQzeno']) + + conf_matrix_flyer= confusion_matrix(df_inter_flyer['Classes MaxLFQastral'],df_inter_flyer['Classes MaxLFQzeno']) + + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix, display_labels=["Non Flyer", "Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix (astral vs zeno)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_zeno_astral', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + + conf_matrix_disp = ConfusionMatrixDisplay( + confusion_matrix=conf_matrix_flyer, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer'] + ) + fig, ax = plt.subplots() + conf_matrix_disp.plot(xticks_rotation=45, ax=ax) + plt.title("Confusion Matrix FLyer (astral vs zeno)", y=1.04, fontsize=11) + plt.savefig('confusion_matrix_flyer_zeno_astral', bbox_inches="tight", dpi=80) + plt.close() + plt.clf() + +def inter_dataset_corespondance(): + df_flyer = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv') + df_no_flyer = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage.csv') + for label_type in ['Classes fragment','Classes precursor', 'Classes MaxLFQ'] : + df_full = pd.concat([df_flyer,df_no_flyer]) + df_size = df_full.shape[0] + nb_no_flyer = df_full[df_full[label_type]==0].shape[0] + nb_weak_flyer = df_full[df_full[label_type] == 1].shape[0] + nb_intermediate_flyer = df_full[df_full[label_type] == 2].shape[0] + nb_strong_flyer = df_full[df_full[label_type] == 3].shape[0] + print('df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(label_type,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size)) + + + df_list =["Wilhelmlab/detectability-proteometools", "Wilhelmlab/detectability-wang","Wilhelmlab/detectability-sinitcyn"] + l_inter_ISA=[] + l_df_hg=[] + for hf_data_name in df_list : + + hf_dataset_split = load_dataset(hf_data_name) + l = [pd.DataFrame(hf_dataset_split[k]) for k in hf_dataset_split.keys()] + df_hg = pd.concat(l) + + df_size = df_hg.shape[0] + nb_no_flyer = df_hg[df_hg['Classes']==0].shape[0] + nb_weak_flyer = df_hg[df_hg['Classes'] == 1].shape[0] + nb_intermediate_flyer = df_hg[df_hg['Classes'] == 2].shape[0] + nb_strong_flyer = df_hg[df_hg['Classes'] == 3].shape[0] + print('df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(hf_data_name,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size)) + + df_common = df_hg.join(df_full.set_index('Sequences'),on='Sequences',how='inner',lsuffix='_hg',rsuffix='_ISA') size_inter = df_common.shape[0] - same_label = df_common[df_common['Classes_hg'] == df_common['Classes_hg_bis']] - same_label_size = same_label.shape[0] - cf_matrix = pd.crosstab(df_common['Classes_hg'], df_common['Classes_hg_bis']) - print('Inter with df hg bis df size : {}, similar label : {:.2f}%'.format(size_inter, 100 * same_label_size / size_inter)) - print(cf_matrix) - l_df_hg.append(df_hg) - - - - + same_label = df_common[df_common['Classes']==df_common['Classes MaxLFQ']].shape[0] + l_inter_ISA.append(df_common) + print('Inter with ISA df size : {}, similar label : {:.2f}%'.format(size_inter,100*same_label/size_inter)) + + for df_hg_bis in l_df_hg : + df_common = df_hg.join(df_hg_bis.set_index('Sequences'), on='Sequences', how='inner', lsuffix='_hg', + rsuffix='_hg_bis') + size_inter = df_common.shape[0] + same_label = df_common[df_common['Classes_hg'] == df_common['Classes_hg_bis']] + same_label_size = same_label.shape[0] + cf_matrix = pd.crosstab(df_common['Classes_hg'], df_common['Classes_hg_bis']) + print('Inter with df hg bis df size : {}, similar label : {:.2f}%'.format(size_inter, 100 * same_label_size / size_inter)) + print(cf_matrix) + l_df_hg.append(df_hg) diff --git a/dataset_extraction.py b/dataset_extraction.py index b6610c7117e3ea09eecba0fecb26792d43548dd5..078e71fe2357e55ac34f238742fed658ab9f66a1 100644 --- a/dataset_extraction.py +++ b/dataset_extraction.py @@ -88,7 +88,7 @@ def build_dataset(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'): -def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'): +def build_dataset_astral(coverage_treshold = 20, min_peptide = 4): df = pd.read_excel('ISA_data/250505_Flyers_ASTRAL_mix_12_species.xlsx') df_non_flyer = pd.read_excel('ISA_data/250505_Non_flyers_ASTRAL_mix_12_species.xlsx') #No flyer @@ -102,25 +102,49 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df #Flyer + quantites_table = pd.read_csv('ISA_data/250505_mix_12_souches_lib_12_especes_conta_ASTRAL_BIOASTER_quantities.csv') df_filtered = df[~(pd.isna(df['Proteotypic ?']))] df_filtered = df_filtered[df_filtered['Coverage']>=coverage_treshold] df_filtered = df_filtered[pd.isna(df_filtered['Miscleavage ? '])] peptide_count=df_filtered.groupby(["Protein.Names"]).size().reset_index(name='counts') + quantites_table_filtered = quantites_table[quantites_table['Modified.Sequence'].isin(df_filtered['Stripped.Sequence'])] filtered_sequence = peptide_count[peptide_count['counts']>=min_peptide]["Protein.Names"] df_filtered = df_filtered[df_filtered["Protein.Names"].isin(filtered_sequence.to_list())] + df_filtered = pd.merge(quantites_table_filtered, df_filtered, how='inner', left_on='Modified.Sequence', + right_on='Stripped.Sequence') df1_grouped = df_filtered.groupby("Protein.Names") dico_final={} # iterate over each group for group_name, df_group in df1_grouped: - seq = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['Stripped.Sequence'].to_list() + seq = df_group.sort_values(by=['Fragment.Quant.Raw'])['Stripped.Sequence'].to_list() + value_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['Fragment.Quant.Raw'].to_list() + value_prec = df_group.sort_values(by=['Precursor.Quantity'])['Precursor.Quantity'].to_list() + value_prec_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['Precursor.Quantity'].to_list() value_maxlfq = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['20250129_ISA_MIX-1_48SPD_001'].to_list() - value_maxlfq_frag = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['20250129_ISA_MIX-1_48SPD_001'].to_list() + value_maxlfq_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['20250129_ISA_MIX-1_48SPD_001'].to_list() + threshold_weak_flyer_frag = value_frag[int(len(seq) / 3)] + threshold_medium_flyer_frag = value_frag[int(2*len(seq) / 3)] + threshold_weak_flyer_prec = value_prec[int(len(seq) / 3)] + threshold_medium_flyer_prec = value_prec[int(2 * len(seq) / 3)] threshold_weak_flyer_maxflq = value_maxlfq[int(len(seq) / 3)] threshold_medium_flyer_maxlfq = value_maxlfq[int(2 * len(seq) / 3)] prot = df_group['Protein.Group'].to_list()[0] for i in range(len(seq)): + if value_frag[i] < threshold_weak_flyer_frag : + label_frag = 1 + elif value_frag[i] < threshold_medium_flyer_frag : + label_frag = 2 + else : + label_frag = 3 + + if value_prec_frag[i] < threshold_weak_flyer_prec : + label_prec = 1 + elif value_prec_frag[i] < threshold_medium_flyer_prec : + label_prec = 2 + else : + label_prec = 3 if value_maxlfq_frag[i] < threshold_weak_flyer_maxflq : label_maxlfq = 1 @@ -129,14 +153,14 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df else : label_maxlfq = 3 - dico_final[seq[i]] = (prot,label_maxlfq) + dico_final[seq[i]] = (prot,label_frag,label_prec,label_maxlfq) - df_final = pd.DataFrame.from_dict(dico_final, orient='index',columns=['Proteins', 'Classes MaxLFQ']) + df_final = pd.DataFrame.from_dict(dico_final, orient='index',columns=['Proteins', 'Classes fragment','Classes precursor', 'Classes MaxLFQ']) df_final['Sequences']=df_final.index df_final = df_final.reset_index() - df_final=df_final[['Sequences','Proteins', 'Classes MaxLFQ']] - df_final.to_csv('ISA_data/df_flyer_no_miscleavage_astral_15.csv', index=False) - df_non_flyer.to_csv('ISA_data/df_non_flyer_no_miscleavage_astral.csv', index=False) + df_final=df_final[['Sequences','Proteins','Classes fragment','Classes precursor', 'Classes MaxLFQ']] + df_final.to_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral_4.csv', index=False) + df_non_flyer.to_csv('ISA_data/datasets/df_non_flyer_no_miscleavage_astral.csv', index=False) def build_regression_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'): @@ -244,10 +268,11 @@ def build_dataset_regression_zeno(coverage_treshold = 20, min_peptide = 4): if __name__ == '__main__': - df_size=[] - for min_pep in range(4,20): - df = build_regression_dataset_astral(coverage_treshold=20, min_peptide=min_pep) - df_size.append(df.shape[0]) - plt.clf() - plt.bar([i for i in range(4,20)],df_size) - plt.savefig('number_of_peptides_thr.png') + # df_size=[] + # for min_pep in range(4,20): + # df = build_regression_dataset_astral(coverage_treshold=20, min_peptide=min_pep) + # df_size.append(df.shape[0]) + # plt.clf() + # plt.bar([i for i in range(4,20)],df_size) + # plt.savefig('number_of_peptides_thr.png') + build_dataset_astral()