dataset exploration

ef3118f7 · Schneider Leo · 70c39f3f · ef3118f7 · ef3118f7
Commit ef3118f7 authored 4 weeks ago by Schneider Leo
--- a/dataset_comparison.py
+++ b/dataset_comparison.py
 import pandas as pd
-from datasets import load_dataset, DatasetDict
-
-
-df_list =["Wilhelmlab/detectability-proteometools", "Wilhelmlab/detectability-wang","Wilhelmlab/detectability-sinitcyn"]
-df_flyer = pd.read_csv('ISA_data/df_flyer_no_miscleavage.csv')
-df_no_flyer = pd.read_csv('ISA_data/df_non_flyer_no_miscleavage.csv')
-
-for label_type in ['Classes fragment','Classes precursor', 'Classes MaxLFQ'] :
-    df_full = pd.concat([df_flyer,df_no_flyer])
-    df_size = df_full.shape[0]
-    nb_no_flyer = df_full[df_full[label_type]==0].shape[0]
-    nb_weak_flyer = df_full[df_full[label_type] == 1].shape[0]
-    nb_intermediate_flyer = df_full[df_full[label_type] == 2].shape[0]
-    nb_strong_flyer = df_full[df_full[label_type] == 3].shape[0]
-    print('df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(label_type,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
-
-l_inter_ISA=[]
-l_df_hg=[]
-for hf_data_name in df_list :
-
-    hf_dataset_split = load_dataset(hf_data_name)
-    l = [pd.DataFrame(hf_dataset_split[k]) for k in hf_dataset_split.keys()]
-    df_hg = pd.concat(l)
-
-    df_size = df_hg.shape[0]
-    nb_no_flyer = df_hg[df_hg['Classes']==0].shape[0]
-    nb_weak_flyer = df_hg[df_hg['Classes'] == 1].shape[0]
-    nb_intermediate_flyer = df_hg[df_hg['Classes'] == 2].shape[0]
-    nb_strong_flyer = df_hg[df_hg['Classes'] == 3].shape[0]
-    print('df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(hf_data_name,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
-
-    df_common = df_hg.join(df_full.set_index('Sequences'),on='Sequences',how='inner',lsuffix='_hg',rsuffix='_ISA')
-    size_inter = df_common.shape[0]
-    same_label = df_common[df_common['Classes']==df_common['Classes MaxLFQ']].shape[0]
-    l_inter_ISA.append(df_common)
-    print('Inter with ISA df size : {}, similar label : {:.2f}%'.format(size_inter,100*same_label/size_inter))
-
-    for df_hg_bis in l_df_hg :
-        df_common = df_hg.join(df_hg_bis.set_index('Sequences'), on='Sequences', how='inner', lsuffix='_hg',
-                               rsuffix='_hg_bis')
+from datasets import load_dataset
+from keras.src.utils.text_dataset import paths_and_labels_to_dataset
+from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
+import matplotlib.pyplot as plt
+
+def intra_dataset_varaition():
+    df_flyer_zeno = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv')
+    df_flyer_astral = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral_4.csv')
+
+    conf_matrix_zeno_maxlfq_precursor = confusion_matrix(df_flyer_zeno['Classes MaxLFQ'], df_flyer_zeno['Classes precursor'])
+    conf_matrix_zeno_maxlfq_fragments= confusion_matrix(df_flyer_zeno['Classes MaxLFQ'],df_flyer_zeno['Classes fragment'])
+    conf_matrix_zeno_fragments_precursor = confusion_matrix(df_flyer_zeno['Classes fragment'],df_flyer_zeno['Classes precursor'])
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix_zeno_maxlfq_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix Zeno (maxlfq vs precursor)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_zeno_maxlfq_precursor', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix_zeno_maxlfq_fragments, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix Zeno (maxlfq vs fragments)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_zeno_maxlfq_fragments', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix_zeno_fragments_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix Zeno (fragments vs precursor)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_zeno_fragments_precursor', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+    conf_matrix_astral_maxlfq_precursor = confusion_matrix(df_flyer_astral['Classes MaxLFQ'], df_flyer_astral['Classes precursor'])
+    conf_matrix_astral_maxlfq_fragments= confusion_matrix(df_flyer_astral['Classes MaxLFQ'],df_flyer_astral['Classes fragment'])
+    conf_matrix_astral_fragments_precursor = confusion_matrix(df_flyer_astral['Classes fragment'],df_flyer_astral['Classes precursor'])
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix_astral_maxlfq_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix astral (maxlfq vs precursor)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_astral_maxlfq_precursor', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix_astral_maxlfq_fragments, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix astral (maxlfq vs fragments)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_astral_maxlfq_fragments', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix_astral_fragments_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix astral (fragments vs precursor)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_astral_fragments_precursor', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+def ISA_dataset_variation():
+    df_flyer = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv')
+    df_no_flyer = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage.csv')
+
+    df_flyer_astral = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral.csv')
+    df_no_flyer_astral = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage_astral.csv')
+    df_flyer_Zeno = df_flyer[['Sequences','Classes MaxLFQ']]
+    df_flyer_astral = df_flyer_astral[['Sequences','Classes MaxLFQ']]
+    df_no_flyer_Zeno = df_no_flyer[['Sequences','Classes MaxLFQ']]
+    df_no_flyer_astral = df_no_flyer_astral[['Sequences','Classes MaxLFQ']]
+
+    df_zeno = pd.concat([df_flyer_Zeno,df_no_flyer_Zeno],axis=0)
+    df_astral = pd.concat([df_flyer_astral,df_no_flyer_astral],axis=0)
+    df_inter = df_zeno.join(df_astral.set_index('Sequences'),on='Sequences',how = 'inner',lsuffix ='zeno',rsuffix='astral')
+
+    df_inter_flyer = df_flyer_Zeno.join(df_flyer_astral.set_index('Sequences'),on='Sequences',how = 'inner',lsuffix ='zeno',rsuffix='astral')
+
+
+    conf_matrix= confusion_matrix(df_inter['Classes MaxLFQastral'],df_inter['Classes MaxLFQzeno'])
+
+    conf_matrix_flyer= confusion_matrix(df_inter_flyer['Classes MaxLFQastral'],df_inter_flyer['Classes MaxLFQzeno'])
+
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix, display_labels=["Non Flyer", "Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix (astral vs zeno)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_zeno_astral', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+    conf_matrix_disp = ConfusionMatrixDisplay(
+        confusion_matrix=conf_matrix_flyer, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
+    )
+    fig, ax = plt.subplots()
+    conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
+    plt.title("Confusion Matrix FLyer (astral vs zeno)", y=1.04, fontsize=11)
+    plt.savefig('confusion_matrix_flyer_zeno_astral', bbox_inches="tight", dpi=80)
+    plt.close()
+    plt.clf()
+
+def inter_dataset_corespondance():
+    df_flyer = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv')
+    df_no_flyer = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage.csv')
+    for label_type in ['Classes fragment','Classes precursor', 'Classes MaxLFQ'] :
+        df_full = pd.concat([df_flyer,df_no_flyer])
+        df_size = df_full.shape[0]
+        nb_no_flyer = df_full[df_full[label_type]==0].shape[0]
+        nb_weak_flyer = df_full[df_full[label_type] == 1].shape[0]
+        nb_intermediate_flyer = df_full[df_full[label_type] == 2].shape[0]
+        nb_strong_flyer = df_full[df_full[label_type] == 3].shape[0]
+        print('df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(label_type,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
+
+
+    df_list =["Wilhelmlab/detectability-proteometools", "Wilhelmlab/detectability-wang","Wilhelmlab/detectability-sinitcyn"]
+    l_inter_ISA=[]
+    l_df_hg=[]
+    for hf_data_name in df_list :
+
+        hf_dataset_split = load_dataset(hf_data_name)
+        l = [pd.DataFrame(hf_dataset_split[k]) for k in hf_dataset_split.keys()]
+        df_hg = pd.concat(l)
+
+        df_size = df_hg.shape[0]
+        nb_no_flyer = df_hg[df_hg['Classes']==0].shape[0]
+        nb_weak_flyer = df_hg[df_hg['Classes'] == 1].shape[0]
+        nb_intermediate_flyer = df_hg[df_hg['Classes'] == 2].shape[0]
+        nb_strong_flyer = df_hg[df_hg['Classes'] == 3].shape[0]
+        print('df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(hf_data_name,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
+
+        df_common = df_hg.join(df_full.set_index('Sequences'),on='Sequences',how='inner',lsuffix='_hg',rsuffix='_ISA')
        size_inter = df_common.shape[0]
-        same_label = df_common[df_common['Classes_hg'] == df_common['Classes_hg_bis']]
-        same_label_size = same_label.shape[0]
-        cf_matrix = pd.crosstab(df_common['Classes_hg'], df_common['Classes_hg_bis'])
-        print('Inter with df hg bis df size : {}, similar label : {:.2f}%'.format(size_inter, 100 * same_label_size / size_inter))
-        print(cf_matrix)
-    l_df_hg.append(df_hg)
-
-
-
-
+        same_label = df_common[df_common['Classes']==df_common['Classes MaxLFQ']].shape[0]
+        l_inter_ISA.append(df_common)
+        print('Inter with ISA df size : {}, similar label : {:.2f}%'.format(size_inter,100*same_label/size_inter))
+
+        for df_hg_bis in l_df_hg :
+            df_common = df_hg.join(df_hg_bis.set_index('Sequences'), on='Sequences', how='inner', lsuffix='_hg',
+                                   rsuffix='_hg_bis')
+            size_inter = df_common.shape[0]
+            same_label = df_common[df_common['Classes_hg'] == df_common['Classes_hg_bis']]
+            same_label_size = same_label.shape[0]
+            cf_matrix = pd.crosstab(df_common['Classes_hg'], df_common['Classes_hg_bis'])
+            print('Inter with df hg bis df size : {}, similar label : {:.2f}%'.format(size_inter, 100 * same_label_size / size_inter))
+            print(cf_matrix)
+        l_df_hg.append(df_hg)




--- a/dataset_extraction.py
+++ b/dataset_extraction.py
@@ -88,7 +88,7 @@ def build_dataset(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'):



-def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'):
+def build_dataset_astral(coverage_treshold = 20, min_peptide = 4):
    df = pd.read_excel('ISA_data/250505_Flyers_ASTRAL_mix_12_species.xlsx')
    df_non_flyer = pd.read_excel('ISA_data/250505_Non_flyers_ASTRAL_mix_12_species.xlsx')
    #No flyer
@@ -102,25 +102,49 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df


    #Flyer
+    quantites_table = pd.read_csv('ISA_data/250505_mix_12_souches_lib_12_especes_conta_ASTRAL_BIOASTER_quantities.csv')
    df_filtered = df[~(pd.isna(df['Proteotypic ?']))]
    df_filtered = df_filtered[df_filtered['Coverage']>=coverage_treshold]
    df_filtered = df_filtered[pd.isna(df_filtered['Miscleavage ? '])]
    peptide_count=df_filtered.groupby(["Protein.Names"]).size().reset_index(name='counts')
+    quantites_table_filtered = quantites_table[quantites_table['Modified.Sequence'].isin(df_filtered['Stripped.Sequence'])]
    filtered_sequence = peptide_count[peptide_count['counts']>=min_peptide]["Protein.Names"]
    df_filtered = df_filtered[df_filtered["Protein.Names"].isin(filtered_sequence.to_list())]
+    df_filtered = pd.merge(quantites_table_filtered, df_filtered, how='inner', left_on='Modified.Sequence',
+                           right_on='Stripped.Sequence')

    df1_grouped = df_filtered.groupby("Protein.Names")
    dico_final={}
    # iterate over each group
    for group_name, df_group in df1_grouped:
-        seq = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['Stripped.Sequence'].to_list()
+        seq = df_group.sort_values(by=['Fragment.Quant.Raw'])['Stripped.Sequence'].to_list()
+        value_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['Fragment.Quant.Raw'].to_list()
+        value_prec = df_group.sort_values(by=['Precursor.Quantity'])['Precursor.Quantity'].to_list()
+        value_prec_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['Precursor.Quantity'].to_list()
        value_maxlfq = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['20250129_ISA_MIX-1_48SPD_001'].to_list()
-        value_maxlfq_frag = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['20250129_ISA_MIX-1_48SPD_001'].to_list()
+        value_maxlfq_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['20250129_ISA_MIX-1_48SPD_001'].to_list()
+        threshold_weak_flyer_frag = value_frag[int(len(seq) / 3)]
+        threshold_medium_flyer_frag = value_frag[int(2*len(seq) / 3)]
+        threshold_weak_flyer_prec = value_prec[int(len(seq) / 3)]
+        threshold_medium_flyer_prec = value_prec[int(2 * len(seq) / 3)]
        threshold_weak_flyer_maxflq = value_maxlfq[int(len(seq) / 3)]
        threshold_medium_flyer_maxlfq = value_maxlfq[int(2 * len(seq) / 3)]
        prot = df_group['Protein.Group'].to_list()[0]

        for i in range(len(seq)):
+            if value_frag[i] < threshold_weak_flyer_frag :
+                label_frag = 1
+            elif value_frag[i] < threshold_medium_flyer_frag :
+                label_frag = 2
+            else :
+                label_frag = 3
+
+            if value_prec_frag[i] < threshold_weak_flyer_prec :
+                label_prec = 1
+            elif value_prec_frag[i] < threshold_medium_flyer_prec :
+                label_prec = 2
+            else :
+                label_prec = 3

            if value_maxlfq_frag[i] < threshold_weak_flyer_maxflq :
                label_maxlfq = 1
@@ -129,14 +153,14 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df
            else :
                label_maxlfq = 3

-            dico_final[seq[i]] = (prot,label_maxlfq)
+            dico_final[seq[i]] = (prot,label_frag,label_prec,label_maxlfq)

-    df_final = pd.DataFrame.from_dict(dico_final, orient='index',columns=['Proteins', 'Classes MaxLFQ'])
+    df_final = pd.DataFrame.from_dict(dico_final, orient='index',columns=['Proteins', 'Classes fragment','Classes precursor', 'Classes MaxLFQ'])
    df_final['Sequences']=df_final.index
    df_final = df_final.reset_index()
-    df_final=df_final[['Sequences','Proteins', 'Classes MaxLFQ']]
-    df_final.to_csv('ISA_data/df_flyer_no_miscleavage_astral_15.csv', index=False)
-    df_non_flyer.to_csv('ISA_data/df_non_flyer_no_miscleavage_astral.csv', index=False)
+    df_final=df_final[['Sequences','Proteins','Classes fragment','Classes precursor', 'Classes MaxLFQ']]
+    df_final.to_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral_4.csv', index=False)
+    df_non_flyer.to_csv('ISA_data/datasets/df_non_flyer_no_miscleavage_astral.csv', index=False)

 def build_regression_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'):

@@ -244,10 +268,11 @@ def build_dataset_regression_zeno(coverage_treshold = 20, min_peptide = 4):


 if __name__ == '__main__':
-    df_size=[]
-    for min_pep in range(4,20):
-        df = build_regression_dataset_astral(coverage_treshold=20, min_peptide=min_pep)
-        df_size.append(df.shape[0])
-    plt.clf()
-    plt.bar([i for i in range(4,20)],df_size)
-    plt.savefig('number_of_peptides_thr.png')
+    # df_size=[]
+    # for min_pep in range(4,20):
+    #     df = build_regression_dataset_astral(coverage_treshold=20, min_peptide=min_pep)
+    #     df_size.append(df.shape[0])
+    # plt.clf()
+    # plt.bar([i for i in range(4,20)],df_size)
+    # plt.savefig('number_of_peptides_thr.png')
+    build_dataset_astral()