Skip to content
Snippets Groups Projects
Commit ef3118f7 authored by Schneider Leo's avatar Schneider Leo
Browse files

dataset exploration

parent 70c39f3f
No related branches found
No related tags found
No related merge requests found
import pandas as pd
from datasets import load_dataset, DatasetDict
df_list =["Wilhelmlab/detectability-proteometools", "Wilhelmlab/detectability-wang","Wilhelmlab/detectability-sinitcyn"]
df_flyer = pd.read_csv('ISA_data/df_flyer_no_miscleavage.csv')
df_no_flyer = pd.read_csv('ISA_data/df_non_flyer_no_miscleavage.csv')
for label_type in ['Classes fragment','Classes precursor', 'Classes MaxLFQ'] :
df_full = pd.concat([df_flyer,df_no_flyer])
df_size = df_full.shape[0]
nb_no_flyer = df_full[df_full[label_type]==0].shape[0]
nb_weak_flyer = df_full[df_full[label_type] == 1].shape[0]
nb_intermediate_flyer = df_full[df_full[label_type] == 2].shape[0]
nb_strong_flyer = df_full[df_full[label_type] == 3].shape[0]
print('df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(label_type,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
l_inter_ISA=[]
l_df_hg=[]
for hf_data_name in df_list :
hf_dataset_split = load_dataset(hf_data_name)
l = [pd.DataFrame(hf_dataset_split[k]) for k in hf_dataset_split.keys()]
df_hg = pd.concat(l)
df_size = df_hg.shape[0]
nb_no_flyer = df_hg[df_hg['Classes']==0].shape[0]
nb_weak_flyer = df_hg[df_hg['Classes'] == 1].shape[0]
nb_intermediate_flyer = df_hg[df_hg['Classes'] == 2].shape[0]
nb_strong_flyer = df_hg[df_hg['Classes'] == 3].shape[0]
print('df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(hf_data_name,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
df_common = df_hg.join(df_full.set_index('Sequences'),on='Sequences',how='inner',lsuffix='_hg',rsuffix='_ISA')
size_inter = df_common.shape[0]
same_label = df_common[df_common['Classes']==df_common['Classes MaxLFQ']].shape[0]
l_inter_ISA.append(df_common)
print('Inter with ISA df size : {}, similar label : {:.2f}%'.format(size_inter,100*same_label/size_inter))
for df_hg_bis in l_df_hg :
df_common = df_hg.join(df_hg_bis.set_index('Sequences'), on='Sequences', how='inner', lsuffix='_hg',
rsuffix='_hg_bis')
from datasets import load_dataset
from keras.src.utils.text_dataset import paths_and_labels_to_dataset
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
def intra_dataset_varaition():
df_flyer_zeno = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv')
df_flyer_astral = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral_4.csv')
conf_matrix_zeno_maxlfq_precursor = confusion_matrix(df_flyer_zeno['Classes MaxLFQ'], df_flyer_zeno['Classes precursor'])
conf_matrix_zeno_maxlfq_fragments= confusion_matrix(df_flyer_zeno['Classes MaxLFQ'],df_flyer_zeno['Classes fragment'])
conf_matrix_zeno_fragments_precursor = confusion_matrix(df_flyer_zeno['Classes fragment'],df_flyer_zeno['Classes precursor'])
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix_zeno_maxlfq_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix Zeno (maxlfq vs precursor)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_zeno_maxlfq_precursor', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix_zeno_maxlfq_fragments, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix Zeno (maxlfq vs fragments)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_zeno_maxlfq_fragments', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix_zeno_fragments_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix Zeno (fragments vs precursor)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_zeno_fragments_precursor', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
conf_matrix_astral_maxlfq_precursor = confusion_matrix(df_flyer_astral['Classes MaxLFQ'], df_flyer_astral['Classes precursor'])
conf_matrix_astral_maxlfq_fragments= confusion_matrix(df_flyer_astral['Classes MaxLFQ'],df_flyer_astral['Classes fragment'])
conf_matrix_astral_fragments_precursor = confusion_matrix(df_flyer_astral['Classes fragment'],df_flyer_astral['Classes precursor'])
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix_astral_maxlfq_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix astral (maxlfq vs precursor)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_astral_maxlfq_precursor', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix_astral_maxlfq_fragments, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix astral (maxlfq vs fragments)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_astral_maxlfq_fragments', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix_astral_fragments_precursor, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix astral (fragments vs precursor)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_astral_fragments_precursor', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
def ISA_dataset_variation():
df_flyer = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv')
df_no_flyer = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage.csv')
df_flyer_astral = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral.csv')
df_no_flyer_astral = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage_astral.csv')
df_flyer_Zeno = df_flyer[['Sequences','Classes MaxLFQ']]
df_flyer_astral = df_flyer_astral[['Sequences','Classes MaxLFQ']]
df_no_flyer_Zeno = df_no_flyer[['Sequences','Classes MaxLFQ']]
df_no_flyer_astral = df_no_flyer_astral[['Sequences','Classes MaxLFQ']]
df_zeno = pd.concat([df_flyer_Zeno,df_no_flyer_Zeno],axis=0)
df_astral = pd.concat([df_flyer_astral,df_no_flyer_astral],axis=0)
df_inter = df_zeno.join(df_astral.set_index('Sequences'),on='Sequences',how = 'inner',lsuffix ='zeno',rsuffix='astral')
df_inter_flyer = df_flyer_Zeno.join(df_flyer_astral.set_index('Sequences'),on='Sequences',how = 'inner',lsuffix ='zeno',rsuffix='astral')
conf_matrix= confusion_matrix(df_inter['Classes MaxLFQastral'],df_inter['Classes MaxLFQzeno'])
conf_matrix_flyer= confusion_matrix(df_inter_flyer['Classes MaxLFQastral'],df_inter_flyer['Classes MaxLFQzeno'])
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix, display_labels=["Non Flyer", "Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix (astral vs zeno)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_zeno_astral', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
conf_matrix_disp = ConfusionMatrixDisplay(
confusion_matrix=conf_matrix_flyer, display_labels=["Weak Flyer", "Medium Flyer", 'Strong Flyer']
)
fig, ax = plt.subplots()
conf_matrix_disp.plot(xticks_rotation=45, ax=ax)
plt.title("Confusion Matrix FLyer (astral vs zeno)", y=1.04, fontsize=11)
plt.savefig('confusion_matrix_flyer_zeno_astral', bbox_inches="tight", dpi=80)
plt.close()
plt.clf()
def inter_dataset_corespondance():
df_flyer = pd.read_csv('ISA_data/datasets/df_flyer_no_miscleavage.csv')
df_no_flyer = pd.read_csv('ISA_data/datasets/df_non_flyer_no_miscleavage.csv')
for label_type in ['Classes fragment','Classes precursor', 'Classes MaxLFQ'] :
df_full = pd.concat([df_flyer,df_no_flyer])
df_size = df_full.shape[0]
nb_no_flyer = df_full[df_full[label_type]==0].shape[0]
nb_weak_flyer = df_full[df_full[label_type] == 1].shape[0]
nb_intermediate_flyer = df_full[df_full[label_type] == 2].shape[0]
nb_strong_flyer = df_full[df_full[label_type] == 3].shape[0]
print('df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(label_type,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
df_list =["Wilhelmlab/detectability-proteometools", "Wilhelmlab/detectability-wang","Wilhelmlab/detectability-sinitcyn"]
l_inter_ISA=[]
l_df_hg=[]
for hf_data_name in df_list :
hf_dataset_split = load_dataset(hf_data_name)
l = [pd.DataFrame(hf_dataset_split[k]) for k in hf_dataset_split.keys()]
df_hg = pd.concat(l)
df_size = df_hg.shape[0]
nb_no_flyer = df_hg[df_hg['Classes']==0].shape[0]
nb_weak_flyer = df_hg[df_hg['Classes'] == 1].shape[0]
nb_intermediate_flyer = df_hg[df_hg['Classes'] == 2].shape[0]
nb_strong_flyer = df_hg[df_hg['Classes'] == 3].shape[0]
print('df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%'.format(hf_data_name,100*nb_no_flyer/df_size,100*nb_weak_flyer/df_size,100*nb_intermediate_flyer/df_size,100*nb_strong_flyer/df_size))
df_common = df_hg.join(df_full.set_index('Sequences'),on='Sequences',how='inner',lsuffix='_hg',rsuffix='_ISA')
size_inter = df_common.shape[0]
same_label = df_common[df_common['Classes_hg'] == df_common['Classes_hg_bis']]
same_label_size = same_label.shape[0]
cf_matrix = pd.crosstab(df_common['Classes_hg'], df_common['Classes_hg_bis'])
print('Inter with df hg bis df size : {}, similar label : {:.2f}%'.format(size_inter, 100 * same_label_size / size_inter))
print(cf_matrix)
l_df_hg.append(df_hg)
same_label = df_common[df_common['Classes']==df_common['Classes MaxLFQ']].shape[0]
l_inter_ISA.append(df_common)
print('Inter with ISA df size : {}, similar label : {:.2f}%'.format(size_inter,100*same_label/size_inter))
for df_hg_bis in l_df_hg :
df_common = df_hg.join(df_hg_bis.set_index('Sequences'), on='Sequences', how='inner', lsuffix='_hg',
rsuffix='_hg_bis')
size_inter = df_common.shape[0]
same_label = df_common[df_common['Classes_hg'] == df_common['Classes_hg_bis']]
same_label_size = same_label.shape[0]
cf_matrix = pd.crosstab(df_common['Classes_hg'], df_common['Classes_hg_bis'])
print('Inter with df hg bis df size : {}, similar label : {:.2f}%'.format(size_inter, 100 * same_label_size / size_inter))
print(cf_matrix)
l_df_hg.append(df_hg)
......
......@@ -88,7 +88,7 @@ def build_dataset(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'):
def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'):
def build_dataset_astral(coverage_treshold = 20, min_peptide = 4):
df = pd.read_excel('ISA_data/250505_Flyers_ASTRAL_mix_12_species.xlsx')
df_non_flyer = pd.read_excel('ISA_data/250505_Non_flyers_ASTRAL_mix_12_species.xlsx')
#No flyer
......@@ -102,25 +102,49 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df
#Flyer
quantites_table = pd.read_csv('ISA_data/250505_mix_12_souches_lib_12_especes_conta_ASTRAL_BIOASTER_quantities.csv')
df_filtered = df[~(pd.isna(df['Proteotypic ?']))]
df_filtered = df_filtered[df_filtered['Coverage']>=coverage_treshold]
df_filtered = df_filtered[pd.isna(df_filtered['Miscleavage ? '])]
peptide_count=df_filtered.groupby(["Protein.Names"]).size().reset_index(name='counts')
quantites_table_filtered = quantites_table[quantites_table['Modified.Sequence'].isin(df_filtered['Stripped.Sequence'])]
filtered_sequence = peptide_count[peptide_count['counts']>=min_peptide]["Protein.Names"]
df_filtered = df_filtered[df_filtered["Protein.Names"].isin(filtered_sequence.to_list())]
df_filtered = pd.merge(quantites_table_filtered, df_filtered, how='inner', left_on='Modified.Sequence',
right_on='Stripped.Sequence')
df1_grouped = df_filtered.groupby("Protein.Names")
dico_final={}
# iterate over each group
for group_name, df_group in df1_grouped:
seq = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['Stripped.Sequence'].to_list()
seq = df_group.sort_values(by=['Fragment.Quant.Raw'])['Stripped.Sequence'].to_list()
value_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['Fragment.Quant.Raw'].to_list()
value_prec = df_group.sort_values(by=['Precursor.Quantity'])['Precursor.Quantity'].to_list()
value_prec_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['Precursor.Quantity'].to_list()
value_maxlfq = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['20250129_ISA_MIX-1_48SPD_001'].to_list()
value_maxlfq_frag = df_group.sort_values(by=['20250129_ISA_MIX-1_48SPD_001'])['20250129_ISA_MIX-1_48SPD_001'].to_list()
value_maxlfq_frag = df_group.sort_values(by=['Fragment.Quant.Raw'])['20250129_ISA_MIX-1_48SPD_001'].to_list()
threshold_weak_flyer_frag = value_frag[int(len(seq) / 3)]
threshold_medium_flyer_frag = value_frag[int(2*len(seq) / 3)]
threshold_weak_flyer_prec = value_prec[int(len(seq) / 3)]
threshold_medium_flyer_prec = value_prec[int(2 * len(seq) / 3)]
threshold_weak_flyer_maxflq = value_maxlfq[int(len(seq) / 3)]
threshold_medium_flyer_maxlfq = value_maxlfq[int(2 * len(seq) / 3)]
prot = df_group['Protein.Group'].to_list()[0]
for i in range(len(seq)):
if value_frag[i] < threshold_weak_flyer_frag :
label_frag = 1
elif value_frag[i] < threshold_medium_flyer_frag :
label_frag = 2
else :
label_frag = 3
if value_prec_frag[i] < threshold_weak_flyer_prec :
label_prec = 1
elif value_prec_frag[i] < threshold_medium_flyer_prec :
label_prec = 2
else :
label_prec = 3
if value_maxlfq_frag[i] < threshold_weak_flyer_maxflq :
label_maxlfq = 1
......@@ -129,14 +153,14 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df
else :
label_maxlfq = 3
dico_final[seq[i]] = (prot,label_maxlfq)
dico_final[seq[i]] = (prot,label_frag,label_prec,label_maxlfq)
df_final = pd.DataFrame.from_dict(dico_final, orient='index',columns=['Proteins', 'Classes MaxLFQ'])
df_final = pd.DataFrame.from_dict(dico_final, orient='index',columns=['Proteins', 'Classes fragment','Classes precursor', 'Classes MaxLFQ'])
df_final['Sequences']=df_final.index
df_final = df_final.reset_index()
df_final=df_final[['Sequences','Proteins', 'Classes MaxLFQ']]
df_final.to_csv('ISA_data/df_flyer_no_miscleavage_astral_15.csv', index=False)
df_non_flyer.to_csv('ISA_data/df_non_flyer_no_miscleavage_astral.csv', index=False)
df_final=df_final[['Sequences','Proteins','Classes fragment','Classes precursor', 'Classes MaxLFQ']]
df_final.to_csv('ISA_data/datasets/df_flyer_no_miscleavage_astral_4.csv', index=False)
df_non_flyer.to_csv('ISA_data/datasets/df_non_flyer_no_miscleavage_astral.csv', index=False)
def build_regression_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'):
......@@ -244,10 +268,11 @@ def build_dataset_regression_zeno(coverage_treshold = 20, min_peptide = 4):
if __name__ == '__main__':
df_size=[]
for min_pep in range(4,20):
df = build_regression_dataset_astral(coverage_treshold=20, min_peptide=min_pep)
df_size.append(df.shape[0])
plt.clf()
plt.bar([i for i in range(4,20)],df_size)
plt.savefig('number_of_peptides_thr.png')
# df_size=[]
# for min_pep in range(4,20):
# df = build_regression_dataset_astral(coverage_treshold=20, min_peptide=min_pep)
# df_size.append(df.shape[0])
# plt.clf()
# plt.bar([i for i in range(4,20)],df_size)
# plt.savefig('number_of_peptides_thr.png')
build_dataset_astral()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment