Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
Peptide Detectability
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Léo Schneider
Peptide Detectability
Commits
ef3118f7
Commit
ef3118f7
authored
4 weeks ago
by
Schneider Leo
Browse files
Options
Downloads
Patches
Plain Diff
dataset exploration
parent
70c39f3f
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
dataset_comparison.py
+164
-50
164 additions, 50 deletions
dataset_comparison.py
dataset_extraction.py
+40
-15
40 additions, 15 deletions
dataset_extraction.py
with
204 additions
and
65 deletions
dataset_comparison.py
+
164
−
50
View file @
ef3118f7
import
pandas
as
pd
from
datasets
import
load_dataset
,
DatasetDict
df_list
=
[
"
Wilhelmlab/detectability-proteometools
"
,
"
Wilhelmlab/detectability-wang
"
,
"
Wilhelmlab/detectability-sinitcyn
"
]
df_flyer
=
pd
.
read_csv
(
'
ISA_data/df_flyer_no_miscleavage.csv
'
)
df_no_flyer
=
pd
.
read_csv
(
'
ISA_data/df_non_flyer_no_miscleavage.csv
'
)
for
label_type
in
[
'
Classes fragment
'
,
'
Classes precursor
'
,
'
Classes MaxLFQ
'
]
:
df_full
=
pd
.
concat
([
df_flyer
,
df_no_flyer
])
df_size
=
df_full
.
shape
[
0
]
nb_no_flyer
=
df_full
[
df_full
[
label_type
]
==
0
].
shape
[
0
]
nb_weak_flyer
=
df_full
[
df_full
[
label_type
]
==
1
].
shape
[
0
]
nb_intermediate_flyer
=
df_full
[
df_full
[
label_type
]
==
2
].
shape
[
0
]
nb_strong_flyer
=
df_full
[
df_full
[
label_type
]
==
3
].
shape
[
0
]
print
(
'
df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%
'
.
format
(
label_type
,
100
*
nb_no_flyer
/
df_size
,
100
*
nb_weak_flyer
/
df_size
,
100
*
nb_intermediate_flyer
/
df_size
,
100
*
nb_strong_flyer
/
df_size
))
l_inter_ISA
=
[]
l_df_hg
=
[]
for
hf_data_name
in
df_list
:
hf_dataset_split
=
load_dataset
(
hf_data_name
)
l
=
[
pd
.
DataFrame
(
hf_dataset_split
[
k
])
for
k
in
hf_dataset_split
.
keys
()]
df_hg
=
pd
.
concat
(
l
)
df_size
=
df_hg
.
shape
[
0
]
nb_no_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
0
].
shape
[
0
]
nb_weak_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
1
].
shape
[
0
]
nb_intermediate_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
2
].
shape
[
0
]
nb_strong_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
3
].
shape
[
0
]
print
(
'
df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%
'
.
format
(
hf_data_name
,
100
*
nb_no_flyer
/
df_size
,
100
*
nb_weak_flyer
/
df_size
,
100
*
nb_intermediate_flyer
/
df_size
,
100
*
nb_strong_flyer
/
df_size
))
df_common
=
df_hg
.
join
(
df_full
.
set_index
(
'
Sequences
'
),
on
=
'
Sequences
'
,
how
=
'
inner
'
,
lsuffix
=
'
_hg
'
,
rsuffix
=
'
_ISA
'
)
size_inter
=
df_common
.
shape
[
0
]
same_label
=
df_common
[
df_common
[
'
Classes
'
]
==
df_common
[
'
Classes MaxLFQ
'
]].
shape
[
0
]
l_inter_ISA
.
append
(
df_common
)
print
(
'
Inter with ISA df size : {}, similar label : {:.2f}%
'
.
format
(
size_inter
,
100
*
same_label
/
size_inter
))
for
df_hg_bis
in
l_df_hg
:
df_common
=
df_hg
.
join
(
df_hg_bis
.
set_index
(
'
Sequences
'
),
on
=
'
Sequences
'
,
how
=
'
inner
'
,
lsuffix
=
'
_hg
'
,
rsuffix
=
'
_hg_bis
'
)
from
datasets
import
load_dataset
from
keras.src.utils.text_dataset
import
paths_and_labels_to_dataset
from
sklearn.metrics
import
ConfusionMatrixDisplay
,
confusion_matrix
import
matplotlib.pyplot
as
plt
def
intra_dataset_varaition
():
df_flyer_zeno
=
pd
.
read_csv
(
'
ISA_data/datasets/df_flyer_no_miscleavage.csv
'
)
df_flyer_astral
=
pd
.
read_csv
(
'
ISA_data/datasets/df_flyer_no_miscleavage_astral_4.csv
'
)
conf_matrix_zeno_maxlfq_precursor
=
confusion_matrix
(
df_flyer_zeno
[
'
Classes MaxLFQ
'
],
df_flyer_zeno
[
'
Classes precursor
'
])
conf_matrix_zeno_maxlfq_fragments
=
confusion_matrix
(
df_flyer_zeno
[
'
Classes MaxLFQ
'
],
df_flyer_zeno
[
'
Classes fragment
'
])
conf_matrix_zeno_fragments_precursor
=
confusion_matrix
(
df_flyer_zeno
[
'
Classes fragment
'
],
df_flyer_zeno
[
'
Classes precursor
'
])
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix_zeno_maxlfq_precursor
,
display_labels
=
[
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix Zeno (maxlfq vs precursor)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_zeno_maxlfq_precursor
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix_zeno_maxlfq_fragments
,
display_labels
=
[
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix Zeno (maxlfq vs fragments)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_zeno_maxlfq_fragments
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix_zeno_fragments_precursor
,
display_labels
=
[
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix Zeno (fragments vs precursor)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_zeno_fragments_precursor
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
conf_matrix_astral_maxlfq_precursor
=
confusion_matrix
(
df_flyer_astral
[
'
Classes MaxLFQ
'
],
df_flyer_astral
[
'
Classes precursor
'
])
conf_matrix_astral_maxlfq_fragments
=
confusion_matrix
(
df_flyer_astral
[
'
Classes MaxLFQ
'
],
df_flyer_astral
[
'
Classes fragment
'
])
conf_matrix_astral_fragments_precursor
=
confusion_matrix
(
df_flyer_astral
[
'
Classes fragment
'
],
df_flyer_astral
[
'
Classes precursor
'
])
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix_astral_maxlfq_precursor
,
display_labels
=
[
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix astral (maxlfq vs precursor)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_astral_maxlfq_precursor
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix_astral_maxlfq_fragments
,
display_labels
=
[
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix astral (maxlfq vs fragments)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_astral_maxlfq_fragments
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix_astral_fragments_precursor
,
display_labels
=
[
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix astral (fragments vs precursor)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_astral_fragments_precursor
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
def
ISA_dataset_variation
():
df_flyer
=
pd
.
read_csv
(
'
ISA_data/datasets/df_flyer_no_miscleavage.csv
'
)
df_no_flyer
=
pd
.
read_csv
(
'
ISA_data/datasets/df_non_flyer_no_miscleavage.csv
'
)
df_flyer_astral
=
pd
.
read_csv
(
'
ISA_data/datasets/df_flyer_no_miscleavage_astral.csv
'
)
df_no_flyer_astral
=
pd
.
read_csv
(
'
ISA_data/datasets/df_non_flyer_no_miscleavage_astral.csv
'
)
df_flyer_Zeno
=
df_flyer
[[
'
Sequences
'
,
'
Classes MaxLFQ
'
]]
df_flyer_astral
=
df_flyer_astral
[[
'
Sequences
'
,
'
Classes MaxLFQ
'
]]
df_no_flyer_Zeno
=
df_no_flyer
[[
'
Sequences
'
,
'
Classes MaxLFQ
'
]]
df_no_flyer_astral
=
df_no_flyer_astral
[[
'
Sequences
'
,
'
Classes MaxLFQ
'
]]
df_zeno
=
pd
.
concat
([
df_flyer_Zeno
,
df_no_flyer_Zeno
],
axis
=
0
)
df_astral
=
pd
.
concat
([
df_flyer_astral
,
df_no_flyer_astral
],
axis
=
0
)
df_inter
=
df_zeno
.
join
(
df_astral
.
set_index
(
'
Sequences
'
),
on
=
'
Sequences
'
,
how
=
'
inner
'
,
lsuffix
=
'
zeno
'
,
rsuffix
=
'
astral
'
)
df_inter_flyer
=
df_flyer_Zeno
.
join
(
df_flyer_astral
.
set_index
(
'
Sequences
'
),
on
=
'
Sequences
'
,
how
=
'
inner
'
,
lsuffix
=
'
zeno
'
,
rsuffix
=
'
astral
'
)
conf_matrix
=
confusion_matrix
(
df_inter
[
'
Classes MaxLFQastral
'
],
df_inter
[
'
Classes MaxLFQzeno
'
])
conf_matrix_flyer
=
confusion_matrix
(
df_inter_flyer
[
'
Classes MaxLFQastral
'
],
df_inter_flyer
[
'
Classes MaxLFQzeno
'
])
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix
,
display_labels
=
[
"
Non Flyer
"
,
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix (astral vs zeno)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_zeno_astral
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
conf_matrix_disp
=
ConfusionMatrixDisplay
(
confusion_matrix
=
conf_matrix_flyer
,
display_labels
=
[
"
Weak Flyer
"
,
"
Medium Flyer
"
,
'
Strong Flyer
'
]
)
fig
,
ax
=
plt
.
subplots
()
conf_matrix_disp
.
plot
(
xticks_rotation
=
45
,
ax
=
ax
)
plt
.
title
(
"
Confusion Matrix FLyer (astral vs zeno)
"
,
y
=
1.04
,
fontsize
=
11
)
plt
.
savefig
(
'
confusion_matrix_flyer_zeno_astral
'
,
bbox_inches
=
"
tight
"
,
dpi
=
80
)
plt
.
close
()
plt
.
clf
()
def
inter_dataset_corespondance
():
df_flyer
=
pd
.
read_csv
(
'
ISA_data/datasets/df_flyer_no_miscleavage.csv
'
)
df_no_flyer
=
pd
.
read_csv
(
'
ISA_data/datasets/df_non_flyer_no_miscleavage.csv
'
)
for
label_type
in
[
'
Classes fragment
'
,
'
Classes precursor
'
,
'
Classes MaxLFQ
'
]
:
df_full
=
pd
.
concat
([
df_flyer
,
df_no_flyer
])
df_size
=
df_full
.
shape
[
0
]
nb_no_flyer
=
df_full
[
df_full
[
label_type
]
==
0
].
shape
[
0
]
nb_weak_flyer
=
df_full
[
df_full
[
label_type
]
==
1
].
shape
[
0
]
nb_intermediate_flyer
=
df_full
[
df_full
[
label_type
]
==
2
].
shape
[
0
]
nb_strong_flyer
=
df_full
[
df_full
[
label_type
]
==
3
].
shape
[
0
]
print
(
'
df ISA {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%
'
.
format
(
label_type
,
100
*
nb_no_flyer
/
df_size
,
100
*
nb_weak_flyer
/
df_size
,
100
*
nb_intermediate_flyer
/
df_size
,
100
*
nb_strong_flyer
/
df_size
))
df_list
=
[
"
Wilhelmlab/detectability-proteometools
"
,
"
Wilhelmlab/detectability-wang
"
,
"
Wilhelmlab/detectability-sinitcyn
"
]
l_inter_ISA
=
[]
l_df_hg
=
[]
for
hf_data_name
in
df_list
:
hf_dataset_split
=
load_dataset
(
hf_data_name
)
l
=
[
pd
.
DataFrame
(
hf_dataset_split
[
k
])
for
k
in
hf_dataset_split
.
keys
()]
df_hg
=
pd
.
concat
(
l
)
df_size
=
df_hg
.
shape
[
0
]
nb_no_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
0
].
shape
[
0
]
nb_weak_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
1
].
shape
[
0
]
nb_intermediate_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
2
].
shape
[
0
]
nb_strong_flyer
=
df_hg
[
df_hg
[
'
Classes
'
]
==
3
].
shape
[
0
]
print
(
'
df {} class repartition : No flyer {:.2f}%, Weak flyer {:.2f}%, Intermediate flyer {:.2f}%, Strong flyer {:.2f}%
'
.
format
(
hf_data_name
,
100
*
nb_no_flyer
/
df_size
,
100
*
nb_weak_flyer
/
df_size
,
100
*
nb_intermediate_flyer
/
df_size
,
100
*
nb_strong_flyer
/
df_size
))
df_common
=
df_hg
.
join
(
df_full
.
set_index
(
'
Sequences
'
),
on
=
'
Sequences
'
,
how
=
'
inner
'
,
lsuffix
=
'
_hg
'
,
rsuffix
=
'
_ISA
'
)
size_inter
=
df_common
.
shape
[
0
]
same_label
=
df_common
[
df_common
[
'
Classes_hg
'
]
==
df_common
[
'
Classes_hg_bis
'
]]
same_label_size
=
same_label
.
shape
[
0
]
cf_matrix
=
pd
.
crosstab
(
df_common
[
'
Classes_hg
'
],
df_common
[
'
Classes_hg_bis
'
])
print
(
'
Inter with df hg bis df size : {}, similar label : {:.2f}%
'
.
format
(
size_inter
,
100
*
same_label_size
/
size_inter
))
print
(
cf_matrix
)
l_df_hg
.
append
(
df_hg
)
same_label
=
df_common
[
df_common
[
'
Classes
'
]
==
df_common
[
'
Classes MaxLFQ
'
]].
shape
[
0
]
l_inter_ISA
.
append
(
df_common
)
print
(
'
Inter with ISA df size : {}, similar label : {:.2f}%
'
.
format
(
size_inter
,
100
*
same_label
/
size_inter
))
for
df_hg_bis
in
l_df_hg
:
df_common
=
df_hg
.
join
(
df_hg_bis
.
set_index
(
'
Sequences
'
),
on
=
'
Sequences
'
,
how
=
'
inner
'
,
lsuffix
=
'
_hg
'
,
rsuffix
=
'
_hg_bis
'
)
size_inter
=
df_common
.
shape
[
0
]
same_label
=
df_common
[
df_common
[
'
Classes_hg
'
]
==
df_common
[
'
Classes_hg_bis
'
]]
same_label_size
=
same_label
.
shape
[
0
]
cf_matrix
=
pd
.
crosstab
(
df_common
[
'
Classes_hg
'
],
df_common
[
'
Classes_hg_bis
'
])
print
(
'
Inter with df hg bis df size : {}, similar label : {:.2f}%
'
.
format
(
size_inter
,
100
*
same_label_size
/
size_inter
))
print
(
cf_matrix
)
l_df_hg
.
append
(
df_hg
)
...
...
This diff is collapsed.
Click to expand it.
dataset_extraction.py
+
40
−
15
View file @
ef3118f7
...
...
@@ -88,7 +88,7 @@ def build_dataset(coverage_treshold = 20, min_peptide = 4, f_name='out_df.csv'):
def
build_dataset_astral
(
coverage_treshold
=
20
,
min_peptide
=
4
,
f_name
=
'
out_df.csv
'
):
def
build_dataset_astral
(
coverage_treshold
=
20
,
min_peptide
=
4
):
df
=
pd
.
read_excel
(
'
ISA_data/250505_Flyers_ASTRAL_mix_12_species.xlsx
'
)
df_non_flyer
=
pd
.
read_excel
(
'
ISA_data/250505_Non_flyers_ASTRAL_mix_12_species.xlsx
'
)
#No flyer
...
...
@@ -102,25 +102,49 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df
#Flyer
quantites_table
=
pd
.
read_csv
(
'
ISA_data/250505_mix_12_souches_lib_12_especes_conta_ASTRAL_BIOASTER_quantities.csv
'
)
df_filtered
=
df
[
~
(
pd
.
isna
(
df
[
'
Proteotypic ?
'
]))]
df_filtered
=
df_filtered
[
df_filtered
[
'
Coverage
'
]
>=
coverage_treshold
]
df_filtered
=
df_filtered
[
pd
.
isna
(
df_filtered
[
'
Miscleavage ?
'
])]
peptide_count
=
df_filtered
.
groupby
([
"
Protein.Names
"
]).
size
().
reset_index
(
name
=
'
counts
'
)
quantites_table_filtered
=
quantites_table
[
quantites_table
[
'
Modified.Sequence
'
].
isin
(
df_filtered
[
'
Stripped.Sequence
'
])]
filtered_sequence
=
peptide_count
[
peptide_count
[
'
counts
'
]
>=
min_peptide
][
"
Protein.Names
"
]
df_filtered
=
df_filtered
[
df_filtered
[
"
Protein.Names
"
].
isin
(
filtered_sequence
.
to_list
())]
df_filtered
=
pd
.
merge
(
quantites_table_filtered
,
df_filtered
,
how
=
'
inner
'
,
left_on
=
'
Modified.Sequence
'
,
right_on
=
'
Stripped.Sequence
'
)
df1_grouped
=
df_filtered
.
groupby
(
"
Protein.Names
"
)
dico_final
=
{}
# iterate over each group
for
group_name
,
df_group
in
df1_grouped
:
seq
=
df_group
.
sort_values
(
by
=
[
'
20250129_ISA_MIX-1_48SPD_001
'
])[
'
Stripped.Sequence
'
].
to_list
()
seq
=
df_group
.
sort_values
(
by
=
[
'
Fragment.Quant.Raw
'
])[
'
Stripped.Sequence
'
].
to_list
()
value_frag
=
df_group
.
sort_values
(
by
=
[
'
Fragment.Quant.Raw
'
])[
'
Fragment.Quant.Raw
'
].
to_list
()
value_prec
=
df_group
.
sort_values
(
by
=
[
'
Precursor.Quantity
'
])[
'
Precursor.Quantity
'
].
to_list
()
value_prec_frag
=
df_group
.
sort_values
(
by
=
[
'
Fragment.Quant.Raw
'
])[
'
Precursor.Quantity
'
].
to_list
()
value_maxlfq
=
df_group
.
sort_values
(
by
=
[
'
20250129_ISA_MIX-1_48SPD_001
'
])[
'
20250129_ISA_MIX-1_48SPD_001
'
].
to_list
()
value_maxlfq_frag
=
df_group
.
sort_values
(
by
=
[
'
20250129_ISA_MIX-1_48SPD_001
'
])[
'
20250129_ISA_MIX-1_48SPD_001
'
].
to_list
()
value_maxlfq_frag
=
df_group
.
sort_values
(
by
=
[
'
Fragment.Quant.Raw
'
])[
'
20250129_ISA_MIX-1_48SPD_001
'
].
to_list
()
threshold_weak_flyer_frag
=
value_frag
[
int
(
len
(
seq
)
/
3
)]
threshold_medium_flyer_frag
=
value_frag
[
int
(
2
*
len
(
seq
)
/
3
)]
threshold_weak_flyer_prec
=
value_prec
[
int
(
len
(
seq
)
/
3
)]
threshold_medium_flyer_prec
=
value_prec
[
int
(
2
*
len
(
seq
)
/
3
)]
threshold_weak_flyer_maxflq
=
value_maxlfq
[
int
(
len
(
seq
)
/
3
)]
threshold_medium_flyer_maxlfq
=
value_maxlfq
[
int
(
2
*
len
(
seq
)
/
3
)]
prot
=
df_group
[
'
Protein.Group
'
].
to_list
()[
0
]
for
i
in
range
(
len
(
seq
)):
if
value_frag
[
i
]
<
threshold_weak_flyer_frag
:
label_frag
=
1
elif
value_frag
[
i
]
<
threshold_medium_flyer_frag
:
label_frag
=
2
else
:
label_frag
=
3
if
value_prec_frag
[
i
]
<
threshold_weak_flyer_prec
:
label_prec
=
1
elif
value_prec_frag
[
i
]
<
threshold_medium_flyer_prec
:
label_prec
=
2
else
:
label_prec
=
3
if
value_maxlfq_frag
[
i
]
<
threshold_weak_flyer_maxflq
:
label_maxlfq
=
1
...
...
@@ -129,14 +153,14 @@ def build_dataset_astral(coverage_treshold = 20, min_peptide = 4, f_name='out_df
else
:
label_maxlfq
=
3
dico_final
[
seq
[
i
]]
=
(
prot
,
label_maxlfq
)
dico_final
[
seq
[
i
]]
=
(
prot
,
label_
frag
,
label_prec
,
label_
maxlfq
)
df_final
=
pd
.
DataFrame
.
from_dict
(
dico_final
,
orient
=
'
index
'
,
columns
=
[
'
Proteins
'
,
'
Classes MaxLFQ
'
])
df_final
=
pd
.
DataFrame
.
from_dict
(
dico_final
,
orient
=
'
index
'
,
columns
=
[
'
Proteins
'
,
'
Classes
fragment
'
,
'
Classes precursor
'
,
'
Classes
MaxLFQ
'
])
df_final
[
'
Sequences
'
]
=
df_final
.
index
df_final
=
df_final
.
reset_index
()
df_final
=
df_final
[[
'
Sequences
'
,
'
Proteins
'
,
'
Classes MaxLFQ
'
]]
df_final
.
to_csv
(
'
ISA_data/df_flyer_no_miscleavage_astral_
15
.csv
'
,
index
=
False
)
df_non_flyer
.
to_csv
(
'
ISA_data/df_non_flyer_no_miscleavage_astral.csv
'
,
index
=
False
)
df_final
=
df_final
[[
'
Sequences
'
,
'
Proteins
'
,
'
Classes fragment
'
,
'
Classes precursor
'
,
'
Classes MaxLFQ
'
]]
df_final
.
to_csv
(
'
ISA_data/
datasets/
df_flyer_no_miscleavage_astral_
4
.csv
'
,
index
=
False
)
df_non_flyer
.
to_csv
(
'
ISA_data/
datasets/
df_non_flyer_no_miscleavage_astral.csv
'
,
index
=
False
)
def
build_regression_dataset_astral
(
coverage_treshold
=
20
,
min_peptide
=
4
,
f_name
=
'
out_df.csv
'
):
...
...
@@ -244,10 +268,11 @@ def build_dataset_regression_zeno(coverage_treshold = 20, min_peptide = 4):
if
__name__
==
'
__main__
'
:
df_size
=
[]
for
min_pep
in
range
(
4
,
20
):
df
=
build_regression_dataset_astral
(
coverage_treshold
=
20
,
min_peptide
=
min_pep
)
df_size
.
append
(
df
.
shape
[
0
])
plt
.
clf
()
plt
.
bar
([
i
for
i
in
range
(
4
,
20
)],
df_size
)
plt
.
savefig
(
'
number_of_peptides_thr.png
'
)
# df_size=[]
# for min_pep in range(4,20):
# df = build_regression_dataset_astral(coverage_treshold=20, min_peptide=min_pep)
# df_size.append(df.shape[0])
# plt.clf()
# plt.bar([i for i in range(4,20)],df_size)
# plt.savefig('number_of_peptides_thr.png')
build_dataset_astral
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment