Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
pseudo_image
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Léo Schneider
pseudo_image
Commits
c83febd6
Commit
c83febd6
authored
2 months ago
by
Schneider Leo
Browse files
Options
Downloads
Patches
Plain Diff
fix : optimal peptide set extraction
parent
dc91f876
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
image_ref/utils.py
+72
-54
72 additions, 54 deletions
image_ref/utils.py
with
72 additions
and
54 deletions
image_ref/utils.py
+
72
−
54
View file @
c83febd6
...
@@ -7,6 +7,14 @@ from analyse_diann_digestion import load_lib
...
@@ -7,6 +7,14 @@ from analyse_diann_digestion import load_lib
import
matplotlib.image
as
mpimg
import
matplotlib.image
as
mpimg
import
re
import
re
SPECIES
=
[
'
Citrobacter cronae
'
,
'
Citrobacter portucalensis
'
,
'
Enterobacter hormaechei
'
,
'
Klebsiella aerogenes
'
,
'
Klebsiella variicola
'
,
'
Raoultella ornithinolytica
'
,
'
Citrobacter farmeri
'
,
'
Enterobacter asburiae
'
,
'
Enterobacter kobei
'
,
'
Klebsiella michiganensis
'
,
'
Morganella morganii
'
,
'
Salmonella enterica
'
,
'
Citrobacter amalonaticus
'
,
'
Citrobacter freundii
'
,
'
Enterobacter chengduensis
'
,
'
Escherichia coli
'
,
'
Klebsiella oxytoca
'
,
'
Proteus mirabilis
'
,
'
Serratia marcescens
'
,
'
Citrobacter braakii
'
,
'
Citrobacter koseri
'
,
'
Enterobacter cloacae
'
,
'
Hafnia alvei
'
,
'
Klebsiella pneumoniae
'
,
'
Providencia stuartii
'
]
ALPHABET_UNMOD
=
{
ALPHABET_UNMOD
=
{
"
A
"
:
1
,
"
A
"
:
1
,
"
C
"
:
2
,
"
C
"
:
2
,
...
@@ -30,7 +38,6 @@ ALPHABET_UNMOD = {
...
@@ -30,7 +38,6 @@ ALPHABET_UNMOD = {
"
Y
"
:
20
,
"
Y
"
:
20
,
}
}
MASSES_MONO
=
{
MASSES_MONO
=
{
"
A
"
:
71.03711
,
"
A
"
:
71.03711
,
"
C
"
:
103.00919
,
"
C
"
:
103.00919
,
...
@@ -54,7 +61,6 @@ MASSES_MONO = {
...
@@ -54,7 +61,6 @@ MASSES_MONO = {
"
Y
"
:
163.06333
,
"
Y
"
:
163.06333
,
}
}
MASSES_AVG
=
{
MASSES_AVG
=
{
"
A
"
:
71.0788
,
"
A
"
:
71.0788
,
"
C
"
:
103.1388
,
"
C
"
:
103.1388
,
...
@@ -78,6 +84,7 @@ MASSES_AVG = {
...
@@ -78,6 +84,7 @@ MASSES_AVG = {
"
Y
"
:
163.1760
,
"
Y
"
:
163.1760
,
}
}
# trypsin cut after K or R (if not followed by P)
# trypsin cut after K or R (if not followed by P)
def
cut
(
seq
):
def
cut
(
seq
):
...
@@ -102,12 +109,14 @@ def cut_with_ind(seq, ind_list):
...
@@ -102,12 +109,14 @@ def cut_with_ind(seq, ind_list):
return
l
return
l
def
digest
(
seq
):
def
digest
(
seq
):
ind
=
cut
(
seq
)
ind
=
cut
(
seq
)
return
cut_with_ind
(
seq
,
ind
)
return
cut_with_ind
(
seq
,
ind
)
def
fasta_similarity
(
path_fasta_1
,
path_fasta_2
):
def
fasta_similarity
(
path_fasta_1
,
path_fasta_2
):
list_seq_1
=
[]
list_seq_1
=
[]
list_seq_2
=
[]
list_seq_2
=
[]
for
record
in
fastapy
.
parse
(
path_fasta_1
):
for
record
in
fastapy
.
parse
(
path_fasta_1
):
list_seq_1
.
extend
(
digest
(
record
.
seq
))
list_seq_1
.
extend
(
digest
(
record
.
seq
))
...
@@ -121,18 +130,20 @@ def fasta_similarity(path_fasta_1, path_fasta_2):
...
@@ -121,18 +130,20 @@ def fasta_similarity(path_fasta_1, path_fasta_2):
plt
.
show
()
plt
.
show
()
plt
.
savefig
(
'
fasta_similarity.png
'
)
plt
.
savefig
(
'
fasta_similarity.png
'
)
def
split_string
(
input_string
):
def
split_string
(
input_string
):
# Use regular expression to split the string at underscore followed by uppercase letter
# Use regular expression to split the string at underscore followed by uppercase letter
return
re
.
split
(
r
'
_(?=[A-Zc])
'
,
input_string
)
return
re
.
split
(
r
'
_(?=[A-Z])|c(?=[A-Z])
'
,
input_string
)
def
build_database_ref_peptide
():
def
build_database_ref_peptide
():
l
=
[]
l
=
[]
with
open
(
'
../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt
'
,
'
r
'
)
as
f
:
with
open
(
'
../data/label_raw/250107_FASTA_RP_GroEL_GroES_Tuf_5pct_assemble_peptides_list.txt
'
,
'
r
'
)
as
f
:
for
line
in
f
:
for
line
in
f
:
if
line
!=
'
\n
'
:
if
line
!=
'
\n
'
:
if
'
>
'
in
line
:
if
'
>
'
in
line
:
#typo ??
#
typo ??
line
=
line
.
replace
(
'
no_family
'
,
'
No_family
'
)
line
=
line
.
replace
(
'
no_family
'
,
'
No_family
'
)
line
=
line
.
replace
(
'
no_order
'
,
'
No_order
'
)
line
=
line
.
replace
(
'
no_order
'
,
'
No_order
'
)
split_line
=
line
.
split
(
'
_
'
)
split_line
=
line
.
split
(
'
_
'
)
...
@@ -140,16 +151,17 @@ def build_database_ref_peptide():
...
@@ -140,16 +151,17 @@ def build_database_ref_peptide():
err
=
split_line
[
1
]
err
=
split_line
[
1
]
prev
=
split_line
[
2
]
prev
=
split_line
[
2
]
split_line
=
split_string
(
line
.
split
(
'
'
)[
1
])
split_line
=
split_string
(
line
.
split
(
'
'
)[
1
])
spe
=
split_line
[
0
].
replace
(
'
_
'
,
'
'
)
spe
=
split_line
[
0
].
replace
(
'
_
'
,
'
'
)
gen
=
split_line
[
1
].
replace
(
'
_
'
,
'
'
)
gen
=
split_line
[
1
].
replace
(
'
_
'
,
'
'
)
fam
=
split_line
[
2
].
replace
(
'
_
'
,
'
'
)
fam
=
split_line
[
2
].
replace
(
'
_
'
,
'
'
)
o
=
split_line
[
3
].
replace
(
'
_
'
,
'
'
)
o
=
split_line
[
3
].
replace
(
'
_
'
,
'
'
)
else
:
else
:
seq
=
line
.
split
(
'
'
)[
1
]
seq
=
line
.
split
(
'
'
)[
1
]
l
.
append
({
'
Sequence
'
:
seq
,
'
Protein code
'
:
prot
,
'
Error treshold
'
:
err
,
'
Prevalance
'
:
prev
,
l
.
append
({
'
Sequence
'
:
seq
,
'
Protein code
'
:
prot
,
'
Error treshold
'
:
err
,
'
Prevalance
'
:
prev
,
'
Specie
'
:
spe
,
'
Genus
'
:
gen
,
'
Family
'
:
fam
,
'
Order
'
:
o
})
'
Specie
'
:
spe
,
'
Genus
'
:
gen
,
'
Family
'
:
fam
,
'
Order
'
:
o
})
return
pd
.
DataFrame
(
l
)
return
pd
.
DataFrame
(
l
)
def
compute_mass
(
seq
,
isotop
):
def
compute_mass
(
seq
,
isotop
):
m
=
0
m
=
0
if
isotop
==
'
mono
'
:
if
isotop
==
'
mono
'
:
...
@@ -160,55 +172,53 @@ def compute_mass(seq, isotop):
...
@@ -160,55 +172,53 @@ def compute_mass(seq, isotop):
m
+=
MASSES_AVG
[
char
]
*
seq
.
count
(
char
)
m
+=
MASSES_AVG
[
char
]
*
seq
.
count
(
char
)
return
m
return
m
def
build_ref_image
(
path_fasta
,
possible_charge
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
rt_pred
):
#build peptide list
def
build_ref_image
(
path_fasta
,
possible_charge
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
rt_pred
):
# build peptide list
list_seq_1
=
[]
list_seq_1
=
[]
for
record
in
fastapy
.
parse
(
path_fasta
):
for
record
in
fastapy
.
parse
(
path_fasta
):
list_seq_1
.
append
(
record
.
seq
)
list_seq_1
.
append
(
record
.
seq
)
list_peptides
=
[]
list_peptides
=
[]
for
prot
in
list_seq_1
:
for
prot
in
list_seq_1
:
list_peptides
.
extend
(
digest
(
prot
))
list_peptides
.
extend
(
digest
(
prot
))
#compute m/z ration
#
compute m/z ration
mz_ratio
=
{}
mz_ratio
=
{}
i
=
0
i
=
0
list_peptides
=
list
(
set
(
list_peptides
))
list_peptides
=
list
(
set
(
list_peptides
))
for
seq
in
list_peptides
:
for
seq
in
list_peptides
:
mz_ratio
[
'
seq
'
]
=
[]
mz_ratio
[
'
seq
'
]
=
[]
for
charge
in
possible_charge
:
for
charge
in
possible_charge
:
ratio
=
compute_mass
(
seq
,
'
avg
'
)
/
charge
ratio
=
compute_mass
(
seq
,
'
avg
'
)
/
charge
if
ms1_end_mz
>
ratio
>
ms1_start_mz
:
if
ms1_end_mz
>
ratio
>
ms1_start_mz
:
mz_ratio
[
'
seq
'
].
append
(
ratio
)
mz_ratio
[
'
seq
'
].
append
(
ratio
)
i
+=
1
i
+=
1
print
(
i
)
print
(
i
)
#assocy predict rt
#
assocy predict rt
data
=
[]
data
=
[]
#predict detectability (optional)
#
predict detectability (optional)
#build image
#
build image
total_ms1_mz
=
ms1_end_mz
-
ms1_start_mz
total_ms1_mz
=
ms1_end_mz
-
ms1_start_mz
n_bin_ms1
=
int
(
total_ms1_mz
//
bin_mz
)
n_bin_ms1
=
int
(
total_ms1_mz
//
bin_mz
)
im
=
np
.
zeros
([
max_cycle
,
n_bin_ms1
])
im
=
np
.
zeros
([
max_cycle
,
n_bin_ms1
])
max_rt
=
np
.
max
(
rt_pred
)
max_rt
=
np
.
max
(
rt_pred
)
min_rt
=
np
.
min
(
rt_pred
)
min_rt
=
np
.
min
(
rt_pred
)
total_rt
=
max_rt
-
min_rt
total_rt
=
max_rt
-
min_rt
for
(
rt
,
mz_ratio
)
in
data
:
for
(
rt
,
mz_ratio
)
in
data
:
im
[
int
((
rt
-
min_rt
/
total_rt
)
*
max_cycle
),
int
(((
mz_ratio
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)]
=
1
im
[
int
((
rt
-
min_rt
/
total_rt
)
*
max_cycle
),
int
(((
mz_ratio
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)]
=
1
return
im
return
im
def
build_ref_image_from_diann
(
path_parqet
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
min_rt
=
None
,
max_rt
=
None
):
def
build_ref_image_from_diann
(
path_parqet
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
min_rt
=
None
,
max_rt
=
None
):
df
=
load_lib
(
path_parqet
)
df
=
load_lib
(
path_parqet
)
df
=
df
[[
'
Stripped.Sequence
'
,
'
Precursor.Charge
'
,
'
RT
'
,
'
Precursor.Mz
'
]]
df
=
df
[[
'
Stripped.Sequence
'
,
'
Precursor.Charge
'
,
'
RT
'
,
'
Precursor.Mz
'
]]
df_unique
=
df
.
drop_duplicates
()
df_unique
=
df
.
drop_duplicates
()
#build image
#
build image
total_ms1_mz
=
ms1_end_mz
-
ms1_start_mz
total_ms1_mz
=
ms1_end_mz
-
ms1_start_mz
n_bin_ms1
=
int
(
total_ms1_mz
//
bin_mz
)
n_bin_ms1
=
int
(
total_ms1_mz
//
bin_mz
)
im
=
np
.
zeros
([
max_cycle
,
n_bin_ms1
])
im
=
np
.
zeros
([
max_cycle
,
n_bin_ms1
])
...
@@ -216,30 +226,38 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
...
@@ -216,30 +226,38 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
max_rt
=
np
.
max
(
df_unique
[
'
RT
'
])
max_rt
=
np
.
max
(
df_unique
[
'
RT
'
])
if
min_rt
is
None
:
if
min_rt
is
None
:
min_rt
=
np
.
min
(
df_unique
[
'
RT
'
])
min_rt
=
np
.
min
(
df_unique
[
'
RT
'
])
total_rt
=
max_rt
-
min_rt
+
1e-3
total_rt
=
max_rt
-
min_rt
+
1e-3
for
row
in
df_unique
.
iterrows
()
:
for
row
in
df_unique
.
iterrows
():
if
900
>
int
(((
row
[
1
][
'
Precursor.Mz
'
]
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)
>=
0
:
if
900
>
int
(((
row
[
1
][
'
Precursor.Mz
'
]
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)
>=
0
:
im
[
int
((
row
[
1
][
'
RT
'
]
-
min_rt
)
/
total_rt
*
max_cycle
),
int
(((
row
[
1
][
'
Precursor.Mz
'
]
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)]
=
1
im
[
int
((
row
[
1
][
'
RT
'
]
-
min_rt
)
/
total_rt
*
max_cycle
),
int
(
((
row
[
1
][
'
Precursor.Mz
'
]
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)]
=
1
return
im
return
im
if
__name__
==
'
__main__
'
:
if
__name__
==
'
__main__
'
:
# fasta_similarity('fasta/uniparc_proteome_UP000033376_2025_03_14.fasta','fasta/uniparc_proteome_UP000033499_2025_03_14.fasta')
# im = build_ref_image_from_diann('fasta/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet', ms1_end_mz=1250, ms1_start_mz=350, bin_mz=1, max_cycle=663, rt_pred=[])
# plt.clf()
# mpimg.imsave('test_img.png', im)
df
=
build_database_ref_peptide
()
df
=
build_database_ref_peptide
()
df_full
=
load_lib
(
'
fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet
'
)
for
spe
in
SPECIES
:
min_rt
=
df_full
[
'
RT
'
].
min
()
max_rt
=
df_full
[
'
RT
'
].
max
()
df_spe
=
df
[
df
[
'
Specie
'
]
==
spe
]
for
spe
in
[
'
Proteus mirabilis
'
,
'
Klebsiella pneumoniae
'
,
'
Klebsiella oxytoca
'
,
'
Enterobacter hormaechei
'
,
'
Citrobacter freundii
'
]:
spe_list
=
df_spe
[
'
Sequence
'
].
to_list
()
im
=
build_ref_image_from_diann
(
with
open
(
'
fasta/optimal peptide set/
'
+
spe
+
'
.fasta
'
,
"
w
"
)
as
f
:
'
fasta/optimal peptide set/
'
+
spe
+
'
.parquet
'
,
ms1_end_mz
=
1250
,
if
not
spe_list
:
ms1_start_mz
=
350
,
bin_mz
=
1
,
max_cycle
=
663
,
min_rt
=
min_rt
,
max_rt
=
max_rt
)
print
(
'
Empty :
'
,
spe
)
plt
.
clf
()
for
pep
in
spe_list
:
mpimg
.
imsave
(
spe
+
'
.png
'
,
im
)
f
.
write
(
pep
)
np
.
save
(
spe
+
'
.npy
'
,
im
)
#
#
# df_full = load_lib(
# 'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
# min_rt = df_full['RT'].min()
# max_rt = df_full['RT'].max()
#
# for spe in SPECIES:
# im = build_ref_image_from_diann(
# 'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
# ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
# plt.clf()
# mpimg.imsave('img_ref/' + spe + '.png', im)
# np.save(spe + '.npy', im)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment