Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
pseudo_image
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Léo Schneider
pseudo_image
Commits
498b38c4
Commit
498b38c4
authored
1 week ago
by
Schneider Leo
Browse files
Options
Downloads
Patches
Plain Diff
clean : utils.py
parent
10edb8ba
No related branches found
No related tags found
Loading
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
image_ref/utils.py
+50
-135
50 additions, 135 deletions
image_ref/utils.py
with
50 additions
and
135 deletions
image_ref/utils.py
+
50
−
135
View file @
498b38c4
...
...
@@ -15,76 +15,6 @@ SPECIES = ['Citrobacter cronae', 'Citrobacter portucalensis', 'Enterobacter horm
'
Citrobacter koseri
'
,
'
Enterobacter cloacae
'
,
'
Hafnia alvei
'
,
'
Klebsiella pneumoniae
'
,
'
Providencia stuartii
'
]
ALPHABET_UNMOD
=
{
"
A
"
:
1
,
"
C
"
:
2
,
"
D
"
:
3
,
"
E
"
:
4
,
"
F
"
:
5
,
"
G
"
:
6
,
"
H
"
:
7
,
"
I
"
:
8
,
"
K
"
:
9
,
"
L
"
:
10
,
"
M
"
:
11
,
"
N
"
:
12
,
"
P
"
:
13
,
"
Q
"
:
14
,
"
R
"
:
15
,
"
S
"
:
16
,
"
T
"
:
17
,
"
V
"
:
18
,
"
W
"
:
19
,
"
Y
"
:
20
,
}
MASSES_MONO
=
{
"
A
"
:
71.03711
,
"
C
"
:
103.00919
,
"
D
"
:
115.02694
,
"
E
"
:
129.04259
,
"
F
"
:
147.06841
,
"
G
"
:
57.02146
,
"
H
"
:
137.05891
,
"
I
"
:
113.08406
,
"
K
"
:
128.09496
,
"
L
"
:
113.08406
,
"
M
"
:
131.04049
,
"
N
"
:
114.04293
,
"
P
"
:
97.05276
,
"
Q
"
:
128.05858
,
"
R
"
:
156.1875
,
"
S
"
:
87.03203
,
"
T
"
:
101.04768
,
"
V
"
:
99.06841
,
"
W
"
:
186.07931
,
"
Y
"
:
163.06333
,
}
MASSES_AVG
=
{
"
A
"
:
71.0788
,
"
C
"
:
103.1388
,
"
D
"
:
115.0886
,
"
E
"
:
129.1155
,
"
F
"
:
147.1766
,
"
G
"
:
57.0519
,
"
H
"
:
137.1411
,
"
I
"
:
113.1594
,
"
K
"
:
128.1741
,
"
L
"
:
113.1594
,
"
M
"
:
131.1926
,
"
N
"
:
114.1038
,
"
P
"
:
97.1167
,
"
Q
"
:
128.1307
,
"
R
"
:
156.1875
,
"
S
"
:
87.0782
,
"
T
"
:
101.1051
,
"
V
"
:
99.1326
,
"
W
"
:
186.2132
,
"
Y
"
:
163.1760
,
}
# trypsin cut after K or R (if not followed by P)
def
cut
(
seq
):
...
...
@@ -142,6 +72,9 @@ def build_database_ref_peptide():
for
line
in
f
:
if
line
!=
'
\n
'
:
if
'
>
'
in
line
:
#remove new line
line
=
line
.
replace
(
'
\n
'
,
''
)
# typo ??
line
=
line
.
replace
(
'
no_family
'
,
'
No_family
'
)
line
=
line
.
replace
(
'
no_order
'
,
'
No_order
'
)
...
...
@@ -157,66 +90,46 @@ def build_database_ref_peptide():
o
=
split_line
[
3
].
replace
(
'
_
'
,
'
'
)
else
:
seq
=
line
.
split
(
'
'
)[
1
]
#remove new lines
seq
=
seq
.
replace
(
'
\n
'
,
''
)
l
.
append
({
'
Sequence
'
:
seq
,
'
Protein code
'
:
prot
,
'
Error treshold
'
:
err
,
'
Prevalance
'
:
prev
,
'
Specie
'
:
spe
,
'
Genus
'
:
gen
,
'
Family
'
:
fam
,
'
Order
'
:
o
})
return
pd
.
DataFrame
(
l
)
def
compute_mass
(
seq
,
isotop
):
m
=
0
if
isotop
==
'
mono
'
:
for
char
in
MASSES_MONO
.
keys
():
m
+=
MASSES_MONO
[
char
]
*
seq
.
count
(
char
)
if
isotop
==
'
avg
'
:
for
char
in
MASSES_AVG
.
keys
():
m
+=
MASSES_AVG
[
char
]
*
seq
.
count
(
char
)
return
m
df
=
pd
.
DataFrame
(
l
)
return
df
.
drop_duplicates
()
def
build_ref_image
(
path_fasta
,
possible_charge
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
rt_pred
):
# build peptide list
list_seq_1
=
[]
for
record
in
fastapy
.
parse
(
path_fasta
):
list_seq_1
.
append
(
record
.
seq
)
list_peptides
=
[]
for
prot
in
list_seq_1
:
list_peptides
.
extend
(
digest
(
prot
))
# compute m/z ration
mz_ratio
=
{}
i
=
0
list_peptides
=
list
(
set
(
list_peptides
))
for
seq
in
list_peptides
:
mz_ratio
[
'
seq
'
]
=
[]
for
charge
in
possible_charge
:
ratio
=
compute_mass
(
seq
,
'
avg
'
)
/
charge
if
ms1_end_mz
>
ratio
>
ms1_start_mz
:
mz_ratio
[
'
seq
'
].
append
(
ratio
)
i
+=
1
print
(
i
)
# assocy predict rt
data
=
[]
def
compute_common_peptide
(
df_path
,
species
):
df
=
pd
.
read_csv
(
df_path
)
df_filter
=
df
[
df
[
'
Specie
'
].
isin
(
species
)]
df_filter
=
df_filter
[[
'
Sequence
'
,
'
Specie
'
]].
drop_duplicates
()
df_filter
[
'
Count
'
]
=
df_filter
.
groupby
(
'
Sequence
'
)[
'
Sequence
'
].
transform
(
'
count
'
)
df_count
=
df_filter
[[
'
Sequence
'
,
'
Count
'
]].
drop_duplicates
()
return
df_count
# predict detectability (optional)
def
build_ref_image_from_diann
(
path_parqet
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
min_rt
=
None
,
max_rt
=
None
):
df
=
load_lib
(
path_parqet
)
df
=
df
[[
'
Stripped.Sequence
'
,
'
Precursor.Charge
'
,
'
RT
'
,
'
Precursor.Mz
'
]]
df_unique
=
df
.
drop_duplicates
()
# build image
total_ms1_mz
=
ms1_end_mz
-
ms1_start_mz
n_bin_ms1
=
int
(
total_ms1_mz
//
bin_mz
)
im
=
np
.
zeros
([
max_cycle
,
n_bin_ms1
])
max_rt
=
np
.
max
(
rt_pred
)
min_rt
=
np
.
min
(
rt_pred
)
total_rt
=
max_rt
-
min_rt
for
(
rt
,
mz_ratio
)
in
data
:
im
[
int
((
rt
-
min_rt
/
total_rt
)
*
max_cycle
),
int
(((
mz_ratio
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)]
=
1
if
max_rt
is
None
:
max_rt
=
np
.
max
(
df_unique
[
'
RT
'
])
if
min_rt
is
None
:
min_rt
=
np
.
min
(
df_unique
[
'
RT
'
])
total_rt
=
max_rt
-
min_rt
+
1e-3
for
row
in
df_unique
.
iterrows
():
if
900
>
int
(((
row
[
1
][
'
Precursor.Mz
'
]
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)
>=
0
:
im
[
int
((
row
[
1
][
'
RT
'
]
-
min_rt
)
/
total_rt
*
max_cycle
),
int
(
((
row
[
1
][
'
Precursor.Mz
'
]
-
ms1_start_mz
)
/
total_ms1_mz
)
*
n_bin_ms1
)]
=
1
return
im
'
Klebsiella michiganensis
'
,
def
build_ref_image_from_diann
(
path_parqet
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
min_rt
=
None
,
max_rt
=
None
):
def
build_ref_image_from_diann_global
(
path_parqet
,
target_seq
,
ms1_end_mz
,
ms1_start_mz
,
bin_mz
,
max_cycle
,
min_rt
=
None
,
max_rt
=
None
):
df
=
load_lib
(
path_parqet
)
df
=
df
[[
'
Stripped.Sequence
'
,
'
Precursor.Charge
'
,
'
RT
'
,
'
Precursor.Mz
'
]]
df
=
df
[
df
[
'
Stripped.Sequence
'
].
isin
(
target_seq
)]
df_unique
=
df
.
drop_duplicates
()
# build image
total_ms1_mz
=
ms1_end_mz
-
ms1_start_mz
...
...
@@ -236,29 +149,31 @@ def build_ref_image_from_diann(path_parqet, ms1_end_mz, ms1_start_mz, bin_mz, ma
if
__name__
==
'
__main__
'
:
df
=
build_database_ref_peptide
()
#
Write fasta file
#
for spe in SPECIES:
#
#
df_spe = df[df['Specie']==spe]
#
spe_list = df_spe['Sequence'].to_list()
#
with open('fasta/optimal peptide set/'+spe+'.fasta',"w") as f:
#
df = build_database_ref_peptide()
#
df.to_csv("dataset_species_ref_peptides.csv", index=False)
#
#Write fasta file
#
with open('fasta/optimal peptide set/gobal_peptide_set.fasta', "w") as f:
#
for spe in SPECIES:
#
df_spe = df[df['Specie']==spe]
#
spe_list = df_spe['Sequence'].drop_duplicate().to_list()
# if not spe_list:
# print('Empty : ',spe)
# for pep in spe_list :
# f.write(pep)
df_count
=
compute_common_peptide
(
"
dataset_species_ref_peptides.csv
"
,
SPECIES
)
#
#Create ref img
df_full
=
load_lib
(
'
fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet
'
)
min_rt
=
df_full
[
'
RT
'
].
min
()
max_rt
=
df_full
[
'
RT
'
].
max
()
for
spe
in
SPECIES
:
im
=
build_ref_image_from_diann
(
'
fasta/optimal peptide set/
'
+
spe
+
'
.parquet
'
,
ms1_end_mz
=
1250
,
ms1_start_mz
=
350
,
bin_mz
=
1
,
max_cycle
=
663
,
min_rt
=
min_rt
,
max_rt
=
max_rt
)
plt
.
clf
()
mpimg
.
imsave
(
'
img_ref/
'
+
spe
+
'
.png
'
,
im
)
np
.
save
(
spe
+
'
.npy
'
,
im
)
#
df_full = load_lib(
#
'fasta/full proteom/steigerwaltii variants/uniparc_proteome_UP000033376_2025_03_14.predicted.parquet')
#
min_rt = df_full['RT'].min()
#
max_rt = df_full['RT'].max()
#
#
for spe in SPECIES:
#
im = build_ref_image_from_diann(
#
'fasta/optimal peptide set/' + spe + '.parquet', ms1_end_mz=1250,
#
ms1_start_mz=350, bin_mz=1, max_cycle=663, min_rt=min_rt, max_rt=max_rt)
#
plt.clf()
#
mpimg.imsave('img_ref/' + spe + '.png', im)
#
np.save(spe + '.npy', im)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment