Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DIA augmentation
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Léo Schneider
DIA augmentation
Commits
2dbfe726
Commit
2dbfe726
authored
5 months ago
by
Schneider Leo
Browse files
Options
Downloads
Patches
Plain Diff
data viz
parent
73ce0ee0
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
data/data_exploration.py
+27
-2
27 additions, 2 deletions
data/data_exploration.py
data/data_viz.py
+34
-14
34 additions, 14 deletions
data/data_viz.py
with
61 additions
and
16 deletions
data/data_exploration.py
+
27
−
2
View file @
2dbfe726
...
@@ -2,7 +2,7 @@ import numpy as np
...
@@ -2,7 +2,7 @@ import numpy as np
import
matplotlib.pyplot
as
plt
import
matplotlib.pyplot
as
plt
import
pandas
as
pd
import
pandas
as
pd
from
numpy.ma.core
import
shape
from
numpy.ma.core
import
shape
from
sklearn.metrics
import
r2_score
from
constant
import
ALPHABET_UNMOD
,
ALPHABET_UNMOD_REV
from
constant
import
ALPHABET_UNMOD
,
ALPHABET_UNMOD_REV
...
@@ -75,6 +75,13 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e
...
@@ -75,6 +75,13 @@ def retention_time_distribution(data, plot=False, save=False, f_name='fig/data_e
plt
.
clf
()
plt
.
clf
()
plt
.
close
()
plt
.
close
()
def
optimal_prediction_r2
(
df
,
seq_col
,
rt_col
):
df_group_1
=
df
.
groupby
([
seq_col
])[
rt_col
].
mean
().
to_frame
().
reset_index
()
df_group_1
[
'
mean rt
'
]
=
df_group_1
[
rt_col
]
df_group_1
=
df_group_1
[[
seq_col
,
'
mean rt
'
]]
df_merged
=
df_group_1
.
merge
(
df
,
how
=
'
inner
'
,
on
=
seq_col
)
return
r2_score
(
df_merged
[
'
mean rt
'
],
df_merged
[
rt_col
])
def
main
():
def
main
():
#data prosit
#data prosit
# df = pd.read_csv('data_prosit/data.csv')
# df = pd.read_csv('data_prosit/data.csv')
...
@@ -134,6 +141,24 @@ def main():
...
@@ -134,6 +141,24 @@ def main():
_
=
aa_distribution
(
df
[
'
seq
'
],
False
,
True
,
'
../fig/data_exploration/aa_distribution_ISA_prosit_outlier.png
'
)
_
=
aa_distribution
(
df
[
'
seq
'
],
False
,
True
,
'
../fig/data_exploration/aa_distribution_ISA_prosit_outlier.png
'
)
retention_time_distribution
(
df
[
'
true rt
'
],
False
,
True
,
'
../fig/data_exploration/retention_time_distribution_ISA_prosit_outlier.png
'
)
retention_time_distribution
(
df
[
'
true rt
'
],
False
,
True
,
'
../fig/data_exploration/retention_time_distribution_ISA_prosit_outlier.png
'
)
#compare variance of outliers vs others petides
seq_out_list
=
df
[
'
seq
'
].
to_list
()
df_prosit
=
pd
.
read_csv
(
'
data_prosit/data_noc.csv
'
)
if
__name__
==
'
__main__
'
:
if
__name__
==
'
__main__
'
:
main
()
df
=
pd
.
read_csv
(
'
data_PXD006109/plasma/data_prosit_outlier.csv
'
)
df
[
'
seq
'
]
=
df
[
'
seq
'
].
map
(
numerical_to_alphabetical_str
)
# compare variance of outliers vs others petides
seq_out_list
=
df
[
'
seq
'
].
to_list
()
df_prosit
=
pd
.
read_csv
(
'
data_prosit/data.csv
'
)
df_prosit
[
'
outlier
'
]
=
df_prosit
[
'
sequence
'
].
map
(
lambda
x
:
x
in
seq_out_list
)
df_prosit_outlier
=
df_prosit
[
df_prosit
[
'
outlier
'
]
==
True
]
df_agg_out
=
df_prosit_outlier
.
groupby
(
pd
.
Grouper
(
key
=
'
mod_sequence
'
))[
'
irt_scaled
'
].
agg
([
'
mean
'
,
'
median
'
,
'
var
'
]).
reset_index
()
df_agg_pro
=
df_prosit
.
groupby
(
pd
.
Grouper
(
key
=
'
mod_sequence
'
))[
'
irt_scaled
'
].
agg
([
'
mean
'
,
'
median
'
,
'
var
'
]).
reset_index
()
# main()
plt
.
hist
(
df_agg_out
[
'
var
'
])
plt
.
savefig
(
'
var_outlier.png
'
)
plt
.
hist
(
df_agg_pro
[
'
var
'
])
plt
.
savefig
(
'
var_pro.png
'
)
\ No newline at end of file
This diff is collapsed.
Click to expand it.
data/data_viz.py
+
34
−
14
View file @
2dbfe726
...
@@ -316,17 +316,37 @@ if __name__ == '__main__' :
...
@@ -316,17 +316,37 @@ if __name__ == '__main__' :
# dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv')
# dataframe = pd.read_csv('../archive_output/ISA/out_ISA_noc_prosit_0.csv')
# df2 = filter_outlier_rt(dataframe)
# df2 = filter_outlier_rt(dataframe)
# df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False)
# df2.to_csv('../data/data_ISA/data_prosit_outlier.csv', index=False)
df
=
pd
.
read_csv
(
'
../data/data_PXD006109/data_prosit_outlier.csv
'
)
# df = pd.read_csv('../data/data_PXD006109/data_prosit_outlier.csv')
compute_peptide_properties
(
df
,
'
plasma_prosit_outlier
'
,
'
seq
'
,
'
num
'
)
# compute_peptide_properties(df, 'plasma_prosit_outlier', 'seq', 'num')
## r2_list = []
df
=
pd
.
read_csv
(
'
../data/data_ISA/data_prosit_outlier.csv
'
)
# for index in range(10):
compute_peptide_properties
(
df
,
'
ISA_prosit_outlier
'
,
'
seq
'
,
'
num
'
)
# dataframe = pd.read_csv( '../output/out_tranfert_prosit_isa_mox_'+str(index)+'.csv')
# r2_list.append(r2_score(dataframe['true rt'], dataframe['rt pred']))
df
=
pd
.
read_csv
(
'
../data/data_ISA/data_isa.csv
'
)
# r2_arr = np.array(r2_list)
compute_peptide_properties
(
df
,
'
ISA
'
,
'
sequence
'
)
# print(r2_arr.mean(),'+/-',r2_arr.std()) #0.979362058986253 +/- 0.004342685753968106
# df = pd.read_csv('../data/data_ISA/data_prosit_outlier.csv')
df
=
pd
.
read_csv
(
'
../data/data_prosit/data.csv
'
)
# compute_peptide_properties(df,'ISA_prosit_outlier','seq', 'num')
compute_peptide_properties
(
df
,
'
prosit
'
,
'
sequence
'
)
#
# df = pd.read_csv('../data/data_ISA/data_isa.csv')
df
=
pd
.
read_csv
(
'
../data/data_PXD006109/plasma/data_plasma.csv
'
)
# compute_peptide_properties(df,'ISA','sequence')
compute_peptide_properties
(
df
,
'
plasma
'
,
'
sequence
'
)
#
\ No newline at end of file
# df = pd.read_csv('../data/data_prosit/data.csv')
# compute_peptide_properties(df,'prosit','sequence')
#
# df = pd.read_csv('../data/data_PXD006109/plasma/data_plasma.csv')
# compute_peptide_properties(df,'plasma','sequence')
r2_list
=
[]
for
index
in
range
(
10
):
dataframe
=
pd
.
read_csv
(
'
../output/out_tranfert_prosit_ISA_
'
+
str
(
index
)
+
'
.csv
'
)
r2_list
.
append
(
r2_score
(
dataframe
[
'
true rt
'
],
dataframe
[
'
rt pred
'
]))
r2_arr
=
np
.
array
(
r2_list
)
print
(
r2_arr
.
mean
(),
'
+/-
'
,
r2_arr
.
std
())
#0.979362058986253 +/- 0.004342685753968106 et 0.9775968974701088 +/- 0.0022122290222140123 => moins bon que mélange
r2_list
=
[]
for
index
in
range
(
10
):
dataframe
=
pd
.
read_csv
(
'
../output/out_tranfert_prosit_plasma_
'
+
str
(
index
)
+
'
.csv
'
)
r2_list
.
append
(
r2_score
(
dataframe
[
'
true rt
'
],
dataframe
[
'
rt pred
'
]))
r2_arr
=
np
.
array
(
r2_list
)
print
(
r2_arr
.
mean
(),
'
+/-
'
,
r2_arr
.
std
())
#0.977876349023085 +/- 0.0040982548977069695 et 0.982352390283812 +/- 0.00036 => equivalent au mélange
pass
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment