Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Mosaicatcher Update
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Thomas Weber
Mosaicatcher Update
Commits
1e901eff
Commit
1e901eff
authored
7 years ago
by
Sascha Meiers
Browse files
Options
Downloads
Patches
Plain Diff
Evaluation script that makes a few plots
parent
8118a911
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Snakefile
+23
-1
23 additions, 1 deletion
Snakefile
utils/evaluate_simulation.R
+141
-0
141 additions, 0 deletions
utils/evaluate_simulation.R
with
164 additions
and
1 deletion
Snakefile
+
23
−
1
View file @
1e901eff
...
...
@@ -34,11 +34,33 @@ rule all:
window = [50000, 100000], chrom = config['chromosomes'][0]) # Specifically run this only for one chrom because it is super slow
################################################################################
# Simulation of count data #
################################################################################
rule simul:
input:
expand("sv_calls/{sample}-{window}/{window}_fixed.{segments}.SV_probs.chr1.pdf",
sample = ["simulation7","simulation8","simulation9"],
window = [50000, 100000],
segments = ["few","medium","many"]),
expand("plots/{sample}-{window}/{window}_fixed.pdf",
sample = ["simulation7","simulation8","simulation9"],
window = [50000, 100000]),
expand("evaluation/simulation{seed}_{window}.{segments}.pdf",
seed = [7,8,9],
window = [50000, 100000],
segments = ["few","medium","many"])
rule evaluate_simulation:
input:
prob = "sv_probabilities/simulation{seed}-{window}/{window}_fixed.{bpdens}/probabilities.txt",
simul = "simulation/variants/genome{seed}-{window}.txt"
output:
"evaluation/simulation{seed}_{window}.{bpdens}.pdf"
script:
"utils/evaluate_simulation.R"
rule simulate_genome:
output:
tsv="simulation/genome/genome{seed}.tsv"
...
...
This diff is collapsed.
Click to expand it.
utils/evaluate_simulation.R
0 → 100644
+
141
−
0
View file @
1e901eff
library
(
data.table
)
library
(
assertthat
)
library
(
ggplot2
)
library
(
cowplot
)
MIN_LLR
=
0.5
# min. log likelihood ratio of SV prob. over REF prob.
MIN_RCO
=
0.8
# min. reciprocal overlap of true and detected SV
sv_input
=
"sv_probabilities/simulation7-100000/100000_fixed.few/probabilities.txt"
real_input
=
"simulation/variants/genome7-100000.txt"
sv_input
=
snakemake
@
input
[[
"prob"
]]
real_input
=
snakemake
@
input
[[
"simul"
]]
svs
=
fread
(
sv_input
)
real
=
fread
(
real_input
)
### call SVs from SV probabilities
# rename p_ref column to be distinguished from SV classes
svs
[,
`:=`
(
ref_prob
=
p_ref
,
p_ref
=
NULL
)
]
# melt to one entry per SV class
svs
=
melt.data.table
(
svs
,
measure.vars
=
colnames
(
svs
)[
grepl
(
'^p_'
,
colnames
(
svs
))],
variable.name
=
"SV_class"
,
value.name
=
"SV_prob"
,
variable.factor
=
FALSE
)
# cast again to one entry per locus and cell, now with
setkey
(
svs
,
chrom
,
start
,
end
,
sample
,
cell
)
svs
=
svs
[,
.
(
SV_class
=
SV_class
[
which.max
(
SV_prob
)],
SV_llr
=
log
(
max
(
SV_prob
))
-
log
(
ref_prob
[
1
])),
by
=
.
(
chrom
,
start
,
end
,
sample
,
cell
)]
### Find reciprocal overlap of SV calls and true SVs
# list of simulated loci (only chrom, start, end)
LOCI
=
real
[,
.
(
VAF
=
.N
),
by
=
.
(
chrom
,
start
,
end
,
SV_type
)]
# list of predicted loci (chrom, start, end) --> only use real SV calls here (MIN_LLR criterion) !
called_loci
=
unique
(
svs
[
SV_llr
>=
MIN_LLR
,
.
(
chrom
,
start
,
end
)])
# Merge all combinations of loci ...
combined
=
merge
(
LOCI
[,
.
(
chrom
,
start
,
end
)],
called_loci
,
by
=
"chrom"
,
all
=
T
,
suffixes
=
c
(
""
,
".call"
),
allow.cartesian
=
T
)
# ... to then filter those that have a reciprocal overlap of at least MIN_RCO
combined
=
combined
[
!
is.na
(
start
)
&
end.call
>
start
&
start.call
<
end
&
(
pmin
(
end
,
end.call
)
-
pmax
(
start
,
start.call
))
/
(
pmax
(
end
,
end.call
)
-
pmin
(
start
,
start.call
))
>=
MIN_RCO
]
# annotate simulated loci with matched real loci, if they exist.
LOCI
=
merge
(
LOCI
,
combined
,
by
=
c
(
"chrom"
,
"start"
,
"end"
),
all.x
=
T
)
LOCI
[,
id
:=
1
:
.N
]
assert_that
(
all
(
LOCI
[,
.
(
chrom
,
start
,
end
)]
==
unique
(
real
[,
.
(
chrom
,
start
,
end
)])))
# Rename called SV classes
unique
(
svs
$
SV_class
)
svs
[
SV_class
==
"p_inv_hom"
,
SV_class
:=
"hom_inv"
]
svs
[
SV_class
==
"p_inv_h1"
,
SV_class
:=
"het_inv"
]
svs
[
SV_class
==
"p_inv_h2"
,
SV_class
:=
"het_inv"
]
svs
[
SV_class
==
"p_del_hom"
,
SV_class
:=
"hom_del"
]
svs
[
SV_class
==
"p_del_h1"
,
SV_class
:=
"het_del"
]
svs
[
SV_class
==
"p_del_h2"
,
SV_class
:=
"het_del"
]
svs
[
SV_class
==
"p_dup_hom"
,
SV_class
:=
"hom_dup"
]
svs
[
SV_class
==
"p_dup_h1"
,
SV_class
:=
"het_dup"
]
svs
[
SV_class
==
"p_dup_h2"
,
SV_class
:=
"het_dup"
]
### Determine sensitivity (recall)
# annotate "real" and "svs" with locus ids for easier overlap
real
=
merge
(
real
,
LOCI
[,
.
(
chrom
,
start
,
end
,
id
)],
by
=
c
(
"chrom"
,
"start"
,
"end"
),
all
=
T
)
svs
=
merge
(
svs
,
LOCI
[,
.
(
chrom
,
start
=
start.call
,
end
=
end.call
,
id
)],
by
=
c
(
"chrom"
,
"start"
,
"end"
),
all.x
=
T
)
# recall = svs correctly detected / svs simulated
# precision = svs correctly detected / all detected svs
#
recall
=
merge
(
real
[,
.
(
id
,
sample
,
cell
,
simulated
=
SV_type
)],
svs
[
!
is.na
(
id
)
&
SV_llr
>=
MIN_LLR
,
.
(
id
,
sample
,
cell
,
called
=
SV_class
)],
by
=
c
(
"id"
,
"sample"
,
"cell"
),
all.x
=
T
)
recall
=
recall
[,
.
(
recall.correct
=
sum
(
simulated
[
!
is.na
(
called
)]
==
called
[
!
is.na
(
called
)]),
recall.any
=
sum
(
!
is.na
(
called
)),
recall.total
=
.N
),
by
=
.
(
id
)]
LOCI
=
merge
(
LOCI
,
recall
,
by
=
"id"
)
prec
=
svs
[
SV_llr
>=
MIN_LLR
,
.N
,
by
=
.
(
chrom
,
start
,
end
,
id
,
SV_class
)]
# Plot: number of breakpoints that were detected
plt_obj
=
LOCI
[,
.
(
size
=
end
-
start
,
VAF
,
SV_type
,
detected
=
!
is.na
(
start.call
))]
plt1
<-
ggplot
(
plt_obj
)
+
geom_point
(
size
=
2
,
aes
(
size
,
VAF
,
col
=
detected
))
+
xlab
(
"SV size"
)
+
scale_color_discrete
(
name
=
paste
(
"based on"
,
MIN_RCO
*
100
,
"% rec. ovl"
))
+
theme
(
legend.position
=
"bottom"
)
+
ggtitle
(
paste0
(
"Were the breakpoints detected? "
,
nrow
(
plt_obj
[
detected
==
TRUE
]),
"/"
,
nrow
(
plt_obj
)))
# Plot: Recall on detected loci
plt_obj
=
LOCI
[
!
is.na
(
start.call
),
.
(
size
=
end
-
start
,
VAF
,
SV_type
,
n
=
recall.total
,
k
=
recall.correct
)]
plt2
=
ggplot
(
plt_obj
)
+
geom_point
(
aes
(
size
,
VAF
,
col
=
k
/
n
))
+
scale_color_gradientn
(
name
=
"Recall (correct SV type)"
,
colours
=
c
(
"blue"
,
"orange"
,
"green"
))
+
facet_wrap
(
~
SV_type
,
nrow
=
1
)
+
theme
(
legend.position
=
"bottom"
,
legend.key.width
=
unit
(
2
,
"cm"
))
+
xlab
(
"SV size"
)
+
ggtitle
(
"Recall"
)
plt2
plt_obj
=
prec
[
is.na
(
id
),
.N
,
by
=
.
(
chrom
,
start
,
end
)]
plt_obj
[,
size
:=
factor
((
end
-
start
)
<
1e6
,
c
(
T
,
F
),
c
(
"< 1Mb"
,
">= 1Mb"
))]
plt3
=
ggplot
(
plt_obj
)
+
geom_histogram
(
aes
(
N
),
binwidth
=
1
)
+
facet_wrap
(
~
size
)
+
ggtitle
(
paste0
(
"SV calls outside simulated region ("
,
sum
(
plt_obj
$
N
),
")"
))
+
xlab
(
"Num. calls at locus"
)
plt3
plt_obj
=
prec
[
is.na
(
id
)]
plt4
=
ggplot
(
plt_obj
)
+
geom_bar
(
aes
(
SV_class
))
+
ggtitle
(
paste0
(
"Wrongly called SV classes"
))
plt4
final
=
ggdraw
()
+
draw_label
(
sv_input
,
x
=
0.5
,
y
=
.97
)
+
draw_plot
(
plt1
,
0
,
0.4
,
0.33
,
0.4
)
+
draw_plot
(
plt2
,
0
,
0
,
1
,
0.4
)
+
draw_plot
(
plt3
,
0.33
,
0.4
,
0.33
,
0.4
)
+
draw_plot
(
plt4
,
0.66
,
0.4
,
0.33
,
0.4
)
final
ggsave
(
snakemake
@
output
[[
1
]],
final
,
width
=
29.7
/
1.5
,
height
=
21
/
1.5
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment