Skip to content
Snippets Groups Projects
Commit 1e901eff authored by Sascha Meiers's avatar Sascha Meiers
Browse files

Evaluation script that makes a few plots

parent 8118a911
No related branches found
No related tags found
No related merge requests found
......@@ -34,11 +34,33 @@ rule all:
window = [50000, 100000], chrom = config['chromosomes'][0]) # Specifically run this only for one chrom because it is super slow
################################################################################
# Simulation of count data #
################################################################################
rule simul:
input:
expand("sv_calls/{sample}-{window}/{window}_fixed.{segments}.SV_probs.chr1.pdf",
sample = ["simulation7","simulation8","simulation9"],
window = [50000, 100000],
segments = ["few","medium","many"]),
expand("plots/{sample}-{window}/{window}_fixed.pdf",
sample = ["simulation7","simulation8","simulation9"],
window = [50000, 100000]),
expand("evaluation/simulation{seed}_{window}.{segments}.pdf",
seed = [7,8,9],
window = [50000, 100000],
segments = ["few","medium","many"])
rule evaluate_simulation:
input:
prob = "sv_probabilities/simulation{seed}-{window}/{window}_fixed.{bpdens}/probabilities.txt",
simul = "simulation/variants/genome{seed}-{window}.txt"
output:
"evaluation/simulation{seed}_{window}.{bpdens}.pdf"
script:
"utils/evaluate_simulation.R"
rule simulate_genome:
output:
tsv="simulation/genome/genome{seed}.tsv"
......
library(data.table)
library(assertthat)
library(ggplot2)
library(cowplot)
MIN_LLR = 0.5 # min. log likelihood ratio of SV prob. over REF prob.
MIN_RCO = 0.8 # min. reciprocal overlap of true and detected SV
sv_input = "sv_probabilities/simulation7-100000/100000_fixed.few/probabilities.txt"
real_input = "simulation/variants/genome7-100000.txt"
sv_input = snakemake@input[["prob"]]
real_input = snakemake@input[["simul"]]
svs = fread(sv_input)
real = fread(real_input)
### call SVs from SV probabilities
# rename p_ref column to be distinguished from SV classes
svs[, `:=`(ref_prob = p_ref, p_ref = NULL) ]
# melt to one entry per SV class
svs = melt.data.table(svs, measure.vars = colnames(svs)[grepl('^p_', colnames(svs))], variable.name = "SV_class", value.name = "SV_prob", variable.factor = FALSE)
# cast again to one entry per locus and cell, now with
setkey(svs, chrom, start, end, sample, cell)
svs = svs[, .(SV_class = SV_class[which.max(SV_prob)],
SV_llr = log(max(SV_prob)) - log(ref_prob[1])),
by = .(chrom, start, end, sample, cell)]
### Find reciprocal overlap of SV calls and true SVs
# list of simulated loci (only chrom, start, end)
LOCI = real[, .(VAF = .N), by = .(chrom, start, end, SV_type)]
# list of predicted loci (chrom, start, end) --> only use real SV calls here (MIN_LLR criterion) !
called_loci = unique(svs[SV_llr >= MIN_LLR, .(chrom, start, end)])
# Merge all combinations of loci ...
combined = merge(LOCI[, .(chrom, start, end)], called_loci, by = "chrom", all =T, suffixes = c("",".call"), allow.cartesian = T)
# ... to then filter those that have a reciprocal overlap of at least MIN_RCO
combined = combined[!is.na(start) & end.call > start & start.call < end & (pmin(end, end.call) - pmax(start, start.call)) / (pmax(end, end.call) - pmin(start, start.call)) >= MIN_RCO]
# annotate simulated loci with matched real loci, if they exist.
LOCI = merge(LOCI, combined, by = c("chrom","start","end"), all.x = T)
LOCI[, id := 1:.N]
assert_that(all(LOCI[,.(chrom, start, end)] == unique(real[, .(chrom, start, end)])))
# Rename called SV classes
unique(svs$SV_class)
svs[SV_class == "p_inv_hom", SV_class := "hom_inv"]
svs[SV_class == "p_inv_h1", SV_class := "het_inv"]
svs[SV_class == "p_inv_h2", SV_class := "het_inv"]
svs[SV_class == "p_del_hom", SV_class := "hom_del"]
svs[SV_class == "p_del_h1", SV_class := "het_del"]
svs[SV_class == "p_del_h2", SV_class := "het_del"]
svs[SV_class == "p_dup_hom", SV_class := "hom_dup"]
svs[SV_class == "p_dup_h1", SV_class := "het_dup"]
svs[SV_class == "p_dup_h2", SV_class := "het_dup"]
### Determine sensitivity (recall)
# annotate "real" and "svs" with locus ids for easier overlap
real = merge(real, LOCI[, .(chrom, start, end, id)], by =c("chrom","start","end"), all = T)
svs = merge(svs, LOCI[, .(chrom, start = start.call, end = end.call, id)], by =c("chrom","start","end"), all.x = T)
# recall = svs correctly detected / svs simulated
# precision = svs correctly detected / all detected svs
#
recall = merge(real[, .(id, sample, cell, simulated = SV_type)],
svs[!is.na(id) & SV_llr >= MIN_LLR, .(id, sample, cell, called = SV_class)],
by = c("id","sample","cell"), all.x = T)
recall = recall[, .(recall.correct = sum(simulated[!is.na(called)] == called[!is.na(called)]),
recall.any = sum(!is.na(called)),
recall.total = .N),
by = .(id)]
LOCI = merge(LOCI, recall, by = "id")
prec = svs[SV_llr >= MIN_LLR, .N, by = .(chrom, start, end, id, SV_class)]
# Plot: number of breakpoints that were detected
plt_obj = LOCI[, .(size = end-start, VAF, SV_type, detected = !is.na(start.call))]
plt1 <- ggplot(plt_obj) +
geom_point(size = 2, aes(size, VAF, col = detected)) +
xlab("SV size") +
scale_color_discrete(name = paste("based on",MIN_RCO*100,"% rec. ovl")) +
theme(legend.position = "bottom") +
ggtitle(paste0("Were the breakpoints detected? ", nrow(plt_obj[detected==TRUE]), "/", nrow(plt_obj)))
# Plot: Recall on detected loci
plt_obj = LOCI[!is.na(start.call), .(size = end - start, VAF, SV_type, n = recall.total, k = recall.correct)]
plt2 = ggplot(plt_obj) +
geom_point(aes(size, VAF, col = k/n)) +
scale_color_gradientn(name = "Recall (correct SV type)", colours = c("blue","orange","green")) +
facet_wrap(~SV_type, nrow = 1) +
theme(legend.position = "bottom", legend.key.width = unit(2,"cm")) +
xlab("SV size") +
ggtitle("Recall")
plt2
plt_obj = prec[is.na(id), .N, by = .(chrom,start,end)]
plt_obj[, size := factor((end - start) < 1e6, c(T,F), c("< 1Mb", ">= 1Mb"))]
plt3 = ggplot(plt_obj) +
geom_histogram(aes(N), binwidth = 1) +
facet_wrap(~size) +
ggtitle(paste0("SV calls outside simulated region (", sum(plt_obj$N), ")")) +
xlab("Num. calls at locus")
plt3
plt_obj = prec[is.na(id)]
plt4 = ggplot(plt_obj) +
geom_bar(aes(SV_class)) +
ggtitle(paste0("Wrongly called SV classes"))
plt4
final = ggdraw() +
draw_label(sv_input, x=0.5, y=.97) +
draw_plot(plt1, 0, 0.4, 0.33, 0.4) +
draw_plot(plt2, 0, 0, 1, 0.4) +
draw_plot(plt3, 0.33, 0.4, 0.33, 0.4) +
draw_plot(plt4, 0.66, 0.4, 0.33, 0.4)
final
ggsave(snakemake@output[[1]], final, width = 29.7/1.5, height = 21/1.5)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment