Skip to content
Snippets Groups Projects
Commit 12efaf98 authored by Sascha Meiers's avatar Sascha Meiers
Browse files

Making a simple SV caller

parent 722f8a76
No related branches found
No related tags found
No related merge requests found
......@@ -40,7 +40,7 @@ rule simul:
expand("sv_calls/simulation{seed}-{window}/{window}_fixed.{segments}/{method}.{chrom}.pdf",
seed = list(range(7)),
window = [50000],
segments = ["medium"],
segments = ["few","medium"],
method = METHODS,
chrom = config["chromosomes"]),
expand("plots/simulation{seed}-{window}/{window}_fixed.pdf",
......@@ -303,7 +303,7 @@ rule mosaiClassifier_make_call:
"sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
output:
"sv_calls/{sample}/{windows}.{bpdens}/simpleCalls.txt"
shell:
script:
"utils/mosaiClassifier_call.snakemake.R"
rule mosaiClassifier_calc_probs:
......
sink(snakemake@log[[1]])
library(data.table)
library(assertthat)
source("utils/mosaiClassifier/mosaiClassifier.R")
# delete: probs = readRDS("sv_probabilities/simulation1-50000/50000_fixed.medium/probabilities.Rdata")
makeSVCallSimple <- function(probs, llr_thr = 1) {
# Do post-processing incl. priors + normalization + regularization
probs = mosaiClassifierPostProcessing(probs)
setkey(probs, chrom, start, end, sample, cell)
# annotate the ref_hom posterior probability per segment / cell
probs[,
ref_hom_pp := .SD[haplo_name == "ref_hom", nb_hap_pp],
by = .(chrom, start, end, sample, cell)]
# order the different haplotype states based on their posterior prob. (nb_hap_pp)
# and keep only the two most likely states
probs[, rank := NULL]
probs[,
rank := frank(-nb_hap_pp, ties.method = "first"),
by = .(chrom, start, end, sample, cell)]
probs <- probs[rank <= 2]
# Sort by rank within each group
setkey(probs, chrom, start, end, sample, cell, rank)
# select first and 2nd SV call
probs[, `:=`(sv_call_name = haplo_name[rank==1],
sv_call_haplotype = haplotype[rank==1],
sv_call_name_2nd = haplo_name[rank==2],
sv_call_haplotype_2nd = haplotype[rank==2],
llr_to_ref = log(nb_hap_pp[rank==1]) - log(ref_hom_pp[rank==1]),
llr_to_2nd = log(nb_hap_pp[rank==1]) - log(nb_hap_pp[rank==2]))]
probs <- probs[rank == 1]
# Clean up table
probs <- probs[, .(chrom, start, end, sample, cell, class, sv_call_name, sv_call_haplotype, sv_call_name_2nd, sv_call_haplotype_2nd, llr_to_ref, llr_to_2nd)]
return(probs[sv_call_name != "ref_hom" & llr_to_ref > llr_thr])
}
f_maryam = snakemake@input[["probs"]]
f_maryam
f_info = snakemake@input[["info"]]
f_info
sample = snakemake@params[["sample_name"]]
sample
f_bamNames = snakemake@input[["bamNames"]]
f_bamNames
d = fread(f_maryam, header = T)
print("Maryam's data:")
d
f = fread(f_info)
print("Info data:")
f
b = fread(f_bamNames)
b = b[order(cell_id),]
print("bamNames data:")
b
# Map cell number to cell name
assert_that(max(d$cells) <= nrow(f))
assert_that(max(d$cells) <= max(b$cell_id))
assert_that(all(1:nrow(b) == b$cell_id)) # Make sure that bamNames are sorted and exactly match numbers 1...n
d$cell = b[d$cells,]$cell_name
d$sample = sample
d
# Get log likelihood ratio of major SV classes
d <- d[, .(chrom = chr,
start = format(start, scientific=F),
end = format(end, scientific=F),
sample,
cell,
type = toupper(types),
p_del_hom = log(`0000`) - log(`1010`),
p_del_h1 = log(`0010`) - log(`1010`),
p_del_h2 = log(`1000`) - log(`1010`),
p_inv_hom = log(`0101`) - log(`1010`),
p_inv_h1 = log(`0110`) - log(`1010`),
p_inv_h2 = log(`1001`) - log(`1010`),
p_dup_hom = log(`2020`) - log(`1010`),
p_dup_h1 = log(`2010`) - log(`1010`),
p_dup_h2 = log(`1020`) - log(`1010`)) ]
# keep only entries with an SV call
# log likelihood ratio >= 1
LLR = 1
e = melt(d,
id.vars = c("chrom","start","end","sample","cell"),
measure.vars = c("p_del_h1", "p_del_h2", "p_inv_hom", "p_inv_h1", "p_inv_h2", "p_dup_hom", "p_dup_h1", "p_dup_h2"),
variable.name = "SV_class",
value.name = "loglikratio",
variable.factor = F)
e = e[loglikratio>= LLR, .SD[order(loglikratio, decreasing = T)][1,], by = .(chrom, start, end, sample, cell)]
e[, SV_class := substr(SV_class,3,nchar(SV_class))]
write.table(e, file = snakemake@output[[1]], quote=F, col.names = T, row.names = F, sep = "\t")
......@@ -241,8 +241,8 @@ mosaiClassifierPostProcessing <- function(probs, haplotypeMode=F, regularization
# testing if there are some segments with zero probability for all haplotypes
segs_max_hap_nb_probs <- probs[,
.(sample, chrom, cell, start, end, max_nb_hap_ll=rep(max(nb_hap_ll), .N)),
by=.(sample, chrom, cell, start, end)]
.(sample, cell, chrom, start, end, max_nb_hap_ll=rep(max(nb_hap_ll), .N)),
by=.(sample, cell, chrom, start, end)]
message(paste("the number of segments with 0 prob for all haplotypes = ",
segs_max_hap_nb_probs[max_nb_hap_ll==0, .N]))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment