-
Sascha Meiers authoredSascha Meiers authored
mosaiClassifier.R 10.10 KiB
library(dplyr)
library(data.table)
library(assertthat)
source("utils/mosaiClassifier/getStrandStates.R")
source("utils/mosaiClassifier/getCountsPerSegment.R")
source("utils/mosaiClassifier/generateHaploStates.R")
source("utils/mosaiClassifier/getDispParAndSegType.R")
source("utils/mosaiClassifier/haploAndGenoName.R")
mosaiClassifierPrepare <- function(counts, info, strand, segs) {
##############################################################################
# Check input data
#
assert_that(is.data.table(counts),
"chrom" %in% colnames(counts),
"start" %in% colnames(counts),
"end" %in% colnames(counts),
"sample"%in% colnames(counts),
"cell" %in% colnames(counts)) %>% invisible
setkey(counts, sample, cell, chrom, start, end)
assert_that(is.data.table(info),
"sample"%in% colnames(info),
"cell" %in% colnames(info),
"nb_p" %in% colnames(info),
"nb_r" %in% colnames(info),
"nb_a" %in% colnames(info),
"pass1" %in% colnames(info)) %>% invisible
setkey(info,sample,cell)
assert_that(is.data.table(strand),
"sample"%in% colnames(strand),
"cell" %in% colnames(strand),
"chrom" %in% colnames(strand),
"start" %in% colnames(strand),
"end" %in% colnames(strand),
"class" %in% colnames(strand)) %>% invisible
setkey(strand, sample, cell, chrom, start, end)
# segs
assert_that(is.data.table(segs),
"chrom" %in% colnames(segs),
"bps" %in% colnames(segs)) %>% invisible
setkey(segs, chrom, bps)
# Kick out non-PASS cells
if (nrow(info[pass1 != 1])> 0) {
message("[SV classifier] Kicking out ",
nrow(info[pass1 != 1]),
" low quality cells. ",
nrow(info[pass1 == 1]),
" remain.")
info <- info[pass1 == 1,]
counts <- counts[ paste(sample,cell) %in% info[,paste(sample,cell)] ]
}
# Check that set of cells in counts is the same as in info
assert_that(all(unique(counts[,.(sample,cell)]) == unique(info[,.(sample,cell)]))) %>% invisible
##############################################################################
# Estimation mean read count per bin for dispersion parameters
#
message("[MosaiClassifier] Problem size: ",
nrow(info),
" cells x ",
nrow(segs),
" segments.")
# Get trimmed mean of counts per bin to estimate the r parameter
# When calculating this, ignore the "None" bins
info <- suppressWarnings(
merge(info,
counts[, .(mean = mean((w+c)[class != "None"], trim = 0.05)),
by = .(sample, cell)],
by = c("sample","cell")) )
##############################################################################
# Expand table to (all cells) x (all segments)
#
# add a "from" column which contians the "to" breakpoint from the prev. segment each
segs[, from := shift(bps,fill = 0) + 1, by = chrom]
# rename the "bps" column to "to"
segs[, `:=`(to = bps, bps = NULL, k = NULL)]
# Add coordinates
addPositions(segs, counts)
# Expand the table to (cells) x (segments) and annotate with NB params
# --> take each row in "segs" and cbind it to a whole "info" table
probs <- segs[,
cbind(.SD, info[,.(sample, cell, nb_p, mean)]),
by = .(chrom,from)]
##############################################################################
# Annotate each segment and cell with the strand state
#
message("[MosaiClassifier] Annotating strand-state")
probs = addStrandStates(probs, strand)
##############################################################################
# Annotate the observed and expected counts in each segment / cell
#
message("[MosaiClassifier] Annotating expected coverage")
probs[, expected := (to - from +1)*mean, by = .(sample, cell, chrom, from, to)]
message("[MosaiClassifier] Annotating observed W/C counts")
probs <- addCountsPerSegment(probs, counts)
probs[, scalar := 1]
# Clean up table
probs[, `:=`(from = NULL,
to = NULL,
mean = NULL)]
return(probs)
}
mosaiClassifierCalcProbs <- function(probs, maximumCN=4, haplotypeMode=F, alpha=0.05) {
assert_that(is.data.table(probs),
"sample" %in% colnames(probs),
"cell" %in% colnames(probs),
"chrom" %in% colnames(probs),
"start" %in% colnames(probs),
"end" %in% colnames(probs),
"nb_p" %in% colnames(probs),
"expected" %in% colnames(probs),
"scalar" %in% colnames(probs),
"W" %in% colnames(probs),
"C" %in% colnames(probs),
!("haplotype" %in% colnames(probs)),
!("haplo_name" %in% colnames(probs)),
!("nb_hap_ll" %in% colnames(probs)))
# defining the vector of all possible haplotypes
hapStatus <- NULL
for (j in 0:maximumCN){
hapStatus <- c(hapStatus, allStatus(3,j))
}
for (j in 1:length(hapStatus)){
hapStatus[j] <- paste(decodeStatus(hapStatus[j]), collapse = '')
}
# creating a datatable containing all possible combinations of strand states and haplotypes,
# and setting their segTypes
hapStrandStates <- data.table()
for (st in c("CC","WW","WC","CW")){
hapStrandStates <- rbind(hapStrandStates,
data.table(class=st, haplotype=hapStatus,
segtype=t(sapply(hapStatus, function(x) getSegType(st, x)))))
}
# naming third and forth columns
colnames(hapStrandStates)[3:4]=c("Wcn", "Ccn")
# Add SCE states where all possibilities are equal (by setting Wcn and Ccn to 0):
hapStrandStates = rbind(hapStrandStates,
data.table(class = "?",
haplotype = hapStatus,
Wcn = 1,
Ccn = 1))
# Make sure that all states listed in probs are one of "WW","WC","CW","CC", or "?"
assert_that(all(unique(probs$class) %in% c("WW","WC","CW","CC","?"))) %>% invisible
# adding haplotype and genotype name columns
hapStrandStates[,haplo_name:=.(sapply(haplotype, get_hap_name))]
hapStrandStates[,geno_name:=.(sapply(haplo_name, haplo_to_geno_name))]
# sort based on state
setkey(hapStrandStates, class)
##### mering probs and haplotype strand states
message("[MosaiClassifier] Expand table by ",
length(hapStatus),
" possible haplotype states")
probs <- merge(probs,
hapStrandStates,
by = "class",
allow.cartesian = T)
###########
# reshuffling the columns
probs <- probs[,.(sample, cell, chrom, start, end, start, end, class, nb_p, expected,
W, C, scalar, haplotype, Wcn, Ccn, haplo_name, geno_name)]
# computing dispersion parameters seperately for each segment and W and C counts ("Wcn" and "Ccn")
message("[MosaiClassifier] Calculate dispersion parameters")
probs <- add_dispPar(probs, alpha)
# compute NB haplotype likelihoods
message("[MosaiClassifier] Calculate NB likelihoods")
probs[, nb_hap_ll := dnbinom(W, size = disp_w, prob = nb_p)
* dnbinom(C, size = disp_c, prob = nb_p)]
# computing sister haplotype (haplotype with the same genotype) for each haplotype
sister.haps <- sapply(hapStatus, sisterHaplotype)
sister.hap.pos <- match(sister.haps, hapStatus)
# compute the set of symmetric haplotypes (haplotypes that are equal to their sister haplotype)
symmetric.haps <- hapStatus[which(sister.haps==hapStatus)]
# averaging the nb probs of sister haplotypes, when haplotype specific strand states are not known
# adding genotype likelihoods, if haplotype mode is false
if (!haplotypeMode) {
message("[MosaiClassifier] Haplotype-unaware mode. H1 and H2 events are averaged.")
probs[,nb_hap_ll:=.((nb_hap_ll+nb_hap_ll[sister.hap.pos])/2), by=.(sample, cell, chrom, start, end)]
}
# computing genotype likelihoods
probs[,nb_gt_ll:=.(nb_hap_ll+nb_hap_ll[sister.hap.pos]), by=.(sample, cell, chrom, start, end)]
# deviding the gt likelihoods of symmetric haplotypes by 2
probs[haplotype %in% symmetric.haps, nb_gt_ll:=.(nb_gt_ll/2)]
# Clean up table:
probs[, `:=`(Wcn = NULL,
Ccn = NULL,
disp_w = NULL,
disp_c = NULL)]
return(probs)
}
mosaiClassifierPostProcessing <- function(probs, haplotypeMode=F, regularizationFactor=1e-10)
{
assert_that(is.data.table(probs))
# check the colnames
# testing if there are some segments with zero probability for all haplotypes
segs_max_hap_nb_probs <- probs[,
.(sample, cell, chrom, start, end, max_nb_hap_ll=rep(max(nb_hap_ll), .N)),
by=.(sample, cell, chrom, start, end)]
message(paste("the number of segments with 0 prob for all haplotypes = ",
segs_max_hap_nb_probs[max_nb_hap_ll==0, .N]))
# add prior probs to the table
probs[,prior:=100L]
probs[haplo_name=="ref_hom",prior:=200L]
probs[haplo_name=="complex",prior:=1L]
# compute the posteriori probs (add new columns)
probs[,nb_hap_pp:=.(nb_hap_ll*prior)][,nb_gt_pp:=.(nb_gt_ll*prior)]
# set a uniform prob on sce segs and the segs_max_hap_nb_probs=0
probs[segs_max_hap_nb_probs$max_nb_hap_ll==0,nb_hap_pp:=1L]
probs[class=="?", nb_hap_pp:=1L]
probs[segs_max_hap_nb_probs$max_nb_hap_ll==0,nb_gt_pp:=1L]
probs[class=="?", nb_gt_pp:=1L]
# normalizing nb_hap_pp and nb_gt_pp to 1 per sample, cell, and segment
probs[, nb_hap_pp := nb_hap_pp/sum(nb_hap_pp), by=.(sample, cell, chrom, start, end)]
probs[, nb_gt_pp := nb_gt_pp/sum(nb_gt_pp), by=.(sample, cell, chrom, start, end)]
# get the number of haplotypes
numHaps <- length(unique(probs$haplotype))
# regularizing nb_hap_ll to set the min possible likelihood to a constant small number
probs[,nb_hap_pp:=.((regularizationFactor/numHaps)+nb_hap_pp*(1-regularizationFactor))]
probs[,nb_gt_pp:=.((regularizationFactor/numHaps)+nb_gt_pp*(1-regularizationFactor))]
return(probs)
}