diff --git a/Snakefile b/Snakefile index c61fecf3f398a634e14bffa1f3ab0f086b3d235a..d59643bdac80f1110b27015e054e8dc0b64dc54f 100644 --- a/Snakefile +++ b/Snakefile @@ -40,7 +40,7 @@ rule simul: expand("sv_calls/simulation{seed}-{window}/{window}_fixed.{segments}/{method}.{chrom}.pdf", seed = list(range(7)), window = [50000], - segments = ["medium"], + segments = ["few","medium"], method = METHODS, chrom = config["chromosomes"]), expand("plots/simulation{seed}-{window}/{window}_fixed.pdf", @@ -303,7 +303,7 @@ rule mosaiClassifier_make_call: "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata" output: "sv_calls/{sample}/{windows}.{bpdens}/simpleCalls.txt" - shell: + script: "utils/mosaiClassifier_call.snakemake.R" rule mosaiClassifier_calc_probs: diff --git a/utils/mosaiClassifier/makeSVcallsSimple.R b/utils/mosaiClassifier/makeSVcallsSimple.R index 80bd77fc7d546dfb3adb8f2e515ca38c60dc6fd3..a32104d449393c86a9ba4c5530c1df6837546130 100644 --- a/utils/mosaiClassifier/makeSVcallsSimple.R +++ b/utils/mosaiClassifier/makeSVcallsSimple.R @@ -1,63 +1,44 @@ -sink(snakemake@log[[1]]) library(data.table) library(assertthat) +source("utils/mosaiClassifier/mosaiClassifier.R") + +# delete: probs = readRDS("sv_probabilities/simulation1-50000/50000_fixed.medium/probabilities.Rdata") + +makeSVCallSimple <- function(probs, llr_thr = 1) { + + # Do post-processing incl. priors + normalization + regularization + probs = mosaiClassifierPostProcessing(probs) + setkey(probs, chrom, start, end, sample, cell) + + # annotate the ref_hom posterior probability per segment / cell + probs[, + ref_hom_pp := .SD[haplo_name == "ref_hom", nb_hap_pp], + by = .(chrom, start, end, sample, cell)] + + # order the different haplotype states based on their posterior prob. (nb_hap_pp) + # and keep only the two most likely states + probs[, rank := NULL] + probs[, + rank := frank(-nb_hap_pp, ties.method = "first"), + by = .(chrom, start, end, sample, cell)] + probs <- probs[rank <= 2] + # Sort by rank within each group + setkey(probs, chrom, start, end, sample, cell, rank) + + # select first and 2nd SV call + probs[, `:=`(sv_call_name = haplo_name[rank==1], + sv_call_haplotype = haplotype[rank==1], + sv_call_name_2nd = haplo_name[rank==2], + sv_call_haplotype_2nd = haplotype[rank==2], + llr_to_ref = log(nb_hap_pp[rank==1]) - log(ref_hom_pp[rank==1]), + llr_to_2nd = log(nb_hap_pp[rank==1]) - log(nb_hap_pp[rank==2]))] + probs <- probs[rank == 1] + + + # Clean up table + probs <- probs[, .(chrom, start, end, sample, cell, class, sv_call_name, sv_call_haplotype, sv_call_name_2nd, sv_call_haplotype_2nd, llr_to_ref, llr_to_2nd)] + + return(probs[sv_call_name != "ref_hom" & llr_to_ref > llr_thr]) +} + -f_maryam = snakemake@input[["probs"]] -f_maryam -f_info = snakemake@input[["info"]] -f_info -sample = snakemake@params[["sample_name"]] -sample -f_bamNames = snakemake@input[["bamNames"]] -f_bamNames - -d = fread(f_maryam, header = T) -print("Maryam's data:") -d -f = fread(f_info) -print("Info data:") -f -b = fread(f_bamNames) -b = b[order(cell_id),] -print("bamNames data:") -b - -# Map cell number to cell name -assert_that(max(d$cells) <= nrow(f)) -assert_that(max(d$cells) <= max(b$cell_id)) -assert_that(all(1:nrow(b) == b$cell_id)) # Make sure that bamNames are sorted and exactly match numbers 1...n - -d$cell = b[d$cells,]$cell_name -d$sample = sample -d - -# Get log likelihood ratio of major SV classes -d <- d[, .(chrom = chr, - start = format(start, scientific=F), - end = format(end, scientific=F), - sample, - cell, - type = toupper(types), - p_del_hom = log(`0000`) - log(`1010`), - p_del_h1 = log(`0010`) - log(`1010`), - p_del_h2 = log(`1000`) - log(`1010`), - p_inv_hom = log(`0101`) - log(`1010`), - p_inv_h1 = log(`0110`) - log(`1010`), - p_inv_h2 = log(`1001`) - log(`1010`), - p_dup_hom = log(`2020`) - log(`1010`), - p_dup_h1 = log(`2010`) - log(`1010`), - p_dup_h2 = log(`1020`) - log(`1010`)) ] - - -# keep only entries with an SV call -# log likelihood ratio >= 1 -LLR = 1 -e = melt(d, - id.vars = c("chrom","start","end","sample","cell"), - measure.vars = c("p_del_h1", "p_del_h2", "p_inv_hom", "p_inv_h1", "p_inv_h2", "p_dup_hom", "p_dup_h1", "p_dup_h2"), - variable.name = "SV_class", - value.name = "loglikratio", - variable.factor = F) -e = e[loglikratio>= LLR, .SD[order(loglikratio, decreasing = T)][1,], by = .(chrom, start, end, sample, cell)] -e[, SV_class := substr(SV_class,3,nchar(SV_class))] -write.table(e, file = snakemake@output[[1]], quote=F, col.names = T, row.names = F, sep = "\t") diff --git a/utils/mosaiClassifier/mosaiClassifier.R b/utils/mosaiClassifier/mosaiClassifier.R index 6aaeaa7fff34814c81a7c1679fbb59b1bc5175c5..445c8267ebb11bb997d648278099030134fc4998 100644 --- a/utils/mosaiClassifier/mosaiClassifier.R +++ b/utils/mosaiClassifier/mosaiClassifier.R @@ -241,8 +241,8 @@ mosaiClassifierPostProcessing <- function(probs, haplotypeMode=F, regularization # testing if there are some segments with zero probability for all haplotypes segs_max_hap_nb_probs <- probs[, - .(sample, chrom, cell, start, end, max_nb_hap_ll=rep(max(nb_hap_ll), .N)), - by=.(sample, chrom, cell, start, end)] + .(sample, cell, chrom, start, end, max_nb_hap_ll=rep(max(nb_hap_ll), .N)), + by=.(sample, cell, chrom, start, end)] message(paste("the number of segments with 0 prob for all haplotypes = ", segs_max_hap_nb_probs[max_nb_hap_ll==0, .N]))