From 70ddcf131687bbf9bd7be896e9f1768d43793033 Mon Sep 17 00:00:00 2001
From: maryamghr <maryam@lap-13-67.mmci.uni-saarland.de>
Date: Tue, 17 Jul 2018 17:33:45 +0200
Subject: [PATCH] added scripts and rules for computing haplotagged read counts
 probabilities

---
 Snakefile                       | 10 +++++
 utils/haplotagProbs.R           | 77 +++++++++++++++++++++++++++++++++
 utils/haplotagProbs.snakemake.R |  8 ++++
 3 files changed, 95 insertions(+)
 create mode 100644 utils/haplotagProbs.R
 create mode 100644 utils/haplotagProbs.snakemake.R

diff --git a/Snakefile b/Snakefile
index 9f5afef..8116d8e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -711,6 +711,16 @@ rule create_haplotag_table:
     script:
         "utils/haplotagTable.snakemake.R"
 
+rule create_haplotag_likelihoods:
+	input:
+		haplotag_table='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
+		sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata'
+	output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.data'
+	log:
+        "log/create_haplotag_likelihoods/{sample}.log"
+	script:
+		"utils/haplotagProbs.snakemake.R"
+
 
 ################################################################################
 # Call SNVs                                                                    #
diff --git a/utils/haplotagProbs.R b/utils/haplotagProbs.R
new file mode 100644
index 0000000..09cfe4c
--- /dev/null
+++ b/utils/haplotagProbs.R
@@ -0,0 +1,77 @@
+suppressMessages(library(mc2d))
+suppressMessages(library(dplyr))
+suppressMessages(library(data.table))
+suppressMessages(library(assertthat))
+source("utils/mosaiClassifier/getDispParAndSegType.R")
+
+# defining multinomial parameters based on haplosegType and alpha
+getMultinomialParams <- function(haploSegType, alpha)
+{
+  CNs <- NULL
+  if (haploSegType=="?"){
+    CNs <- rep(-1, 4)
+  }
+  
+  else{
+    CNs <- as.numeric(unlist(strsplit(haploSegType, "")))
+    CNs[CNs==0]=alpha
+    CNs <- CNs/sum(CNs)
+  }
+  
+  return(list(multi.p.W.h1=CNs[1],
+              multi.p.C.h1=CNs[2],
+              multi.p.W.h2=CNs[3],
+              multi.p.C.h2=CNs[4]))
+}
+
+# adding an extra column for multinomial probabilities
+addHaploCountProbs <- function(probs, haploCounts, alpha)
+{
+  assert_that(is.data.table(probs),
+              "sample" %in% colnames(probs),
+              "cell"   %in% colnames(probs),
+              "chrom"  %in% colnames(probs),
+              "start"  %in% colnames(probs),
+              "end"    %in% colnames(probs),
+              "nb_p"   %in% colnames(probs),
+              "expected"     %in% colnames(probs),
+              "W"      %in% colnames(probs),
+              "C"      %in% colnames(probs),
+              "class" %in% colnames(probs),
+              "haplotype"  %in% colnames(probs)) %>% invisible
+  
+  # sorting probs
+  setkey(probs, sample, chrom, start, end, cell, haplotype)
+  
+  # adding segTypes and haplo segTypes
+  probs[, `:=`(segtype=paste0(getSegType(class[1], haplotype[1]), collapse = ''),
+               haploSegType=ifelse(class=="?", "?", paste0(c(getSegType(substr(class[1],1,1), substr(haplotype[1],1,2)),
+                                                             getSegType(substr(class[1],2,2), substr(haplotype[1],3,4))), collapse = ''))),
+        by = .(class, haplotype)]
+  
+  # merging probs and haplotype counts
+  probs <- merge(probs,
+                 haploCounts[,.(chrom, start, end, cell, watson.H1, crick.H1, watson.H2, crick.H2)],
+                 by=c("chrom", "start", "end", "cell"),
+                 allow.cartesian = T,
+                 all.x = T)
+  
+  # define multinomial probabilities for each of the four different outcomes based on haplo segtype
+  probs[, c("multi.p.W.h1", "multi.p.C.h1", "multi.p.W.h2", "multi.p.C.h2"):=getMultinomialParams(haploSegType[1], alpha), 
+        by=haploSegType]
+  
+  # add a column for multinomial probabilities of haplotagged read counts
+  probs.new <- probs[!is.na(watson.H1) & !is.na(multi.p.W.h1)]
+  
+  probs.new[, multi.prob:=dmultinomial(x=as.matrix(probs.new[,.(watson.H1, crick.H1, watson.H2, crick.H2)]), 
+                                       prob=as.matrix(probs.new[,.(multi.p.W.h1, multi.p.C.h1, multi.p.W.h2, multi.p.C.h2)]))]
+  
+  # merge probs.new and probs
+  probs <- merge(probs, 
+                 probs.new[, .(chrom, start, end, cell, sample, haplotype, multi.prob)],
+                 by=c("chrom", "start", "end", "cell", "sample", "haplotype"),
+                 all.x = T,
+                 allow.cartesian = T)
+  
+  return(probs)
+}
\ No newline at end of file
diff --git a/utils/haplotagProbs.snakemake.R b/utils/haplotagProbs.snakemake.R
new file mode 100644
index 0000000..14e14df
--- /dev/null
+++ b/utils/haplotagProbs.snakemake.R
@@ -0,0 +1,8 @@
+sink(snakemake@log[[1]])
+
+haplotagCounts <- snakemake@input[[haplotag_table]]
+probs <- snakemake@input[[sv_probs_table]]
+
+probs <- addHaploCountProbs(probs, haploCounts, alpha)
+
+save(probs, file=snakemake@output[[1]])
\ No newline at end of file
-- 
GitLab