From 6af66f000a711b33ac70961b88b93f7a2e7d06ec Mon Sep 17 00:00:00 2001
From: Tobias Marschall <tobias.marschall@0ohm.net>
Date: Fri, 13 Jul 2018 09:00:57 +0200
Subject: [PATCH] Create haplotag count tables

---
 Snakefile                       | 38 +++++++++++++++++++++-----
 utils/haplotagTable.R           | 47 +++++++--------------------------
 utils/haplotagTable.snakemake.R | 10 +++++++
 3 files changed, 52 insertions(+), 43 deletions(-)
 create mode 100644 utils/haplotagTable.snakemake.R

diff --git a/Snakefile b/Snakefile
index 030a0df..a11bac7 100644
--- a/Snakefile
+++ b/Snakefile
@@ -60,13 +60,16 @@ rule all:
                bpdens = ["few","medium","many"],
                method = METHODS),
         expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLES, chrom = config["chromosomes"]),
-        expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{af}.pdf", 
+        expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{af}.pdf",
                sample = SAMPLES,
                window = [50000, 100000],
                bpdens = ["few","medium","many"],
                method = METHODS,
                af = ["high","med","low","rare"]),
-        ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[sample] for sample in SAMPLES],
+        expand("haplotag/table/{sample}/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv",
+               sample = SAMPLES,
+               window = [50000, 100000],
+               bpdens = ["few","medium","many"]),
 
 
 ################################################################################
@@ -686,6 +689,29 @@ rule haplotag_bams:
     shell:
         "whatshap haplotag -o {output.bam} -r {input.ref} {input.vcf} {input.bam} > {log} 2>{log}"
 
+rule create_haplotag_segment_bed:
+    input:
+        segments="segmentation2/{sample}/{size,[0-9]+}{what}.{bpdens}.txt",
+    output:
+        bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens}.bed",
+    shell:
+        "awk 'BEGIN {{s={wildcards.size};OFS=\"\\t\"}} $2!=c {{prev=0}} NR>1 {{print $2,prev*s+1,($3+1)*s; prev=$3+1; c=$2}}' {input.segments} > {output.bed}"
+
+rule create_haplotag_table:
+    input:
+        bams=lambda wc: ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
+        bais=lambda wc: ['haplotag/bam/{}/{}.bam.bai'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
+        bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed"
+    output:
+        tsv='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
+    params:
+        bam_path='haplotag/bam/{sample}/',
+    log:
+        "log/create_haplotag_table/{sample}.log"
+    script:
+        "utils/haplotagTable.snakemake.R"
+
+
 ################################################################################
 # Call SNVs                                                                    #
 ################################################################################
@@ -702,13 +728,13 @@ rule mergeBams:
     shell:
         config["samtools"] + " merge -@ {threads} {output} {input} 2>&1 > {log}"
 
-rule indexMergedBam:
+rule index_bam:
     input:
-        "snv_calls/{sample}/merged.bam"
+        "{file}.bam"
     output:
-        "snv_calls/{sample}/merged.bam.bai"
+        "{file}.bam.bai"
     log:
-        "log/indexMergedBam/{sample}.log"
+        "{file}.bam.log"
     shell:
         config["samtools"] + " index {input} 2> {log}"
 
diff --git a/utils/haplotagTable.R b/utils/haplotagTable.R
index 180758c..5f9256d 100755
--- a/utils/haplotagTable.R
+++ b/utils/haplotagTable.R
@@ -4,38 +4,6 @@ library(ggplot2)
 library(cowplot)
 library(BiocParallel)
 
-## load required function [see below!!!]
-
-## run the command
-#tab <- getHaplotagTable(sv.table = "/media/porubsky/Elements/StrandSeqNation/C7/HaplotaggedBams/biAllelic_llr4_100kb_fixed_many.txt", bam.path = "/media/porubsky/Elements/StrandSeqNation/C7/HaplotaggedBams/")
-#This new function takes as an input bed file of genomic regions without the header line <chr start end>
-tab <- getHaplotagTable2(bedFile = "<regions.bed>", bam.path = "<bams>", CPUs=4)
-
-## data quality check [Not compete!!!]
-#final.table <- tab
-#final.table$total.reads <- final.table$crick.count + final.table$watson.count
-#final.table$haplotagged <- apply(final.table[,c(10:13)], 1, sum)
-#final.table$haplotagged.perc <- (final.table$haplotagged / final.table$total.reads)*100
-
-## tagged <- as.data.frame(table(final.table$haplotagged.perc > 0))
-#tagged$ID <- c("untagged", "tagged")
-#p1 <- ggplot(tagged, aes(x="", y=Freq, fill=ID)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + theme_bw()
-
-#final.table.cov <- split(final.table, final.table$haplotagged.perc > 0)
-#final.table$regionID <- paste(final.table$chrom, final.table$start, final.table$end, sep="_")
-#final.table.perRegion <- split(final.table, final.table$regionID)
-## get number of informative regions
-#inform.regions <- table(sapply(final.table.perRegion, function(x) sum(x$haplotagged)) == 0)
-#inform.regions <- as.data.frame(inform.regions)
-#inform.regions$ID <- c('Informative', 'Uninformative')
-#p2 <- ggplot(inform.regions, aes(x="", y=Freq, fill=ID)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + theme_bw()
-#plot_grid(p1, p2, nrow = 1)
-
-
-#########################
-### ==> FUNCTIONS <== ###
-#########################
-
 #' Print haplotagged read counts
 #'
 #' This function will take \code{list} of haplotagged bams files and will return \link{data.frame}
@@ -100,12 +68,17 @@ getHaplotagTable <- function(sv.table=NULL, bam.path=NULL) {
 #' @param CPUs Number of CPUs to use for data processing.
 #' @author David Porubsky
 
-getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4) {
+getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destination=NULL) {
   
   suppressPackageStartupMessages({
     requireNamespace("tools")
   })
   
+  message("Creating haplotag table")
+  message("BED file: ", bedFile)
+  message("BAM path: ", bam.path)
+  message("Output file: ", file.destination)
+
   ## read the SV table
   regions <- read.table(bedFile, header = FALSE, stringsAsFactors = FALSE)
   regions.gr <- GRanges(seqnames=regions$V1, ranges=IRanges(start=regions$V2, end=regions$V3))
@@ -136,9 +109,9 @@ getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4) {
   }
   
   final.table <- do.call(rbind, all.counts)
-  file.base <- gsub(sv.table, pattern = "\\.txt|\\.bed|\\.csv|\\.tsv", replacement = "")
-  file.destination <- paste0(file.base, "_haplotaggedCounts.txt") 
-  write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE)
+  if (!is.null(file.destination)){
+    write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE)
+  }
   message("DONE!!!")
   
   return(final.table)
@@ -248,4 +221,4 @@ bamregion2GRanges <- function(bamfile, bamindex=bamfile, region=NULL, pairedEndR
   data <- GenomeInfoDb::keepSeqlevels(data, seqlevels(region), pruning.mode="coarse")
   
   return(data)
-}
\ No newline at end of file
+}
diff --git a/utils/haplotagTable.snakemake.R b/utils/haplotagTable.snakemake.R
new file mode 100644
index 0000000..7286ab3
--- /dev/null
+++ b/utils/haplotagTable.snakemake.R
@@ -0,0 +1,10 @@
+log <- file(snakemake@log[[1]], open='wt')
+sink(file=log, type='message')
+sink(file=log, type='output')
+
+source("utils/haplotagTable.R")
+
+
+tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.path = snakemake@params[["bam_path"]], file.destination = snakemake@output[["tsv"]])
+
+#SVplotting(snakemake@input[["sv_calls"]], snakemake@output[["barplot_high"]], snakemake@output[["barplot_med"]], snakemake@output[["barplot_low"]], snakemake@output[["barplot_rare"]])
-- 
GitLab