Generate haplotag count tables in parallel

997a7b69 · Tobias Marschall · 70ddcf13 · 997a7b69 · 997a7b69 · 997a7b69
Commit 997a7b69 authored 6 years ago by Tobias Marschall
--- a/Snakefile
+++ b/Snakefile
@@ -66,7 +66,7 @@ rule all:
               bpdens = ["few","medium","many"],
               method = METHODS,
               af = ["high","med","low","rare"]),
-        expand("haplotag/table/{sample}/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv",
+        expand("haplotag/table/{sample}/full/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv",
               sample = SAMPLES,
               window = [50000, 100000],
               bpdens = ["few","medium","many"]),
@@ -691,7 +691,7 @@ rule haplotag_bams:

 rule create_haplotag_segment_bed:
    input:
-        segments="segmentation2/{sample}/{size,[0-9]+}{what}.{bpdens}.txt",
+        segments="segmentation2/{sample}/{size}{what}.{bpdens}.txt",
    output:
        bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens}.bed",
    shell:
@@ -699,27 +699,34 @@ rule create_haplotag_segment_bed:

 rule create_haplotag_table:
    input:
-        bams=lambda wc: ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
-        bais=lambda wc: ['haplotag/bam/{}/{}.bam.bai'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
+        bam='haplotag/bam/{sample}/{cell}.bam',
+        bai='haplotag/bam/{sample}/{cell}.bam.bai',
        bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed"
    output:
-        tsv='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
-    params:
-        bam_path='haplotag/bam/{sample}/',
+        tsv='haplotag/table/{sample}/by-cell/haplotag-counts.{cell}.{windows}.{bpdens}.tsv'
    log:
-        "log/create_haplotag_table/{sample}.log"
+        "log/create_haplotag_table/{sample}.{cell}.log"
    script:
        "utils/haplotagTable.snakemake.R"

+rule merge_haplotag_tables:
+    input:
+        tsvs=lambda wc: ['haplotag/table/{}/by-cell/haplotag-counts.{}.{}.{}.tsv'.format(sample,cell,wc.windows,wc.bpdens) for cell in BAM_PER_SAMPLE[wc.sample]],
+    output:
+        tsv='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens}.tsv'
+    shell:
+        '(head -n1 {input.tsvs[0]} && tail -q -n +2 {input.tsvs}) > {output.tsv}'
+
+
 rule create_haplotag_likelihoods:
-	input:
-		haplotag_table='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
-		sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata'
-	output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.data'
-	log:
+    input:
+        haplotag_table='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
+        sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata'
+    output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.data'
+    log:
        "log/create_haplotag_likelihoods/{sample}.log"
-	script:
-		"utils/haplotagProbs.snakemake.R"
+    script:
+        "utils/haplotagProbs.snakemake.R"


 ################################################################################

--- a/utils/haplotagTable.R
+++ b/utils/haplotagTable.R
@@ -65,11 +65,11 @@ getHaplotagTable <- function(sv.table=NULL, bam.path=NULL) {
 #' counts of reads per haplotype.
 #'
 #' @param bedFile A path to a table in bed format with regions to count haplotagged reads.
-#' @param bam.path A path to the haplotagged bam files.
+#' @param bam.file BAM file name
 #' @param CPUs Number of CPUs to use for data processing.
 #' @author David Porubsky

-getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destination=NULL) {
+getHaplotagTable2 <- function(bedFile=NULL, bam.file=NULL, CPUs=4, file.destination=NULL) {
  
  suppressPackageStartupMessages({
    requireNamespace("tools")
@@ -77,54 +77,44 @@ getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destinat
  
  message("Creating haplotag table")
  message("BED file: ", bedFile)
-  message("BAM path: ", bam.path)
+  message("BAM file: ", bam.file)
  message("Output file: ", file.destination)

  ## read the SV table
  regions <- read.table(bedFile, header = FALSE, stringsAsFactors = FALSE)
  regions.gr <- GRanges(seqnames=regions$V1, ranges=IRanges(start=regions$V2, end=regions$V3))
-  ## list all bam files to count haplotagged reads in
-  haplotag.bams <- list.files(path = bam.path, pattern = "\\.bam$", full.names = T)
  
-  all.counts <- list()
-  for (i in 1:length(haplotag.bams)) {
-    bam <- haplotag.bams[i]
-    filename <- basename(bam)
-    cell.id <- unlist(strsplit(filename, "\\."))[1]
-    message("Processing bamfile ", filename, " ...", appendLF=F); ptm <- proc.time()    
-    
-    ## read in reads for selected regions
-    fragments <- bamregion2GRanges(bamfile = bam, region = regions.gr, pairedEndReads = T, min.mapq = 10, filterAltAlign = TRUE)
-    fragments$HP[is.na(fragments$HP)] <- 0 #set missing haplotag to zero
-    ## split reads per selected region
-    hits <- findOverlaps(regions.gr, fragments)
-    fragments.per.region <- split(fragments[subjectHits(hits)], queryHits(hits))
-    # subset regions to the regions that have non-zero read count
-    regions <- regions[unique(queryHits(hits)),]
-    ## count haplotagged reads in selected regions
-    #use parallel execution with a given number of CPUs
-    counts <- bplapply(fragments.per.region, getHapReadCount, BPPARAM = MulticoreParam(CPUs))
-    counts.df <- do.call(rbind, counts)

-    
-    # cbind regions to counts.df
-    counts.df <- cbind(cell=cell.id, regions, counts.df)
-    # renaming the regions columns
-    colnames(counts.df)[2:4] <- c("chrom", "start", "end")
-    
-    ## export final table of haplotagged read counts
-    all.counts[[i]] <- counts.df
-    
-    time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
-  }
+  filename <- basename(bam.file)
+  cell.id <- unlist(strsplit(filename, "\\."))[1]
+  message("Processing bamfile ", filename, " ...", appendLF=F); ptm <- proc.time()
+
+  ## read in reads for selected regions
+  fragments <- bamregion2GRanges(bamfile = bam.file, region = regions.gr, pairedEndReads = T, min.mapq = 10, filterAltAlign = TRUE)
+  fragments$HP[is.na(fragments$HP)] <- 0 #set missing haplotag to zero
+  ## split reads per selected region
+  hits <- findOverlaps(regions.gr, fragments)
+  fragments.per.region <- split(fragments[subjectHits(hits)], queryHits(hits))
+  # subset regions to the regions that have non-zero read count
+  regions <- regions[unique(queryHits(hits)),]
+  ## count haplotagged reads in selected regions
+  #use parallel execution with a given number of CPUs
+  counts <- bplapply(fragments.per.region, getHapReadCount, BPPARAM = MulticoreParam(CPUs))
+  counts.df <- do.call(rbind, counts)
+
+  # cbind regions to counts.df
+  counts.df <- cbind(cell=cell.id, regions, counts.df)
+  # renaming the regions columns
+  colnames(counts.df)[2:4] <- c("chrom", "start", "end")
  
-  final.table <- do.call(rbind, all.counts)
+  time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
+
  if (!is.null(file.destination)){
-    write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE)
+    write.table(counts.df, file = file.destination, quote = FALSE, row.names = FALSE)
  }
  message("DONE!!!")
  
-  return(final.table)
+  return(counts.df)
 }

 #' Count haplotype specific reads

--- a/utils/haplotagTable.snakemake.R
+++ b/utils/haplotagTable.snakemake.R
@@ -5,6 +5,6 @@ sink(file=log, type='output')
 source("utils/haplotagTable.R")


-tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.path = snakemake@params[["bam_path"]], file.destination = snakemake@output[["tsv"]])
+tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.file = snakemake@input[["bam"]], file.destination = snakemake@output[["tsv"]])

 #SVplotting(snakemake@input[["sv_calls"]], snakemake@output[["barplot_high"]], snakemake@output[["barplot_med"]], snakemake@output[["barplot_low"]], snakemake@output[["barplot_rare"]])