diff --git a/Snakefile b/Snakefile index 8116d8e0b049ad14ece5a9c07f85a1eb0cd6d52e..80e371b7c8dd35d22916ae20aeabee6cafe388f0 100644 --- a/Snakefile +++ b/Snakefile @@ -66,7 +66,7 @@ rule all: bpdens = ["few","medium","many"], method = METHODS, af = ["high","med","low","rare"]), - expand("haplotag/table/{sample}/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv", + expand("haplotag/table/{sample}/full/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv", sample = SAMPLES, window = [50000, 100000], bpdens = ["few","medium","many"]), @@ -691,7 +691,7 @@ rule haplotag_bams: rule create_haplotag_segment_bed: input: - segments="segmentation2/{sample}/{size,[0-9]+}{what}.{bpdens}.txt", + segments="segmentation2/{sample}/{size}{what}.{bpdens}.txt", output: bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens}.bed", shell: @@ -699,27 +699,34 @@ rule create_haplotag_segment_bed: rule create_haplotag_table: input: - bams=lambda wc: ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]], - bais=lambda wc: ['haplotag/bam/{}/{}.bam.bai'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]], + bam='haplotag/bam/{sample}/{cell}.bam', + bai='haplotag/bam/{sample}/{cell}.bam.bai', bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed" output: - tsv='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv' - params: - bam_path='haplotag/bam/{sample}/', + tsv='haplotag/table/{sample}/by-cell/haplotag-counts.{cell}.{windows}.{bpdens}.tsv' log: - "log/create_haplotag_table/{sample}.log" + "log/create_haplotag_table/{sample}.{cell}.log" script: "utils/haplotagTable.snakemake.R" +rule merge_haplotag_tables: + input: + tsvs=lambda wc: ['haplotag/table/{}/by-cell/haplotag-counts.{}.{}.{}.tsv'.format(sample,cell,wc.windows,wc.bpdens) for cell in BAM_PER_SAMPLE[wc.sample]], + output: + tsv='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens}.tsv' + shell: + '(head -n1 {input.tsvs[0]} && tail -q -n +2 {input.tsvs}) > {output.tsv}' + + rule create_haplotag_likelihoods: - input: - haplotag_table='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv' - sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata' - output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.data' - log: + input: + haplotag_table='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv' + sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata' + output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.data' + log: "log/create_haplotag_likelihoods/{sample}.log" - script: - "utils/haplotagProbs.snakemake.R" + script: + "utils/haplotagProbs.snakemake.R" ################################################################################ diff --git a/utils/haplotagTable.R b/utils/haplotagTable.R index 936d2cc7d518a6fd859c4e8780307b188f9b0b10..2fea2c08d57b343797455024773705a17669d7d5 100755 --- a/utils/haplotagTable.R +++ b/utils/haplotagTable.R @@ -65,11 +65,11 @@ getHaplotagTable <- function(sv.table=NULL, bam.path=NULL) { #' counts of reads per haplotype. #' #' @param bedFile A path to a table in bed format with regions to count haplotagged reads. -#' @param bam.path A path to the haplotagged bam files. +#' @param bam.file BAM file name #' @param CPUs Number of CPUs to use for data processing. #' @author David Porubsky -getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destination=NULL) { +getHaplotagTable2 <- function(bedFile=NULL, bam.file=NULL, CPUs=4, file.destination=NULL) { suppressPackageStartupMessages({ requireNamespace("tools") @@ -77,54 +77,44 @@ getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destinat message("Creating haplotag table") message("BED file: ", bedFile) - message("BAM path: ", bam.path) + message("BAM file: ", bam.file) message("Output file: ", file.destination) ## read the SV table regions <- read.table(bedFile, header = FALSE, stringsAsFactors = FALSE) regions.gr <- GRanges(seqnames=regions$V1, ranges=IRanges(start=regions$V2, end=regions$V3)) - ## list all bam files to count haplotagged reads in - haplotag.bams <- list.files(path = bam.path, pattern = "\\.bam$", full.names = T) - all.counts <- list() - for (i in 1:length(haplotag.bams)) { - bam <- haplotag.bams[i] - filename <- basename(bam) - cell.id <- unlist(strsplit(filename, "\\."))[1] - message("Processing bamfile ", filename, " ...", appendLF=F); ptm <- proc.time() - - ## read in reads for selected regions - fragments <- bamregion2GRanges(bamfile = bam, region = regions.gr, pairedEndReads = T, min.mapq = 10, filterAltAlign = TRUE) - fragments$HP[is.na(fragments$HP)] <- 0 #set missing haplotag to zero - ## split reads per selected region - hits <- findOverlaps(regions.gr, fragments) - fragments.per.region <- split(fragments[subjectHits(hits)], queryHits(hits)) - # subset regions to the regions that have non-zero read count - regions <- regions[unique(queryHits(hits)),] - ## count haplotagged reads in selected regions - #use parallel execution with a given number of CPUs - counts <- bplapply(fragments.per.region, getHapReadCount, BPPARAM = MulticoreParam(CPUs)) - counts.df <- do.call(rbind, counts) - - # cbind regions to counts.df - counts.df <- cbind(cell=cell.id, regions, counts.df) - # renaming the regions columns - colnames(counts.df)[2:4] <- c("chrom", "start", "end") - - ## export final table of haplotagged read counts - all.counts[[i]] <- counts.df - - time <- proc.time() - ptm; message(" ",round(time[3],2),"s") - } + filename <- basename(bam.file) + cell.id <- unlist(strsplit(filename, "\\."))[1] + message("Processing bamfile ", filename, " ...", appendLF=F); ptm <- proc.time() + + ## read in reads for selected regions + fragments <- bamregion2GRanges(bamfile = bam.file, region = regions.gr, pairedEndReads = T, min.mapq = 10, filterAltAlign = TRUE) + fragments$HP[is.na(fragments$HP)] <- 0 #set missing haplotag to zero + ## split reads per selected region + hits <- findOverlaps(regions.gr, fragments) + fragments.per.region <- split(fragments[subjectHits(hits)], queryHits(hits)) + # subset regions to the regions that have non-zero read count + regions <- regions[unique(queryHits(hits)),] + ## count haplotagged reads in selected regions + #use parallel execution with a given number of CPUs + counts <- bplapply(fragments.per.region, getHapReadCount, BPPARAM = MulticoreParam(CPUs)) + counts.df <- do.call(rbind, counts) + + # cbind regions to counts.df + counts.df <- cbind(cell=cell.id, regions, counts.df) + # renaming the regions columns + colnames(counts.df)[2:4] <- c("chrom", "start", "end") - final.table <- do.call(rbind, all.counts) + time <- proc.time() - ptm; message(" ",round(time[3],2),"s") + if (!is.null(file.destination)){ - write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE) + write.table(counts.df, file = file.destination, quote = FALSE, row.names = FALSE) } message("DONE!!!") - return(final.table) + return(counts.df) } #' Count haplotype specific reads diff --git a/utils/haplotagTable.snakemake.R b/utils/haplotagTable.snakemake.R index 7286ab38a635b6c4017b3e8c31234e2674f0bb2b..f9be24bf2d83d1495576598edbfdcf8fdaddd3ab 100644 --- a/utils/haplotagTable.snakemake.R +++ b/utils/haplotagTable.snakemake.R @@ -5,6 +5,6 @@ sink(file=log, type='output') source("utils/haplotagTable.R") -tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.path = snakemake@params[["bam_path"]], file.destination = snakemake@output[["tsv"]]) +tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.file = snakemake@input[["bam"]], file.destination = snakemake@output[["tsv"]]) #SVplotting(snakemake@input[["sv_calls"]], snakemake@output[["barplot_high"]], snakemake@output[["barplot_med"]], snakemake@output[["barplot_low"]], snakemake@output[["barplot_rare"]])