Skip to content
Snippets Groups Projects
Commit 997a7b69 authored by Tobias Marschall's avatar Tobias Marschall
Browse files

Generate haplotag count tables in parallel

parent 70ddcf13
No related branches found
No related tags found
No related merge requests found
......@@ -66,7 +66,7 @@ rule all:
bpdens = ["few","medium","many"],
method = METHODS,
af = ["high","med","low","rare"]),
expand("haplotag/table/{sample}/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv",
expand("haplotag/table/{sample}/full/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv",
sample = SAMPLES,
window = [50000, 100000],
bpdens = ["few","medium","many"]),
......@@ -691,7 +691,7 @@ rule haplotag_bams:
rule create_haplotag_segment_bed:
input:
segments="segmentation2/{sample}/{size,[0-9]+}{what}.{bpdens}.txt",
segments="segmentation2/{sample}/{size}{what}.{bpdens}.txt",
output:
bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens}.bed",
shell:
......@@ -699,27 +699,34 @@ rule create_haplotag_segment_bed:
rule create_haplotag_table:
input:
bams=lambda wc: ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
bais=lambda wc: ['haplotag/bam/{}/{}.bam.bai'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
bam='haplotag/bam/{sample}/{cell}.bam',
bai='haplotag/bam/{sample}/{cell}.bam.bai',
bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed"
output:
tsv='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
params:
bam_path='haplotag/bam/{sample}/',
tsv='haplotag/table/{sample}/by-cell/haplotag-counts.{cell}.{windows}.{bpdens}.tsv'
log:
"log/create_haplotag_table/{sample}.log"
"log/create_haplotag_table/{sample}.{cell}.log"
script:
"utils/haplotagTable.snakemake.R"
rule merge_haplotag_tables:
input:
tsvs=lambda wc: ['haplotag/table/{}/by-cell/haplotag-counts.{}.{}.{}.tsv'.format(sample,cell,wc.windows,wc.bpdens) for cell in BAM_PER_SAMPLE[wc.sample]],
output:
tsv='haplotag/table/{sample}/full/haplotag-counts.{windows}.{bpdens}.tsv'
shell:
'(head -n1 {input.tsvs[0]} && tail -q -n +2 {input.tsvs}) > {output.tsv}'
rule create_haplotag_likelihoods:
input:
haplotag_table='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata'
output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.data'
log:
input:
haplotag_table='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
sv_probs_table = 'sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata'
output: 'haplotag/table/{sample}/haplotag-likelihoods.{windows}.{bpdens}.data'
log:
"log/create_haplotag_likelihoods/{sample}.log"
script:
"utils/haplotagProbs.snakemake.R"
script:
"utils/haplotagProbs.snakemake.R"
################################################################################
......
......@@ -65,11 +65,11 @@ getHaplotagTable <- function(sv.table=NULL, bam.path=NULL) {
#' counts of reads per haplotype.
#'
#' @param bedFile A path to a table in bed format with regions to count haplotagged reads.
#' @param bam.path A path to the haplotagged bam files.
#' @param bam.file BAM file name
#' @param CPUs Number of CPUs to use for data processing.
#' @author David Porubsky
getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destination=NULL) {
getHaplotagTable2 <- function(bedFile=NULL, bam.file=NULL, CPUs=4, file.destination=NULL) {
suppressPackageStartupMessages({
requireNamespace("tools")
......@@ -77,54 +77,44 @@ getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destinat
message("Creating haplotag table")
message("BED file: ", bedFile)
message("BAM path: ", bam.path)
message("BAM file: ", bam.file)
message("Output file: ", file.destination)
## read the SV table
regions <- read.table(bedFile, header = FALSE, stringsAsFactors = FALSE)
regions.gr <- GRanges(seqnames=regions$V1, ranges=IRanges(start=regions$V2, end=regions$V3))
## list all bam files to count haplotagged reads in
haplotag.bams <- list.files(path = bam.path, pattern = "\\.bam$", full.names = T)
all.counts <- list()
for (i in 1:length(haplotag.bams)) {
bam <- haplotag.bams[i]
filename <- basename(bam)
cell.id <- unlist(strsplit(filename, "\\."))[1]
message("Processing bamfile ", filename, " ...", appendLF=F); ptm <- proc.time()
## read in reads for selected regions
fragments <- bamregion2GRanges(bamfile = bam, region = regions.gr, pairedEndReads = T, min.mapq = 10, filterAltAlign = TRUE)
fragments$HP[is.na(fragments$HP)] <- 0 #set missing haplotag to zero
## split reads per selected region
hits <- findOverlaps(regions.gr, fragments)
fragments.per.region <- split(fragments[subjectHits(hits)], queryHits(hits))
# subset regions to the regions that have non-zero read count
regions <- regions[unique(queryHits(hits)),]
## count haplotagged reads in selected regions
#use parallel execution with a given number of CPUs
counts <- bplapply(fragments.per.region, getHapReadCount, BPPARAM = MulticoreParam(CPUs))
counts.df <- do.call(rbind, counts)
# cbind regions to counts.df
counts.df <- cbind(cell=cell.id, regions, counts.df)
# renaming the regions columns
colnames(counts.df)[2:4] <- c("chrom", "start", "end")
## export final table of haplotagged read counts
all.counts[[i]] <- counts.df
time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
}
filename <- basename(bam.file)
cell.id <- unlist(strsplit(filename, "\\."))[1]
message("Processing bamfile ", filename, " ...", appendLF=F); ptm <- proc.time()
## read in reads for selected regions
fragments <- bamregion2GRanges(bamfile = bam.file, region = regions.gr, pairedEndReads = T, min.mapq = 10, filterAltAlign = TRUE)
fragments$HP[is.na(fragments$HP)] <- 0 #set missing haplotag to zero
## split reads per selected region
hits <- findOverlaps(regions.gr, fragments)
fragments.per.region <- split(fragments[subjectHits(hits)], queryHits(hits))
# subset regions to the regions that have non-zero read count
regions <- regions[unique(queryHits(hits)),]
## count haplotagged reads in selected regions
#use parallel execution with a given number of CPUs
counts <- bplapply(fragments.per.region, getHapReadCount, BPPARAM = MulticoreParam(CPUs))
counts.df <- do.call(rbind, counts)
# cbind regions to counts.df
counts.df <- cbind(cell=cell.id, regions, counts.df)
# renaming the regions columns
colnames(counts.df)[2:4] <- c("chrom", "start", "end")
final.table <- do.call(rbind, all.counts)
time <- proc.time() - ptm; message(" ",round(time[3],2),"s")
if (!is.null(file.destination)){
write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE)
write.table(counts.df, file = file.destination, quote = FALSE, row.names = FALSE)
}
message("DONE!!!")
return(final.table)
return(counts.df)
}
#' Count haplotype specific reads
......
......@@ -5,6 +5,6 @@ sink(file=log, type='output')
source("utils/haplotagTable.R")
tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.path = snakemake@params[["bam_path"]], file.destination = snakemake@output[["tsv"]])
tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.file = snakemake@input[["bam"]], file.destination = snakemake@output[["tsv"]])
#SVplotting(snakemake@input[["sv_calls"]], snakemake@output[["barplot_high"]], snakemake@output[["barplot_med"]], snakemake@output[["barplot_low"]], snakemake@output[["barplot_rare"]])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment