From 6af66f000a711b33ac70961b88b93f7a2e7d06ec Mon Sep 17 00:00:00 2001 From: Tobias Marschall <tobias.marschall@0ohm.net> Date: Fri, 13 Jul 2018 09:00:57 +0200 Subject: [PATCH] Create haplotag count tables --- Snakefile | 38 +++++++++++++++++++++----- utils/haplotagTable.R | 47 +++++++-------------------------- utils/haplotagTable.snakemake.R | 10 +++++++ 3 files changed, 52 insertions(+), 43 deletions(-) create mode 100644 utils/haplotagTable.snakemake.R diff --git a/Snakefile b/Snakefile index 030a0df..a11bac7 100644 --- a/Snakefile +++ b/Snakefile @@ -60,13 +60,16 @@ rule all: bpdens = ["few","medium","many"], method = METHODS), expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLES, chrom = config["chromosomes"]), - expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{af}.pdf", + expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{af}.pdf", sample = SAMPLES, window = [50000, 100000], bpdens = ["few","medium","many"], method = METHODS, af = ["high","med","low","rare"]), - ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[sample] for sample in SAMPLES], + expand("haplotag/table/{sample}/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv", + sample = SAMPLES, + window = [50000, 100000], + bpdens = ["few","medium","many"]), ################################################################################ @@ -686,6 +689,29 @@ rule haplotag_bams: shell: "whatshap haplotag -o {output.bam} -r {input.ref} {input.vcf} {input.bam} > {log} 2>{log}" +rule create_haplotag_segment_bed: + input: + segments="segmentation2/{sample}/{size,[0-9]+}{what}.{bpdens}.txt", + output: + bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens}.bed", + shell: + "awk 'BEGIN {{s={wildcards.size};OFS=\"\\t\"}} $2!=c {{prev=0}} NR>1 {{print $2,prev*s+1,($3+1)*s; prev=$3+1; c=$2}}' {input.segments} > {output.bed}" + +rule create_haplotag_table: + input: + bams=lambda wc: ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]], + bais=lambda wc: ['haplotag/bam/{}/{}.bam.bai'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]], + bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed" + output: + tsv='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv' + params: + bam_path='haplotag/bam/{sample}/', + log: + "log/create_haplotag_table/{sample}.log" + script: + "utils/haplotagTable.snakemake.R" + + ################################################################################ # Call SNVs # ################################################################################ @@ -702,13 +728,13 @@ rule mergeBams: shell: config["samtools"] + " merge -@ {threads} {output} {input} 2>&1 > {log}" -rule indexMergedBam: +rule index_bam: input: - "snv_calls/{sample}/merged.bam" + "{file}.bam" output: - "snv_calls/{sample}/merged.bam.bai" + "{file}.bam.bai" log: - "log/indexMergedBam/{sample}.log" + "{file}.bam.log" shell: config["samtools"] + " index {input} 2> {log}" diff --git a/utils/haplotagTable.R b/utils/haplotagTable.R index 180758c..5f9256d 100755 --- a/utils/haplotagTable.R +++ b/utils/haplotagTable.R @@ -4,38 +4,6 @@ library(ggplot2) library(cowplot) library(BiocParallel) -## load required function [see below!!!] - -## run the command -#tab <- getHaplotagTable(sv.table = "/media/porubsky/Elements/StrandSeqNation/C7/HaplotaggedBams/biAllelic_llr4_100kb_fixed_many.txt", bam.path = "/media/porubsky/Elements/StrandSeqNation/C7/HaplotaggedBams/") -#This new function takes as an input bed file of genomic regions without the header line <chr start end> -tab <- getHaplotagTable2(bedFile = "<regions.bed>", bam.path = "<bams>", CPUs=4) - -## data quality check [Not compete!!!] -#final.table <- tab -#final.table$total.reads <- final.table$crick.count + final.table$watson.count -#final.table$haplotagged <- apply(final.table[,c(10:13)], 1, sum) -#final.table$haplotagged.perc <- (final.table$haplotagged / final.table$total.reads)*100 - -## tagged <- as.data.frame(table(final.table$haplotagged.perc > 0)) -#tagged$ID <- c("untagged", "tagged") -#p1 <- ggplot(tagged, aes(x="", y=Freq, fill=ID)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + theme_bw() - -#final.table.cov <- split(final.table, final.table$haplotagged.perc > 0) -#final.table$regionID <- paste(final.table$chrom, final.table$start, final.table$end, sep="_") -#final.table.perRegion <- split(final.table, final.table$regionID) -## get number of informative regions -#inform.regions <- table(sapply(final.table.perRegion, function(x) sum(x$haplotagged)) == 0) -#inform.regions <- as.data.frame(inform.regions) -#inform.regions$ID <- c('Informative', 'Uninformative') -#p2 <- ggplot(inform.regions, aes(x="", y=Freq, fill=ID)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + theme_bw() -#plot_grid(p1, p2, nrow = 1) - - -######################### -### ==> FUNCTIONS <== ### -######################### - #' Print haplotagged read counts #' #' This function will take \code{list} of haplotagged bams files and will return \link{data.frame} @@ -100,12 +68,17 @@ getHaplotagTable <- function(sv.table=NULL, bam.path=NULL) { #' @param CPUs Number of CPUs to use for data processing. #' @author David Porubsky -getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4) { +getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destination=NULL) { suppressPackageStartupMessages({ requireNamespace("tools") }) + message("Creating haplotag table") + message("BED file: ", bedFile) + message("BAM path: ", bam.path) + message("Output file: ", file.destination) + ## read the SV table regions <- read.table(bedFile, header = FALSE, stringsAsFactors = FALSE) regions.gr <- GRanges(seqnames=regions$V1, ranges=IRanges(start=regions$V2, end=regions$V3)) @@ -136,9 +109,9 @@ getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4) { } final.table <- do.call(rbind, all.counts) - file.base <- gsub(sv.table, pattern = "\\.txt|\\.bed|\\.csv|\\.tsv", replacement = "") - file.destination <- paste0(file.base, "_haplotaggedCounts.txt") - write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE) + if (!is.null(file.destination)){ + write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE) + } message("DONE!!!") return(final.table) @@ -248,4 +221,4 @@ bamregion2GRanges <- function(bamfile, bamindex=bamfile, region=NULL, pairedEndR data <- GenomeInfoDb::keepSeqlevels(data, seqlevels(region), pruning.mode="coarse") return(data) -} \ No newline at end of file +} diff --git a/utils/haplotagTable.snakemake.R b/utils/haplotagTable.snakemake.R new file mode 100644 index 0000000..7286ab3 --- /dev/null +++ b/utils/haplotagTable.snakemake.R @@ -0,0 +1,10 @@ +log <- file(snakemake@log[[1]], open='wt') +sink(file=log, type='message') +sink(file=log, type='output') + +source("utils/haplotagTable.R") + + +tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.path = snakemake@params[["bam_path"]], file.destination = snakemake@output[["tsv"]]) + +#SVplotting(snakemake@input[["sv_calls"]], snakemake@output[["barplot_high"]], snakemake@output[["barplot_med"]], snakemake@output[["barplot_low"]], snakemake@output[["barplot_rare"]]) -- GitLab