Skip to content
Snippets Groups Projects
Commit 6af66f00 authored by Tobias Marschall's avatar Tobias Marschall
Browse files

Create haplotag count tables

parent 1bb8aa37
No related branches found
No related tags found
No related merge requests found
......@@ -60,13 +60,16 @@ rule all:
bpdens = ["few","medium","many"],
method = METHODS),
expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLES, chrom = config["chromosomes"]),
expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{af}.pdf",
expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{af}.pdf",
sample = SAMPLES,
window = [50000, 100000],
bpdens = ["few","medium","many"],
method = METHODS,
af = ["high","med","low","rare"]),
['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[sample] for sample in SAMPLES],
expand("haplotag/table/{sample}/haplotag-counts.{window}_fixed_norm.{bpdens}.tsv",
sample = SAMPLES,
window = [50000, 100000],
bpdens = ["few","medium","many"]),
################################################################################
......@@ -686,6 +689,29 @@ rule haplotag_bams:
shell:
"whatshap haplotag -o {output.bam} -r {input.ref} {input.vcf} {input.bam} > {log} 2>{log}"
rule create_haplotag_segment_bed:
input:
segments="segmentation2/{sample}/{size,[0-9]+}{what}.{bpdens}.txt",
output:
bed="haplotag/bed/{sample}/{size,[0-9]+}{what}.{bpdens}.bed",
shell:
"awk 'BEGIN {{s={wildcards.size};OFS=\"\\t\"}} $2!=c {{prev=0}} NR>1 {{print $2,prev*s+1,($3+1)*s; prev=$3+1; c=$2}}' {input.segments} > {output.bed}"
rule create_haplotag_table:
input:
bams=lambda wc: ['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
bais=lambda wc: ['haplotag/bam/{}/{}.bam.bai'.format(sample,bam) for bam in BAM_PER_SAMPLE[wc.sample]],
bed = "haplotag/bed/{sample}/{windows}.{bpdens}.bed"
output:
tsv='haplotag/table/{sample}/haplotag-counts.{windows}.{bpdens}.tsv'
params:
bam_path='haplotag/bam/{sample}/',
log:
"log/create_haplotag_table/{sample}.log"
script:
"utils/haplotagTable.snakemake.R"
################################################################################
# Call SNVs #
################################################################################
......@@ -702,13 +728,13 @@ rule mergeBams:
shell:
config["samtools"] + " merge -@ {threads} {output} {input} 2>&1 > {log}"
rule indexMergedBam:
rule index_bam:
input:
"snv_calls/{sample}/merged.bam"
"{file}.bam"
output:
"snv_calls/{sample}/merged.bam.bai"
"{file}.bam.bai"
log:
"log/indexMergedBam/{sample}.log"
"{file}.bam.log"
shell:
config["samtools"] + " index {input} 2> {log}"
......
......@@ -4,38 +4,6 @@ library(ggplot2)
library(cowplot)
library(BiocParallel)
## load required function [see below!!!]
## run the command
#tab <- getHaplotagTable(sv.table = "/media/porubsky/Elements/StrandSeqNation/C7/HaplotaggedBams/biAllelic_llr4_100kb_fixed_many.txt", bam.path = "/media/porubsky/Elements/StrandSeqNation/C7/HaplotaggedBams/")
#This new function takes as an input bed file of genomic regions without the header line <chr start end>
tab <- getHaplotagTable2(bedFile = "<regions.bed>", bam.path = "<bams>", CPUs=4)
## data quality check [Not compete!!!]
#final.table <- tab
#final.table$total.reads <- final.table$crick.count + final.table$watson.count
#final.table$haplotagged <- apply(final.table[,c(10:13)], 1, sum)
#final.table$haplotagged.perc <- (final.table$haplotagged / final.table$total.reads)*100
## tagged <- as.data.frame(table(final.table$haplotagged.perc > 0))
#tagged$ID <- c("untagged", "tagged")
#p1 <- ggplot(tagged, aes(x="", y=Freq, fill=ID)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + theme_bw()
#final.table.cov <- split(final.table, final.table$haplotagged.perc > 0)
#final.table$regionID <- paste(final.table$chrom, final.table$start, final.table$end, sep="_")
#final.table.perRegion <- split(final.table, final.table$regionID)
## get number of informative regions
#inform.regions <- table(sapply(final.table.perRegion, function(x) sum(x$haplotagged)) == 0)
#inform.regions <- as.data.frame(inform.regions)
#inform.regions$ID <- c('Informative', 'Uninformative')
#p2 <- ggplot(inform.regions, aes(x="", y=Freq, fill=ID)) + geom_bar(width = 1, stat = "identity") + coord_polar("y", start=0) + theme_bw()
#plot_grid(p1, p2, nrow = 1)
#########################
### ==> FUNCTIONS <== ###
#########################
#' Print haplotagged read counts
#'
#' This function will take \code{list} of haplotagged bams files and will return \link{data.frame}
......@@ -100,12 +68,17 @@ getHaplotagTable <- function(sv.table=NULL, bam.path=NULL) {
#' @param CPUs Number of CPUs to use for data processing.
#' @author David Porubsky
getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4) {
getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4, file.destination=NULL) {
suppressPackageStartupMessages({
requireNamespace("tools")
})
message("Creating haplotag table")
message("BED file: ", bedFile)
message("BAM path: ", bam.path)
message("Output file: ", file.destination)
## read the SV table
regions <- read.table(bedFile, header = FALSE, stringsAsFactors = FALSE)
regions.gr <- GRanges(seqnames=regions$V1, ranges=IRanges(start=regions$V2, end=regions$V3))
......@@ -136,9 +109,9 @@ getHaplotagTable2 <- function(bedFile=NULL, bam.path=NULL, CPUs=4) {
}
final.table <- do.call(rbind, all.counts)
file.base <- gsub(sv.table, pattern = "\\.txt|\\.bed|\\.csv|\\.tsv", replacement = "")
file.destination <- paste0(file.base, "_haplotaggedCounts.txt")
write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE)
if (!is.null(file.destination)){
write.table(final.table, file = file.destination, quote = FALSE, row.names = FALSE)
}
message("DONE!!!")
return(final.table)
......@@ -248,4 +221,4 @@ bamregion2GRanges <- function(bamfile, bamindex=bamfile, region=NULL, pairedEndR
data <- GenomeInfoDb::keepSeqlevels(data, seqlevels(region), pruning.mode="coarse")
return(data)
}
\ No newline at end of file
}
log <- file(snakemake@log[[1]], open='wt')
sink(file=log, type='message')
sink(file=log, type='output')
source("utils/haplotagTable.R")
tab <- getHaplotagTable2(bedFile = snakemake@input[["bed"]], bam.path = snakemake@params[["bam_path"]], file.destination = snakemake@output[["tsv"]])
#SVplotting(snakemake@input[["sv_calls"]], snakemake@output[["barplot_high"]], snakemake@output[["barplot_med"]], snakemake@output[["barplot_low"]], snakemake@output[["barplot_rare"]])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment