Skip to content
Snippets Groups Projects
Commit 6335805c authored by Sascha Meiers's avatar Sascha Meiers
Browse files

add new SV plot and prepare Snakefile for new SV classifier

parent 14072656
No related branches found
No related tags found
No related merge requests found
{
"chromosomes" : ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"],
"chromosomes" : ["chrX","chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"],
"reference" : "/g/korbel/shared/datasets/refgenomes/human/hg19_chr1_22XYM.fa",
"reference" : "/g/korbel/shared/datasets/refgenomes/human/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna",
"mosaicatcher" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/build/mosaicatcher",
"plot_script" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/qc.R",
"plot_segments" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/plot_segments.R",
......@@ -12,7 +12,7 @@
"snv_calls" : {},
"exclude_file" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/hg19.exclude",
"exclude_file" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/GCA_000001405.15_GRCh38_no_alt_analysis_set.exclude",
"variable_bins" : {
"50000" : "utils/variable_bins.GRCh38.50kb.bed",
"100000" : "utils/variable_bins.GRCh38.100kb.bed"
......@@ -25,7 +25,7 @@
"many" : 0.5
},
"paired_end" : false,
"paired_end" : true,
"simulation_min_vaf" : 0.01,
"simulation_max_vaf" : 1.0,
......
......@@ -17,21 +17,19 @@ import os.path
# * plot all single cell libraries in different window sizes
# * calculate a segmentation into potential SVs using Mosaicatcher
METHODS = ["simpleCalls"]
rule all:
input:
expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLE, window = [50000, 100000, 200000, 500000]),
expand("plots/{sample}/{window}_variable.pdf", sample = SAMPLE, window = [50000, 100000]),
expand("segmentation2/{sample}/{window}_fixed.{bpdens}.txt", sample = SAMPLE,
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("segmentation2/{sample}/{window}_variable.{bpdens}.chr1.txt", sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"]),
expand("strand_states/{sample}/final.txt", sample = SAMPLE),
expand("sv_calls/{sample}/{window}_fixed.{bpdens}.SV_probs.{chrom}.pdf", sample = SAMPLE, chrom = config['chromosomes'],
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("sv_calls/{sample}/{window}_variable.{bpdens}.SV_probs.chr1.pdf", sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"]),
expand("segmentation/{sample}/{window}_fixed/{chrom}.pdf", sample = SAMPLE,
window = [50000, 100000], chrom = config['chromosomes'][0]) # Specifically run this only for one chrom because it is super slow
expand("sv_calls/{sample}/{window}_fixed.{bpdens}/{method}.{chrom}.pdf",
sample = SAMPLE,
chrom = config["chromosomes"],
window = [100000],
bpdens = ["few","medium","many"],
method = METHODS)
################################################################################
......@@ -189,16 +187,12 @@ rule plot_mosaic_counts:
rule plot_SV_calls:
input:
counts = "counts/{sample}/{windows}.txt.gz",
probs = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.txt"
calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt"
output:
expand("sv_calls/{{sample}}/{{windows}}.{{bpdens}}.SV_probs.{chrom}.pdf", chrom = config['chromosomes'] )
log:
"log/{sample}/plot_SV_call.{windows}.{bpdens}.txt"
params:
plot_command = "Rscript " + config["sv_plot_script"]
"sv_calls/{sample}/{windows}.{bpdens}/{method}.{chrom}.pdf"
shell:
"""
{params.plot_command} {input.counts} {input.probs} sv_calls/{wildcards.sample}/{wildcards.windows}.{wildcards.bpdens}.SV_probs > {log} 2>&1
Rscript utils/chrom.R calls={input.calls} {input.counts} {wildcards.chrom} {output}
"""
......@@ -304,21 +298,6 @@ rule segmentation:
{input} > {log} 2>&1
"""
rule plot_segmentation:
input:
counts = "counts/{sample}/{file_name}.txt.gz",
segments = "segmentation/{sample}/{file_name}.txt"
output:
"segmentation/{sample}/{file_name}/{chrom}.pdf"
log:
"log/{sample}/plot_segmentation.{file_name}.{chrom}.txt"
params:
command = config["plot_segments"]
shell:
"""
Rscript {params.command} {input.counts} {input.segments} {wildcards.chrom} {output} > {log} 2>&1
"""
# Pick a few segmentations and prepare the input files for SV classification
......@@ -424,6 +403,31 @@ rule convert_SVprob_output:
# "utils/sv_classifier.R"
################################################################################
# New SV classification based on a combination of Sascha's and Maryam's method #
################################################################################
rule mosaiClassifier_make_call:
input:
"sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
output:
"sv_calls/{sample}/{windows}.{bpdens}/simpleCalls.txt"
shell:
"""
Rscript utils/mosaiClassifier.makeCall.R {input} {output}
"""
rule mosaiClassifier_calc_probs:
input:
counts = "counts/{sample}/{windows}.txt.gz",
info = "counts/{sample}/{windows}.info",
states = "strand_states/{sample}/final.txt",
bp = "segmentation2/{sample}/{windows}.{bpdens}.txt"
output:
output = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
script:
"utils/mosaiClassifier.main.R"
################################################################################
# Strand states & phasing #
......
#
# Copyright (C) 2017 Sascha Meiers
# Distributed under the MIT software license, see the accompanying
# file LICENSE.md or http://www.opensource.org/licenses/mit-license.php.
suppressMessages(library(dplyr))
suppressMessages(library(data.table))
suppressMessages(library(ggplot2))
library(scales) %>% invisible
library(assertthat) %>% invisible
library(stringr) %>% invisible
################################################################################
# Settings #
################################################################################
zcat_command = "zcat"
format_Mb <- function(x) {paste(comma(x/1e6), "Mb")}
### Colors for background
manual_colors = c(
# duplications
simul_hom_dup = "firebrick4",
dup_hom = muted("firebrick4", 70, 50),
simul_het_dup = "firebrick2",
dup_h1 = muted("firebrick2", 90, 30),
dup_h2 = muted("firebrick2", 90, 30),
# deletions
simul_hom_del = "dodgerblue4",
del_hom = muted("dodgerblue4", 50, 60),
simul_het_del = "dodgerblue2",
del_h1 = muted("dodgerblue2", 80, 50),
del_h2 = muted("dodgerblue2", 80, 50),
# inversions
simul_hom_inv = "chartreuse4",
inv_hom = muted("chartreuse4", 80, 50),
simul_het_inv = "chartreuse2",
inv_h1 = muted("chartreuse2", 100, 60),
inv_h2 = muted("chartreuse2", 100, 60),
# other SVs
simul_false_del = "darkgrey",
simul_inv_dup = "darkgoldenrod2",
idup_h1 = muted("darkgoldenrod2", 80, 70),
idup_h2 = muted("darkgoldenrod2", 80, 70),
# background
bg1 = "#ffffff",
bg2 = "khaki2")
################################################################################
# Usage #
################################################################################
print_usage_and_stop = function(msg = NULL) {
if(!is.null(msg))
message(msg)
message("Plot Strand-seq counts of all cells for a single chromosome. ")
message(" ")
message("Usage: ")
message(" Rscript chrom.R [OPTIONS] <count-file> <chrom> <out.pdf> ")
message(" ")
message("OPTIONS (no spaces around `=`): ")
message(" per-page=<int> Number of cells to be printed per page ")
message(" segments=<file> Show the segmentation in the plots ")
message(" calls=<file> Highlight SV calls provided in a table ")
message(" truth=<file> Mark the `true`` SVs provided in a table ")
message(" ")
message("Generates one plot per chromosome listing all cells below another, separated ")
message("into pages. If an SV probability file is provided (2), segments are colored ")
message("according to the predicted SV classes. Note that only certain classes are ")
message("accepted and the script will yield an error if others are provided. ")
message("Similarly, a segmentation file can be specified, yet it must contain exactly ")
message("one segmentation (MosaiCatcher reports segmentations for various total numbers ")
message("of breakpoints).")
options( show.error.messages = F)
stop()
}
################################################################################
# Command Line Arguments #
################################################################################
args <- c("segments=../../20180417_ss_simulations/run_20180401/segmentation2/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15.txt",
"truth=../../20180417_ss_simulations/run_20180401/simulation_new/seed5_size200000-400000_vaf20-50/variants-50000.txt",
"per-page=12",
"calls=../../20180417_ss_simulations/run_20180401/sv_calls/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15/simple_llr2.txt",
"../../20180417_ss_simulations/run_20180401/counts/seed5_size200000-400000_vaf20-50-50000/50000_fixed.txt.gz",
"chr1",
"test.pdf")
args <- commandArgs(trailingOnly = T)
if (length(args)< 3) print_usage_and_stop("[Error] Too few arguments!")
f_counts = args[length(args)-2]
CHROM = args[length(args)-1]
f_out = args[length(args)]
f_segments = NULL
f_calls = NULL
f_truth = NULL
cells_per_page <- 8
if (length(args)>3) {
if (!all(grepl("^(calls|segments|per-page|truth)=", args[1:(length(args)-3)]))) {
print_usage_and_stop("[Error]: Options must be one of `calls`, `segments`, `per-page`, or `truth`") }
for (op in args[1:(length(args)-3)]) {
if (grepl("^segments=", op)) f_segments = str_sub(op, 10)
if (grepl("^calls=", op)) f_calls = str_sub(op, 7)
if (grepl("^truth=", op)) f_truth = str_sub(op, 7)
if (grepl("^per-page=", op)) {
pp = as.integer(str_sub(op, 10));
if (pp>0 && pp < 50) { cells_per_page = pp }
}
}
}
################################################################################
# Read & check input data #
################################################################################
### Check counts table
message(" * Reading count data ", f_counts, "...")
if (grepl('\\.gz$',f_counts)) {
counts = fread(paste(zcat_command, f_counts))
} else {
counts = fread(f_counts)
}
assert_that("chrom" %in% colnames(counts),
"start" %in% colnames(counts),
"end" %in% colnames(counts),
"class" %in% colnames(counts),
"sample" %in% colnames(counts),
"cell" %in% colnames(counts),
"w" %in% colnames(counts),
"c" %in% colnames(counts)) %>% invisible
counts[, sample_cell := paste(sample, "-", cell)]
setkey(counts, chrom, sample_cell)
bins = unique(counts[, .(chrom, start, end)])
### Check CHROM:
assert_that(CHROM %in% unique(counts$chrom))
counts = counts[chrom == CHROM]
### Check SV call file
if (!is.null(f_calls)) {
message(" * Reading SV calls from ", f_calls, "...")
svs = fread(f_calls)
assert_that("chrom" %in% colnames(svs),
"start" %in% colnames(svs),
"end" %in% colnames(svs),
"sample" %in% colnames(svs),
"cell" %in% colnames(svs),
"SV_class" %in% colnames(svs)) %>% invisible
assert_that(all(svs$SV_class %in% names(manual_colors))) %>% invisible
svs[, sample_cell := paste(sample, "-", cell)]
assert_that(all(unique(svs$sample_cell) %in% unique(counts$sample_cell))) %>% invisible
svs = svs[chrom == CHROM]
}
### Check segment table
if (!is.null(f_segments)) {
message(" * Reading segmentation file from ", f_segments, "...")
seg = fread(f_segments)
assert_that("chrom" %in% colnames(seg),
"bps" %in% colnames(seg)) %>% invisible
if ("k" %in% colnames(seg)) {
seg[, assert_that(length(unique(k))==1), by = .(chrom)] %>% invisible }
seg = merge(seg, bins[,.N, by =chrom][, .(chrom, N = c(0,cumsum(N)[1:(.N-1)]))], by = "chrom")
seg[, `:=`(from = c(1,bps[1:(.N-1)]+1), to = bps), by = chrom]
seg[, `:=`(start = bins[from + N]$start,
end = bins[to + N]$end )]
seg[, SV_class := rep(c("bg1","bg2"),.N)[1:.N], by = chrom]
seg = seg[chrom == CHROM]
}
### Check simulated variants
if (!is.null(f_truth)) {
message(" * Reading simulated variants from ", f_truth, "...")
simul = fread(f_truth)
assert_that("chrom" %in% colnames(simul),
"start" %in% colnames(simul),
"end" %in% colnames(simul),
"sample" %in% colnames(simul),
"cell" %in% colnames(simul),
"SV_type" %in% colnames(simul)) %>% invisible
simul[, `:=`(SV_class = paste0("simul_",SV_type), SV_type = NULL, sample_cell = paste(sample, "-", cell))]
simul[, sample_cell := paste(sample, "-", cell)]
assert_that(all(unique(simul$sample_cell) %in% unique(counts$sample_cell))) %>% invisible
simul = simul[chrom == CHROM]
}
################################################################################
# Actual plot #
################################################################################
# Plot always a few cells per page!
y_lim = 3 * counts[,median(w+c)]
n_cells = length(unique(counts[,sample_cell]))
i = 1
message(" * Plotting ", CHROM, " (", f_out, ")")
cairo_pdf(f_out, width=14, height=10, onefile = T)
while (i <= n_cells) {
# Subset to this set of cells:
CELLS = unique(counts[,.(sample_cell)])[i:(min(i+cells_per_page-1,n_cells))]
setkey(CELLS, sample_cell)
# Subset counts
local_counts = counts[CELLS, on = .(sample_cell), nomatch = 0]
# Start major plot
plt <- ggplot(local_counts)
# Add background colors for segments, if available:
if (!is.null(f_segments)) {
# Segments need to be multiplied by "CELLS"
local_seg = CELLS[, cbind(seg, sample_cell), by = sample_cell]
if (nrow(local_seg)>0) {
plt <- plt +
geom_rect(data = local_seg, alpha = 0.4,
aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = SV_class))
}
}
# Add colors for SV calls, if available
if (!is.null(f_calls)) {
local_svs = svs[CELLS, on = .(sample_cell), nomatch = 0]
if (nrow(local_svs)>0) {
plt <- plt +
geom_rect(data = local_svs, alpha = 1,
aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = SV_class))
}
}
# Add bars for true SVs, if available
if (!is.null(f_truth)) {
local_sim = simul[CELLS, on = .(sample_cell), nomatch = 0]
if (nrow(local_sim)>0) {
plt <- plt +
geom_rect(data = local_sim,
aes(xmin = start, xmax = end, ymin = y_lim, ymax = Inf, fill = SV_class))
}
}
plt <- plt +
geom_rect(aes(xmin = start, xmax=end, ymin=0, ymax = -w), fill='sandybrown') +
geom_rect(aes(xmin = start, xmax=end, ymin=0, ymax = c), fill='paleturquoise4') +
facet_wrap(~ sample_cell, ncol = 1) +
ylab("Watson | Crick") + xlab(NULL) +
scale_x_continuous(breaks = pretty_breaks(12), labels = format_Mb) +
scale_y_continuous(breaks = pretty_breaks(3)) +
coord_cartesian(ylim = c(-y_lim, y_lim)) +
scale_fill_manual(values = manual_colors) +
theme_minimal() +
theme(panel.spacing = unit(0, "lines"),
axis.ticks.x = element_blank(),
strip.background = element_rect(color = "#eeeeee", fill = "#eeeeee"),
strip.text = element_text(size = 5),
legend.position = "bottom") +
ggtitle(paste("data:", basename(f_counts), "chromosome:", CHROM))
print(plt)
i = i + cells_per_page
} # while
dev.off()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment