From 6335805ca57278e384d3f47021b90c941c84cb24 Mon Sep 17 00:00:00 2001 From: Sascha Meiers <meiers@embl.de> Date: Wed, 16 May 2018 10:11:30 +0200 Subject: [PATCH] add new SV plot and prepare Snakefile for new SV classifier --- Snake.config.json | 8 +- Snakefile | 70 ++++++------ utils/chrom.R | 282 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 323 insertions(+), 37 deletions(-) create mode 100644 utils/chrom.R diff --git a/Snake.config.json b/Snake.config.json index 6c64413..843a536 100644 --- a/Snake.config.json +++ b/Snake.config.json @@ -1,7 +1,7 @@ { - "chromosomes" : ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"], + "chromosomes" : ["chrX","chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"], - "reference" : "/g/korbel/shared/datasets/refgenomes/human/hg19_chr1_22XYM.fa", + "reference" : "/g/korbel/shared/datasets/refgenomes/human/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna", "mosaicatcher" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/build/mosaicatcher", "plot_script" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/qc.R", "plot_segments" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/plot_segments.R", @@ -12,7 +12,7 @@ "snv_calls" : {}, - "exclude_file" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/hg19.exclude", + "exclude_file" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/GCA_000001405.15_GRCh38_no_alt_analysis_set.exclude", "variable_bins" : { "50000" : "utils/variable_bins.GRCh38.50kb.bed", "100000" : "utils/variable_bins.GRCh38.100kb.bed" @@ -25,7 +25,7 @@ "many" : 0.5 }, - "paired_end" : false, + "paired_end" : true, "simulation_min_vaf" : 0.01, "simulation_max_vaf" : 1.0, diff --git a/Snakefile b/Snakefile index 6bed2b2..9e69dbb 100644 --- a/Snakefile +++ b/Snakefile @@ -17,21 +17,19 @@ import os.path # * plot all single cell libraries in different window sizes # * calculate a segmentation into potential SVs using Mosaicatcher + +METHODS = ["simpleCalls"] + rule all: input: expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLE, window = [50000, 100000, 200000, 500000]), expand("plots/{sample}/{window}_variable.pdf", sample = SAMPLE, window = [50000, 100000]), - expand("segmentation2/{sample}/{window}_fixed.{bpdens}.txt", sample = SAMPLE, - window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]), - #expand("segmentation2/{sample}/{window}_variable.{bpdens}.chr1.txt", sample = SAMPLE, - # window = [50000, 100000], bpdens = ["few","medium","many"]), - expand("strand_states/{sample}/final.txt", sample = SAMPLE), - expand("sv_calls/{sample}/{window}_fixed.{bpdens}.SV_probs.{chrom}.pdf", sample = SAMPLE, chrom = config['chromosomes'], - window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]), - #expand("sv_calls/{sample}/{window}_variable.{bpdens}.SV_probs.chr1.pdf", sample = SAMPLE, - # window = [50000, 100000], bpdens = ["few","medium","many"]), - expand("segmentation/{sample}/{window}_fixed/{chrom}.pdf", sample = SAMPLE, - window = [50000, 100000], chrom = config['chromosomes'][0]) # Specifically run this only for one chrom because it is super slow + expand("sv_calls/{sample}/{window}_fixed.{bpdens}/{method}.{chrom}.pdf", + sample = SAMPLE, + chrom = config["chromosomes"], + window = [100000], + bpdens = ["few","medium","many"], + method = METHODS) ################################################################################ @@ -189,16 +187,12 @@ rule plot_mosaic_counts: rule plot_SV_calls: input: counts = "counts/{sample}/{windows}.txt.gz", - probs = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.txt" + calls = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt" output: - expand("sv_calls/{{sample}}/{{windows}}.{{bpdens}}.SV_probs.{chrom}.pdf", chrom = config['chromosomes'] ) - log: - "log/{sample}/plot_SV_call.{windows}.{bpdens}.txt" - params: - plot_command = "Rscript " + config["sv_plot_script"] + "sv_calls/{sample}/{windows}.{bpdens}/{method}.{chrom}.pdf" shell: """ - {params.plot_command} {input.counts} {input.probs} sv_calls/{wildcards.sample}/{wildcards.windows}.{wildcards.bpdens}.SV_probs > {log} 2>&1 + Rscript utils/chrom.R calls={input.calls} {input.counts} {wildcards.chrom} {output} """ @@ -304,21 +298,6 @@ rule segmentation: {input} > {log} 2>&1 """ -rule plot_segmentation: - input: - counts = "counts/{sample}/{file_name}.txt.gz", - segments = "segmentation/{sample}/{file_name}.txt" - output: - "segmentation/{sample}/{file_name}/{chrom}.pdf" - log: - "log/{sample}/plot_segmentation.{file_name}.{chrom}.txt" - params: - command = config["plot_segments"] - shell: - """ - Rscript {params.command} {input.counts} {input.segments} {wildcards.chrom} {output} > {log} 2>&1 - """ - # Pick a few segmentations and prepare the input files for SV classification @@ -424,6 +403,31 @@ rule convert_SVprob_output: # "utils/sv_classifier.R" +################################################################################ +# New SV classification based on a combination of Sascha's and Maryam's method # +################################################################################ + +rule mosaiClassifier_make_call: + input: + "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata" + output: + "sv_calls/{sample}/{windows}.{bpdens}/simpleCalls.txt" + shell: + """ + Rscript utils/mosaiClassifier.makeCall.R {input} {output} + """ + +rule mosaiClassifier_calc_probs: + input: + counts = "counts/{sample}/{windows}.txt.gz", + info = "counts/{sample}/{windows}.info", + states = "strand_states/{sample}/final.txt", + bp = "segmentation2/{sample}/{windows}.{bpdens}.txt" + output: + output = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata" + script: + "utils/mosaiClassifier.main.R" + ################################################################################ # Strand states & phasing # diff --git a/utils/chrom.R b/utils/chrom.R new file mode 100644 index 0000000..e13a821 --- /dev/null +++ b/utils/chrom.R @@ -0,0 +1,282 @@ +# +# Copyright (C) 2017 Sascha Meiers +# Distributed under the MIT software license, see the accompanying +# file LICENSE.md or http://www.opensource.org/licenses/mit-license.php. + +suppressMessages(library(dplyr)) +suppressMessages(library(data.table)) +suppressMessages(library(ggplot2)) +library(scales) %>% invisible +library(assertthat) %>% invisible +library(stringr) %>% invisible + + +################################################################################ +# Settings # +################################################################################ + +zcat_command = "zcat" +format_Mb <- function(x) {paste(comma(x/1e6), "Mb")} + +### Colors for background +manual_colors = c( + # duplications + simul_hom_dup = "firebrick4", + dup_hom = muted("firebrick4", 70, 50), + simul_het_dup = "firebrick2", + dup_h1 = muted("firebrick2", 90, 30), + dup_h2 = muted("firebrick2", 90, 30), + # deletions + simul_hom_del = "dodgerblue4", + del_hom = muted("dodgerblue4", 50, 60), + simul_het_del = "dodgerblue2", + del_h1 = muted("dodgerblue2", 80, 50), + del_h2 = muted("dodgerblue2", 80, 50), + # inversions + simul_hom_inv = "chartreuse4", + inv_hom = muted("chartreuse4", 80, 50), + simul_het_inv = "chartreuse2", + inv_h1 = muted("chartreuse2", 100, 60), + inv_h2 = muted("chartreuse2", 100, 60), + # other SVs + simul_false_del = "darkgrey", + simul_inv_dup = "darkgoldenrod2", + idup_h1 = muted("darkgoldenrod2", 80, 70), + idup_h2 = muted("darkgoldenrod2", 80, 70), + # background + bg1 = "#ffffff", + bg2 = "khaki2") + + + + +################################################################################ +# Usage # +################################################################################ + +print_usage_and_stop = function(msg = NULL) { + if(!is.null(msg)) + message(msg) + message("Plot Strand-seq counts of all cells for a single chromosome. ") + message(" ") + message("Usage: ") + message(" Rscript chrom.R [OPTIONS] <count-file> <chrom> <out.pdf> ") + message(" ") + message("OPTIONS (no spaces around `=`): ") + message(" per-page=<int> Number of cells to be printed per page ") + message(" segments=<file> Show the segmentation in the plots ") + message(" calls=<file> Highlight SV calls provided in a table ") + message(" truth=<file> Mark the `true`` SVs provided in a table ") + message(" ") + message("Generates one plot per chromosome listing all cells below another, separated ") + message("into pages. If an SV probability file is provided (2), segments are colored ") + message("according to the predicted SV classes. Note that only certain classes are ") + message("accepted and the script will yield an error if others are provided. ") + message("Similarly, a segmentation file can be specified, yet it must contain exactly ") + message("one segmentation (MosaiCatcher reports segmentations for various total numbers ") + message("of breakpoints).") + options( show.error.messages = F) + stop() +} + + + +################################################################################ +# Command Line Arguments # +################################################################################ + +args <- c("segments=../../20180417_ss_simulations/run_20180401/segmentation2/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15.txt", + "truth=../../20180417_ss_simulations/run_20180401/simulation_new/seed5_size200000-400000_vaf20-50/variants-50000.txt", + "per-page=12", + "calls=../../20180417_ss_simulations/run_20180401/sv_calls/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15/simple_llr2.txt", + "../../20180417_ss_simulations/run_20180401/counts/seed5_size200000-400000_vaf20-50-50000/50000_fixed.txt.gz", + "chr1", + "test.pdf") + +args <- commandArgs(trailingOnly = T) + +if (length(args)< 3) print_usage_and_stop("[Error] Too few arguments!") + +f_counts = args[length(args)-2] +CHROM = args[length(args)-1] +f_out = args[length(args)] + +f_segments = NULL +f_calls = NULL +f_truth = NULL +cells_per_page <- 8 + +if (length(args)>3) { + if (!all(grepl("^(calls|segments|per-page|truth)=", args[1:(length(args)-3)]))) { + print_usage_and_stop("[Error]: Options must be one of `calls`, `segments`, `per-page`, or `truth`") } + for (op in args[1:(length(args)-3)]) { + if (grepl("^segments=", op)) f_segments = str_sub(op, 10) + if (grepl("^calls=", op)) f_calls = str_sub(op, 7) + if (grepl("^truth=", op)) f_truth = str_sub(op, 7) + if (grepl("^per-page=", op)) { + pp = as.integer(str_sub(op, 10)); + if (pp>0 && pp < 50) { cells_per_page = pp } + } + } +} + + +################################################################################ +# Read & check input data # +################################################################################ + +### Check counts table + message(" * Reading count data ", f_counts, "...") + if (grepl('\\.gz$',f_counts)) { + counts = fread(paste(zcat_command, f_counts)) + } else { + counts = fread(f_counts) + } + assert_that("chrom" %in% colnames(counts), + "start" %in% colnames(counts), + "end" %in% colnames(counts), + "class" %in% colnames(counts), + "sample" %in% colnames(counts), + "cell" %in% colnames(counts), + "w" %in% colnames(counts), + "c" %in% colnames(counts)) %>% invisible + counts[, sample_cell := paste(sample, "-", cell)] + setkey(counts, chrom, sample_cell) + bins = unique(counts[, .(chrom, start, end)]) + +### Check CHROM: +assert_that(CHROM %in% unique(counts$chrom)) +counts = counts[chrom == CHROM] + +### Check SV call file +if (!is.null(f_calls)) { + message(" * Reading SV calls from ", f_calls, "...") + svs = fread(f_calls) + assert_that("chrom" %in% colnames(svs), + "start" %in% colnames(svs), + "end" %in% colnames(svs), + "sample" %in% colnames(svs), + "cell" %in% colnames(svs), + "SV_class" %in% colnames(svs)) %>% invisible + assert_that(all(svs$SV_class %in% names(manual_colors))) %>% invisible + svs[, sample_cell := paste(sample, "-", cell)] + assert_that(all(unique(svs$sample_cell) %in% unique(counts$sample_cell))) %>% invisible + svs = svs[chrom == CHROM] +} + +### Check segment table +if (!is.null(f_segments)) { + message(" * Reading segmentation file from ", f_segments, "...") + seg = fread(f_segments) + assert_that("chrom" %in% colnames(seg), + "bps" %in% colnames(seg)) %>% invisible + if ("k" %in% colnames(seg)) { + seg[, assert_that(length(unique(k))==1), by = .(chrom)] %>% invisible } + + seg = merge(seg, bins[,.N, by =chrom][, .(chrom, N = c(0,cumsum(N)[1:(.N-1)]))], by = "chrom") + seg[, `:=`(from = c(1,bps[1:(.N-1)]+1), to = bps), by = chrom] + seg[, `:=`(start = bins[from + N]$start, + end = bins[to + N]$end )] + seg[, SV_class := rep(c("bg1","bg2"),.N)[1:.N], by = chrom] + seg = seg[chrom == CHROM] +} + + + +### Check simulated variants +if (!is.null(f_truth)) { + message(" * Reading simulated variants from ", f_truth, "...") + simul = fread(f_truth) + assert_that("chrom" %in% colnames(simul), + "start" %in% colnames(simul), + "end" %in% colnames(simul), + "sample" %in% colnames(simul), + "cell" %in% colnames(simul), + "SV_type" %in% colnames(simul)) %>% invisible + simul[, `:=`(SV_class = paste0("simul_",SV_type), SV_type = NULL, sample_cell = paste(sample, "-", cell))] + simul[, sample_cell := paste(sample, "-", cell)] + assert_that(all(unique(simul$sample_cell) %in% unique(counts$sample_cell))) %>% invisible + simul = simul[chrom == CHROM] +} + + + + + +################################################################################ +# Actual plot # +################################################################################ + + +# Plot always a few cells per page! +y_lim = 3 * counts[,median(w+c)] +n_cells = length(unique(counts[,sample_cell])) +i = 1 + + +message(" * Plotting ", CHROM, " (", f_out, ")") +cairo_pdf(f_out, width=14, height=10, onefile = T) +while (i <= n_cells) { + + # Subset to this set of cells: + CELLS = unique(counts[,.(sample_cell)])[i:(min(i+cells_per_page-1,n_cells))] + setkey(CELLS, sample_cell) + + # Subset counts + local_counts = counts[CELLS, on = .(sample_cell), nomatch = 0] + + # Start major plot + plt <- ggplot(local_counts) + + # Add background colors for segments, if available: + if (!is.null(f_segments)) { + # Segments need to be multiplied by "CELLS" + local_seg = CELLS[, cbind(seg, sample_cell), by = sample_cell] + if (nrow(local_seg)>0) { + plt <- plt + + geom_rect(data = local_seg, alpha = 0.4, + aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = SV_class)) + } + } + + # Add colors for SV calls, if available + if (!is.null(f_calls)) { + local_svs = svs[CELLS, on = .(sample_cell), nomatch = 0] + if (nrow(local_svs)>0) { + plt <- plt + + geom_rect(data = local_svs, alpha = 1, + aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = SV_class)) + } + } + + # Add bars for true SVs, if available + if (!is.null(f_truth)) { + local_sim = simul[CELLS, on = .(sample_cell), nomatch = 0] + if (nrow(local_sim)>0) { + plt <- plt + + geom_rect(data = local_sim, + aes(xmin = start, xmax = end, ymin = y_lim, ymax = Inf, fill = SV_class)) + } + } + + plt <- plt + + geom_rect(aes(xmin = start, xmax=end, ymin=0, ymax = -w), fill='sandybrown') + + geom_rect(aes(xmin = start, xmax=end, ymin=0, ymax = c), fill='paleturquoise4') + + facet_wrap(~ sample_cell, ncol = 1) + + ylab("Watson | Crick") + xlab(NULL) + + scale_x_continuous(breaks = pretty_breaks(12), labels = format_Mb) + + scale_y_continuous(breaks = pretty_breaks(3)) + + coord_cartesian(ylim = c(-y_lim, y_lim)) + + scale_fill_manual(values = manual_colors) + + theme_minimal() + + theme(panel.spacing = unit(0, "lines"), + axis.ticks.x = element_blank(), + strip.background = element_rect(color = "#eeeeee", fill = "#eeeeee"), + strip.text = element_text(size = 5), + legend.position = "bottom") + + ggtitle(paste("data:", basename(f_counts), "chromosome:", CHROM)) + + print(plt) + i = i + cells_per_page +} # while +dev.off() -- GitLab