From 6335805ca57278e384d3f47021b90c941c84cb24 Mon Sep 17 00:00:00 2001
From: Sascha Meiers <meiers@embl.de>
Date: Wed, 16 May 2018 10:11:30 +0200
Subject: [PATCH] add new SV plot and prepare Snakefile for new SV classifier

---
 Snake.config.json |   8 +-
 Snakefile         |  70 ++++++------
 utils/chrom.R     | 282 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 323 insertions(+), 37 deletions(-)
 create mode 100644 utils/chrom.R

diff --git a/Snake.config.json b/Snake.config.json
index 6c64413..843a536 100644
--- a/Snake.config.json
+++ b/Snake.config.json
@@ -1,7 +1,7 @@
 {
-    "chromosomes"   : ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"],
+    "chromosomes"   : ["chrX","chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"],
 
-    "reference"     : "/g/korbel/shared/datasets/refgenomes/human/hg19_chr1_22XYM.fa",
+    "reference"     : "/g/korbel/shared/datasets/refgenomes/human/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna",
     "mosaicatcher"  : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/build/mosaicatcher",
     "plot_script"   : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/qc.R",
     "plot_segments" : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/plot_segments.R",
@@ -12,7 +12,7 @@
 
     "snv_calls"     : {},
 
-    "exclude_file"  : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/hg19.exclude",
+    "exclude_file"  : "/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/GCA_000001405.15_GRCh38_no_alt_analysis_set.exclude",
     "variable_bins" : {
         "50000"     : "utils/variable_bins.GRCh38.50kb.bed",
         "100000"    : "utils/variable_bins.GRCh38.100kb.bed"
@@ -25,7 +25,7 @@
         "many"      : 0.5
     },
 
-    "paired_end"    : false,
+    "paired_end"    : true,
 
     "simulation_min_vaf" : 0.01,
     "simulation_max_vaf" : 1.0,
diff --git a/Snakefile b/Snakefile
index 6bed2b2..9e69dbb 100644
--- a/Snakefile
+++ b/Snakefile
@@ -17,21 +17,19 @@ import os.path
 # * plot all single cell libraries in different window sizes
 # * calculate a segmentation into potential SVs using Mosaicatcher
 
+
+METHODS = ["simpleCalls"]
+
 rule all:
     input:
         expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLE, window = [50000, 100000, 200000, 500000]),
         expand("plots/{sample}/{window}_variable.pdf", sample = SAMPLE, window = [50000, 100000]),
-        expand("segmentation2/{sample}/{window}_fixed.{bpdens}.txt", sample = SAMPLE,
-               window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
-        #expand("segmentation2/{sample}/{window}_variable.{bpdens}.chr1.txt", sample = SAMPLE,
-        #       window = [50000, 100000], bpdens = ["few","medium","many"]),
-        expand("strand_states/{sample}/final.txt", sample = SAMPLE),
-        expand("sv_calls/{sample}/{window}_fixed.{bpdens}.SV_probs.{chrom}.pdf", sample = SAMPLE, chrom = config['chromosomes'],
-               window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
-        #expand("sv_calls/{sample}/{window}_variable.{bpdens}.SV_probs.chr1.pdf", sample = SAMPLE,
-        #       window = [50000, 100000], bpdens = ["few","medium","many"]),
-        expand("segmentation/{sample}/{window}_fixed/{chrom}.pdf", sample = SAMPLE,
-                window = [50000, 100000], chrom = config['chromosomes'][0])    # Specifically run this only for one chrom because it is super slow
+        expand("sv_calls/{sample}/{window}_fixed.{bpdens}/{method}.{chrom}.pdf",
+               sample = SAMPLE,
+               chrom = config["chromosomes"],
+               window = [100000],
+               bpdens = ["few","medium","many"],
+               method = METHODS)
 
 
 ################################################################################
@@ -189,16 +187,12 @@ rule plot_mosaic_counts:
 rule plot_SV_calls:
     input:
         counts = "counts/{sample}/{windows}.txt.gz",
-        probs  = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.txt"
+        calls  = "sv_calls/{sample}/{windows}.{bpdens}/{method}.txt"
     output:
-        expand("sv_calls/{{sample}}/{{windows}}.{{bpdens}}.SV_probs.{chrom}.pdf", chrom = config['chromosomes'] )
-    log:
-        "log/{sample}/plot_SV_call.{windows}.{bpdens}.txt"
-    params:
-        plot_command = "Rscript " + config["sv_plot_script"]
+        "sv_calls/{sample}/{windows}.{bpdens}/{method}.{chrom}.pdf"
     shell:
         """
-        {params.plot_command} {input.counts} {input.probs} sv_calls/{wildcards.sample}/{wildcards.windows}.{wildcards.bpdens}.SV_probs > {log} 2>&1
+        Rscript utils/chrom.R calls={input.calls} {input.counts} {wildcards.chrom} {output}
         """
 
 
@@ -304,21 +298,6 @@ rule segmentation:
         {input} > {log} 2>&1
         """
 
-rule plot_segmentation:
-    input:
-        counts = "counts/{sample}/{file_name}.txt.gz",
-        segments = "segmentation/{sample}/{file_name}.txt"
-    output:
-        "segmentation/{sample}/{file_name}/{chrom}.pdf"
-    log:
-        "log/{sample}/plot_segmentation.{file_name}.{chrom}.txt"
-    params:
-        command = config["plot_segments"]
-    shell:
-        """
-        Rscript {params.command} {input.counts} {input.segments} {wildcards.chrom} {output} > {log} 2>&1
-        """
-
 
 
 # Pick a few segmentations and prepare the input files for SV classification
@@ -424,6 +403,31 @@ rule convert_SVprob_output:
 #         "utils/sv_classifier.R"
 
 
+################################################################################
+# New SV classification based on a combination of Sascha's and Maryam's method #
+################################################################################
+
+rule mosaiClassifier_make_call:
+    input:
+        "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
+    output:
+        "sv_calls/{sample}/{windows}.{bpdens}/simpleCalls.txt"
+    shell:
+        """
+        Rscript utils/mosaiClassifier.makeCall.R {input} {output}
+        """
+
+rule mosaiClassifier_calc_probs:
+    input:
+        counts = "counts/{sample}/{windows}.txt.gz",
+        info   = "counts/{sample}/{windows}.info",
+        states = "strand_states/{sample}/final.txt",
+        bp     = "segmentation2/{sample}/{windows}.{bpdens}.txt"
+    output:
+        output = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
+    script:
+        "utils/mosaiClassifier.main.R"
+
 
 ################################################################################
 # Strand states & phasing                                                      #
diff --git a/utils/chrom.R b/utils/chrom.R
new file mode 100644
index 0000000..e13a821
--- /dev/null
+++ b/utils/chrom.R
@@ -0,0 +1,282 @@
+#
+# Copyright (C) 2017 Sascha Meiers
+# Distributed under the MIT software license, see the accompanying
+# file LICENSE.md or http://www.opensource.org/licenses/mit-license.php.
+
+suppressMessages(library(dplyr))
+suppressMessages(library(data.table))
+suppressMessages(library(ggplot2))
+library(scales) %>% invisible
+library(assertthat) %>% invisible
+library(stringr) %>% invisible
+
+
+################################################################################
+# Settings                                                                     #
+################################################################################
+
+zcat_command = "zcat"
+format_Mb   <- function(x) {paste(comma(x/1e6), "Mb")}
+
+### Colors for background
+manual_colors = c(  
+    # duplications
+  simul_hom_dup   = "firebrick4",
+  dup_hom         = muted("firebrick4", 70, 50),
+  simul_het_dup   = "firebrick2",
+  dup_h1          = muted("firebrick2", 90, 30),
+  dup_h2          = muted("firebrick2", 90, 30),
+    # deletions
+  simul_hom_del   = "dodgerblue4",
+  del_hom         = muted("dodgerblue4", 50, 60),
+  simul_het_del   = "dodgerblue2",
+  del_h1          = muted("dodgerblue2", 80, 50),
+  del_h2          = muted("dodgerblue2", 80, 50),
+    # inversions
+  simul_hom_inv   = "chartreuse4",
+  inv_hom         = muted("chartreuse4", 80, 50),
+  simul_het_inv   = "chartreuse2",
+  inv_h1          = muted("chartreuse2", 100, 60),
+  inv_h2          = muted("chartreuse2", 100, 60),
+    # other SVs
+  simul_false_del = "darkgrey",
+  simul_inv_dup   = "darkgoldenrod2",
+  idup_h1         = muted("darkgoldenrod2", 80, 70),
+  idup_h2         = muted("darkgoldenrod2", 80, 70),
+    # background
+  bg1 = "#ffffff",
+  bg2 = "khaki2")
+
+
+
+
+################################################################################
+# Usage                                                                        #
+################################################################################
+
+print_usage_and_stop = function(msg = NULL) {
+  if(!is.null(msg))
+    message(msg)
+  message("Plot Strand-seq counts of all cells for a single chromosome.                    ")
+  message("                                                                                ")
+  message("Usage:                                                                          ")
+  message("    Rscript chrom.R [OPTIONS] <count-file> <chrom> <out.pdf>                    ")
+  message("                                                                                ")
+  message("OPTIONS (no spaces around `=`):                                                 ")
+  message("    per-page=<int>        Number of cells to be printed per page                ")
+  message("    segments=<file>       Show the segmentation in the plots                    ")
+  message("    calls=<file>          Highlight SV calls provided in a table                ")
+  message("    truth=<file>          Mark the `true`` SVs provided in a table              ")
+  message("                                                                                ")
+  message("Generates one plot per chromosome listing all cells below another, separated    ")
+  message("into pages. If an SV probability file is provided (2), segments are colored     ")
+  message("according to the predicted SV classes. Note that only certain classes are       ")
+  message("accepted and the script will yield an error if others are provided.             ")
+  message("Similarly, a segmentation file can be specified, yet it must contain exactly    ")
+  message("one segmentation (MosaiCatcher reports segmentations for various total numbers  ")
+  message("of breakpoints).")
+  options( show.error.messages = F)
+  stop()
+}
+
+
+
+################################################################################
+# Command Line Arguments                                                       #
+################################################################################
+
+args <- c("segments=../../20180417_ss_simulations/run_20180401/segmentation2/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15.txt",
+          "truth=../../20180417_ss_simulations/run_20180401/simulation_new/seed5_size200000-400000_vaf20-50/variants-50000.txt",
+          "per-page=12",
+          "calls=../../20180417_ss_simulations/run_20180401/sv_calls/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15/simple_llr2.txt",
+          "../../20180417_ss_simulations/run_20180401/counts/seed5_size200000-400000_vaf20-50-50000/50000_fixed.txt.gz",
+          "chr1",
+          "test.pdf")
+
+args <- commandArgs(trailingOnly = T)
+
+if (length(args)< 3) print_usage_and_stop("[Error] Too few arguments!")
+
+f_counts = args[length(args)-2]
+CHROM    = args[length(args)-1]
+f_out    = args[length(args)]
+
+f_segments = NULL
+f_calls    = NULL
+f_truth    = NULL
+cells_per_page <- 8
+
+if (length(args)>3) {
+  if (!all(grepl("^(calls|segments|per-page|truth)=", args[1:(length(args)-3)]))) {
+    print_usage_and_stop("[Error]: Options must be one of `calls`, `segments`, `per-page`, or `truth`") }
+  for (op in args[1:(length(args)-3)]) {
+    if (grepl("^segments=", op)) f_segments = str_sub(op, 10)
+    if (grepl("^calls=", op))    f_calls = str_sub(op, 7)
+    if (grepl("^truth=", op))    f_truth = str_sub(op, 7)
+    if (grepl("^per-page=", op)) {
+      pp = as.integer(str_sub(op, 10)); 
+      if (pp>0 && pp < 50) { cells_per_page = pp }
+    } 
+  }
+}
+
+
+################################################################################
+# Read & check input data                                                      #
+################################################################################
+
+### Check counts table
+  message(" * Reading count data ", f_counts, "...")
+  if (grepl('\\.gz$',f_counts)) {
+    counts = fread(paste(zcat_command, f_counts))
+  } else {
+    counts = fread(f_counts)
+  }
+  assert_that("chrom"  %in% colnames(counts),
+              "start"  %in% colnames(counts),
+              "end"    %in% colnames(counts),
+              "class"  %in% colnames(counts),
+              "sample" %in% colnames(counts),
+              "cell"   %in% colnames(counts),
+              "w"      %in% colnames(counts),
+              "c"      %in% colnames(counts)) %>% invisible
+  counts[, sample_cell := paste(sample, "-", cell)]
+  setkey(counts, chrom, sample_cell)
+  bins = unique(counts[, .(chrom, start, end)])
+
+### Check CHROM:
+assert_that(CHROM %in% unique(counts$chrom))
+counts = counts[chrom == CHROM]
+
+### Check SV call file
+if (!is.null(f_calls)) {
+  message(" * Reading SV calls from ", f_calls, "...")
+  svs = fread(f_calls)
+  assert_that("chrom"    %in% colnames(svs),
+              "start"    %in% colnames(svs),
+              "end"      %in% colnames(svs),
+              "sample"   %in% colnames(svs),
+              "cell"     %in% colnames(svs),
+              "SV_class" %in% colnames(svs)) %>% invisible
+  assert_that(all(svs$SV_class %in% names(manual_colors))) %>% invisible
+  svs[, sample_cell := paste(sample, "-", cell)]
+  assert_that(all(unique(svs$sample_cell) %in% unique(counts$sample_cell))) %>% invisible
+  svs = svs[chrom == CHROM]
+}
+
+### Check segment table
+if (!is.null(f_segments)) {
+  message(" * Reading segmentation file from ", f_segments, "...")
+  seg = fread(f_segments)
+  assert_that("chrom" %in% colnames(seg),
+              "bps"   %in% colnames(seg)) %>% invisible
+  if ("k" %in% colnames(seg)) {
+    seg[, assert_that(length(unique(k))==1), by = .(chrom)] %>% invisible }
+
+  seg = merge(seg, bins[,.N, by =chrom][, .(chrom, N = c(0,cumsum(N)[1:(.N-1)]))], by = "chrom")
+  seg[, `:=`(from = c(1,bps[1:(.N-1)]+1), to = bps), by = chrom]
+  seg[, `:=`(start = bins[from + N]$start,
+             end   = bins[to   + N]$end  )]
+  seg[, SV_class := rep(c("bg1","bg2"),.N)[1:.N], by = chrom]
+  seg = seg[chrom == CHROM]
+}
+
+
+
+### Check simulated variants
+if (!is.null(f_truth)) {
+  message(" * Reading simulated variants from ", f_truth, "...")
+  simul = fread(f_truth)
+  assert_that("chrom"   %in% colnames(simul),
+              "start"   %in% colnames(simul),
+              "end"     %in% colnames(simul),
+              "sample"  %in% colnames(simul),
+              "cell"    %in% colnames(simul),
+              "SV_type" %in% colnames(simul)) %>% invisible
+  simul[, `:=`(SV_class = paste0("simul_",SV_type), SV_type = NULL, sample_cell = paste(sample, "-", cell))]
+  simul[, sample_cell := paste(sample, "-", cell)]
+  assert_that(all(unique(simul$sample_cell) %in% unique(counts$sample_cell))) %>% invisible
+  simul = simul[chrom == CHROM]
+}
+
+
+
+
+
+################################################################################
+# Actual plot                                                                  #
+################################################################################
+
+
+# Plot always a few cells per page!
+y_lim = 3 * counts[,median(w+c)]
+n_cells = length(unique(counts[,sample_cell]))
+i = 1
+
+
+message(" * Plotting ", CHROM, " (", f_out, ")")
+cairo_pdf(f_out, width=14, height=10, onefile = T)
+while (i <= n_cells) {
+
+    # Subset to this set of cells:
+    CELLS = unique(counts[,.(sample_cell)])[i:(min(i+cells_per_page-1,n_cells))]
+    setkey(CELLS, sample_cell)
+
+    # Subset counts
+    local_counts = counts[CELLS, on = .(sample_cell), nomatch = 0]
+
+    # Start major plot
+    plt <- ggplot(local_counts)
+
+    # Add background colors for segments, if available:
+    if (!is.null(f_segments)) {
+      # Segments need to be multiplied by "CELLS"
+      local_seg = CELLS[, cbind(seg, sample_cell), by = sample_cell]
+      if (nrow(local_seg)>0) {
+          plt <- plt +
+              geom_rect(data = local_seg, alpha = 0.4,
+                        aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = SV_class)) 
+      }
+    }
+    
+    # Add colors for SV calls, if available
+    if (!is.null(f_calls)) {
+      local_svs = svs[CELLS, on = .(sample_cell), nomatch = 0]
+      if (nrow(local_svs)>0) {
+        plt <- plt +
+          geom_rect(data = local_svs, alpha = 1,
+                    aes(xmin = start, xmax = end, ymin = -Inf, ymax = Inf, fill = SV_class))
+      }
+    }
+    
+    # Add bars for true SVs, if available
+    if (!is.null(f_truth)) {
+      local_sim = simul[CELLS, on = .(sample_cell), nomatch = 0]
+      if (nrow(local_sim)>0) {
+        plt <- plt +
+          geom_rect(data = local_sim,
+                    aes(xmin = start, xmax = end, ymin = y_lim, ymax = Inf, fill = SV_class))
+      }
+    }
+
+    plt <- plt +
+        geom_rect(aes(xmin = start, xmax=end, ymin=0, ymax = -w), fill='sandybrown') +
+        geom_rect(aes(xmin = start, xmax=end, ymin=0, ymax =  c), fill='paleturquoise4') +
+        facet_wrap(~ sample_cell, ncol = 1) +
+        ylab("Watson | Crick") + xlab(NULL) +
+        scale_x_continuous(breaks = pretty_breaks(12), labels = format_Mb) +
+        scale_y_continuous(breaks = pretty_breaks(3)) +
+        coord_cartesian(ylim = c(-y_lim, y_lim)) +
+        scale_fill_manual(values = manual_colors) +
+        theme_minimal() +
+        theme(panel.spacing = unit(0, "lines"),
+              axis.ticks.x = element_blank(),
+              strip.background = element_rect(color = "#eeeeee", fill = "#eeeeee"),
+              strip.text = element_text(size = 5),
+              legend.position = "bottom") +
+        ggtitle(paste("data:", basename(f_counts), "chromosome:", CHROM))
+
+    print(plt)
+    i = i + cells_per_page
+} # while
+dev.off()
-- 
GitLab