Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
Mosaicatcher Update
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Thomas Weber
Mosaicatcher Update
Commits
6335805c
Commit
6335805c
authored
6 years ago
by
Sascha Meiers
Browse files
Options
Downloads
Patches
Plain Diff
add new SV plot and prepare Snakefile for new SV classifier
parent
14072656
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
Snake.config.json
+4
-4
4 additions, 4 deletions
Snake.config.json
Snakefile
+37
-33
37 additions, 33 deletions
Snakefile
utils/chrom.R
+282
-0
282 additions, 0 deletions
utils/chrom.R
with
323 additions
and
37 deletions
Snake.config.json
+
4
−
4
View file @
6335805c
{
"chromosomes"
:
[
"chr1"
,
"chr2"
,
"chr3"
,
"chr4"
,
"chr5"
,
"chr6"
,
"chr7"
,
"chr8"
,
"chr9"
,
"chr10"
,
"chr11"
,
"chr12"
,
"chr13"
,
"chr14"
,
"chr15"
,
"chr16"
,
"chr17"
,
"chr18"
,
"chr19"
,
"chr20"
,
"chr21"
,
"chr22"
],
"chromosomes"
:
[
"
chrX"
,
"
chr1"
,
"chr2"
,
"chr3"
,
"chr4"
,
"chr5"
,
"chr6"
,
"chr7"
,
"chr8"
,
"chr9"
,
"chr10"
,
"chr11"
,
"chr12"
,
"chr13"
,
"chr14"
,
"chr15"
,
"chr16"
,
"chr17"
,
"chr18"
,
"chr19"
,
"chr20"
,
"chr21"
,
"chr22"
],
"reference"
:
"/g/korbel/shared/datasets/refgenomes/human/
hg19_chr1_22XYM
.fa"
,
"reference"
:
"/g/korbel/shared/datasets/refgenomes/human/
GCA_000001405.15_GRCh38_no_alt_analysis_set
.f
n
a"
,
"mosaicatcher"
:
"/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/build/mosaicatcher"
,
"plot_script"
:
"/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/qc.R"
,
"plot_segments"
:
"/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/R/plot_segments.R"
,
...
...
@@ -12,7 +12,7 @@
"snv_calls"
:
{},
"exclude_file"
:
"/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/
hg19
.exclude"
,
"exclude_file"
:
"/g/korbel/meiers/tools/mosaicatcher/mosaicatcher/data/exclude/
GCA_000001405.15_GRCh38_no_alt_analysis_set
.exclude"
,
"variable_bins"
:
{
"50000"
:
"utils/variable_bins.GRCh38.50kb.bed"
,
"100000"
:
"utils/variable_bins.GRCh38.100kb.bed"
...
...
@@ -25,7 +25,7 @@
"many"
:
0.5
},
"paired_end"
:
fals
e
,
"paired_end"
:
tru
e
,
"simulation_min_vaf"
:
0.01
,
"simulation_max_vaf"
:
1.0
,
...
...
This diff is collapsed.
Click to expand it.
Snakefile
+
37
−
33
View file @
6335805c
...
...
@@ -17,21 +17,19 @@ import os.path
# * plot all single cell libraries in different window sizes
# * calculate a segmentation into potential SVs using Mosaicatcher
METHODS = ["simpleCalls"]
rule all:
input:
expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLE, window = [50000, 100000, 200000, 500000]),
expand("plots/{sample}/{window}_variable.pdf", sample = SAMPLE, window = [50000, 100000]),
expand("segmentation2/{sample}/{window}_fixed.{bpdens}.txt", sample = SAMPLE,
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("segmentation2/{sample}/{window}_variable.{bpdens}.chr1.txt", sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"]),
expand("strand_states/{sample}/final.txt", sample = SAMPLE),
expand("sv_calls/{sample}/{window}_fixed.{bpdens}.SV_probs.{chrom}.pdf", sample = SAMPLE, chrom = config['chromosomes'],
window = [50000, 100000, 200000, 500000], bpdens = ["few","medium","many"]),
#expand("sv_calls/{sample}/{window}_variable.{bpdens}.SV_probs.chr1.pdf", sample = SAMPLE,
# window = [50000, 100000], bpdens = ["few","medium","many"]),
expand("segmentation/{sample}/{window}_fixed/{chrom}.pdf", sample = SAMPLE,
window = [50000, 100000], chrom = config['chromosomes'][0]) # Specifically run this only for one chrom because it is super slow
expand("sv_calls/{sample}/{window}_fixed.{bpdens}/{method}.{chrom}.pdf",
sample = SAMPLE,
chrom = config["chromosomes"],
window = [100000],
bpdens = ["few","medium","many"],
method = METHODS)
################################################################################
...
...
@@ -189,16 +187,12 @@ rule plot_mosaic_counts:
rule plot_SV_calls:
input:
counts = "counts/{sample}/{windows}.txt.gz",
prob
s = "sv_
probabilitie
s/{sample}/{windows}.{bpdens}/
probabilities
.txt"
call
s = "sv_
call
s/{sample}/{windows}.{bpdens}/
{method}
.txt"
output:
expand("sv_calls/{{sample}}/{{windows}}.{{bpdens}}.SV_probs.{chrom}.pdf", chrom = config['chromosomes'] )
log:
"log/{sample}/plot_SV_call.{windows}.{bpdens}.txt"
params:
plot_command = "Rscript " + config["sv_plot_script"]
"sv_calls/{sample}/{windows}.{bpdens}/{method}.{chrom}.pdf"
shell:
"""
{params.plot_command}
{input.c
ount
s} {input.
probs} sv_calls/{wildcards.sample}/{wildcards.windows}.{wildcards.bpdens}.SV_probs > {log} 2>&1
Rscript utils/chrom.R calls=
{input.c
all
s} {input.
counts} {wildcards.chrom} {output}
"""
...
...
@@ -304,21 +298,6 @@ rule segmentation:
{input} > {log} 2>&1
"""
rule plot_segmentation:
input:
counts = "counts/{sample}/{file_name}.txt.gz",
segments = "segmentation/{sample}/{file_name}.txt"
output:
"segmentation/{sample}/{file_name}/{chrom}.pdf"
log:
"log/{sample}/plot_segmentation.{file_name}.{chrom}.txt"
params:
command = config["plot_segments"]
shell:
"""
Rscript {params.command} {input.counts} {input.segments} {wildcards.chrom} {output} > {log} 2>&1
"""
# Pick a few segmentations and prepare the input files for SV classification
...
...
@@ -424,6 +403,31 @@ rule convert_SVprob_output:
# "utils/sv_classifier.R"
################################################################################
# New SV classification based on a combination of Sascha's and Maryam's method #
################################################################################
rule mosaiClassifier_make_call:
input:
"sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
output:
"sv_calls/{sample}/{windows}.{bpdens}/simpleCalls.txt"
shell:
"""
Rscript utils/mosaiClassifier.makeCall.R {input} {output}
"""
rule mosaiClassifier_calc_probs:
input:
counts = "counts/{sample}/{windows}.txt.gz",
info = "counts/{sample}/{windows}.info",
states = "strand_states/{sample}/final.txt",
bp = "segmentation2/{sample}/{windows}.{bpdens}.txt"
output:
output = "sv_probabilities/{sample}/{windows}.{bpdens}/probabilities.Rdata"
script:
"utils/mosaiClassifier.main.R"
################################################################################
# Strand states & phasing #
...
...
This diff is collapsed.
Click to expand it.
utils/chrom.R
0 → 100644
+
282
−
0
View file @
6335805c
#
# Copyright (C) 2017 Sascha Meiers
# Distributed under the MIT software license, see the accompanying
# file LICENSE.md or http://www.opensource.org/licenses/mit-license.php.
suppressMessages
(
library
(
dplyr
))
suppressMessages
(
library
(
data.table
))
suppressMessages
(
library
(
ggplot2
))
library
(
scales
)
%>%
invisible
library
(
assertthat
)
%>%
invisible
library
(
stringr
)
%>%
invisible
################################################################################
# Settings #
################################################################################
zcat_command
=
"zcat"
format_Mb
<-
function
(
x
)
{
paste
(
comma
(
x
/
1e6
),
"Mb"
)}
### Colors for background
manual_colors
=
c
(
# duplications
simul_hom_dup
=
"firebrick4"
,
dup_hom
=
muted
(
"firebrick4"
,
70
,
50
),
simul_het_dup
=
"firebrick2"
,
dup_h1
=
muted
(
"firebrick2"
,
90
,
30
),
dup_h2
=
muted
(
"firebrick2"
,
90
,
30
),
# deletions
simul_hom_del
=
"dodgerblue4"
,
del_hom
=
muted
(
"dodgerblue4"
,
50
,
60
),
simul_het_del
=
"dodgerblue2"
,
del_h1
=
muted
(
"dodgerblue2"
,
80
,
50
),
del_h2
=
muted
(
"dodgerblue2"
,
80
,
50
),
# inversions
simul_hom_inv
=
"chartreuse4"
,
inv_hom
=
muted
(
"chartreuse4"
,
80
,
50
),
simul_het_inv
=
"chartreuse2"
,
inv_h1
=
muted
(
"chartreuse2"
,
100
,
60
),
inv_h2
=
muted
(
"chartreuse2"
,
100
,
60
),
# other SVs
simul_false_del
=
"darkgrey"
,
simul_inv_dup
=
"darkgoldenrod2"
,
idup_h1
=
muted
(
"darkgoldenrod2"
,
80
,
70
),
idup_h2
=
muted
(
"darkgoldenrod2"
,
80
,
70
),
# background
bg1
=
"#ffffff"
,
bg2
=
"khaki2"
)
################################################################################
# Usage #
################################################################################
print_usage_and_stop
=
function
(
msg
=
NULL
)
{
if
(
!
is.null
(
msg
))
message
(
msg
)
message
(
"Plot Strand-seq counts of all cells for a single chromosome. "
)
message
(
" "
)
message
(
"Usage: "
)
message
(
" Rscript chrom.R [OPTIONS] <count-file> <chrom> <out.pdf> "
)
message
(
" "
)
message
(
"OPTIONS (no spaces around `=`): "
)
message
(
" per-page=<int> Number of cells to be printed per page "
)
message
(
" segments=<file> Show the segmentation in the plots "
)
message
(
" calls=<file> Highlight SV calls provided in a table "
)
message
(
" truth=<file> Mark the `true`` SVs provided in a table "
)
message
(
" "
)
message
(
"Generates one plot per chromosome listing all cells below another, separated "
)
message
(
"into pages. If an SV probability file is provided (2), segments are colored "
)
message
(
"according to the predicted SV classes. Note that only certain classes are "
)
message
(
"accepted and the script will yield an error if others are provided. "
)
message
(
"Similarly, a segmentation file can be specified, yet it must contain exactly "
)
message
(
"one segmentation (MosaiCatcher reports segmentations for various total numbers "
)
message
(
"of breakpoints)."
)
options
(
show.error.messages
=
F
)
stop
()
}
################################################################################
# Command Line Arguments #
################################################################################
args
<-
c
(
"segments=../../20180417_ss_simulations/run_20180401/segmentation2/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15.txt"
,
"truth=../../20180417_ss_simulations/run_20180401/simulation_new/seed5_size200000-400000_vaf20-50/variants-50000.txt"
,
"per-page=12"
,
"calls=../../20180417_ss_simulations/run_20180401/sv_calls/seed5_size200000-400000_vaf20-50-50000/50000_fixed.fraction15/simple_llr2.txt"
,
"../../20180417_ss_simulations/run_20180401/counts/seed5_size200000-400000_vaf20-50-50000/50000_fixed.txt.gz"
,
"chr1"
,
"test.pdf"
)
args
<-
commandArgs
(
trailingOnly
=
T
)
if
(
length
(
args
)
<
3
)
print_usage_and_stop
(
"[Error] Too few arguments!"
)
f_counts
=
args
[
length
(
args
)
-2
]
CHROM
=
args
[
length
(
args
)
-1
]
f_out
=
args
[
length
(
args
)]
f_segments
=
NULL
f_calls
=
NULL
f_truth
=
NULL
cells_per_page
<-
8
if
(
length
(
args
)
>
3
)
{
if
(
!
all
(
grepl
(
"^(calls|segments|per-page|truth)="
,
args
[
1
:
(
length
(
args
)
-3
)])))
{
print_usage_and_stop
(
"[Error]: Options must be one of `calls`, `segments`, `per-page`, or `truth`"
)
}
for
(
op
in
args
[
1
:
(
length
(
args
)
-3
)])
{
if
(
grepl
(
"^segments="
,
op
))
f_segments
=
str_sub
(
op
,
10
)
if
(
grepl
(
"^calls="
,
op
))
f_calls
=
str_sub
(
op
,
7
)
if
(
grepl
(
"^truth="
,
op
))
f_truth
=
str_sub
(
op
,
7
)
if
(
grepl
(
"^per-page="
,
op
))
{
pp
=
as.integer
(
str_sub
(
op
,
10
));
if
(
pp
>
0
&&
pp
<
50
)
{
cells_per_page
=
pp
}
}
}
}
################################################################################
# Read & check input data #
################################################################################
### Check counts table
message
(
" * Reading count data "
,
f_counts
,
"..."
)
if
(
grepl
(
'\\.gz$'
,
f_counts
))
{
counts
=
fread
(
paste
(
zcat_command
,
f_counts
))
}
else
{
counts
=
fread
(
f_counts
)
}
assert_that
(
"chrom"
%in%
colnames
(
counts
),
"start"
%in%
colnames
(
counts
),
"end"
%in%
colnames
(
counts
),
"class"
%in%
colnames
(
counts
),
"sample"
%in%
colnames
(
counts
),
"cell"
%in%
colnames
(
counts
),
"w"
%in%
colnames
(
counts
),
"c"
%in%
colnames
(
counts
))
%>%
invisible
counts
[,
sample_cell
:=
paste
(
sample
,
"-"
,
cell
)]
setkey
(
counts
,
chrom
,
sample_cell
)
bins
=
unique
(
counts
[,
.
(
chrom
,
start
,
end
)])
### Check CHROM:
assert_that
(
CHROM
%in%
unique
(
counts
$
chrom
))
counts
=
counts
[
chrom
==
CHROM
]
### Check SV call file
if
(
!
is.null
(
f_calls
))
{
message
(
" * Reading SV calls from "
,
f_calls
,
"..."
)
svs
=
fread
(
f_calls
)
assert_that
(
"chrom"
%in%
colnames
(
svs
),
"start"
%in%
colnames
(
svs
),
"end"
%in%
colnames
(
svs
),
"sample"
%in%
colnames
(
svs
),
"cell"
%in%
colnames
(
svs
),
"SV_class"
%in%
colnames
(
svs
))
%>%
invisible
assert_that
(
all
(
svs
$
SV_class
%in%
names
(
manual_colors
)))
%>%
invisible
svs
[,
sample_cell
:=
paste
(
sample
,
"-"
,
cell
)]
assert_that
(
all
(
unique
(
svs
$
sample_cell
)
%in%
unique
(
counts
$
sample_cell
)))
%>%
invisible
svs
=
svs
[
chrom
==
CHROM
]
}
### Check segment table
if
(
!
is.null
(
f_segments
))
{
message
(
" * Reading segmentation file from "
,
f_segments
,
"..."
)
seg
=
fread
(
f_segments
)
assert_that
(
"chrom"
%in%
colnames
(
seg
),
"bps"
%in%
colnames
(
seg
))
%>%
invisible
if
(
"k"
%in%
colnames
(
seg
))
{
seg
[,
assert_that
(
length
(
unique
(
k
))
==
1
),
by
=
.
(
chrom
)]
%>%
invisible
}
seg
=
merge
(
seg
,
bins
[,
.N
,
by
=
chrom
][,
.
(
chrom
,
N
=
c
(
0
,
cumsum
(
N
)[
1
:
(
.N
-1
)]))],
by
=
"chrom"
)
seg
[,
`:=`
(
from
=
c
(
1
,
bps
[
1
:
(
.N
-1
)]
+1
),
to
=
bps
),
by
=
chrom
]
seg
[,
`:=`
(
start
=
bins
[
from
+
N
]
$
start
,
end
=
bins
[
to
+
N
]
$
end
)]
seg
[,
SV_class
:=
rep
(
c
(
"bg1"
,
"bg2"
),
.N
)[
1
:
.N
],
by
=
chrom
]
seg
=
seg
[
chrom
==
CHROM
]
}
### Check simulated variants
if
(
!
is.null
(
f_truth
))
{
message
(
" * Reading simulated variants from "
,
f_truth
,
"..."
)
simul
=
fread
(
f_truth
)
assert_that
(
"chrom"
%in%
colnames
(
simul
),
"start"
%in%
colnames
(
simul
),
"end"
%in%
colnames
(
simul
),
"sample"
%in%
colnames
(
simul
),
"cell"
%in%
colnames
(
simul
),
"SV_type"
%in%
colnames
(
simul
))
%>%
invisible
simul
[,
`:=`
(
SV_class
=
paste0
(
"simul_"
,
SV_type
),
SV_type
=
NULL
,
sample_cell
=
paste
(
sample
,
"-"
,
cell
))]
simul
[,
sample_cell
:=
paste
(
sample
,
"-"
,
cell
)]
assert_that
(
all
(
unique
(
simul
$
sample_cell
)
%in%
unique
(
counts
$
sample_cell
)))
%>%
invisible
simul
=
simul
[
chrom
==
CHROM
]
}
################################################################################
# Actual plot #
################################################################################
# Plot always a few cells per page!
y_lim
=
3
*
counts
[,
median
(
w
+
c
)]
n_cells
=
length
(
unique
(
counts
[,
sample_cell
]))
i
=
1
message
(
" * Plotting "
,
CHROM
,
" ("
,
f_out
,
")"
)
cairo_pdf
(
f_out
,
width
=
14
,
height
=
10
,
onefile
=
T
)
while
(
i
<=
n_cells
)
{
# Subset to this set of cells:
CELLS
=
unique
(
counts
[,
.
(
sample_cell
)])[
i
:
(
min
(
i
+
cells_per_page
-1
,
n_cells
))]
setkey
(
CELLS
,
sample_cell
)
# Subset counts
local_counts
=
counts
[
CELLS
,
on
=
.
(
sample_cell
),
nomatch
=
0
]
# Start major plot
plt
<-
ggplot
(
local_counts
)
# Add background colors for segments, if available:
if
(
!
is.null
(
f_segments
))
{
# Segments need to be multiplied by "CELLS"
local_seg
=
CELLS
[,
cbind
(
seg
,
sample_cell
),
by
=
sample_cell
]
if
(
nrow
(
local_seg
)
>
0
)
{
plt
<-
plt
+
geom_rect
(
data
=
local_seg
,
alpha
=
0.4
,
aes
(
xmin
=
start
,
xmax
=
end
,
ymin
=
-
Inf
,
ymax
=
Inf
,
fill
=
SV_class
))
}
}
# Add colors for SV calls, if available
if
(
!
is.null
(
f_calls
))
{
local_svs
=
svs
[
CELLS
,
on
=
.
(
sample_cell
),
nomatch
=
0
]
if
(
nrow
(
local_svs
)
>
0
)
{
plt
<-
plt
+
geom_rect
(
data
=
local_svs
,
alpha
=
1
,
aes
(
xmin
=
start
,
xmax
=
end
,
ymin
=
-
Inf
,
ymax
=
Inf
,
fill
=
SV_class
))
}
}
# Add bars for true SVs, if available
if
(
!
is.null
(
f_truth
))
{
local_sim
=
simul
[
CELLS
,
on
=
.
(
sample_cell
),
nomatch
=
0
]
if
(
nrow
(
local_sim
)
>
0
)
{
plt
<-
plt
+
geom_rect
(
data
=
local_sim
,
aes
(
xmin
=
start
,
xmax
=
end
,
ymin
=
y_lim
,
ymax
=
Inf
,
fill
=
SV_class
))
}
}
plt
<-
plt
+
geom_rect
(
aes
(
xmin
=
start
,
xmax
=
end
,
ymin
=
0
,
ymax
=
-
w
),
fill
=
'sandybrown'
)
+
geom_rect
(
aes
(
xmin
=
start
,
xmax
=
end
,
ymin
=
0
,
ymax
=
c
),
fill
=
'paleturquoise4'
)
+
facet_wrap
(
~
sample_cell
,
ncol
=
1
)
+
ylab
(
"Watson | Crick"
)
+
xlab
(
NULL
)
+
scale_x_continuous
(
breaks
=
pretty_breaks
(
12
),
labels
=
format_Mb
)
+
scale_y_continuous
(
breaks
=
pretty_breaks
(
3
))
+
coord_cartesian
(
ylim
=
c
(
-
y_lim
,
y_lim
))
+
scale_fill_manual
(
values
=
manual_colors
)
+
theme_minimal
()
+
theme
(
panel.spacing
=
unit
(
0
,
"lines"
),
axis.ticks.x
=
element_blank
(),
strip.background
=
element_rect
(
color
=
"#eeeeee"
,
fill
=
"#eeeeee"
),
strip.text
=
element_text
(
size
=
5
),
legend.position
=
"bottom"
)
+
ggtitle
(
paste
(
"data:"
,
basename
(
f_counts
),
"chromosome:"
,
CHROM
))
print
(
plt
)
i
=
i
+
cells_per_page
}
# while
dev.off
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment