Skip to content
Snippets Groups Projects
Commit 2ecc73fa authored by Tobias Marschall's avatar Tobias Marschall
Browse files

Added rules to re-genotype a set of given SNVs (e.g. from 1000G) and to do...

Added rules to re-genotype a set of given SNVs (e.g. from 1000G) and to do haplotagging of BAM files. Slightly reorganized input file structure: now expect two separate folders "bam/{sample}/selected" and "bam/{sample}/all", where "selected" should contain the good Strand-seq libraries, where all should contain all libraries that can be used for SNV calling/genotyping.
parent cdb20b80
No related branches found
No related tags found
No related merge requests found
......@@ -15,6 +15,8 @@
"RPE1-WT" : "/g/korbel2/StrandSeq/20180606_RPE1-WT/regenSNVs_GRCh38sites_RPEWTstrandS_chr1-22plusX.filtered.vcf.gz"
},
"snv_sites_to_genotype": "",
"variable_bins" : {
"50000" : "utils/variable_bins.GRCh38.50kb.bed",
"100000" : "utils/variable_bins.GRCh38.100kb.bed"
......
import math
from collections import defaultdict
configfile: "Snake.config.json"
SAMPLE,BAM = glob_wildcards("bam/{sample}/{bam}.bam")
BAM_PER_SAMPLE = dict([(s,[]) for s in SAMPLE])
for i in range(len(SAMPLE)):
BAM_PER_SAMPLE[SAMPLE[i]].append(BAM[i])
print("Detected {} samples:".format(len(set(SAMPLE))))
for s in set(SAMPLE):
print(" {}:\t{} cells".format(s, len(BAM_PER_SAMPLE[s])))
SAMPLE,BAM = glob_wildcards("bam/{sample}/selected/{bam}.bam")
SAMPLES = sorted(set(SAMPLE))
BAM_PER_SAMPLE = defaultdict(list)
for sample,bam in zip(SAMPLE,BAM):
BAM_PER_SAMPLE[sample].append(bam)
ALLBAMS_PER_SAMPLE = defaultdict(list)
for sample in SAMPLES:
ALLBAMS_PER_SAMPLE[sample] = glob_wildcards("bam/{}/all/{{bam}}.bam".format(sample)).bam
print("Detected {} samples:".format(len(SAMPLES)))
for s in SAMPLES:
print(" {}:\t{} cells\t {} selected cells".format(s, len(ALLBAMS_PER_SAMPLE[s]), len(BAM_PER_SAMPLE[s])))
import os.path
......@@ -41,21 +51,22 @@ localrules:
rule all:
input:
expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLE, window = [50000, 100000, 200000, 500000]),
expand("plots/{sample}/{window}_fixed_norm.pdf", sample = SAMPLE, window = [50000, 100000, 200000]),
expand("plots/{sample}/{window}_fixed.pdf", sample = SAMPLES, window = [50000, 100000, 200000, 500000]),
expand("plots/{sample}/{window}_fixed_norm.pdf", sample = SAMPLES, window = [50000, 100000, 200000]),
expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_calls/{method}.{chrom}.pdf",
sample = SAMPLE,
chrom = config["chromosomes"],
window = [50000, 100000],
bpdens = ["few","medium","many"],
method = METHODS),
expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLE, chrom = config["chromosomes"]),
expand("ploidy/{sample}/ploidy.{chrom}.txt", sample = SAMPLES, chrom = config["chromosomes"]),
expand("sv_calls/{sample}/{window}_fixed_norm.{bpdens}/plots/sv_consistency/{method}.consistency-barplot-{af}.pdf",
sample = SAMPLE,
sample = SAMPLES,
window = [50000, 100000],
bpdens = ["few","medium","many"],
method = METHODS,
af = ["high","med","low","rare"]),
['haplotag/bam/{}/{}.bam'.format(sample,bam) for bam in BAM_PER_SAMPLE[sample] for sample in SAMPLES],
################################################################################
......@@ -288,7 +299,7 @@ rule generate_exclude_file_1:
output:
temp("log/exclude_file.temp")
input:
bam = expand("bam/{sample}/{bam}.bam", sample = SAMPLE[0], bam = BAM[0])
bam = expand("bam/{sample}/selected/{bam}.bam", sample = SAMPLES[0], bam = ALLBAMS_PER_SAMPLE[SAMPLES[0]][0])
log:
"log/generate_exclude_file_1.log"
params:
......@@ -317,8 +328,8 @@ rule generate_exclude_file_2:
rule mosaic_count_fixed:
input:
bam = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
bai = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
bam = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
bai = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
excl = "log/exclude_file"
output:
counts = "counts/{sample}/{window}_fixed.txt.gz",
......@@ -342,8 +353,8 @@ rule mosaic_count_fixed:
rule mosaic_count_variable:
input:
bam = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]),
bai = lambda wc: expand("bam/" + wc.sample + "/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]),
bam = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]),
bai = lambda wc: expand("bam/" + wc.sample + "/selected/{bam}.bam.bai", bam = BAM_PER_SAMPLE[wc.sample]),
bed = lambda wc: config["variable_bins"][str(wc.window)],
excl = "log/exclude_file"
output:
......@@ -567,7 +578,10 @@ rule prepare_strandphaser_config_per_chrom:
def locate_snv_vcf(wildcards):
if "snv_calls" not in config or wildcards.sample not in config["snv_calls"] or config["snv_calls"][wildcards.sample] == "":
return "snv_calls/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
if "snv_sites_to_genotype" in config and config["snv_sites_to_genotype"] != "":
return "snv_genotyping/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
else:
return "snv_calls/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
else:
return "external_snv_calls/{}/{}.vcf".format(wildcards.sample, wildcards.chrom)
......@@ -577,9 +591,10 @@ rule run_strandphaser_per_chrom:
snppositions = locate_snv_vcf,
configfile = "strand_states/{sample}/StrandPhaseR.{chrom}.config",
strandphaser = "utils/R-packages/StrandPhaseR/R/StrandPhaseR",
bamfolder = "bam/{sample}/"
bamfolder = "bam/{sample}/selected"
output:
"strand_states/{sample}/StrandPhaseR_analysis.{chrom}/Phased/phased_haps.txt"
"strand_states/{sample}/StrandPhaseR_analysis.{chrom}/Phased/phased_haps.txt",
"strand_states/{sample}/StrandPhaseR_analysis.{chrom}/VCFfiles/{chrom}_phased.vcf",
log:
"log/run_strandphaser_per_chrom/{sample}/{chrom}.log"
shell:
......@@ -594,6 +609,16 @@ rule run_strandphaser_per_chrom:
> {log} 2>&1
"""
rule merge_strandphaser_vcfs:
input:
vcf=expand("strand_states/{{sample}}/StrandPhaseR_analysis.{chrom}/VCFfiles/{chrom}_phased.vcf", chrom=config["chromosomes"])
output:
vcf='phased-snvs/{sample}.vcf.gz'
log:
"log/merge_strandphaser_vcfs/{sample}.log"
shell:
"(bcftools concat -a | bcftools view -o {output.vcf} -O z --genotype het --types snps - ) > {log} 2>&1"
rule combine_strandphaser_output:
......@@ -625,6 +650,21 @@ rule convert_strandphaser_output:
"utils/helper.convert_strandphaser_output.R"
################################################################################
# Haplotagging #
################################################################################
rule haplotag_bams:
input:
vcf='phased-snvs/{sample}.vcf.gz',
bam='bam/{sample}/selected/{bam}.bam',
ref = config["reference"],
output:
bam='haplotag/bam/{sample}/{bam}.bam',
log:
"log/haplotag_bams/{sample}.log"
shell:
"whatshap haplotag -o {output.bam} -r {input.ref} {input.vcf} {input.bam} > {log} 2>{log}"
################################################################################
# Call SNVs #
......@@ -632,7 +672,7 @@ rule convert_strandphaser_output:
rule mergeBams:
input:
lambda wc: expand("bam/" + wc.sample + "/{bam}.bam", bam = BAM_PER_SAMPLE[wc.sample]) if wc.sample in BAM_PER_SAMPLE else "FOOBAR",
lambda wc: expand("bam/" + wc.sample + "/all/{bam}.bam", bam = ALLBAMS_PER_SAMPLE[wc.sample]) if wc.sample in ALLBAMS_PER_SAMPLE else "FOOBAR",
output:
"snv_calls/{sample}/merged.bam"
log:
......@@ -657,7 +697,7 @@ rule call_SNVs_bcftools_chrom:
bam = "snv_calls/{sample}/merged.bam",
bai = "snv_calls/{sample}/merged.bam.bai"
output:
"snv_calls/{sample}/{chrom}.vcf"
"snv_calls/{sample}/{chrom,chr[0-9A-Z]+}.vcf"
log:
"log/call_SNVs_bcftools_chrom/{sample}/{chrom}.log"
params:
......@@ -670,6 +710,23 @@ rule call_SNVs_bcftools_chrom:
| {params.bcftools} call -mv - | {params.bcftools} view --genotype het --types snps - > {output} 2> {log}
"""
rule regenotype_SNVs:
input:
bam = "snv_calls/{sample}/merged.bam",
bai = "snv_calls/{sample}/merged.bam.bai",
fa = config["reference"],
sites = config["snv_sites_to_genotype"],
output:
vcf = "snv_genotyping/{sample}/{chrom,chr[0-9A-Z]+}.vcf"
log:
"log/snv_genotyping/{sample}/{chrom}.log"
params:
bcftools = config["bcftools"]
shell:
"""
(freebayes -f {input.fa} -r {wildcards.chrom} -@ {input.sites} --only-use-input-alleles {input.bam} --genotype-qualities | {params.bcftools} view --exclude-uncalled --genotype het --types snps --include "QUAL>=10" - > {output.vcf}) 2> {log}
"""
rule merge_SNV_calls:
input:
expand("snv_calls/{{sample}}/{chrom}.vcf", chrom = config['chromosomes'])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment