Skip to content
Snippets Groups Projects
Commit ba94e52d authored by Tobias Marschall's avatar Tobias Marschall
Browse files

Added Jan's script for post-filtering of SV calls

parent ff357ff1
No related branches found
No related tags found
No related merge requests found
......@@ -33,20 +33,34 @@ import os.path
METHODS = [
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.005_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.01_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.02_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.03_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.04_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.05_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.005_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.01_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.02_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.03_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.04_regfactor6",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.05_regfactor6",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0_regfactor6_filterFALSE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.005_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.01_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.02_regfactor6_filterFALSE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.03_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.04_regfactor6_filterFALSE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.05_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0_regfactor6_filterFALSE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.005_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.01_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.02_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.03_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.04_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.05_regfactor6_filterFALSE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0_regfactor6_filterTRUE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.005_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.01_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.02_regfactor6_filterTRUE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.03_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.04_regfactor6_filterTRUE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsFALSE_gtcutoff0.05_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0_regfactor6_filterTRUE",
"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.005_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.01_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.02_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.03_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.04_regfactor6_filterTRUE",
#"simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.05_regfactor6_filterTRUE",
]
BPDENS = [
......@@ -563,7 +577,7 @@ rule mosaiClassifier_make_call:
input:
probs = 'haplotag/table/{sample}/haplotag-likelihoods.{window}_fixed_norm.{bpdens}.Rdata'
output:
"sv_calls/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.txt"
"sv_calls/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}_filterFALSE.txt"
params:
minFrac_used_bins = 0.8
log:
......@@ -571,6 +585,15 @@ rule mosaiClassifier_make_call:
script:
"utils/mosaiClassifier_call.snakemake.R"
rule filter_calls:
input:
calls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens}/simpleCalls_llr{llr}_poppriors{pop_priors}_haplotags{use_haplotags}_gtcutoff{gtcutoff}_regfactor{regfactor}_filterFALSE.txt"
output:
calls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}_filterTRUE.txt"
shell:
'utils/filter_MosaiCatcher_calls.pl {input.calls} | awk \'BEGIN {{OFS="\\t"}} $15!="FAIL" {{$15=""; print}}\' > {output.calls}'
rule mosaiClassifier_calc_probs:
input:
counts = "counts/{sample}/{windows}.txt.gz",
......
#!/usr/bin/perl -w
use strict;
my $min_N_inv = 3;
#Use input file with format "chrom start end sample cell class scalar num_bins sv_call_name sv_call_haplotype sv_call_name_2nd sv_call_haplotype_2nd llr_to_ref llr_to_2nd"
#
# e.g.
#
my $input_file = $ARGV[0];
if (!$ARGV[0]) {
print STDERR "Input filname missinf=g (use e.g. sv_calls_txt_file_all/RPE1-WT/100000_fixed_norm.selected_j0.01_s0.1/simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.05_regfactor6.txt )\n";
print STDERR "Filters inversions unless they are seen at least $min_N_inv times.\n";
print STDERR "Filters deletions only seen in WW and CC chromosomes\n";
print STDERR "Filters duplications only seen in WW and CC chromosomes (but gives inv-dups seen in such context a PASS)\n";
exit;
}
print STDERR "Processing $input_file...\n";
#--- read input file
my (%STARTs, %ENDs, %SV_TYPEs, %STRAND_STATESs, %FILTER, %REPORTs);
open FH, "$input_file" or die;
while (<FH>) {
chomp;
next if ($_ =~ /^chrom/); #ignore first line
my ($chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd) = split (/\t/, $_); #split by TAB
push (@{$STARTs{$chrom}}, $start);
push (@{$ENDs{$chrom}}, $end);
push (@{$SV_TYPEs{$chrom}}, $sv_call_name);
push (@{$STRAND_STATESs{$chrom}}, $strand_state_class);
push (@{$REPORTs{$chrom}}, $_);
}
close FH;
#--- 1. filter DEL calls only seen on WW or CC chromosomes
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
next unless ($SV_TYPEs{$chrom}[$i] =~ /del_h[12]/); #only look at het-del calls
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
#print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n";
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW');
}
}
#--- 2. filter INV calls seen less than Min_N_inv times
foreach my $chrom (sort keys %STARTs) {
my %inv_recurrence=();
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
next unless ($SV_TYPEs{$chrom}[$i] =~ /inv_h[12]/); #only look at het-inv calls
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
#print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n";
$inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}++;
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $min_N_inv);
}
}
#--- 3. filter DUP calls only seen on WW or CC chromosomes
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
next unless ($SV_TYPEs{$chrom}[$i] =~ /dup_h[12]/); #only look at het-dup calls
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}) or $SV_TYPEs{$chrom}[$i] =~ /idup_h[12]/); #initialize
#print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n";
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW');
}
}
#--- output file with filters
print "chrom\tstart\tend\tsample\tcell\tstrand_state_class\tscalar\tnum_bins\tsv_call_name\tsv_call_haplotype\tsv_call_name_2nd\tsv_call_haplotype_2nd\tllr_to_ref\tllr_to_2nd\tpass/fail\n";
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'PASS' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
print "$REPORTs{$chrom}[$i]\t", $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}, "\n";
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment