From 268d8d55b953c983a6ea974d261c56227f55f1d1 Mon Sep 17 00:00:00 2001
From: Tobias Marschall <tobias.marschall@0ohm.net>
Date: Sat, 29 Sep 2018 11:13:13 +0200
Subject: [PATCH] Add AF to SV call tables

---
 Snakefile                           |  2 +-
 utils/filter_MosaiCatcher_calls.pl  |  6 +++---
 utils/mosaiClassifier/makeSVcalls.R | 11 ++++++++---
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/Snakefile b/Snakefile
index 10934a9..d4579d5 100644
--- a/Snakefile
+++ b/Snakefile
@@ -591,7 +591,7 @@ rule filter_calls:
     output: 
         calls = "sv_calls/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}_filterTRUE.txt"
     shell:
-        'utils/filter_MosaiCatcher_calls.pl {input.calls} | awk \'BEGIN {{OFS="\\t"}} $15!="FAIL" {{$15=""; print}}\' > {output.calls}'
+        'utils/filter_MosaiCatcher_calls.pl {input.calls} | awk \'BEGIN {{OFS="\\t"}} $16!="FAIL" {{$16=""; print}}\' > {output.calls}'
 
 
 rule mosaiClassifier_calc_probs:
diff --git a/utils/filter_MosaiCatcher_calls.pl b/utils/filter_MosaiCatcher_calls.pl
index bd2a95f..e7e0957 100755
--- a/utils/filter_MosaiCatcher_calls.pl
+++ b/utils/filter_MosaiCatcher_calls.pl
@@ -3,7 +3,7 @@ use strict;
 
 my $min_N_inv = 3;
 
-#Use input file with format "chrom	start	end	sample	cell	class	scalar	num_bins	sv_call_name	sv_call_haplotype	sv_call_name_2nd	sv_call_haplotype_2nd	llr_to_ref	llr_to_2nd"
+#Use input file with format "chrom	start	end	sample	cell	class	scalar	num_bins	sv_call_name	sv_call_haplotype	sv_call_name_2nd	sv_call_haplotype_2nd	llr_to_ref	llr_to_2nd	af"
 #
 # e.g. 
 # 
@@ -27,7 +27,7 @@ open FH, "$input_file" or die;
 while (<FH>) {
 	chomp;
 	next if ($_ =~ /^chrom/); #ignore first line
-	my ($chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd) = split (/\t/, $_); #split by TAB
+	my ($chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af) = split (/\t/, $_); #split by TAB
 	push (@{$STARTs{$chrom}}, $start);
 	push (@{$ENDs{$chrom}}, $end);
 	push (@{$SV_TYPEs{$chrom}}, $sv_call_name);
@@ -74,7 +74,7 @@ foreach my $chrom (sort keys %STARTs) {
 
 
 #--- output file with filters
-print "chrom\tstart\tend\tsample\tcell\tstrand_state_class\tscalar\tnum_bins\tsv_call_name\tsv_call_haplotype\tsv_call_name_2nd\tsv_call_haplotype_2nd\tllr_to_ref\tllr_to_2nd\tpass/fail\n";
+print "chrom\tstart\tend\tsample\tcell\tstrand_state_class\tscalar\tnum_bins\tsv_call_name\tsv_call_haplotype\tsv_call_name_2nd\tsv_call_haplotype_2nd\tllr_to_ref\tllr_to_2nd\taf\tpass/fail\n";
 foreach my $chrom (sort keys %STARTs) {
         for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
 		$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'PASS' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
diff --git a/utils/mosaiClassifier/makeSVcalls.R b/utils/mosaiClassifier/makeSVcalls.R
index af32890..e374b82 100644
--- a/utils/mosaiClassifier/makeSVcalls.R
+++ b/utils/mosaiClassifier/makeSVcalls.R
@@ -77,11 +77,16 @@ makeSVCallSimple <- function(probs, llr_thr = 1, use.pop.priors = FALSE, use.hap
        by = .(chrom, start, end, sample, cell)]
   probs <- probs[rank == 1]
 
+  # compute the number of cells (used below to compute AFs)
+  ncells = length(unique(probs[,cell]))
 
-  # Clean up table
-  probs <- probs[, .(chrom, start, end, sample, cell, class, scalar, num_bins, sv_call_name, sv_call_haplotype, sv_call_name_2nd, sv_call_haplotype_2nd, llr_to_ref, llr_to_2nd)]
+  # select calls to retain and clean up table by removing unnecessary columns
+  probs <- probs[sv_call_name != "ref_hom" & llr_to_ref > llr_thr, .(chrom, start, end, sample, cell, class, scalar, num_bins, sv_call_name, sv_call_haplotype, sv_call_name_2nd, sv_call_haplotype_2nd, llr_to_ref, llr_to_2nd)]
 
-  return(probs[sv_call_name != "ref_hom" & llr_to_ref > llr_thr])
+  # compute allele frequencies
+  probs[, af := .N / ncells , by = .(chrom, start, end, sample, sv_call_name)]
+
+  return(probs)
 }
 
 forceBiallelic <- function(probs, penalize_factor = 0.1)
-- 
GitLab