From 4c5619d4968c8a99cf5be65813e22d95657616f4 Mon Sep 17 00:00:00 2001
From: Tobias Marschall <tobias.marschall@0ohm.net>
Date: Tue, 9 Oct 2018 20:05:23 +0200
Subject: [PATCH] Update on filter/merge scripts sent by Jan today.

---
 utils/filter_MosaiCatcher_calls.pl            | 28 ++++++++++++++++++-
 ...ls_of_same_AF_and_generate_output_table.pl |  3 ++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/utils/filter_MosaiCatcher_calls.pl b/utils/filter_MosaiCatcher_calls.pl
index 849bc89..e2687c4 100755
--- a/utils/filter_MosaiCatcher_calls.pl
+++ b/utils/filter_MosaiCatcher_calls.pl
@@ -3,7 +3,7 @@ use strict;
 
 my $min_N_inv = 3;
 my $min_WC = 0.2;
-my $safe_llr_to_ref = 20;
+my $safe_llr_to_ref = 50;
 my $SegDup_file = "./utils/segdups/segDups_hg38_UCSCtrack.bed.gz";
 my $MaxSegDup_overlap = 0.5;
 
@@ -118,6 +118,10 @@ close FH_1;
 #-- 5. filter Del events overlapping SegDups to much (based on defined value)
 print STDERR "Testing for overlap with SegDups...\n";
 #my %SEGDUPOV;
+my $UCSC_artefact_chr="chr2";
+my $UCSC_artefact_start=90400000;
+my $UCSC_artefact_end=91400000;
+
 foreach my $chrom (sort keys %STARTs) {
 	my %Seen;
         for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {	
@@ -151,6 +155,28 @@ foreach my $chrom (sort keys %STARTs) {
 		$overlap/=length($overlap_string);
 		#$SEGDUPOV{$chrom}{$stD}{$enD}=$overlap;
 		$FILTER{$chrom}{$stD}{$enD}= sprintf ("FAIL(SegDup:%4.2f)", $overlap) if ($overlap > $MaxSegDup_overlap);		
+		#------------------------------------------------------------
+		#-- remove build38 / ucsc 'hole' region in annotated assembly
+		#------------------------------------------------------------
+		next unless ($chrom eq $UCSC_artefact_chr);
+		my $UCSC_exception=0;
+		if (($stD<=$UCSC_artefact_start) && ($UCSC_artefact_start<=$enD)) {
+                                my ($EndPos) = sort {$a<=>$b} ($enD, $UCSC_artefact_end); #sort smaller EndVal
+                                my $DupOvL = $EndPos-$UCSC_artefact_start+1;
+                                substr ($overlap_string, $UCSC_artefact_start-$stD+1, $DupOvL) = sprintf "1"x$DupOvL; #fill in "1" for each overlapping base
+                		$UCSC_exception=1; 
+		} elsif (($UCSC_artefact_start<=$stD) && ($stD<=$UCSC_artefact_end)) {
+                       my ($EndPos) = sort {$a<=>$b} ($enD, $UCSC_artefact_end); #sort smaller EndVal
+                       my $DupOvL = $EndPos-$stD+1;
+                       substr ($overlap_string, 0, $DupOvL) = sprintf "1"x$DupOvL; #fill in "1" for each overlapping base
+                	$UCSC_exception = 1;
+		}
+		next unless ($UCSC_exception);
+		for (my $M=0; $M<length($overlap_string); $M++) {
+                     $overlap+=1 if (substr($overlap_string, $M, 1) eq "1");
+                }
+		$overlap/=length($overlap_string);
+		$FILTER{$chrom}{$stD}{$enD}= sprintf ("FAIL(SegD/UCSC:%4.2f)", $overlap) if ($overlap > $MaxSegDup_overlap);
 	}
 }
 
diff --git a/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl b/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl
index 9b7951a..d316ebf 100755
--- a/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl
+++ b/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl
@@ -2,12 +2,14 @@
 use strict;
 
 
+my $min_AF_for_grouping =0.1;
 my $AF_simiarity_threshold = 0.25;
 my $input_file = $ARGV[0];
 
 if (!$ARGV[0]) {
 	print STDERR "inputfile missing\n";
 	print STDERR "-> cluster into groups by chromosome, if similar AF (relative threshold=$AF_simiarity_threshold), if same SV event, if SVs are directly adjacent, and if at least one cell shared\n";
+	print STDERR "Ignores SVs with VAF<$min_AF_for_grouping, which are never grouped\n";
 	die;
 }
 
@@ -78,6 +80,7 @@ my $group_name = 0;
 foreach my $chrom (sort keys %STARTs) {
 	my $previous_shared = 0;
         for (my $i=0; $i<@{$STARTs{$chrom}}-1; $i++) {
+		next if ($averaged_AF{$chrom}{$STARTs{$chrom}[$i]} < $min_AF_for_grouping); #ignore events with too low VAF
 		unless ($ENDs{$chrom}[$i] == $STARTs{$chrom}[$i+1]) { #only consider events that are directly adjacent
 			$previous_shared=0;#initialize
 			next;
-- 
GitLab