From d8d9414242bc170171b64c2b45d40656fccf02e3 Mon Sep 17 00:00:00 2001
From: Tobias Marschall <tobias.marschall@0ohm.net>
Date: Thu, 1 Nov 2018 12:36:35 +0100
Subject: [PATCH] Further update by Jan to filtering/merging

---
 utils/filter_MosaiCatcher_calls.pl            |  4 +-
 ...ls_of_same_AF_and_generate_output_table.pl | 38 ++++++++++++++-----
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/utils/filter_MosaiCatcher_calls.pl b/utils/filter_MosaiCatcher_calls.pl
index 506b8d3..fd8efe5 100755
--- a/utils/filter_MosaiCatcher_calls.pl
+++ b/utils/filter_MosaiCatcher_calls.pl
@@ -3,7 +3,7 @@ use strict;
 
 my $min_N_inv = 3;
 my $min_WC = 1/3;
-my $safe_llr_to_ref = 50;
+my $safe_llr_to_ref = 30;
 my $SegDup_file = "./utils/segdups/segDups_hg38_UCSCtrack.bed.gz";
 my $MaxSegDup_overlap = 0.5;
 
@@ -20,7 +20,7 @@ if (!$ARGV[0]) {
 	print STDERR "Filters deletions seen in not more than $min_WC WC chromosomes\n";
 	print STDERR "Filters duplications seen in not more than $min_WC WC chromosomes (but gives inv-dups seen in such context a PASS)\n";
 	print STDERR "Del and Dup events with llr_to_ref>=$safe_llr_to_ref will never be masked\n";
-	printf STDERR "uses SegDup file $SegDup_file and removes all Dels overlapping with SegDups by >%3.1f%\n", $MaxSegDup_overlap*100;
+	printf STDERR "uses SegDup file $SegDup_file and removes all Dels overlapping with SegDups by >%4.1f percent \n", $MaxSegDup_overlap*100;
 	exit;
 }
 
diff --git a/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl b/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl
index d316ebf..bc349e0 100755
--- a/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl
+++ b/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl
@@ -2,10 +2,10 @@
 use strict;
 
 
-my $min_AF_for_grouping =0.1;
+my $min_AF_for_grouping =0.03;
 my $AF_simiarity_threshold = 0.25;
 my $input_file = $ARGV[0];
-
+my $safe_llr_to_ref = 30;
 if (!$ARGV[0]) {
 	print STDERR "inputfile missing\n";
 	print STDERR "-> cluster into groups by chromosome, if similar AF (relative threshold=$AF_simiarity_threshold), if same SV event, if SVs are directly adjacent, and if at least one cell shared\n";
@@ -14,7 +14,7 @@ if (!$ARGV[0]) {
 }
 
 #--- read input file
-my (%MAIN_HAP, %Sv_call_haplotype, %Num_bins, %Group_ID, %STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE, %Llr_to_2nds, %Llr_to_refs);
+my (%MAIN_HAP, %PASS_FAIL, %Sv_call_haplotype, %Num_bins, %Group_ID, %STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE, %Llr_to_2nds, %Llr_to_refs);
 print STDERR "Reading $input_file...\n";
 open FH, "$input_file" or die;
 while (<FH>) {
@@ -22,13 +22,14 @@ while (<FH>) {
         next if ($_ =~ /^chrom/); #ignore first line
         my ($chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail) = split (/[\t ]+/, $_); #split by TAB/whitespace
 	#print STDERR "$chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail\n";
-	next unless ($passfail =~ /PASS/); #ignore FAIL
+	#next unless ($passfail =~ /PASS/); #ignore FAIL
 	unless ($SEEN{$chrom}{$start}) {
 		push (@{$STARTs{$chrom}}, $start);
         	push (@{$ENDs{$chrom}}, $end);
 		$SEEN{$chrom}{$start}=1;
 	}
-        push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name);
+        push (@{$PASS_FAIL{$chrom}{$start}}, $passfail); 
+	push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name);
 	push (@{$CELLs{$chrom}{$start}}, $cell);
 	push (@{$AFs{$chrom}{$start}}, $af);
 	push (@{$Llr_to_refs{$chrom}{$start}}, $llr_to_ref);
@@ -124,10 +125,16 @@ foreach my $chrom (sort keys %STARTs) {
 		if (exists ($Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) {
 			#next if ($seen{$Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}});
 			$last_group_ID = $Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]};
-			my (@myStarts, @myEnds);
+			my (@myStarts, @myEnds, @myPassFail, $myLlr_to_refs);
 			my $segments = "$chrom:$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]";
 			push (@myStarts, $STARTs{$chrom}[$i]); 
 			push (@myEnds, $ENDs{$chrom}[$i]);
+			my $new_passfail;
+			#print STDERR "tst\n";
+			#print STDERR "---> $PASS_FAIL{$chrom}[$i][0]\n";
+			#print STDERR "---> @{$PASS_FAIL{$chrom}[$i]}\n";
+			($new_passfail) = sort {$b cmp $a} (@{$PASS_FAIL{$chrom}{$STARTs{$chrom}[$i]}});
+			push (@myPassFail, $new_passfail);
 			my $end =0;
 			my $y=$i;
 				#-- prepare consensus report for merged calls
@@ -135,7 +142,7 @@ foreach my $chrom (sort keys %STARTs) {
                         my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
                         my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
                         my ($num_bins) = sort {$b <=> $a} @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
-
+			$myLlr_to_refs+= $llr_to_ref; #add up log likelihoods
 			while (!$end) { #get other events falling onto same group ID
 				$y++; #increment
 				if ($y==@{$STARTs{$chrom}}) {
@@ -152,22 +159,35 @@ foreach my $chrom (sort keys %STARTs) {
 					}
 					push (@myStarts, $STARTs{$chrom}[$y]);                                         
                         		push (@myEnds, $ENDs{$chrom}[$y]);
+					my $new_passfail;
+                        		($new_passfail) = sort {$b cmp $a} @{$PASS_FAIL{$chrom}{$STARTs{$chrom}[$y]}};
+                        		push (@myPassFail, $new_passfail);
+					
 					$segments .= "|$chrom:$STARTs{$chrom}[$y]-$ENDs{$chrom}[$y]";
 					($af) = sort {$b <=> $a} ($af, @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}); #use maximum in this consensus report
 		                        ($llr_to_ref) = sort {$b <=> $a} ($llr_to_ref, @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
                		                ($llr_to_2nd) = sort {$b <=> $a} ($llr_to_2nd, @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
                                         ($num_bins) = sort {$b <=> $a} ($num_bins, @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
-
+					$myLlr_to_refs+= $llr_to_ref; #add up log likelihoods
 				} else {
 					$y--;
 					$i=$y; #jump forward
 					$end=1;
 				}
 			}
-			print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]}, $MAIN_HAP{$chrom}{$myStarts[0]}, $llr_to_ref, $llr_to_2nd, $af, \[Group\_$Group_ID{$chrom}{$myStarts[0]}{$myEnds[0]}\/$segments\]\n";
+			#--> check here for PASS-FAIL in Groups using @myPassFail
+			my $GroupPassFail;
+			($GroupPassFail) = sort {$b cmp $a} @myPassFail;
+			unless ($GroupPassFail =~ /PASS/) {
+				next unless ($myLlr_to_refs >= $safe_llr_to_ref);
+			}
+			print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]}, $MAIN_HAP{$chrom}{$myStarts[0]}, $myLlr_to_refs, N/A, $af, \[Group\_$Group_ID{$chrom}{$myStarts[0]}{$myEnds[0]}\/$segments\]\n";
 			#print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]]}\n";	
 		} else {
 			#-- prepare consensus report for singlish calls (calls not falling into a merge-group)
+			my $passfail_single;
+			($passfail_single) = sort {$a cmp $b} @{$PASS_FAIL{$chrom}{$STARTs{$chrom}[$i]}}; #remove singlish calls with a FAIL (introduced 31/10)
+			next unless ($passfail_single =~ /PASS/); 
 			my ($af) = sort {$b <=> $a} @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}; #use maximum in this consensus report
 			my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
 			my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
-- 
GitLab