From d8d9414242bc170171b64c2b45d40656fccf02e3 Mon Sep 17 00:00:00 2001 From: Tobias Marschall <tobias.marschall@0ohm.net> Date: Thu, 1 Nov 2018 12:36:35 +0100 Subject: [PATCH] Further update by Jan to filtering/merging --- utils/filter_MosaiCatcher_calls.pl | 4 +- ...ls_of_same_AF_and_generate_output_table.pl | 38 ++++++++++++++----- 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/utils/filter_MosaiCatcher_calls.pl b/utils/filter_MosaiCatcher_calls.pl index 506b8d3..fd8efe5 100755 --- a/utils/filter_MosaiCatcher_calls.pl +++ b/utils/filter_MosaiCatcher_calls.pl @@ -3,7 +3,7 @@ use strict; my $min_N_inv = 3; my $min_WC = 1/3; -my $safe_llr_to_ref = 50; +my $safe_llr_to_ref = 30; my $SegDup_file = "./utils/segdups/segDups_hg38_UCSCtrack.bed.gz"; my $MaxSegDup_overlap = 0.5; @@ -20,7 +20,7 @@ if (!$ARGV[0]) { print STDERR "Filters deletions seen in not more than $min_WC WC chromosomes\n"; print STDERR "Filters duplications seen in not more than $min_WC WC chromosomes (but gives inv-dups seen in such context a PASS)\n"; print STDERR "Del and Dup events with llr_to_ref>=$safe_llr_to_ref will never be masked\n"; - printf STDERR "uses SegDup file $SegDup_file and removes all Dels overlapping with SegDups by >%3.1f%\n", $MaxSegDup_overlap*100; + printf STDERR "uses SegDup file $SegDup_file and removes all Dels overlapping with SegDups by >%4.1f percent \n", $MaxSegDup_overlap*100; exit; } diff --git a/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl b/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl index d316ebf..bc349e0 100755 --- a/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl +++ b/utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl @@ -2,10 +2,10 @@ use strict; -my $min_AF_for_grouping =0.1; +my $min_AF_for_grouping =0.03; my $AF_simiarity_threshold = 0.25; my $input_file = $ARGV[0]; - +my $safe_llr_to_ref = 30; if (!$ARGV[0]) { print STDERR "inputfile missing\n"; print STDERR "-> cluster into groups by chromosome, if similar AF (relative threshold=$AF_simiarity_threshold), if same SV event, if SVs are directly adjacent, and if at least one cell shared\n"; @@ -14,7 +14,7 @@ if (!$ARGV[0]) { } #--- read input file -my (%MAIN_HAP, %Sv_call_haplotype, %Num_bins, %Group_ID, %STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE, %Llr_to_2nds, %Llr_to_refs); +my (%MAIN_HAP, %PASS_FAIL, %Sv_call_haplotype, %Num_bins, %Group_ID, %STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE, %Llr_to_2nds, %Llr_to_refs); print STDERR "Reading $input_file...\n"; open FH, "$input_file" or die; while (<FH>) { @@ -22,13 +22,14 @@ while (<FH>) { next if ($_ =~ /^chrom/); #ignore first line my ($chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail) = split (/[\t ]+/, $_); #split by TAB/whitespace #print STDERR "$chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail\n"; - next unless ($passfail =~ /PASS/); #ignore FAIL + #next unless ($passfail =~ /PASS/); #ignore FAIL unless ($SEEN{$chrom}{$start}) { push (@{$STARTs{$chrom}}, $start); push (@{$ENDs{$chrom}}, $end); $SEEN{$chrom}{$start}=1; } - push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name); + push (@{$PASS_FAIL{$chrom}{$start}}, $passfail); + push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name); push (@{$CELLs{$chrom}{$start}}, $cell); push (@{$AFs{$chrom}{$start}}, $af); push (@{$Llr_to_refs{$chrom}{$start}}, $llr_to_ref); @@ -124,10 +125,16 @@ foreach my $chrom (sort keys %STARTs) { if (exists ($Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { #next if ($seen{$Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}); $last_group_ID = $Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}; - my (@myStarts, @myEnds); + my (@myStarts, @myEnds, @myPassFail, $myLlr_to_refs); my $segments = "$chrom:$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]"; push (@myStarts, $STARTs{$chrom}[$i]); push (@myEnds, $ENDs{$chrom}[$i]); + my $new_passfail; + #print STDERR "tst\n"; + #print STDERR "---> $PASS_FAIL{$chrom}[$i][0]\n"; + #print STDERR "---> @{$PASS_FAIL{$chrom}[$i]}\n"; + ($new_passfail) = sort {$b cmp $a} (@{$PASS_FAIL{$chrom}{$STARTs{$chrom}[$i]}}); + push (@myPassFail, $new_passfail); my $end =0; my $y=$i; #-- prepare consensus report for merged calls @@ -135,7 +142,7 @@ foreach my $chrom (sort keys %STARTs) { my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report my ($num_bins) = sort {$b <=> $a} @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report - + $myLlr_to_refs+= $llr_to_ref; #add up log likelihoods while (!$end) { #get other events falling onto same group ID $y++; #increment if ($y==@{$STARTs{$chrom}}) { @@ -152,22 +159,35 @@ foreach my $chrom (sort keys %STARTs) { } push (@myStarts, $STARTs{$chrom}[$y]); push (@myEnds, $ENDs{$chrom}[$y]); + my $new_passfail; + ($new_passfail) = sort {$b cmp $a} @{$PASS_FAIL{$chrom}{$STARTs{$chrom}[$y]}}; + push (@myPassFail, $new_passfail); + $segments .= "|$chrom:$STARTs{$chrom}[$y]-$ENDs{$chrom}[$y]"; ($af) = sort {$b <=> $a} ($af, @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}); #use maximum in this consensus report ($llr_to_ref) = sort {$b <=> $a} ($llr_to_ref, @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report ($llr_to_2nd) = sort {$b <=> $a} ($llr_to_2nd, @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report ($num_bins) = sort {$b <=> $a} ($num_bins, @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report - + $myLlr_to_refs+= $llr_to_ref; #add up log likelihoods } else { $y--; $i=$y; #jump forward $end=1; } } - print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]}, $MAIN_HAP{$chrom}{$myStarts[0]}, $llr_to_ref, $llr_to_2nd, $af, \[Group\_$Group_ID{$chrom}{$myStarts[0]}{$myEnds[0]}\/$segments\]\n"; + #--> check here for PASS-FAIL in Groups using @myPassFail + my $GroupPassFail; + ($GroupPassFail) = sort {$b cmp $a} @myPassFail; + unless ($GroupPassFail =~ /PASS/) { + next unless ($myLlr_to_refs >= $safe_llr_to_ref); + } + print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]}, $MAIN_HAP{$chrom}{$myStarts[0]}, $myLlr_to_refs, N/A, $af, \[Group\_$Group_ID{$chrom}{$myStarts[0]}{$myEnds[0]}\/$segments\]\n"; #print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]]}\n"; } else { #-- prepare consensus report for singlish calls (calls not falling into a merge-group) + my $passfail_single; + ($passfail_single) = sort {$a cmp $b} @{$PASS_FAIL{$chrom}{$STARTs{$chrom}[$i]}}; #remove singlish calls with a FAIL (introduced 31/10) + next unless ($passfail_single =~ /PASS/); my ($af) = sort {$b <=> $a} @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}; #use maximum in this consensus report my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report -- GitLab