diff --git a/utils/filter_MosaiCatcher_calls.pl b/utils/filter_MosaiCatcher_calls.pl index e7e095736f6036755e4a83611bb9ae17748ff88c..65449e9cd203a7f571b5fc98348f9f7755e5bb01 100755 --- a/utils/filter_MosaiCatcher_calls.pl +++ b/utils/filter_MosaiCatcher_calls.pl @@ -2,6 +2,8 @@ use strict; my $min_N_inv = 3; +my $min_WC = 0.2; +my $safe_llr_to_ref = 20; #Use input file with format "chrom start end sample cell class scalar num_bins sv_call_name sv_call_haplotype sv_call_name_2nd sv_call_haplotype_2nd llr_to_ref llr_to_2nd af" # @@ -13,8 +15,9 @@ my $input_file = $ARGV[0]; if (!$ARGV[0]) { print STDERR "Input filname missinf=g (use e.g. sv_calls_txt_file_all/RPE1-WT/100000_fixed_norm.selected_j0.01_s0.1/simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.05_regfactor6.txt )\n"; print STDERR "Filters inversions unless they are seen at least $min_N_inv times.\n"; - print STDERR "Filters deletions only seen in WW and CC chromosomes\n"; - print STDERR "Filters duplications only seen in WW and CC chromosomes (but gives inv-dups seen in such context a PASS)\n"; + print STDERR "Filters deletions seen in less then $min_WC WC chromosomes\n"; + print STDERR "Filters duplications seen in less then $min_WC WC chromosomes (but gives inv-dups seen in such context a PASS)\n"; + print STDERR "Del and Dup events with llr_to_ref>=$safe_llr_to_ref will never be masked\n"; exit; } @@ -22,7 +25,7 @@ print STDERR "Processing $input_file...\n"; #--- read input file -my (%STARTs, %ENDs, %SV_TYPEs, %STRAND_STATESs, %FILTER, %REPORTs); +my (%STARTs, %ENDs, %SV_TYPEs, %STRAND_STATESs, %FILTER, %REPORTs, %FILTER_ARR, %FILTER_ARR_DUP, %TESTED_DEL, %TESTED_DUP, %inv_recurrence, %LLR_TO_REF); open FH, "$input_file" or die; while (<FH>) { chomp; @@ -32,42 +35,58 @@ while (<FH>) { push (@{$ENDs{$chrom}}, $end); push (@{$SV_TYPEs{$chrom}}, $sv_call_name); push (@{$STRAND_STATESs{$chrom}}, $strand_state_class); + #$_ =~ /[\t ]*/\t/; push (@{$REPORTs{$chrom}}, $_); + + # Del and Dup with high llr will never be filtered. Record highest llr for event at certain position + $LLR_TO_REF{$chrom}{$start}{$end} = $llr_to_ref unless (exists ($LLR_TO_REF{$chrom}{$start}{$end})); + $llr_to_ref = 100 if ($llr_to_ref =~ /Inf/); + $LLR_TO_REF{$chrom}{$start}{$end} = $llr_to_ref if ($llr_to_ref > $LLR_TO_REF{$chrom}{$start}{$end}); } close FH; -#--- 1. filter DEL calls only seen on WW or CC chromosomes +#--- 1. filter DEL calls seen primarily on WW or CC chromosomes foreach my $chrom (sort keys %STARTs) { for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) { next unless ($SV_TYPEs{$chrom}[$i] =~ /del_h[12]/); #only look at het-del calls - $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize + #FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize #print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n"; - $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW'); + if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW') { + push (@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 1); + } else { + push (@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 0); + } + $TESTED_DEL{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}=1; } } #--- 2. filter INV calls seen less than Min_N_inv times foreach my $chrom (sort keys %STARTs) { - my %inv_recurrence=(); for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) { - next unless ($SV_TYPEs{$chrom}[$i] =~ /inv_h[12]/); #only look at het-inv calls - $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize + next unless ($SV_TYPEs{$chrom}[$i] =~ /inv_h[12o]/); #only look at het-inv or hom-inv calls + #$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize #print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n"; - $inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}++; - $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $min_N_inv); + $inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}++; + #$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $min_N_inv); } } -#--- 3. filter DUP calls only seen on WW or CC chromosomes +#--- 3. filter DUP calls seen primarily on WW or CC chromosomes foreach my $chrom (sort keys %STARTs) { for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) { - next unless ($SV_TYPEs{$chrom}[$i] =~ /dup_h[12]/); #only look at het-dup calls - $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}) or $SV_TYPEs{$chrom}[$i] =~ /idup_h[12]/); #initialize - #print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n"; - $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW'); + $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($SV_TYPEs{$chrom}[$i] =~ /^idup_h[12]/); #inverted duplications trigger 'pass' + next unless ($SV_TYPEs{$chrom}[$i] =~ /^dup_h[12]/); #look at het-dup calls + #$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW'); + + if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW') { + push (@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 1); + } else { + push (@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 0); + } + $TESTED_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}=1; } } @@ -77,7 +96,33 @@ foreach my $chrom (sort keys %STARTs) { print "chrom\tstart\tend\tsample\tcell\tstrand_state_class\tscalar\tnum_bins\tsv_call_name\tsv_call_haplotype\tsv_call_name_2nd\tsv_call_haplotype_2nd\tllr_to_ref\tllr_to_2nd\taf\tpass/fail\n"; foreach my $chrom (sort keys %STARTs) { for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) { - $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'PASS' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize + if (!exists ($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { + + + if (exists ($TESTED_DEL{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { #check DELs to identify events seen not largely in CC and WW chromosomes + my $iterate=0; + foreach my $val (@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}) { + $iterate+=$val/@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}; + } + + $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= sprintf ("FAIL(WC:%4.2f)", $iterate) if ($iterate < $min_WC); + $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= "PASS" if ($iterate >= $min_WC or $LLR_TO_REF{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $safe_llr_to_ref); + } + if (exists ($TESTED_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { #check DUP to identify events seen not largely in CC and WW chromosomes + my $iterate=0; + foreach my $val (@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}) { + $iterate+=$val/@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}; + } + $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= sprintf ("FAIL(WC:%4.2f)", $iterate) if ($iterate < $min_WC); + $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= "PASS" if ($iterate >= $min_WC or $LLR_TO_REF{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $safe_llr_to_ref); + } + } + unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { + $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'PASS'; + if (exists ($inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { + $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = sprintf ("FAIL(#inv:" . $inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} . ")") if ($inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} < $min_N_inv); + } + } print "$REPORTs{$chrom}[$i]\t", $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}, "\n"; } } diff --git a/utils/group_nearby_calls_of_same_AF.pl b/utils/group_nearby_calls_of_same_AF.pl new file mode 100755 index 0000000000000000000000000000000000000000..57e8f972bf51b1cd611a88dc2fbd88ad378abae9 --- /dev/null +++ b/utils/group_nearby_calls_of_same_AF.pl @@ -0,0 +1,103 @@ +#!/usr/bin/perl -w +use strict; + + +my $AF_simiarity_threshold = 0.25; +my $input_file = $ARGV[0]; + +if (!$ARGV[0]) { + print STDERR "inputfile missing\n"; + print STDERR "-> cluster into groups by chromosome, if similar AF (relative threshold=$AF_simiarity_threshold), if same SV event, if SVs are directly adjacent, and if at least one cell shared\n"; + die; +} + +#--- read input file +my (%STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE); +print STDERR "Reading $input_file...\n"; +open FH, "$input_file" or die; +while (<FH>) { + chomp; + next if ($_ =~ /^chrom/); #ignore first line + my ($chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail) = split (/[\t ]+/, $_); #split by TAB/whitespace + #print STDERR "$chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail\n"; + next unless ($passfail =~ /PASS/); #ignore FAIL + unless ($SEEN{$chrom}{$start}) { + push (@{$STARTs{$chrom}}, $start); + push (@{$ENDs{$chrom}}, $end); + $SEEN{$chrom}{$start}=1; + } + push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name); + push (@{$CELLs{$chrom}{$start}}, $cell); + push (@{$AFs{$chrom}{$start}}, $af); +} +%SEEN=(); #reinitialize +close FH; + + +#-- compute average AF per event +foreach my $chrom (sort keys %STARTs) { + #print STDERR "test\n"; + for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) { + $averaged_AF{$chrom}{$STARTs{$chrom}[$i]} = &average (@{$AFs{$chrom}{$STARTs{$chrom}[$i]}}); + } +} + +#-- define majority SV type (haplotype-independent) +foreach my $chrom (sort keys %STARTs) { + for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) { + my (%count_abundance, $main_SV_type); + foreach my $callname (@{$SV_TYPEs{$chrom}{$STARTs{$chrom}[$i]}}) { + $callname =~ s/_h[12]//; + #print STDERR "$callname\n"; + $count_abundance{$callname}++; + } + ($main_SV_type) = sort {$count_abundance{$b}<=>$count_abundance{$a}} keys %count_abundance; + $MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]} = $main_SV_type; + } +} + +#--- cluster into groups by chromosome, if similar averaged AF, same primary SV type, directly adjacent call, and at least one cell shared +my $group_name = 0; +foreach my $chrom (sort keys %STARTs) { + my $previous_shared = 0; + for (my $i=0; $i<@{$STARTs{$chrom}}-1; $i++) { + unless ($ENDs{$chrom}[$i] == $STARTs{$chrom}[$i+1]) { #only consider events that are directly adjacent + $previous_shared=0;#initialize + next; + } + unless (($averaged_AF{$chrom}{$STARTs{$chrom}[$i]}/$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}>= (1-$AF_simiarity_threshold)) and ($averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}/$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}>= (1-$AF_simiarity_threshold))) { #compare AFs + $previous_shared=0;#initialize + next; + } + unless ($MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]} eq $MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i+1]}) { #must be same primary SV type + $previous_shared=0;#initialize + next; + } + my $shared_cells=0; + foreach my $cell1 (@{$CELLs{$chrom}{$STARTs{$chrom}[$i]}}) { + last if ($shared_cells==1); + foreach my $cell2 (@{$CELLs{$chrom}{$STARTs{$chrom}[$i+1]}}) { + $shared_cells = 1 if ($cell1 eq $cell2); + } + } + if ($shared_cells == 0) { + $previous_shared=0;#initialize + next; + } + $group_name++ unless ($previous_shared); + print "SHARED: $chrom\t$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]|$STARTs{$chrom}[$i+1]-$ENDs{$chrom}[$i+1] (AF=$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}|$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}) (SV_type=$MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}) group_name=$group_name\n"; + $previous_shared = 1; + } +} + +#--------------- +#--- subroutines +#--------------- +sub average { + my $av=0; + foreach my $cnt (@_) { + $av+=$cnt/scalar(@_); + } + return $av; +} +