Skip to content
Snippets Groups Projects
Commit 421ea9f0 authored by Tobias Marschall's avatar Tobias Marschall
Browse files

Updated versions of Jan's scripts for filtering/merging.

parent b6854542
No related branches found
No related tags found
No related merge requests found
......@@ -2,6 +2,8 @@
use strict;
my $min_N_inv = 3;
my $min_WC = 0.2;
my $safe_llr_to_ref = 20;
#Use input file with format "chrom start end sample cell class scalar num_bins sv_call_name sv_call_haplotype sv_call_name_2nd sv_call_haplotype_2nd llr_to_ref llr_to_2nd af"
#
......@@ -13,8 +15,9 @@ my $input_file = $ARGV[0];
if (!$ARGV[0]) {
print STDERR "Input filname missinf=g (use e.g. sv_calls_txt_file_all/RPE1-WT/100000_fixed_norm.selected_j0.01_s0.1/simpleCalls_llr4_poppriorsTRUE_haplotagsTRUE_gtcutoff0.05_regfactor6.txt )\n";
print STDERR "Filters inversions unless they are seen at least $min_N_inv times.\n";
print STDERR "Filters deletions only seen in WW and CC chromosomes\n";
print STDERR "Filters duplications only seen in WW and CC chromosomes (but gives inv-dups seen in such context a PASS)\n";
print STDERR "Filters deletions seen in less then $min_WC WC chromosomes\n";
print STDERR "Filters duplications seen in less then $min_WC WC chromosomes (but gives inv-dups seen in such context a PASS)\n";
print STDERR "Del and Dup events with llr_to_ref>=$safe_llr_to_ref will never be masked\n";
exit;
}
......@@ -22,7 +25,7 @@ print STDERR "Processing $input_file...\n";
#--- read input file
my (%STARTs, %ENDs, %SV_TYPEs, %STRAND_STATESs, %FILTER, %REPORTs);
my (%STARTs, %ENDs, %SV_TYPEs, %STRAND_STATESs, %FILTER, %REPORTs, %FILTER_ARR, %FILTER_ARR_DUP, %TESTED_DEL, %TESTED_DUP, %inv_recurrence, %LLR_TO_REF);
open FH, "$input_file" or die;
while (<FH>) {
chomp;
......@@ -32,42 +35,58 @@ while (<FH>) {
push (@{$ENDs{$chrom}}, $end);
push (@{$SV_TYPEs{$chrom}}, $sv_call_name);
push (@{$STRAND_STATESs{$chrom}}, $strand_state_class);
#$_ =~ /[\t ]*/\t/;
push (@{$REPORTs{$chrom}}, $_);
# Del and Dup with high llr will never be filtered. Record highest llr for event at certain position
$LLR_TO_REF{$chrom}{$start}{$end} = $llr_to_ref unless (exists ($LLR_TO_REF{$chrom}{$start}{$end}));
$llr_to_ref = 100 if ($llr_to_ref =~ /Inf/);
$LLR_TO_REF{$chrom}{$start}{$end} = $llr_to_ref if ($llr_to_ref > $LLR_TO_REF{$chrom}{$start}{$end});
}
close FH;
#--- 1. filter DEL calls only seen on WW or CC chromosomes
#--- 1. filter DEL calls seen primarily on WW or CC chromosomes
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
next unless ($SV_TYPEs{$chrom}[$i] =~ /del_h[12]/); #only look at het-del calls
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
#FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
#print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n";
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW');
if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW') {
push (@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 1);
} else {
push (@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 0);
}
$TESTED_DEL{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}=1;
}
}
#--- 2. filter INV calls seen less than Min_N_inv times
foreach my $chrom (sort keys %STARTs) {
my %inv_recurrence=();
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
next unless ($SV_TYPEs{$chrom}[$i] =~ /inv_h[12]/); #only look at het-inv calls
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
next unless ($SV_TYPEs{$chrom}[$i] =~ /inv_h[12o]/); #only look at het-inv or hom-inv calls
#$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
#print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n";
$inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}++;
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $min_N_inv);
$inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}++;
#$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($inv_recurrence{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $min_N_inv);
}
}
#--- 3. filter DUP calls only seen on WW or CC chromosomes
#--- 3. filter DUP calls seen primarily on WW or CC chromosomes
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
next unless ($SV_TYPEs{$chrom}[$i] =~ /dup_h[12]/); #only look at het-dup calls
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'FAIL' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}) or $SV_TYPEs{$chrom}[$i] =~ /idup_h[12]/); #initialize
#print STDERR "$chrom $SV_TYPEs{$chrom}[$i]\n";
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW');
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($SV_TYPEs{$chrom}[$i] =~ /^idup_h[12]/); #inverted duplications trigger 'pass'
next unless ($SV_TYPEs{$chrom}[$i] =~ /^dup_h[12]/); #look at het-dup calls
#$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = 'PASS' if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW');
if ($STRAND_STATESs{$chrom}[$i] eq 'WC' or $STRAND_STATESs{$chrom}[$i] eq 'CW') {
push (@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 1);
} else {
push (@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}, 0);
}
$TESTED_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}=1;
}
}
......@@ -77,7 +96,33 @@ foreach my $chrom (sort keys %STARTs) {
print "chrom\tstart\tend\tsample\tcell\tstrand_state_class\tscalar\tnum_bins\tsv_call_name\tsv_call_haplotype\tsv_call_name_2nd\tsv_call_haplotype_2nd\tllr_to_ref\tllr_to_2nd\taf\tpass/fail\n";
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'PASS' unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})); #initialize
if (!exists ($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) {
if (exists ($TESTED_DEL{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { #check DELs to identify events seen not largely in CC and WW chromosomes
my $iterate=0;
foreach my $val (@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}) {
$iterate+=$val/@{$FILTER_ARR{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}};
}
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= sprintf ("FAIL(WC:%4.2f)", $iterate) if ($iterate < $min_WC);
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= "PASS" if ($iterate >= $min_WC or $LLR_TO_REF{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $safe_llr_to_ref);
}
if (exists ($TESTED_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) { #check DUP to identify events seen not largely in CC and WW chromosomes
my $iterate=0;
foreach my $val (@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}}) {
$iterate+=$val/@{$FILTER_ARR_DUP{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}};
}
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= sprintf ("FAIL(WC:%4.2f)", $iterate) if ($iterate < $min_WC);
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= "PASS" if ($iterate >= $min_WC or $LLR_TO_REF{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} >= $safe_llr_to_ref);
}
}
unless (exists($FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) {
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}= 'PASS';
if (exists ($inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) {
$FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} = sprintf ("FAIL(#inv:" . $inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} . ")") if ($inv_recurrence{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]} < $min_N_inv);
}
}
print "$REPORTs{$chrom}[$i]\t", $FILTER{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}, "\n";
}
}
#!/usr/bin/perl -w
use strict;
my $AF_simiarity_threshold = 0.25;
my $input_file = $ARGV[0];
if (!$ARGV[0]) {
print STDERR "inputfile missing\n";
print STDERR "-> cluster into groups by chromosome, if similar AF (relative threshold=$AF_simiarity_threshold), if same SV event, if SVs are directly adjacent, and if at least one cell shared\n";
die;
}
#--- read input file
my (%STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE);
print STDERR "Reading $input_file...\n";
open FH, "$input_file" or die;
while (<FH>) {
chomp;
next if ($_ =~ /^chrom/); #ignore first line
my ($chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail) = split (/[\t ]+/, $_); #split by TAB/whitespace
#print STDERR "$chrom, $start, $end, $sample, $cell, $strand_state_class, $scalar, $num_bins, $sv_call_name, $sv_call_haplotype, $sv_call_name_2nd, $sv_call_haplotype_2nd, $llr_to_ref, $llr_to_2nd, $af, $passfail\n";
next unless ($passfail =~ /PASS/); #ignore FAIL
unless ($SEEN{$chrom}{$start}) {
push (@{$STARTs{$chrom}}, $start);
push (@{$ENDs{$chrom}}, $end);
$SEEN{$chrom}{$start}=1;
}
push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name);
push (@{$CELLs{$chrom}{$start}}, $cell);
push (@{$AFs{$chrom}{$start}}, $af);
}
%SEEN=(); #reinitialize
close FH;
#-- compute average AF per event
foreach my $chrom (sort keys %STARTs) {
#print STDERR "test\n";
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
$averaged_AF{$chrom}{$STARTs{$chrom}[$i]} = &average (@{$AFs{$chrom}{$STARTs{$chrom}[$i]}});
}
}
#-- define majority SV type (haplotype-independent)
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
my (%count_abundance, $main_SV_type);
foreach my $callname (@{$SV_TYPEs{$chrom}{$STARTs{$chrom}[$i]}}) {
$callname =~ s/_h[12]//;
#print STDERR "$callname\n";
$count_abundance{$callname}++;
}
($main_SV_type) = sort {$count_abundance{$b}<=>$count_abundance{$a}} keys %count_abundance;
$MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]} = $main_SV_type;
}
}
#--- cluster into groups by chromosome, if similar averaged AF, same primary SV type, directly adjacent call, and at least one cell shared
my $group_name = 0;
foreach my $chrom (sort keys %STARTs) {
my $previous_shared = 0;
for (my $i=0; $i<@{$STARTs{$chrom}}-1; $i++) {
unless ($ENDs{$chrom}[$i] == $STARTs{$chrom}[$i+1]) { #only consider events that are directly adjacent
$previous_shared=0;#initialize
next;
}
unless (($averaged_AF{$chrom}{$STARTs{$chrom}[$i]}/$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}>= (1-$AF_simiarity_threshold)) and ($averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}/$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}>= (1-$AF_simiarity_threshold))) { #compare AFs
$previous_shared=0;#initialize
next;
}
unless ($MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]} eq $MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i+1]}) { #must be same primary SV type
$previous_shared=0;#initialize
next;
}
my $shared_cells=0;
foreach my $cell1 (@{$CELLs{$chrom}{$STARTs{$chrom}[$i]}}) {
last if ($shared_cells==1);
foreach my $cell2 (@{$CELLs{$chrom}{$STARTs{$chrom}[$i+1]}}) {
$shared_cells = 1 if ($cell1 eq $cell2);
}
}
if ($shared_cells == 0) {
$previous_shared=0;#initialize
next;
}
$group_name++ unless ($previous_shared);
print "SHARED: $chrom\t$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]|$STARTs{$chrom}[$i+1]-$ENDs{$chrom}[$i+1] (AF=$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}|$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}) (SV_type=$MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}) group_name=$group_name\n";
$previous_shared = 1;
}
}
#---------------
#--- subroutines
#---------------
sub average {
my $av=0;
foreach my $cnt (@_) {
$av+=$cnt/scalar(@_);
}
return $av;
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment