Skip to content
Snippets Groups Projects
Commit 5699687e authored by Tobias Marschall's avatar Tobias Marschall
Browse files

Replace Jan's merging script by new one (sent by mail Oct 4; 19:11)

parent c8e5c181
No related branches found
No related tags found
No related merge requests found
......@@ -652,7 +652,7 @@ rule postprocessing_merge:
output:
calls = "postprocessing/merge/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.txt"
shell:
'utils/group_nearby_calls_of_same_AF.pl {input.calls} > {output.calls}'
'utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl {input.calls} > {output.calls}'
################################################################################
......
......@@ -12,7 +12,7 @@ if (!$ARGV[0]) {
}
#--- read input file
my (%STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE);
my (%MAIN_HAP, %Sv_call_haplotype, %Num_bins, %Group_ID, %STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE, %Llr_to_2nds, %Llr_to_refs);
print STDERR "Reading $input_file...\n";
open FH, "$input_file" or die;
while (<FH>) {
......@@ -29,6 +29,10 @@ while (<FH>) {
push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name);
push (@{$CELLs{$chrom}{$start}}, $cell);
push (@{$AFs{$chrom}{$start}}, $af);
push (@{$Llr_to_refs{$chrom}{$start}}, $llr_to_ref);
push (@{$Llr_to_2nds{$chrom}{$start}}, $llr_to_2nd);
push (@{$Num_bins{$chrom}{$start}}, $num_bins);
push (@{$Sv_call_haplotype{$chrom}{$start}}, $sv_call_haplotype);
}
%SEEN=(); #reinitialize
close FH;
......@@ -56,6 +60,19 @@ foreach my $chrom (sort keys %STARTs) {
}
}
#-- define majority haplotype
foreach my $chrom (sort keys %STARTs) {
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
my (%count_abundance, $main_hap);
foreach my $hapname (@{$Sv_call_haplotype{$chrom}{$STARTs{$chrom}[$i]}}) {
$count_abundance{$hapname}++;
}
($main_hap) = sort {$count_abundance{$b}<=>$count_abundance{$a}} keys %count_abundance;
$MAIN_HAP{$chrom}{$STARTs{$chrom}[$i]} = $main_hap;
}
}
#--- cluster into groups by chromosome, if similar averaged AF, same primary SV type, directly adjacent call, and at least one cell shared
my $group_name = 0;
foreach my $chrom (sort keys %STARTs) {
......@@ -85,11 +102,81 @@ foreach my $chrom (sort keys %STARTs) {
next;
}
$group_name++ unless ($previous_shared);
print "SHARED: $chrom\t$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]|$STARTs{$chrom}[$i+1]-$ENDs{$chrom}[$i+1] (AF=$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}|$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}) (SV_type=$MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}) group_name=$group_name\n";
print STDERR "Grouping/merging $chrom\t$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]|$STARTs{$chrom}[$i+1]-$ENDs{$chrom}[$i+1] (AF=$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}|$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}) (SV_type=$MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}) group_name=$group_name ...\n";
$previous_shared = 1;
$Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}=$group_name;
$Group_ID{$chrom}{$STARTs{$chrom}[$i+1]}{$ENDs{$chrom}[$i+1]}=$group_name;
}
}
#--------- Generating output
print STDERR "--\nGenerating Results_Output_Table\n";
print "chrom, start, end, num_bins, consensus_sv, consensus_sv_call_haplotype, llr_to_ref_max, llr_to_2nd_max, af, segments\n";
foreach my $chrom (sort keys %STARTs) {
my $previous_shared = 0;
my $last_group_ID;
for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
if (exists ($Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) {
#next if ($seen{$Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}});
$last_group_ID = $Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]};
my (@myStarts, @myEnds);
my $segments = "$chrom:$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]";
push (@myStarts, $STARTs{$chrom}[$i]);
push (@myEnds, $ENDs{$chrom}[$i]);
my $end =0;
my $y=$i;
#-- prepare consensus report for merged calls
my ($af) = sort {$b <=> $a} @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}; #use maximum in this consensus report
my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
my ($num_bins) = sort {$b <=> $a} @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
while (!$end) { #get other events falling onto same group ID
$y++; #increment
if ($y==@{$STARTs{$chrom}}) {
$y--;
$i=$y;
last;
}
#print "($y)|", scalar (@{$STARTs{$chrom}}), " $chrom|$Group_ID{$chrom}{$STARTs{$chrom}[$y]}{$ENDs{$chrom}[$y]}|--->";
if (exists ($Group_ID{$chrom}{$STARTs{$chrom}[$y]}{$ENDs{$chrom}[$y]})) {
if ($last_group_ID ne $Group_ID{$chrom}{$STARTs{$chrom}[$y]}{$ENDs{$chrom}[$y]}) { #new group starts
$y--;
$i=$y;
last;
}
push (@myStarts, $STARTs{$chrom}[$y]);
push (@myEnds, $ENDs{$chrom}[$y]);
$segments .= "|$chrom:$STARTs{$chrom}[$y]-$ENDs{$chrom}[$y]";
($af) = sort {$b <=> $a} ($af, @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}); #use maximum in this consensus report
($llr_to_ref) = sort {$b <=> $a} ($llr_to_ref, @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
($llr_to_2nd) = sort {$b <=> $a} ($llr_to_2nd, @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
($num_bins) = sort {$b <=> $a} ($num_bins, @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
} else {
$y--;
$i=$y; #jump forward
$end=1;
}
}
print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]}, $MAIN_HAP{$chrom}{$myStarts[0]}, $llr_to_ref, $llr_to_2nd, $af, \[Group\_$Group_ID{$chrom}{$myStarts[0]}{$myEnds[0]}\/$segments\]\n";
#print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]]}\n";
} else {
#-- prepare consensus report for singlish calls (calls not falling into a merge-group)
my ($af) = sort {$b <=> $a} @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}; #use maximum in this consensus report
my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
my ($num_bins) = sort {$b <=> $a} @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
print "$chrom, $STARTs{$chrom}[$i], $ENDs{$chrom}[$i], $num_bins, $MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}, $MAIN_HAP{$chrom}{$STARTs{$chrom}[$i]}, $llr_to_ref, $llr_to_2nd, $af, [$chrom:$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]]\n";
}
}
}
#---------------
#--- subroutines
#---------------
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment