Replace Jan's merging script by new one (sent by mail Oct 4; 19:11)

5699687e · Tobias Marschall · c8e5c181 · 5699687e · 5699687e
Commit 5699687e authored 6 years ago by Tobias Marschall
--- a/Snakefile
+++ b/Snakefile
@@ -652,7 +652,7 @@ rule postprocessing_merge:
    output: 
        calls = "postprocessing/merge/{sample}/{window}_fixed_norm.{bpdens,selected_j[0-9\\.]+_s[0-9\\.]+}/simpleCalls_llr{llr}_poppriors{pop_priors,(TRUE|FALSE)}_haplotags{use_haplotags,(TRUE|FALSE)}_gtcutoff{gtcutoff,[0-9\\.]+}_regfactor{regfactor,[0-9]+}.txt"
    shell:
-        'utils/group_nearby_calls_of_same_AF.pl {input.calls}  > {output.calls}'
+        'utils/group_nearby_calls_of_same_AF_and_generate_output_table.pl {input.calls}  > {output.calls}'


 ################################################################################

--- a/utils/group_nearby_calls_of_same_AF.pl
+++ b/utils/group_nearby_calls_of_same_AF.pl
@@ -12,7 +12,7 @@ if (!$ARGV[0]) {
 }

 #--- read input file
-my (%STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE);
+my (%MAIN_HAP, %Sv_call_haplotype, %Num_bins, %Group_ID, %STARTs, %ENDs, %SV_TYPEs, %CELLs, %AFs, %SEEN, %averaged_AF, %MAIN_SV_TYPE, %Llr_to_2nds, %Llr_to_refs);
 print STDERR "Reading $input_file...\n";
 open FH, "$input_file" or die;
 while (<FH>) {
@@ -29,6 +29,10 @@ while (<FH>) {
        push (@{$SV_TYPEs{$chrom}{$start}}, $sv_call_name);
 	push (@{$CELLs{$chrom}{$start}}, $cell);
 	push (@{$AFs{$chrom}{$start}}, $af);
+	push (@{$Llr_to_refs{$chrom}{$start}}, $llr_to_ref);
+	push (@{$Llr_to_2nds{$chrom}{$start}}, $llr_to_2nd);
+	push (@{$Num_bins{$chrom}{$start}}, $num_bins);
+	push (@{$Sv_call_haplotype{$chrom}{$start}}, $sv_call_haplotype);
 }
 %SEEN=(); #reinitialize
 close FH;
@@ -56,6 +60,19 @@ foreach my $chrom (sort keys %STARTs) {
 	}
 }

+#-- define majority haplotype
+foreach my $chrom (sort keys %STARTs) {
+        for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
+                my (%count_abundance, $main_hap);
+                foreach my $hapname (@{$Sv_call_haplotype{$chrom}{$STARTs{$chrom}[$i]}}) {
+                        $count_abundance{$hapname}++;
+                }
+                ($main_hap) = sort {$count_abundance{$b}<=>$count_abundance{$a}} keys %count_abundance;
+                $MAIN_HAP{$chrom}{$STARTs{$chrom}[$i]} = $main_hap;
+        }
+}
+
+
 #--- cluster into groups by chromosome, if similar averaged AF, same primary SV type, directly adjacent call, and at least one cell shared
 my $group_name = 0;
 foreach my $chrom (sort keys %STARTs) {
@@ -85,11 +102,81 @@ foreach my $chrom (sort keys %STARTs) {
 			next;
 		}
 		$group_name++ unless ($previous_shared);
-		print "SHARED: $chrom\t$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]|$STARTs{$chrom}[$i+1]-$ENDs{$chrom}[$i+1] (AF=$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}|$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}) (SV_type=$MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}) group_name=$group_name\n";	
+		print STDERR "Grouping/merging $chrom\t$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]|$STARTs{$chrom}[$i+1]-$ENDs{$chrom}[$i+1] (AF=$averaged_AF{$chrom}{$STARTs{$chrom}[$i]}|$averaged_AF{$chrom}{$STARTs{$chrom}[$i+1]}) (SV_type=$MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}) group_name=$group_name ...\n";	
 		$previous_shared = 1;
+		$Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}=$group_name;
+		$Group_ID{$chrom}{$STARTs{$chrom}[$i+1]}{$ENDs{$chrom}[$i+1]}=$group_name;
+	}
+}
+
+
+#--------- Generating output
+print STDERR "--\nGenerating Results_Output_Table\n";
+print "chrom, start, end, num_bins, consensus_sv, consensus_sv_call_haplotype, llr_to_ref_max, llr_to_2nd_max, af, segments\n";
+
+foreach my $chrom (sort keys %STARTs) {
+        my $previous_shared = 0;
+	my $last_group_ID;
+        for (my $i=0; $i<@{$STARTs{$chrom}}; $i++) {
+		if (exists ($Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]})) {
+			#next if ($seen{$Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]}});
+			$last_group_ID = $Group_ID{$chrom}{$STARTs{$chrom}[$i]}{$ENDs{$chrom}[$i]};
+			my (@myStarts, @myEnds);
+			my $segments = "$chrom:$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]";
+			push (@myStarts, $STARTs{$chrom}[$i]); 
+			push (@myEnds, $ENDs{$chrom}[$i]);
+			my $end =0;
+			my $y=$i;
+				#-- prepare consensus report for merged calls
+			my ($af) = sort {$b <=> $a} @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}; #use maximum in this consensus report
+                        my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
+                        my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
+                        my ($num_bins) = sort {$b <=> $a} @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
+
+			while (!$end) { #get other events falling onto same group ID
+				$y++; #increment
+				if ($y==@{$STARTs{$chrom}}) {
+					$y--;
+					$i=$y;
+					last;
+				}
+				#print "($y)|", scalar (@{$STARTs{$chrom}}), " $chrom|$Group_ID{$chrom}{$STARTs{$chrom}[$y]}{$ENDs{$chrom}[$y]}|--->";
+				if (exists ($Group_ID{$chrom}{$STARTs{$chrom}[$y]}{$ENDs{$chrom}[$y]})) {
+					if ($last_group_ID ne $Group_ID{$chrom}{$STARTs{$chrom}[$y]}{$ENDs{$chrom}[$y]}) { #new group starts
+						$y--;
+                                        	$i=$y;
+                                        	last;
+					}
+					push (@myStarts, $STARTs{$chrom}[$y]);                                         
+                        		push (@myEnds, $ENDs{$chrom}[$y]);
+					$segments .= "|$chrom:$STARTs{$chrom}[$y]-$ENDs{$chrom}[$y]";
+					($af) = sort {$b <=> $a} ($af, @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}); #use maximum in this consensus report
+		                        ($llr_to_ref) = sort {$b <=> $a} ($llr_to_ref, @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
+               		                ($llr_to_2nd) = sort {$b <=> $a} ($llr_to_2nd, @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
+                                        ($num_bins) = sort {$b <=> $a} ($num_bins, @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}); #use max in this consensus report
+
+				} else {
+					$y--;
+					$i=$y; #jump forward
+					$end=1;
+				}
+			}
+			print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]}, $MAIN_HAP{$chrom}{$myStarts[0]}, $llr_to_ref, $llr_to_2nd, $af, \[Group\_$Group_ID{$chrom}{$myStarts[0]}{$myEnds[0]}\/$segments\]\n";
+			#print "$chrom, $myStarts[0], $myEnds[-1], $num_bins, $MAIN_SV_TYPE{$chrom}{$myStarts[0]]}\n";	
+		} else {
+			#-- prepare consensus report for singlish calls (calls not falling into a merge-group)
+			my ($af) = sort {$b <=> $a} @{$AFs{$chrom}{$STARTs{$chrom}[$i]}}; #use maximum in this consensus report
+			my ($llr_to_ref) = sort {$b <=> $a} @{$Llr_to_refs{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
+			my ($llr_to_2nd) = sort {$b <=> $a} @{$Llr_to_2nds{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
+			my ($num_bins) = sort {$b <=> $a} @{$Num_bins{$chrom}{$STARTs{$chrom}[$i]}}; #use max in this consensus report
+			print "$chrom, $STARTs{$chrom}[$i], $ENDs{$chrom}[$i], $num_bins, $MAIN_SV_TYPE{$chrom}{$STARTs{$chrom}[$i]}, $MAIN_HAP{$chrom}{$STARTs{$chrom}[$i]}, $llr_to_ref, $llr_to_2nd, $af, [$chrom:$STARTs{$chrom}[$i]-$ENDs{$chrom}[$i]]\n";
+			
+		}
 	}
 }

+
+
 #---------------
 #--- subroutines
 #---------------