From 646bf2adb2a32e9c861998f4138bb1b916d40ef1 Mon Sep 17 00:00:00 2001
From: Tobias Marschall <>
Date: Fri, 5 Oct 2018 08:37:26 +0200
Subject: [PATCH] First attempt on cell mixing evaluation

 Snakefile                     | 15 +++++++
 utils/ | 76 +++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)
 create mode 100755 utils/

diff --git a/Snakefile b/Snakefile
index dc38402..9855049 100644
--- a/Snakefile
+++ b/Snakefile
@@ -110,6 +110,10 @@ rule all:
                window = [100000],
                bpdens = BPDENS,
                method = list(set(m.replace('_filterTRUE','').replace('_filterFALSE','') for m in METHODS))),
+#        expand("cell-mixing-eval/{window}_fixed_norm.{bpdens}/{method}.tsv",
+#               window = [100000],
+#               bpdens = BPDENS,
+#               method = METHODS),
@@ -1023,3 +1027,14 @@ rule aggregate_summary_statistics:
         "(head -n1 {input.tsv[0]} && (tail -n1 -q {input.tsv} | sort -k1) ) > {output}"
+rule evaluate_cell_mixing:
+    input:
+        sv_calls = expand('sv_calls/{sample}/{{windows}}.{{bpdens}}/{{method}}.txt', sample=SAMPLES),
+        truth = '../input-data/ground_truth/RPE-BM510_manual/clonal-events.tsv',
+    output:
+        tsv = 'cell-mixing-eval/{windows}.{bpdens}/{method}.tsv'
+    log:
+        'cell-mixing-eval/{windows}.{bpdens}/{method}.log'
+    run:
+        names = ','.join(SAMPLES)
+        shell('utils/ --names {names} {input.truth} {input.sv_calls} > {output.tsv} 2> {log}')
diff --git a/utils/ b/utils/
new file mode 100755
index 0000000..12beedd
--- /dev/null
+++ b/utils/
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+import sys
+from argparse import ArgumentParser
+import pandas as pd
+import numpy as np
+def matching_cells_1bpoverlap(calls, true_events):
+	'''Return the number of true calls that have at least 1bp overlap with a predicted call.'''
+	found = 0
+	print('Searching for recovered ground truth SVs (clonal)', file=sys.stderr)
+	cell_counts = []
+	# TODO: this algorithm is quadratic time. Could do linear, but fast enough for now.
+	for chrom, start, end, sv_type in true_events[['chrom','start','end','sv_type']].values:
+		matching_calls = calls[(calls.chrom==chrom) & (calls.start<end) & (calls.end>start)]
+		n = len(matching_calls.groupby(by='cell'))
+		cell_counts.append(n)
+		print('   ground truth call {}:{}-{} ({}) had a matching SV in {} cells'.format(chrom,start,end,sv_type,n), file=sys.stderr)
+	return cell_counts
+def sensitivity_1bpoverlap_single_cell(calls, true_events):
+	'''Return the number of true calls that have at least 1bp overlap with a predicted call.'''
+	found = 0
+	print('Searching for recovered ground truth SVs (single cell)', file=sys.stderr)
+	# TODO: this algorithm is quadratic time. Could do linear, but fast enough for now.
+	for chrom, start, end, cell, sv_type in true_events[['chrom','start','end','cell','sv_type']].values:
+		if len(calls[(calls.cell==cell) & (calls.chrom==chrom) & (calls.start<end) & (calls.end>start) & ((calls.sv_call_name == (sv_type+'_h1')) | (calls.sv_call_name == (sv_type+'_h2')))]) > 0:
+			print('   single-cell ground truth call {}:{}-{} ({},{}) was recovered'.format(chrom,start,end,sv_type,cell), file=sys.stderr)
+			found += 1
+		else:
+			print('   single-cell ground truth call {}:{}-{} ({},{}) was missed'.format(chrom,start,end,sv_type,cell), file=sys.stderr)
+	return found
+def main():
+	parser = ArgumentParser(prog='', description=__doc__)
+	parser.add_argument('--names', default=None,
+		help='Callset names.')
+	parser.add_argument('groundtruth', metavar='GROUNDTRUTH', help='Ground truth set of variants to compare to')
+	parser.add_argument('callsets', metavar='CALLSETS', nargs='+', help='Callset files (tsv) as output by MosaiClassifier')
+	args = parser.parse_args()
+	if args.names is None:
+		names = args.callsets
+	else:
+		names = args.names.split(',')
+		assert len(names) == len(args.callsets)
+	true_events = pd.read_csv(args.groundtruth, sep='\t')
+	#results['true_clonal_events_recovered'] = sensitivity_1bpoverlap(sv_table[>=0.8], true_events)
+	#results['true_clonal_recall'] = results['true_clonal_events_recovered'] / len(true_events)
+	#result_order += ['true_clonal_events_recovered', 'true_clonal_recall']
+	fieldnames = ['callset'] + ['{}:{}-{}'.format(chrom,start,end) for chrom, start, end in true_events[['chrom','start','end']].values]
+	print(*fieldnames, sep='\t')
+	for callset_filename, name in zip(args.callsets, names):
+		print('Reading', callset_filename, file=sys.stderr)
+		calls = pd.read_csv(callset_filename, sep='\t')
+		print('Detected {} total calls and {} unique calls in {} single cells'.format(
+			len(calls),
+			len(calls.groupby(by=['chrom','start','end','sv_call_name'])),
+			len(calls.groupby(by='cell')),
+		), file=sys.stderr)
+		cell_counts = matching_cells_1bpoverlap(calls, true_events)
+		print(name, *cell_counts, sep='\t')
+if __name__ == '__main__':
+	main()