From 646bf2adb2a32e9c861998f4138bb1b916d40ef1 Mon Sep 17 00:00:00 2001 From: Tobias Marschall <tobias.marschall@0ohm.net> Date: Fri, 5 Oct 2018 08:37:26 +0200 Subject: [PATCH] First attempt on cell mixing evaluation --- Snakefile | 15 +++++++ utils/evaluate_cell_mixing.py | 76 +++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100755 utils/evaluate_cell_mixing.py diff --git a/Snakefile b/Snakefile index dc38402..9855049 100644 --- a/Snakefile +++ b/Snakefile @@ -110,6 +110,10 @@ rule all: window = [100000], bpdens = BPDENS, method = list(set(m.replace('_filterTRUE','').replace('_filterFALSE','') for m in METHODS))), +# expand("cell-mixing-eval/{window}_fixed_norm.{bpdens}/{method}.tsv", +# window = [100000], +# bpdens = BPDENS, +# method = METHODS), ################################################################################ @@ -1023,3 +1027,14 @@ rule aggregate_summary_statistics: shell: "(head -n1 {input.tsv[0]} && (tail -n1 -q {input.tsv} | sort -k1) ) > {output}" +rule evaluate_cell_mixing: + input: + sv_calls = expand('sv_calls/{sample}/{{windows}}.{{bpdens}}/{{method}}.txt', sample=SAMPLES), + truth = '../input-data/ground_truth/RPE-BM510_manual/clonal-events.tsv', + output: + tsv = 'cell-mixing-eval/{windows}.{bpdens}/{method}.tsv' + log: + 'cell-mixing-eval/{windows}.{bpdens}/{method}.log' + run: + names = ','.join(SAMPLES) + shell('utils/evaluate_cell_mixing.py --names {names} {input.truth} {input.sv_calls} > {output.tsv} 2> {log}') diff --git a/utils/evaluate_cell_mixing.py b/utils/evaluate_cell_mixing.py new file mode 100755 index 0000000..12beedd --- /dev/null +++ b/utils/evaluate_cell_mixing.py @@ -0,0 +1,76 @@ +#!/usr/bin/env python + +import sys +from argparse import ArgumentParser +import pandas as pd +import numpy as np + + +def matching_cells_1bpoverlap(calls, true_events): + '''Return the number of true calls that have at least 1bp overlap with a predicted call.''' + found = 0 + print('Searching for recovered ground truth SVs (clonal)', file=sys.stderr) + cell_counts = [] + # TODO: this algorithm is quadratic time. Could do linear, but fast enough for now. + for chrom, start, end, sv_type in true_events[['chrom','start','end','sv_type']].values: + matching_calls = calls[(calls.chrom==chrom) & (calls.start<end) & (calls.end>start)] + n = len(matching_calls.groupby(by='cell')) + cell_counts.append(n) + print(' ground truth call {}:{}-{} ({}) had a matching SV in {} cells'.format(chrom,start,end,sv_type,n), file=sys.stderr) + return cell_counts + + +def sensitivity_1bpoverlap_single_cell(calls, true_events): + '''Return the number of true calls that have at least 1bp overlap with a predicted call.''' + found = 0 + print('Searching for recovered ground truth SVs (single cell)', file=sys.stderr) + # TODO: this algorithm is quadratic time. Could do linear, but fast enough for now. + for chrom, start, end, cell, sv_type in true_events[['chrom','start','end','cell','sv_type']].values: + if len(calls[(calls.cell==cell) & (calls.chrom==chrom) & (calls.start<end) & (calls.end>start) & ((calls.sv_call_name == (sv_type+'_h1')) | (calls.sv_call_name == (sv_type+'_h2')))]) > 0: + print(' single-cell ground truth call {}:{}-{} ({},{}) was recovered'.format(chrom,start,end,sv_type,cell), file=sys.stderr) + found += 1 + else: + print(' single-cell ground truth call {}:{}-{} ({},{}) was missed'.format(chrom,start,end,sv_type,cell), file=sys.stderr) + return found + + +def main(): + parser = ArgumentParser(prog='evaluate_cell_mixing.py', description=__doc__) + + parser.add_argument('--names', default=None, + help='Callset names.') + + parser.add_argument('groundtruth', metavar='GROUNDTRUTH', help='Ground truth set of variants to compare to') + parser.add_argument('callsets', metavar='CALLSETS', nargs='+', help='Callset files (tsv) as output by MosaiClassifier') + + args = parser.parse_args() + + if args.names is None: + names = args.callsets + else: + names = args.names.split(',') + assert len(names) == len(args.callsets) + + true_events = pd.read_csv(args.groundtruth, sep='\t') + #results['true_clonal_events_recovered'] = sensitivity_1bpoverlap(sv_table[sv_table.af>=0.8], true_events) + #results['true_clonal_recall'] = results['true_clonal_events_recovered'] / len(true_events) + #result_order += ['true_clonal_events_recovered', 'true_clonal_recall'] + + fieldnames = ['callset'] + ['{}:{}-{}'.format(chrom,start,end) for chrom, start, end in true_events[['chrom','start','end']].values] + print(*fieldnames, sep='\t') + + for callset_filename, name in zip(args.callsets, names): + print('Reading', callset_filename, file=sys.stderr) + calls = pd.read_csv(callset_filename, sep='\t') + + print('Detected {} total calls and {} unique calls in {} single cells'.format( + len(calls), + len(calls.groupby(by=['chrom','start','end','sv_call_name'])), + len(calls.groupby(by='cell')), + ), file=sys.stderr) + + cell_counts = matching_cells_1bpoverlap(calls, true_events) + print(name, *cell_counts, sep='\t') + +if __name__ == '__main__': + main() -- GitLab