From 693f7a25e4ed9cfced86504756d4695d158828a1 Mon Sep 17 00:00:00 2001 From: Constantin Pape <constantin.pape@iwr.uni-heidelberg.de> Date: Fri, 7 Jun 2019 16:01:34 +0200 Subject: [PATCH] Update gene script and call it for cell master table --- scripts/attributes/genes.py | 30 ++++++++++++++++++------------ scripts/attributes/map_objects.py | 2 +- scripts/attributes/master.py | 16 +++++++++++++--- 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/scripts/attributes/genes.py b/scripts/attributes/genes.py index 15e64ff..726c8a0 100755 --- a/scripts/attributes/genes.py +++ b/scripts/attributes/genes.py @@ -1,4 +1,3 @@ -import os import csv import h5py import numpy as np @@ -6,6 +5,12 @@ from vigra.analysis import extractRegionFeatures from vigra.sampling import resize +# TODO +# wrap this in a cluster_tools task in order to run remotely +# fix blatant inefficiencis (size loop) +# make test to check against original table + + def get_bbs(data): num_cells = (np.max(data)).astype('int') + 1 cells_bbs = [[] for i in range(num_cells)] @@ -23,6 +28,8 @@ def get_bbs(data): return cells_bbs +# TODO very inefficient, can use "Count" feature of vigra features instead +# and then just do this in `get_bbs` as well def get_cell_sizes(data): max_label = (np.max(data)).astype('uint32') cell_sizes = [0] * (max_label + 1) @@ -58,25 +65,24 @@ def get_cell_expression(segm_data, all_genes): def write_genes_table(segm_file, genes_file, table_file, labels): - DSET = 't00000/s00/4/cells' - NEW_SHAPE = (570, 518, 550) - GENES_DSET = 'genes' - NAMES_DSET = 'gene_names' - - genes_table_file = os.path.splitext(table_file)[0] + "_genes2" + os.path.splitext(table_file)[1] + dset = 't00000/s00/4/cells' + new_shape = (570, 518, 550) + genes_dset = 'genes' + names_dset = 'gene_names' with h5py.File(segm_file, 'r') as f: - segment_data = f[DSET][:] + segment_data = f[dset][:] + # TODO loading the whole thing into ram takes a lot of memory with h5py.File(genes_file, 'r') as f: - all_genes = f[GENES_DSET][:] - gene_names = [i.decode('utf-8') for i in f[NAMES_DSET]] + all_genes = f[genes_dset][:] + gene_names = [i.decode('utf-8') for i in f[names_dset]] num_genes = len(gene_names) - downsampled_data = resize(segment_data.astype("float32"), shape=NEW_SHAPE, order=0).astype('uint16') + downsampled_data = resize(segment_data.astype("float32"), shape=new_shape, order=0).astype('uint16') avail_labels, expression = get_cell_expression(downsampled_data, all_genes) - with open(genes_table_file, 'w') as genes_table: + with open(table_file, 'w') as genes_table: csv_writer = csv.writer(genes_table, delimiter='\t') _ = csv_writer.writerow(['label_id'] + gene_names) for label in labels: diff --git a/scripts/attributes/map_objects.py b/scripts/attributes/map_objects.py index b817eef..01a4770 100644 --- a/scripts/attributes/map_objects.py +++ b/scripts/attributes/map_objects.py @@ -47,5 +47,5 @@ def map_objects(label_ids, seg_path, seg_key, map_out, data.append(labels[:, None]) col_names = ['label_id'] + map_names - data = np.concatenate([label_ids[:, None]] + data, axis=0) + data = np.concatenate([label_ids[:, None]] + data, axis=1) write_csv(map_out, data, col_names) diff --git a/scripts/attributes/master.py b/scripts/attributes/master.py index f02335a..4ceb1ea 100644 --- a/scripts/attributes/master.py +++ b/scripts/attributes/master.py @@ -3,6 +3,7 @@ import h5py from .base_attributes import base_attributes from .map_objects import map_objects +from .genes import write_genes_table from ..files import get_h5_path_from_xml @@ -21,8 +22,8 @@ def make_cell_tables(folder, name, tmp_folder, resolution, table_folder = os.path.join(folder, 'tables', name) os.makedirs(table_folder, exist_ok=True) - seg_path = get_seg_path(folder, name) seg_key = 't00000/s00/0/cells' + seg_path = get_seg_path(folder, name, seg_key) # make the basic attributes table base_out = os.path.join(table_folder, 'default.csv') @@ -33,16 +34,25 @@ def make_cell_tables(folder, name, tmp_folder, resolution, # make table with mapping to other objects # nuclei, cellular models (TODO), ... map_out = os.path.join(table_folder, 'objects.csv') - map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels')] + map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels', seg_key)] map_keys = [seg_key] map_names = ['nucleus_id'] map_objects(label_ids, seg_path, seg_key, map_out, map_paths, map_keys, map_names, tmp_folder, target, max_jobs) + # make table with gene mapping + # TODO we need to make sure that this file is always copied / updated + # before this is called ! + aux_gene_xml = os.path.join(folder, 'misc', 'meds_all_genes.xml') + aux_gene_path = get_h5_path_from_xml(aux_gene_xml) + if not os.path.exists(aux_gene_path): + raise RuntimeError("Can't find auxiliary gene file") + gene_out = os.path.join(table_folder, 'genes.csv') + write_genes_table(seg_path, aux_gene_path, gene_out, label_ids) + # TODO additional tables: # regions / semantics - # gene expression # ??? -- GitLab