From 7fe3e58daffff7c83642732a3cec3aba93fba3a4 Mon Sep 17 00:00:00 2001 From: Constantin Pape <c.pape@gmx.net> Date: Sun, 14 Jul 2019 22:58:20 +0200 Subject: [PATCH] Refactor morphology feature implementation in separate file --- scripts/extension/attributes/morphology.py | 218 ++-------------- .../extension/attributes/morphology_impl.py | 238 ++++++++++++++++++ 2 files changed, 252 insertions(+), 204 deletions(-) create mode 100644 scripts/extension/attributes/morphology_impl.py diff --git a/scripts/extension/attributes/morphology.py b/scripts/extension/attributes/morphology.py index 0cb522a..2f39c0b 100644 --- a/scripts/extension/attributes/morphology.py +++ b/scripts/extension/attributes/morphology.py @@ -4,23 +4,15 @@ import os import sys import json -# this is a task called by multiple processes, -# so we need to restrict the number of threads used by numpy -from cluster_tools.utils.numpy_utils import set_numpy_threads -set_numpy_threads(1) -import numpy as np - import luigi -import pandas as pd import nifty.tools as nt -from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area -from skimage.transform import resize -from skimage.util import pad +import pandas as pd import cluster_tools.utils.volume_utils as vu import cluster_tools.utils.function_utils as fu from cluster_tools.utils.task_utils import DummyTask from cluster_tools.cluster_tasks import SlurmTask, LocalTask +from scripts.extension.attributes.morphology_impl import morphology_impl # # Morphology Attribute Tasks @@ -116,161 +108,6 @@ class MorphologySlurm(MorphologyBase, SlurmTask): # -# get shape of full data & downsampling factor -def get_scale_factor(path, key_full, key, resolution): - with vu.file_reader(path, 'r') as f: - full_shape = f[key_full].shape - shape = f[key].shape - - # scale factor for downsampling - scale_factor = [res * (fs / sh) - for res, fs, sh in zip(resolution, - full_shape, - shape)] - return scale_factor - - -def filter_table(table, min_size, max_size): - if max_size is None: - table = table.loc[table['n_pixels'] >= min_size, :] - else: - criteria = np.logical_and(table['n_pixels'] > min_size, table['n_pixels'] < max_size) - table = table.loc[criteria, :] - return table - - -def filter_table_from_mapping(table, mapping_path): - # read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id - mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64') - # remove zero labels from this table too, if exist - mapping = mapping[np.logical_and(mapping[:, 0] != 0, - mapping[:, 1] != 0)] - table = table.loc[np.isin(table['label_id'], mapping[:, 0]), :] - return table - - -def load_data(ds, row, scale): - # compute the bounding box from the row information - mins = [row.bb_min_z, row.bb_min_y, row.bb_min_x] - maxs = [row.bb_max_z, row.bb_max_y, row.bb_max_x] - mins = [int(mi / sca) for mi, sca in zip(mins, scale)] - maxs = [int(ma / sca) + 1 for ma, sca in zip(maxs, scale)] - bb = tuple(slice(mi, ma) for mi, ma in zip(mins, maxs)) - # load the data from the bounding box - return ds[bb] - - -def morphology_row_features(mask, scale): - - # Calculate stats from skimage - ski_morph = regionprops(mask.astype('uint8')) - - # volume in pixels - volume_in_pix = ski_morph[0]['area'] - - # extent - extent = ski_morph[0]['extent'] - - # The mesh calculation below fails if an edge of the segmentation is right up against the - # edge of the volume - gives an open, rather than a closed surface - # Pad by a few pixels to avoid this - mask = pad(mask, 10, mode='constant') - - # surface area of mesh around object (other ways to calculate better?) - verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale)) - surface_area = mesh_surface_area(verts, faces) - - # volume in microns - volume_in_microns = np.prod(scale)*volume_in_pix - - # sphericity (as in morpholibj) - # Should run from zero to one - sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3) - - return [volume_in_microns, extent, surface_area, sphericity] - - -def intensity_row_features(raw, mask): - intensity_vals_in_mask = raw[mask] - # mean and stdev - use float64 to avoid silent overflow errors - mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64) - st_dev = np.std(intensity_vals_in_mask, dtype=np.float64) - return mean_intensity, st_dev - - -# compute morphology (and intensity features) for label range -def morphology_features_for_label_range(table, ds, ds_raw, - scale_factor_seg, scale_factor_raw, - label_begin, label_end): - label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end) - sub_table = table.loc[label_range, :] - stats = [] - for row in sub_table.itertuples(index=False): - label_id = int(row.label_id) - - # load the segmentation data from the bounding box corresponding - # to this row - seg = load_data(ds, row, scale_factor_seg) - - # compute the segmentation mask and check that we have - # foreground in the mask - seg_mask = seg == label_id - if seg_mask.sum() == 0: - # if the seg mask is empty, we simply skip this label-id - continue - - # compute the morphology features from the segmentation mask - result = [float(label_id)] + morphology_row_features(seg_mask, scale_factor_seg) - - # compute the intensiry features from raw data and segmentation mask - if ds_raw is not None: - raw = load_data(ds_raw, row, scale_factor_raw) - # resize the segmentation mask if it does not fit the raw data - if seg_mask.shape != raw.shape: - seg_mask = resize(seg_mask, raw.shape, - order=0, mode='reflect', - anti_aliasing=True, preserve_range=True).astype('bool') - result += intensity_row_features(raw, seg_mask) - stats.append(result) - return stats - - -def compute_morphology_features(table, segmentation_path, raw_path, - seg_key, raw_key, - scale_factor_seg, scale_factor_raw, - blocking, block_list): - - if raw_path != '': - assert raw_key is not None and scale_factor_raw is not None - f_raw = vu.file_reader(raw_path, 'r') - ds_raw = f_raw[raw_key] - else: - f_raw = ds_raw = None - - with vu.file_reader(segmentation_path, 'r') as f: - ds = f[seg_key] - - stats = [] - for block_id in block_list: - block = blocking.getBlock(block_id) - label_a, label_b = block.begin[0], block.end[0] - fu.log("Computing features from label-id %i to %i" % (label_a, label_b)) - stats.extend(morphology_features_for_label_range(table, ds, ds_raw, - scale_factor_seg, scale_factor_raw, - label_a, label_b)) - if f_raw is not None: - f_raw.close() - - # convert to pandas table and add column names - stats = pd.DataFrame(stats) - columns = ['label_id', - 'shape_volume_in_microns', 'shape_extent', 'shape_surface_area', 'shape_sphericity'] - if raw_path != '': - columns += ['intensity_mean', 'intensity_st_dev'] - stats.columns = columns - return stats - - def morphology(job_id, config_path): fu.log("start processing job %i" % job_id) @@ -294,49 +131,22 @@ def morphology(job_id, config_path): block_list = config['block_list'] - # keys to segmentation and raw data for the different scales - seg_key_full = 't00000/s00/0/cells' - seg_key = 't00000/s00/%i/cells' % seg_scale - raw_key_full = 't00000/s00/0/cells' - raw_key = 't00000/s00/%i/cells' % raw_scale - - # get scale factor for the segmentation - scale_factor_seg = get_scale_factor(segmentation_path, seg_key_full, seg_key, resolution) - - # get scale factor for raw data (if it's given) - if raw_path != '': - fu.log("Have raw path; compute intensity features") - # NOTE for now we can hard-code the resolution for the raw data here, - # but we might need to change this if we get additional dataset(s) - raw_resolution = [0.025, 0.01, 0.01] - scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution) - else: - fu.log("Don't have raw path; do not compute intensity features") - raw_resolution = scale_factor_raw = None - # read the base table table = pd.read_csv(in_table_path, sep='\t') - n_labels = table.shape[0] - fu.log("Initial number of labels: %i" % n_labels) - # remove zero label if it exists - table = table.loc[table['label_id'] != 0, :] - - # if we have a mappin, only keep objects in the mapping - # (i.e cells that have assigned nuclei) - if mapping_path != '': - fu.log("Have mapping path %s" % mapping_path) - table = filter_table_from_mapping(table, mapping_path) - fu.log("Number of labels after filter with mapping: %i" % table.shape[0]) - # filter by size - table = filter_table(table, min_size, max_size) - fu.log("Number of labels after size filte: %i" % table.shape[0]) + # get the label ranges for this job + n_labels = table.shape[0] blocking = nt.blocking([0], [n_labels], [1000]) - - fu.log("Computing morphology features") - stats = compute_morphology_features(table, segmentation_path, raw_path, - seg_key, raw_key, scale_factor_seg, scale_factor_raw, - blocking, block_list) + label_starts, label_stops = [], [] + for block_id in block_list: + block = blocking.getBlock(block_id) + label_starts.append(block.begin) + label_stops.append(block.end) + + stats = morphology_impl(segmentation_path, raw_path, table, mapping_path, + min_size, max_size, + resolution, raw_scale, seg_scale, + label_starts, label_stops) output_path = output_prefix + '_job%i.csv' % job_id fu.log("Save result to %s" % output_path) diff --git a/scripts/extension/attributes/morphology_impl.py b/scripts/extension/attributes/morphology_impl.py new file mode 100644 index 0000000..cfaab70 --- /dev/null +++ b/scripts/extension/attributes/morphology_impl.py @@ -0,0 +1,238 @@ +#! /bin/python + +# this is a task called by multiple processes, +# so we need to restrict the number of threads used by numpy +from cluster_tools.utils.numpy_utils import set_numpy_threads +set_numpy_threads(1) +import numpy as np + +import h5py +import pandas as pd +from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area +from skimage.transform import resize +from skimage.util import pad + + +# get shape of full data & downsampling factor +def get_scale_factor(path, key_full, key, resolution): + with h5py.File(path, 'r') as f: + full_shape = f[key_full].shape + shape = f[key].shape + + # scale factor for downsampling + scale_factor = [res * (fs / sh) + for res, fs, sh in zip(resolution, + full_shape, + shape)] + return scale_factor + + +def filter_table(table, min_size, max_size): + if max_size is None: + table = table.loc[table['n_pixels'] >= min_size, :] + else: + criteria = np.logical_and(table['n_pixels'] > min_size, table['n_pixels'] < max_size) + table = table.loc[criteria, :] + return table + + +def filter_table_from_mapping(table, mapping_path): + # read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id + mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64') + # remove zero labels from this table too, if exist + mapping = mapping[np.logical_and(mapping[:, 0] != 0, + mapping[:, 1] != 0)] + table = table.loc[np.isin(table['label_id'], mapping[:, 0]), :] + return table + + +def load_data(ds, row, scale): + # compute the bounding box from the row information + mins = [row.bb_min_z, row.bb_min_y, row.bb_min_x] + maxs = [row.bb_max_z, row.bb_max_y, row.bb_max_x] + mins = [int(mi / sca) for mi, sca in zip(mins, scale)] + maxs = [int(ma / sca) + 1 for ma, sca in zip(maxs, scale)] + bb = tuple(slice(mi, ma) for mi, ma in zip(mins, maxs)) + # load the data from the bounding box + return ds[bb] + + +def morphology_row_features(mask, scale): + + # Calculate stats from skimage + ski_morph = regionprops(mask.astype('uint8')) + + # volume in pixels + volume_in_pix = ski_morph[0]['area'] + + # extent + extent = ski_morph[0]['extent'] + + # The mesh calculation below fails if an edge of the segmentation is right up against the + # edge of the volume - gives an open, rather than a closed surface + # Pad by a few pixels to avoid this + mask = pad(mask, 10, mode='constant') + + # surface area of mesh around object (other ways to calculate better?) + verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale)) + surface_area = mesh_surface_area(verts, faces) + + # volume in microns + volume_in_microns = np.prod(scale)*volume_in_pix + + # sphericity (as in morpholibj) + # Should run from zero to one + sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3) + + return [volume_in_microns, extent, surface_area, sphericity] + + +def intensity_row_features(raw, mask): + intensity_vals_in_mask = raw[mask] + # mean and stdev - use float64 to avoid silent overflow errors + mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64) + st_dev = np.std(intensity_vals_in_mask, dtype=np.float64) + return mean_intensity, st_dev + + +# compute morphology (and intensity features) for label range +def morphology_features_for_label_range(table, ds, ds_raw, + scale_factor_seg, scale_factor_raw, + label_begin, label_end): + label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end) + sub_table = table.loc[label_range, :] + stats = [] + for row in sub_table.itertuples(index=False): + label_id = int(row.label_id) + + # load the segmentation data from the bounding box corresponding + # to this row + seg = load_data(ds, row, scale_factor_seg) + + # compute the segmentation mask and check that we have + # foreground in the mask + seg_mask = seg == label_id + if seg_mask.sum() == 0: + # if the seg mask is empty, we simply skip this label-id + continue + + # compute the morphology features from the segmentation mask + result = [float(label_id)] + morphology_row_features(seg_mask, scale_factor_seg) + + # compute the intensiry features from raw data and segmentation mask + if ds_raw is not None: + raw = load_data(ds_raw, row, scale_factor_raw) + # resize the segmentation mask if it does not fit the raw data + if seg_mask.shape != raw.shape: + seg_mask = resize(seg_mask, raw.shape, + order=0, mode='reflect', + anti_aliasing=True, preserve_range=True).astype('bool') + result += intensity_row_features(raw, seg_mask) + stats.append(result) + return stats + + +def compute_morphology_features(table, segmentation_path, raw_path, + seg_key, raw_key, + scale_factor_seg, scale_factor_raw, + label_starts, label_stops): + + if raw_path != '': + assert raw_key is not None and scale_factor_raw is not None + f_raw = h5py.File(raw_path, 'r') + ds_raw = f_raw[raw_key] + else: + f_raw = ds_raw = None + + with h5py.File(segmentation_path, 'r') as f: + ds = f[seg_key] + + stats = [] + for label_a, label_b in zip(label_starts, label_stops): + # fu.log("Computing features from label-id %i to %i" % (label_a, label_b)) + stats.extend(morphology_features_for_label_range(table, ds, ds_raw, + scale_factor_seg, scale_factor_raw, + label_a, label_b)) + if f_raw is not None: + f_raw.close() + + # convert to pandas table and add column names + stats = pd.DataFrame(stats) + columns = ['label_id', + 'shape_volume_in_microns', 'shape_extent', 'shape_surface_area', 'shape_sphericity'] + if raw_path != '': + columns += ['intensity_mean', 'intensity_st_dev'] + stats.columns = columns + return stats + + +def morphology_impl(segmentation_path, raw_path, table, mapping_path, + min_size, max_size, + resolution, raw_scale, seg_scale, + label_starts, label_stops): + """ Compute morphology features for a segmentation. + + Can compute features for multiple label ranges. If you want to + compute features for the full label range, pass + 'label_starts=[0]' and 'label_stops=[number_of_labels]' + + Arguments: + segmentation_path [str] - path to segmentation stored as h5. + raw_path [str] - path to raw data stored as h5. + Pass 'None' if you don't want to compute features based on raw data. + table [pd.DataFrame] - table with default attributes + (sizes, center of mass and bounding boxes) for segmentation + mapping_path [str] - path to - path to nucleus id mapping. + Pass 'None' if not relevant. + min_size [int] - minimal size for objects used in calcualtion + max_size [int] - maximum size for objects used in calcualtion + resolution [listlike] - resolution in nanometer. + Must be given in [Z, Y, X]. + raw_scale [int] - scale level of the raw data + seg_scale [int] - scale level of the segmentation + label_starts [listlike] - list with label start positions + label_stops [listlike] - list with label stop positions + """ + + # keys to segmentation and raw data for the different scales + seg_key_full = 't00000/s00/0/cells' + seg_key = 't00000/s00/%i/cells' % seg_scale + raw_key_full = 't00000/s00/0/cells' + raw_key = 't00000/s00/%i/cells' % raw_scale + + # get scale factor for the segmentation + scale_factor_seg = get_scale_factor(segmentation_path, seg_key_full, seg_key, resolution) + + # get scale factor for raw data (if it's given) + if raw_path != '': + # fu.log("Have raw path; compute intensity features") + # NOTE for now we can hard-code the resolution for the raw data here, + # but we might need to change this if we get additional dataset(s) + raw_resolution = [0.025, 0.01, 0.01] + scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution) + else: + # fu.log("Don't have raw path; do not compute intensity features") + raw_resolution = scale_factor_raw = None + + # remove zero label if it exists + table = table.loc[table['label_id'] != 0, :] + + # if we have a mappin, only keep objects in the mapping + # (i.e cells that have assigned nuclei) + if mapping_path != '': + # fu.log("Have mapping path %s" % mapping_path) + table = filter_table_from_mapping(table, mapping_path) + # fu.log("Number of labels after filter with mapping: %i" % table.shape[0]) + # filter by size + table = filter_table(table, min_size, max_size) + # fu.log("Number of labels after size filte: %i" % table.shape[0]) + + # fu.log("Computing morphology features") + stats = compute_morphology_features(table, segmentation_path, raw_path, + seg_key, raw_key, scale_factor_seg, scale_factor_raw, + label_starts, label_stops) + return stats + + +if __name__ == '__main__': + pass -- GitLab