morphology_impl.py

from datetime import datetime

#TODO - uncomment this part
# this is a task called by multiple processes,
# so we need to restrict the number of threads used by numpy
#from cluster_tools.utils.numpy_utils import set_numpy_threads
#set_numpy_threads(1)
import numpy as np

import h5py
import pandas as pd
from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area
from skimage.transform import resize
from skimage.util import pad


def log(msg):
    print("%s: %s" % (str(datetime.now()), msg))


# get shape of full data & downsampling factor
def get_scale_factor(path, key_full, key, resolution):
    with h5py.File(path, 'r') as f:
        full_shape = f[key_full].shape
        shape = f[key].shape

    # scale factor for downsampling
    scale_factor = [res * (fs / sh)
                    for res, fs, sh in zip(resolution,
                                           full_shape,
                                           shape)]
    return scale_factor


# changed to shape_pixel_size, is the plan to go to n_pixels in future?
def filter_table(table, min_size, max_size):
    if max_size is None:
        table = table.loc[table['shape_pixelsize'] >= min_size, :]
    else:
        criteria = np.logical_and(table['shape_pixelsize'] > min_size, table['shape_pixelsize'] < max_size)
        table = table.loc[criteria, :]
    return table


def filter_table_from_mapping(table, mapping_path):
    # read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id
    mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64')
    # remove zero labels from this table too, if exist
    mapping = mapping[np.logical_and(mapping[:, 0] != 0,
                                     mapping[:, 1] != 0)]
    table = table.loc[np.isin(table['label_id'], mapping[:, 0]), :]
    return table


def load_data(ds, row, scale):
    # compute the bounding box from the row information
    mins = [row.bb_min_z, row.bb_min_y, row.bb_min_x]
    maxs = [row.bb_max_z, row.bb_max_y, row.bb_max_x]
    mins = [int(mi / sca) for mi, sca in zip(mins, scale)]
    maxs = [int(ma / sca) + 1 for ma, sca in zip(maxs, scale)]
    bb = tuple(slice(mi, ma) for mi, ma in zip(mins, maxs))
    # load the data from the bounding box
    return ds[bb]


def morphology_row_features(mask, scale):

    # Calculate stats from skimage
    ski_morph = regionprops(mask.astype('uint8'))

    # volume in pixels
    volume_in_pix = ski_morph[0]['area']

    # extent
    extent = ski_morph[0]['extent']

    # The mesh calculation below fails if an edge of the segmentation is right up against the
    # edge of the volume - gives an open, rather than a closed surface
    # Pad by a few pixels to avoid this
    mask = pad(mask, 10, mode='constant')

    # surface area of mesh around object (other ways to calculate better?)
    verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale))
    surface_area = mesh_surface_area(verts, faces)

    # volume in microns
    volume_in_microns = np.prod(scale)*volume_in_pix

    # sphericity (as in morpholibj)
    # Should run from zero to one
    sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3)

    return [volume_in_microns, extent, surface_area, sphericity]


def intensity_row_features(raw, mask):
    intensity_vals_in_mask = raw[mask]
    # mean and stdev - use float64 to avoid silent overflow errors
    mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64)
    st_dev = np.std(intensity_vals_in_mask, dtype=np.float64)
    return mean_intensity, st_dev


# compute morphology (and intensity features) for label range
def morphology_features_for_label_range(table, ds, ds_raw,
                                        scale_factor_seg, scale_factor_raw,
                                        label_begin, label_end):
    label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end)
    sub_table = table.loc[label_range, :]
    stats = []
    for row in sub_table.itertuples(index=False):
        label_id = int(row.label_id)

        # load the segmentation data from the bounding box corresponding
        # to this row
        seg = load_data(ds, row, scale_factor_seg)

        # compute the segmentation mask and check that we have
        # foreground in the mask
        seg_mask = seg == label_id
        if seg_mask.sum() == 0:
            # if the seg mask is empty, we simply skip this label-id
            continue

        # compute the morphology features from the segmentation mask
        result = [float(label_id)] + morphology_row_features(seg_mask, scale_factor_seg)

        # compute the intensiry features from raw data and segmentation mask
        if ds_raw is not None:
            raw = load_data(ds_raw, row, scale_factor_raw)
            # resize the segmentation mask if it does not fit the raw data
            if seg_mask.shape != raw.shape:
                seg_mask = resize(seg_mask, raw.shape,
                                  order=0, mode='reflect',
                                  anti_aliasing=True, preserve_range=True).astype('bool')
            result += intensity_row_features(raw, seg_mask)
        stats.append(result)
    return stats


def compute_morphology_features(table, segmentation_path, raw_path,
                                seg_key, raw_key,
                                scale_factor_seg, scale_factor_raw,
                                label_starts, label_stops):

    if raw_path != '':
        assert raw_key is not None and scale_factor_raw is not None
        f_raw = h5py.File(raw_path, 'r')
        ds_raw = f_raw[raw_key]
    else:
        f_raw = ds_raw = None

    with h5py.File(segmentation_path, 'r') as f:
        ds = f[seg_key]

        stats = []
        for label_a, label_b in zip(label_starts, label_stops):
            log("Computing features from label-id %i to %i" % (label_a, label_b))
            stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
                                                             scale_factor_seg, scale_factor_raw,
                                                             label_a, label_b))
    if f_raw is not None:
        f_raw.close()

    # convert to pandas table and add column names
    stats = pd.DataFrame(stats)
    columns = ['label_id',
               'shape_volume_in_microns', 'shape_extent', 'shape_surface_area', 'shape_sphericity']
    if raw_path != '':
        columns += ['intensity_mean', 'intensity_st_dev']
    stats.columns = columns
    return stats


def morphology_impl(segmentation_path, raw_path, table, mapping_path,
                    min_size, max_size,
                    resolution, raw_scale, seg_scale,
                    label_starts, label_stops):
    """ Compute morphology features for a segmentation.

    Can compute features for multiple label ranges. If you want to
    compute features for the full label range, pass
    'label_starts=[0]' and 'label_stops=[number_of_labels]'

    Arguments:
        segmentation_path [str] - path to segmentation stored as h5.
        raw_path [str] - path to raw data stored as h5.
            Pass 'None' if you don't want to compute features based on raw data.
        table [pd.DataFrame] - table with default attributes
            (sizes, center of mass and bounding boxes) for segmentation
        mapping_path [str] - path to - path to nucleus id mapping.
            Pass 'None' if not relevant.
        min_size [int] - minimal size for objects used in calcualtion
        max_size [int] - maximum size for objects used in calcualtion
        resolution [listlike] - resolution in nanometer.
            Must be given in [Z, Y, X].
        raw_scale [int] - scale level of the raw data
        seg_scale [int] - scale level of the segmentation
        label_starts [listlike] - list with label start positions
        label_stops [listlike] - list with label stop positions
    """

    # keys to segmentation and raw data for the different scales
    seg_key_full = 't00000/s00/0/cells'
    seg_key = 't00000/s00/%i/cells' % seg_scale
    raw_key_full = 't00000/s00/0/cells'
    raw_key = 't00000/s00/%i/cells' % raw_scale

    # get scale factor for the segmentation
    scale_factor_seg = get_scale_factor(segmentation_path, seg_key_full, seg_key, resolution)

    # get scale factor for raw data (if it's given)
    if raw_path != '':
        log("Have raw path; compute intensity features")
        # NOTE for now we can hard-code the resolution for the raw data here,
        # but we might need to change this if we get additional dataset(s)
        raw_resolution = [0.025, 0.01, 0.01]
        scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
    else:
        log("Don't have raw path; do not compute intensity features")
        raw_resolution = scale_factor_raw = None

    # remove zero label if it exists
    table = table.loc[table['label_id'] != 0, :]

    # if we have a mappin, only keep objects in the mapping
    # (i.e cells that have assigned nuclei)
    if mapping_path != '':
        log("Have mapping path %s" % mapping_path)
        table = filter_table_from_mapping(table, mapping_path)
        log("Number of labels after filter with mapping: %i" % table.shape[0])
    # filter by size
    table = filter_table(table, min_size, max_size)
    log("Number of labels after size filte: %i" % table.shape[0])

    log("Computing morphology features")
    stats = compute_morphology_features(table, segmentation_path, raw_path,
                                        seg_key, raw_key, scale_factor_seg, scale_factor_raw,
                                        label_starts, label_stops)
    return stats


if __name__ == '__main__':
    pass