morphology_impl.py

from datetime import datetime

# this is a task called by multiple processes,
# so we need to restrict the number of threads used by numpy
from elf.util import set_numpy_threads
set_numpy_threads(1)

import numpy as np

import vigra
import h5py
import pandas as pd
from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area
from skimage.util import pad
from scipy.ndimage.morphology import distance_transform_edt
from mahotas.features import haralick
from skimage.morphology import label, remove_small_objects


def log(msg):
    print("%s: %s" % (str(datetime.now()), msg))


# get shape of full data & downsampling factor
def get_scale_factor(path, key_full, key, resolution):
    with h5py.File(path, 'r') as f:
        full_shape = f[key_full].shape
        shape = f[key].shape

    # scale factor for downsampling
    scale_factor = [res * (fs / sh)
                    for res, fs, sh in zip(resolution,
                                           full_shape,
                                           shape)]
    return scale_factor


def filter_table(table, min_size, max_size):
    if max_size is None:
        table = table.loc[table['n_pixels'] >= min_size, :]
    else:
        criteria = np.logical_and(table['n_pixels'] > min_size, table['n_pixels'] < max_size)
        table = table.loc[criteria, :]
    return table


# some cell segmentations have a sensible total pixel size but very large bounding boxes i.e. they are small spots
# distributed over a large region, not one cell > filter for these cases by capping the bounding box size
def filter_table_bb(table, max_bb):
    total_bb = (table['bb_max_z'] - table['bb_min_z']) * (table['bb_max_y'] - table['bb_min_y']) * (
            table['bb_max_x'] - table['bb_min_x'])

    table = table.loc[total_bb < max_bb, :]

    return table


def filter_table_from_mapping(table, mapping_path):
    # read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id
    mapping = pd.read_csv(mapping_path, sep='\t')

    # remove zero labels from this table too, if exist
    mapping = mapping.loc[np.logical_and(mapping.iloc[:, 0] != 0,
                                         mapping.iloc[:, 1] != 0), :]
    table = table.loc[np.isin(table['label_id'], mapping['label_id']), :]

    # add a column for the 'nucleus_id' of the mapped nucleus (use this later to exclude the area covered
    # by the nucleus)
    table = table.join(mapping.set_index('label_id'), on='label_id', how='left')

    return table


# regions here are regions to exclude
def filter_table_region(table, region_path, regions=('empty', 'yolk', 'neuropil', 'cuticle')):
    region_mapping = pd.read_csv(region_path, sep='\t')

    # remove zero label if it exists
    region_mapping = region_mapping.loc[region_mapping['label_id'] != 0, :]

    for region in regions:
        region_mapping = region_mapping.loc[region_mapping[region] == 0, :]

    table = table.loc[np.isin(table['label_id'], region_mapping['label_id']), :]

    return table


def run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path):
    # remove zero label if present
    table = table.loc[table['label_id'] != 0, :]

    # filter to only keep cells with assigned nuclei
    if mapping_path is not None:
        log("Have mapping path %s" % mapping_path)
        table = filter_table_from_mapping(table, mapping_path)
        log("Number of labels after filter with mapping: %i" % table.shape[0])

    # filter to exclude certain regions
    if region_mapping_path is not None:
        log("Have region mapping path %s" % region_mapping_path)
        table = filter_table_region(table, region_mapping_path)
        log("Number of labels after region filter: %i" % table.shape[0])

    # filter by size of object (no. pixels)
    if min_size is not None or max_size is not None:
        table = filter_table(table, min_size, max_size)
        log("Number of labels after size filter: %i" % table.shape[0])

    # filter by bounding box size
    if max_bb is not None:
        table = filter_table_bb(table, max_bb)
        log("Number of labels after bounding box size filter %i" % table.shape[0])

    return table


def load_data(ds, row, scale):
    # compute the bounding box from the row information
    mins = [row.bb_min_z, row.bb_min_y, row.bb_min_x]
    maxs = [row.bb_max_z, row.bb_max_y, row.bb_max_x]
    mins = [int(mi / sca) for mi, sca in zip(mins, scale)]
    maxs = [int(ma / sca) + 1 for ma, sca in zip(maxs, scale)]
    bb = tuple(slice(mi, ma) for mi, ma in zip(mins, maxs))
    # load the data from the bounding box
    return ds[bb]


def generate_column_names(raw_path, chromatin_path, exclude_path):
    columns = ['label_id']
    morph_columns = ['shape_volume_in_microns', 'shape_extent', 'shape_equiv_diameter',
                     'shape_major_axis', 'shape_minor_axis', 'shape_surface_area', 'shape_sphericity',
                     'shape_max_radius']

    columns += morph_columns

    if raw_path is not None:
        intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr',
                             'intensity_total']
        texture_columns = ['texture_hara%s' % x for x in range(1, 14)]

        columns += intensity_columns

        if exclude_path is None:
            # radial intensity columns
            for val in [25, 50, 75, 100]:
                columns += ['%s_%s' % (var, val) for var in intensity_columns]

        columns += texture_columns

    if chromatin_path is not None:
        edt_columns = ['shape_edt_mean', 'shape_edt_stdev', 'shape_edt_median', 'shape_edt_iqr']
        edt_columns += ['shape_percent_%s' % var for var in [25, 50, 75, 100]]

        for phase in ['_het', '_eu']:
            columns += [var + phase for var in morph_columns]
            columns += [var + phase for var in edt_columns]

            if raw_path is not None:
                columns += [var + phase for var in intensity_columns]
                columns += [var + phase for var in texture_columns]

    return columns


def morphology_row_features(mask, scale):
    # Calculate stats from skimage
    ski_morph = regionprops(mask.astype('uint8'))

    volume_in_pix = ski_morph[0]['area']
    volume_in_microns = np.prod(scale) * volume_in_pix
    extent = ski_morph[0]['extent']
    equiv_diameter = ski_morph[0]['equivalent_diameter']
    major_axis = ski_morph[0]['major_axis_length']
    minor_axis = ski_morph[0]['minor_axis_length']

    # The mesh calculation below fails if an edge of the segmentation is right up against the
    # edge of the volume - gives an open, rather than a closed surface
    # Pad by a few pixels to avoid this
    mask = pad(mask, 10, mode='constant')

    # surface area of mesh around object (other ways to calculate better?)
    verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale))
    surface_area = mesh_surface_area(verts, faces)

    # sphericity (as in morpholibj)
    # Should run from zero to one
    sphericity = (36 * np.pi * (float(volume_in_microns) ** 2)) / (float(surface_area) ** 3)

    # max radius = max distance from pixel to outside
    edt = distance_transform_edt(mask, sampling=scale, return_distances=True)
    max_radius = np.max(edt)

    return (volume_in_microns, extent, equiv_diameter, major_axis,
            minor_axis, surface_area, sphericity, max_radius)


def intensity_row_features(raw, mask):
    intensity_vals_in_mask = raw[mask]
    # mean and stdev - use float64 to avoid silent overflow errors
    mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64)
    st_dev = np.std(intensity_vals_in_mask, dtype=np.float64)
    median_intensity = np.median(intensity_vals_in_mask)

    quartile_75, quartile_25 = np.percentile(intensity_vals_in_mask, [75, 25])
    interquartile_range_intensity = quartile_75 - quartile_25

    total = np.sum(intensity_vals_in_mask, dtype=np.float64)

    return mean_intensity, st_dev, median_intensity, interquartile_range_intensity, total


def radial_intensity_row_features(raw, mask, scale, stops=(0.0, 0.25, 0.5, 0.75, 1.0)):
    result = ()

    edt = distance_transform_edt(mask, sampling=scale, return_distances=True)
    edt = edt / np.max(edt)

    bottoms = stops[0:len(stops) - 1]
    tops = stops[1:]

    radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)]

    for m in radial_masks:
        result += intensity_row_features(raw, m)

    return result


def texture_row_features(raw, mask):
    # errors if there are small, isolated spots (because I'm using ignore zeros as true)
    # so here remove components that are < 10 pixels
    # may still error in some cases
    labelled = label(mask)
    if len(np.unique(labelled)) > 2:
        labelled = remove_small_objects(labelled, min_size=10)
        mask = labelled != 0
        mask = mask.astype('uint8')

    # set regions outside mask to zero
    raw_copy = raw.copy()
    raw_copy[mask == 0] = 0

    try:
        hara = haralick(raw_copy, ignore_zeros=True, return_mean=True, distance=2)

    except ValueError:
        log('Texture computation failed - can happen when using ignore_zeros')
        hara = (0.,) * 13

    return tuple(hara)


def radial_distribution(edt, mask, stops=(0.0, 0.25, 0.5, 0.75, 1.0)):
    result = ()

    bottoms = stops[0:len(stops) - 1]
    tops = stops[1:]

    radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)]

    for m in radial_masks:
        # percent of that zone that is covered by the mask
        result += (np.sum(mask[m]) / np.sum(m),)

    return result


def chromatin_row_features(chromatin, edt, raw, scale_chromatin):
    result = ()

    result += morphology_row_features(chromatin, scale_chromatin)

    # edt stats i.e. stats on distribution of chromatin, dropping the total value
    result += intensity_row_features(edt, chromatin)[:-1]
    result += radial_distribution(edt, chromatin)

    if raw is not None:
        # resize the chromatin masks if not same size as raw
        chromatin_type = chromatin.dtype
        if chromatin.shape != raw.shape:
            chromatin = chromatin.astype('float32')
            chromatin = vigra.sampling.resize(chromatin, shape=raw.shape, order=0)
            chromatin = chromatin.astype(chromatin_type)

        result += intensity_row_features(raw, chromatin)
        result += texture_row_features(raw, chromatin)

    return result


# compute morphology (and intensity features) for label range
def morphology_features_for_label_range(table, ds, ds_raw,
                                        ds_chromatin,
                                        ds_exclude,
                                        scale_factor_seg, scale_factor_raw,
                                        scale_factor_chromatin,
                                        scale_factor_exclude,
                                        label_begin, label_end):
    label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end)
    sub_table = table.loc[label_range, :]
    stats = []
    for row in sub_table.itertuples(index=False):
        log(str(row.label_id))
        label_id = int(row.label_id)

        # load the segmentation data from the bounding box corresponding
        # to this row
        seg = load_data(ds, row, scale_factor_seg)

        # compute the segmentation mask and check that we have
        # foreground in the mask
        seg_mask = seg == label_id
        if seg_mask.sum() == 0:
            # if the seg mask is empty, we simply skip this label-id
            continue

        # compute the morphology features from the segmentation mask
        result = (float(label_id),) + morphology_row_features(seg_mask, scale_factor_seg)

        if ds_exclude is not None:
            exclude = load_data(ds_exclude, row, scale_factor_exclude)
            exclude_type = exclude.dtype

            # resize to fit seg
            if exclude.shape != seg_mask.shape:
                exclude = exclude.astype('float32')
                exclude = vigra.sampling.resize(exclude, shape=seg_mask.shape, order=0)
                exclude = exclude.astype(exclude_type)

            # binary for correct nucleus
            exclude = exclude == int(row.nucleus_id)

            # remove nucleus area form seg_mask
            seg_mask[exclude] = False

        # compute the intensity features from raw data and segmentation mask
        if ds_raw is not None:
            raw = load_data(ds_raw, row, scale_factor_raw)

            # resize the segmentation mask if it does not fit the raw data
            seg_mask_type = seg_mask.dtype

            if seg_mask.shape != raw.shape:
                seg_mask = seg_mask.astype('float32')
                seg_mask = vigra.sampling.resize(seg_mask, shape=raw.shape, order=0)
                seg_mask = seg_mask.astype(seg_mask_type)

            result += intensity_row_features(raw, seg_mask)
            # doesn't make sense to run the radial intensity if the nucleus area is being excluded, as the euclidean
            # distance transform then gives distance from the outside & nuclear surface - hard to interpret
            if ds_exclude is None:
                result += radial_intensity_row_features(raw, seg_mask, scale_factor_raw)
            result += texture_row_features(raw, seg_mask)

        if ds_chromatin is not None:
            chromatin = load_data(ds_chromatin, row, scale_factor_chromatin)

            # set to 1 (heterochromatin), 2 (euchromatin)
            heterochromatin = chromatin == label_id + 12000
            euchromatin = chromatin == label_id

            # skip if no chromatin segmentation
            total_heterochromatin = heterochromatin.sum()
            total_euchromatin = euchromatin.sum()
            if total_heterochromatin == 0 and total_euchromatin.sum() == 0:
                continue

            # euclidean distance transform for whole nucleus, normalised to run from 0 to 1
            whole_nucleus = np.logical_or(heterochromatin, euchromatin)
            edt = distance_transform_edt(whole_nucleus, sampling=scale_factor_chromatin, return_distances=True)
            edt = edt / np.max(edt)

            if ds_raw is None:
                raw = None

            if total_heterochromatin != 0:
                result += chromatin_row_features(heterochromatin, edt, raw, scale_factor_chromatin)
            else:
                result += (0.,) * 36

            if total_euchromatin != 0:
                result += chromatin_row_features(euchromatin, edt, raw, scale_factor_chromatin)
            else:
                result += (0.,) * 36

        stats.append(result)
    return stats


def morphology_impl_nucleus(nucleus_segmentation_path, raw_path, chromatin_path,
                            table,
                            min_size, max_size,
                            max_bb,
                            nucleus_resolution, chromatin_resolution,
                            nucleus_seg_scale, raw_scale, chromatin_scale,
                            label_starts, label_stops):
    """ Compute morphology features for nucleus segmentation.

       Can compute features for multiple label ranges. If you want to
       compute features for the full label range, pass
       'label_starts=[0]' and 'label_stops=[number_of_labels]'

       Arguments:
           nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5.
           raw_path [str] - path to raw data stored as h5.
               Pass 'None' if you don't want to compute features based on raw data.
           chromatin_path [str] - path to chromatin segmentation data stored as h5.
           table [pd.DataFrame] - table with default attributes
               (sizes, center of mass and bounding boxes) for segmentation
           min_size [int] - minimal size for objects used in calculation
           max_size [int] - maximum size for objects used in calculation
           max_bb [int] - maximum total volume of bounding box for objects used in calculation
           nucleus_resolution [listlike] - resolution in nanometer.
               Must be given in [Z, Y, X].
           chromatin_resolution [listlike] - resolution in nanometer.
               Must be given in [Z, Y, X].
           nucleus_seg_scale [int] - scale level of the segmentation.
           raw_scale [int] - scale level of the raw data
           chromatin_scale [int] - scale level of the segmentation.
           label_starts [listlike] - list with label start positions
           label_stops [listlike] - list with label stop positions
       """

    # keys for the different scales
    nucleus_seg_key_full = 't00000/s00/0/cells'
    nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale
    raw_key_full = 't00000/s00/0/cells'
    raw_key = 't00000/s00/%i/cells' % raw_scale
    chromatin_key_full = 't00000/s00/0/cells'
    chromatin_key = 't00000/s00/%i/cells' % chromatin_scale

    # filter table
    table = run_all_filters(table, min_size, max_size, max_bb, None, None)

    # get scale factors
    if raw_path is not None:
        log("Have raw path; compute intensity features")
        # NOTE for now we can hard-code the resolution for the raw data here,
        # but we might need to change this if we get additional dataset(s)
        raw_resolution = [0.025, 0.01, 0.01]
        scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
        f_raw = h5py.File(raw_path, 'r')
        ds_raw = f_raw[raw_key]
    else:
        log("Don't have raw path; do not compute intensity features")
        scale_factor_raw = f_raw = ds_raw = None

    if chromatin_path is not None:
        log("Have chromatin path; compute chromatin features")
        scale_factor_chromatin = get_scale_factor(chromatin_path, chromatin_key_full, chromatin_key,
                                                  chromatin_resolution)
        f_chromatin = h5py.File(chromatin_path, 'r')
        ds_chromatin = f_chromatin[chromatin_key]
    else:
        log("Don't have chromatin path; do not compute chromatin features")
        scale_factor_chromatin = f_chromatin = ds_chromatin = None

    log("Computing morphology features")
    scale_factor_nucleus_seg = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
                                                nucleus_resolution)
    with h5py.File(nucleus_segmentation_path, 'r') as f:
        ds = f[nucleus_seg_key]

        stats = []
        for label_a, label_b in zip(label_starts, label_stops):
            log("Computing features from label-id %i to %i" % (label_a, label_b))
            stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
                                                             ds_chromatin,
                                                             None,
                                                             scale_factor_nucleus_seg, scale_factor_raw,
                                                             scale_factor_chromatin,
                                                             None,
                                                             label_a, label_b))

    for var in f_raw, f_chromatin:
        if var is not None:
            var.close()

    # convert to pandas table and add column names
    stats = pd.DataFrame(stats)
    stats.columns = generate_column_names(raw_path, chromatin_path, None)

    return stats


def morphology_impl_cell(cell_segmentation_path, raw_path,
                         nucleus_segmentation_path,
                         table, mapping_path,
                         region_mapping_path,
                         min_size, max_size,
                         max_bb,
                         cell_resolution, nucleus_resolution,
                         cell_seg_scale, raw_scale, nucleus_seg_scale,
                         label_starts, label_stops):
    """ Compute morphology features for cell segmentation.

       Can compute features for multiple label ranges. If you want to
       compute features for the full label range, pass
       'label_starts=[0]' and 'label_stops=[number_of_labels]'

       Arguments:
           cell_segmentation_path [str] - path to cell segmentation stored as h5.
           raw_path [str] - path to raw data stored as h5.
               Pass 'None' if you don't want to compute features based on raw data.
           nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5.
               Pass 'None' if you don't want to exclude the region covered by the nucleus from intensity &
               texture calculations.
           table [pd.DataFrame] - table with default attributes
               (sizes, center of mass and bounding boxes) for segmentation
           mapping_path [str] - path to - path to nucleus id mapping.
               Pass 'None' if not relevant.
           region_mapping_path [str] - path to - path to cellid to region mapping
               Pass 'None' if not relevant
           min_size [int] - minimal size for objects used in calculation
           max_size [int] - maximum size for objects used in calculation
           max_bb [int] - maximum total volume of bounding box for objects used in calculation
           cell_resolution [listlike] - resolution in nanometer.
               Must be given in [Z, Y, X].
           nucleus_resolution [listlike] - resolution in nanometer.
               Must be given in [Z, Y, X].
           cell_seg_scale [int] - scale level of the segmentation
           raw_scale [int] - scale level of the raw data
           nucleus_seg_scale [int] - scale level of the segmentation.
           label_starts [listlike] - list with label start positions
           label_stops [listlike] - list with label stop positions
       """

    # keys for the different scales
    cell_seg_key_full = 't00000/s00/0/cells'
    cell_seg_key = 't00000/s00/%i/cells' % cell_seg_scale
    raw_key_full = 't00000/s00/0/cells'
    raw_key = 't00000/s00/%i/cells' % raw_scale
    nucleus_seg_key_full = 't00000/s00/0/cells'
    nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale

    # filter table
    table = run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path)

    # get scale factors
    if raw_path is not None:
        log("Have raw path; compute intensity features")
        # NOTE for now we can hard-code the resolution for the raw data here,
        # but we might need to change this if we get additional dataset(s)
        raw_resolution = [0.025, 0.01, 0.01]
        scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
        f_raw = h5py.File(raw_path, 'r')
        ds_raw = f_raw[raw_key]
    else:
        log("Don't have raw path; do not compute intensity features")
        scale_factor_raw = f_raw = ds_raw = None

    if nucleus_segmentation_path is not None:
        log("Have nucleus path; exclude nucleus for intensity measures")
        scale_factor_nucleus = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
                                                nucleus_resolution)
        f_nucleus = h5py.File(nucleus_segmentation_path, 'r')
        ds_nucleus = f_nucleus[nucleus_seg_key]
    else:
        log("Don't have exclude path; don't exclude nucleus area for intensity measures")
        scale_factor_nucleus = f_nucleus = ds_nucleus = None

    log("Computing morphology features")
    scale_factor_cell_seg = get_scale_factor(cell_segmentation_path, cell_seg_key_full, cell_seg_key, cell_resolution)
    with h5py.File(cell_segmentation_path, 'r') as f:
        ds = f[cell_seg_key]

        stats = []
        for label_a, label_b in zip(label_starts, label_stops):
            log("Computing features from label-id %i to %i" % (label_a, label_b))
            stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
                                                             None,
                                                             ds_nucleus,
                                                             scale_factor_cell_seg, scale_factor_raw,
                                                             None,
                                                             scale_factor_nucleus,
                                                             label_a, label_b))

    for var in f_raw, f_nucleus:
        if var is not None:
            var.close()

    # convert to pandas table and add column names
    stats = pd.DataFrame(stats)
    stats.columns = generate_column_names(raw_path, None, nucleus_segmentation_path)

    return stats


if __name__ == '__main__':
    pass