Skip to content
Snippets Groups Projects

Morphology

Merged Kimberly Isobel Meechan requested to merge morphology into master
2 files
+ 439
89
Compare changes
  • Side-by-side
  • Inline
Files
2
@@ -4,13 +4,17 @@ from datetime import datetime
# so we need to restrict the number of threads used by numpy
from elf.util import set_numpy_threads
set_numpy_threads(1)
import numpy as np
import vigra
import h5py
import pandas as pd
from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area
from skimage.transform import resize
from skimage.util import pad
from scipy.ndimage.morphology import distance_transform_edt
from mahotas.features import haralick
from skimage.morphology import label, remove_small_objects
def log(msg):
@@ -40,13 +44,74 @@ def filter_table(table, min_size, max_size):
return table
# some cell segmentations have a sensible total pixel size but very large bounding boxes i.e. they are small spots
# distributed over a large region, not one cell > filter for these cases by capping the bounding box size
def filter_table_bb(table, max_bb):
total_bb = (table['bb_max_z'] - table['bb_min_z']) * (table['bb_max_y'] - table['bb_min_y']) * (
table['bb_max_x'] - table['bb_min_x'])
table = table.loc[total_bb < max_bb, :]
return table
def filter_table_from_mapping(table, mapping_path):
# read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id
mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64')
mapping = pd.read_csv(mapping_path, sep='\t')
# remove zero labels from this table too, if exist
mapping = mapping[np.logical_and(mapping[:, 0] != 0,
mapping[:, 1] != 0)]
table = table.loc[np.isin(table['label_id'], mapping[:, 0]), :]
mapping = mapping.loc[np.logical_and(mapping.iloc[:, 0] != 0,
mapping.iloc[:, 1] != 0), :]
table = table.loc[np.isin(table['label_id'], mapping['label_id']), :]
# add a column for the 'nucleus_id' of the mapped nucleus (use this later to exclude the area covered
# by the nucleus)
table = table.join(mapping.set_index('label_id'), on='label_id', how='left')
return table
# regions here are regions to exclude
def filter_table_region(table, region_path, regions=('empty', 'yolk', 'neuropil', 'cuticle')):
region_mapping = pd.read_csv(region_path, sep='\t')
# remove zero label if it exists
region_mapping = region_mapping.loc[region_mapping['label_id'] != 0, :]
for region in regions:
region_mapping = region_mapping.loc[region_mapping[region] == 0, :]
table = table.loc[np.isin(table['label_id'], region_mapping['label_id']), :]
return table
def run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path):
# remove zero label if present
table = table.loc[table['label_id'] != 0, :]
# filter to only keep cells with assigned nuclei
if mapping_path is not None:
log("Have mapping path %s" % mapping_path)
table = filter_table_from_mapping(table, mapping_path)
log("Number of labels after filter with mapping: %i" % table.shape[0])
# filter to exclude certain regions
if region_mapping_path is not None:
log("Have region mapping path %s" % region_mapping_path)
table = filter_table_region(table, region_mapping_path)
log("Number of labels after region filter: %i" % table.shape[0])
# filter by size of object (no. pixels)
if min_size is not None or max_size is not None:
table = filter_table(table, min_size, max_size)
log("Number of labels after size filter: %i" % table.shape[0])
# filter by bounding box size
if max_bb is not None:
table = filter_table_bb(table, max_bb)
log("Number of labels after bounding box size filter %i" % table.shape[0])
return table
@@ -61,16 +126,53 @@ def load_data(ds, row, scale):
return ds[bb]
def morphology_row_features(mask, scale):
def generate_column_names(raw_path, chromatin_path, exclude_path):
columns = ['label_id']
morph_columns = ['shape_volume_in_microns', 'shape_extent', 'shape_equiv_diameter',
'shape_major_axis', 'shape_minor_axis', 'shape_surface_area', 'shape_sphericity',
'shape_max_radius']
columns += morph_columns
if raw_path is not None:
intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr',
'intensity_total']
texture_columns = ['texture_hara%s' % x for x in range(1, 14)]
columns += intensity_columns
if exclude_path is None:
# radial intensity columns
for val in [25, 50, 75, 100]:
columns += ['%s_%s' % (var, val) for var in intensity_columns]
columns += texture_columns
if chromatin_path is not None:
edt_columns = ['shape_edt_mean', 'shape_edt_stdev', 'shape_edt_median', 'shape_edt_iqr']
edt_columns += ['shape_percent_%s' % var for var in [25, 50, 75, 100]]
for phase in ['_het', '_eu']:
columns += [var + phase for var in morph_columns]
columns += [var + phase for var in edt_columns]
if raw_path is not None:
columns += [var + phase for var in intensity_columns]
columns += [var + phase for var in texture_columns]
return columns
def morphology_row_features(mask, scale):
# Calculate stats from skimage
ski_morph = regionprops(mask.astype('uint8'))
# volume in pixels
volume_in_pix = ski_morph[0]['area']
# extent
volume_in_microns = np.prod(scale) * volume_in_pix
extent = ski_morph[0]['extent']
equiv_diameter = ski_morph[0]['equivalent_diameter']
major_axis = ski_morph[0]['major_axis_length']
minor_axis = ski_morph[0]['minor_axis_length']
# The mesh calculation below fails if an edge of the segmentation is right up against the
# edge of the volume - gives an open, rather than a closed surface
@@ -81,14 +183,16 @@ def morphology_row_features(mask, scale):
verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale))
surface_area = mesh_surface_area(verts, faces)
# volume in microns
volume_in_microns = np.prod(scale)*volume_in_pix
# sphericity (as in morpholibj)
# Should run from zero to one
sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3)
sphericity = (36 * np.pi * (float(volume_in_microns) ** 2)) / (float(surface_area) ** 3)
return [volume_in_microns, extent, surface_area, sphericity]
# max radius = max distance from pixel to outside
edt = distance_transform_edt(mask, sampling=scale, return_distances=True)
max_radius = np.max(edt)
return (volume_in_microns, extent, equiv_diameter, major_axis,
minor_axis, surface_area, sphericity, max_radius)
def intensity_row_features(raw, mask):
@@ -96,17 +200,108 @@ def intensity_row_features(raw, mask):
# mean and stdev - use float64 to avoid silent overflow errors
mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64)
st_dev = np.std(intensity_vals_in_mask, dtype=np.float64)
return mean_intensity, st_dev
median_intensity = np.median(intensity_vals_in_mask)
quartile_75, quartile_25 = np.percentile(intensity_vals_in_mask, [75, 25])
interquartile_range_intensity = quartile_75 - quartile_25
total = np.sum(intensity_vals_in_mask, dtype=np.float64)
return mean_intensity, st_dev, median_intensity, interquartile_range_intensity, total
def radial_intensity_row_features(raw, mask, scale, stops=(0.0, 0.25, 0.5, 0.75, 1.0)):
result = ()
edt = distance_transform_edt(mask, sampling=scale, return_distances=True)
edt = edt / np.max(edt)
bottoms = stops[0:len(stops) - 1]
tops = stops[1:]
radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)]
for m in radial_masks:
result += intensity_row_features(raw, m)
return result
def texture_row_features(raw, mask):
# errors if there are small, isolated spots (because I'm using ignore zeros as true)
# so here remove components that are < 10 pixels
# may still error in some cases
labelled = label(mask)
if len(np.unique(labelled)) > 2:
labelled = remove_small_objects(labelled, min_size=10)
mask = labelled != 0
mask = mask.astype('uint8')
# set regions outside mask to zero
raw_copy = raw.copy()
raw_copy[mask == 0] = 0
try:
hara = haralick(raw_copy, ignore_zeros=True, return_mean=True, distance=2)
except ValueError:
log('Texture computation failed - can happen when using ignore_zeros')
hara = (0.,) * 13
return tuple(hara)
def radial_distribution(edt, mask, stops=(0.0, 0.25, 0.5, 0.75, 1.0)):
result = ()
bottoms = stops[0:len(stops) - 1]
tops = stops[1:]
radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)]
for m in radial_masks:
# percent of that zone that is covered by the mask
result += (np.sum(mask[m]) / np.sum(m),)
return result
def chromatin_row_features(chromatin, edt, raw, scale_chromatin):
result = ()
result += morphology_row_features(chromatin, scale_chromatin)
# edt stats i.e. stats on distribution of chromatin, dropping the total value
result += intensity_row_features(edt, chromatin)[:-1]
result += radial_distribution(edt, chromatin)
if raw is not None:
# resize the chromatin masks if not same size as raw
chromatin_type = chromatin.dtype
if chromatin.shape != raw.shape:
chromatin = chromatin.astype('float32')
chromatin = vigra.sampling.resize(chromatin, shape=raw.shape, order=0)
chromatin = chromatin.astype(chromatin_type)
result += intensity_row_features(raw, chromatin)
result += texture_row_features(raw, chromatin)
return result
# compute morphology (and intensity features) for label range
def morphology_features_for_label_range(table, ds, ds_raw,
ds_chromatin,
ds_exclude,
scale_factor_seg, scale_factor_raw,
scale_factor_chromatin,
scale_factor_exclude,
label_begin, label_end):
label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end)
sub_table = table.loc[label_range, :]
stats = []
for row in sub_table.itertuples(index=False):
log(str(row.label_id))
label_id = int(row.label_id)
# load the segmentation data from the bounding box corresponding
@@ -121,120 +316,274 @@ def morphology_features_for_label_range(table, ds, ds_raw,
continue
# compute the morphology features from the segmentation mask
result = [float(label_id)] + morphology_row_features(seg_mask, scale_factor_seg)
result = (float(label_id),) + morphology_row_features(seg_mask, scale_factor_seg)
if ds_exclude is not None:
exclude = load_data(ds_exclude, row, scale_factor_exclude)
exclude_type = exclude.dtype
# resize to fit seg
if exclude.shape != seg_mask.shape:
exclude = exclude.astype('float32')
exclude = vigra.sampling.resize(exclude, shape=seg_mask.shape, order=0)
exclude = exclude.astype(exclude_type)
# compute the intensiry features from raw data and segmentation mask
# binary for correct nucleus
exclude = exclude == int(row.nucleus_id)
# remove nucleus area form seg_mask
seg_mask[exclude] = False
# compute the intensity features from raw data and segmentation mask
if ds_raw is not None:
raw = load_data(ds_raw, row, scale_factor_raw)
# resize the segmentation mask if it does not fit the raw data
seg_mask_type = seg_mask.dtype
if seg_mask.shape != raw.shape:
seg_mask = resize(seg_mask, raw.shape,
order=0, mode='reflect',
anti_aliasing=True, preserve_range=True).astype('bool')
seg_mask = seg_mask.astype('float32')
seg_mask = vigra.sampling.resize(seg_mask, shape=raw.shape, order=0)
seg_mask = seg_mask.astype(seg_mask_type)
result += intensity_row_features(raw, seg_mask)
# doesn't make sense to run the radial intensity if the nucleus area is being excluded, as the euclidean
# distance transform then gives distance from the outside & nuclear surface - hard to interpret
if ds_exclude is None:
result += radial_intensity_row_features(raw, seg_mask, scale_factor_raw)
result += texture_row_features(raw, seg_mask)
if ds_chromatin is not None:
chromatin = load_data(ds_chromatin, row, scale_factor_chromatin)
# set to 1 (heterochromatin), 2 (euchromatin)
heterochromatin = chromatin == label_id + 12000
euchromatin = chromatin == label_id
# skip if no chromatin segmentation
total_heterochromatin = heterochromatin.sum()
total_euchromatin = euchromatin.sum()
if total_heterochromatin == 0 and total_euchromatin.sum() == 0:
continue
# euclidean distance transform for whole nucleus, normalised to run from 0 to 1
whole_nucleus = np.logical_or(heterochromatin, euchromatin)
edt = distance_transform_edt(whole_nucleus, sampling=scale_factor_chromatin, return_distances=True)
edt = edt / np.max(edt)
if ds_raw is None:
raw = None
if total_heterochromatin != 0:
result += chromatin_row_features(heterochromatin, edt, raw, scale_factor_chromatin)
else:
result += (0.,) * 36
if total_euchromatin != 0:
result += chromatin_row_features(euchromatin, edt, raw, scale_factor_chromatin)
else:
result += (0.,) * 36
stats.append(result)
return stats
def compute_morphology_features(table, segmentation_path, raw_path,
seg_key, raw_key,
scale_factor_seg, scale_factor_raw,
label_starts, label_stops):
def morphology_impl_nucleus(nucleus_segmentation_path, raw_path, chromatin_path,
table,
min_size, max_size,
max_bb,
nucleus_resolution, chromatin_resolution,
nucleus_seg_scale, raw_scale, chromatin_scale,
label_starts, label_stops):
""" Compute morphology features for nucleus segmentation.
Can compute features for multiple label ranges. If you want to
compute features for the full label range, pass
'label_starts=[0]' and 'label_stops=[number_of_labels]'
Arguments:
nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5.
raw_path [str] - path to raw data stored as h5.
Pass 'None' if you don't want to compute features based on raw data.
chromatin_path [str] - path to chromatin segmentation data stored as h5.
table [pd.DataFrame] - table with default attributes
(sizes, center of mass and bounding boxes) for segmentation
min_size [int] - minimal size for objects used in calculation
max_size [int] - maximum size for objects used in calculation
max_bb [int] - maximum total volume of bounding box for objects used in calculation
nucleus_resolution [listlike] - resolution in nanometer.
Must be given in [Z, Y, X].
chromatin_resolution [listlike] - resolution in nanometer.
Must be given in [Z, Y, X].
nucleus_seg_scale [int] - scale level of the segmentation.
raw_scale [int] - scale level of the raw data
chromatin_scale [int] - scale level of the segmentation.
label_starts [listlike] - list with label start positions
label_stops [listlike] - list with label stop positions
"""
# keys for the different scales
nucleus_seg_key_full = 't00000/s00/0/cells'
nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale
raw_key_full = 't00000/s00/0/cells'
raw_key = 't00000/s00/%i/cells' % raw_scale
chromatin_key_full = 't00000/s00/0/cells'
chromatin_key = 't00000/s00/%i/cells' % chromatin_scale
# filter table
table = run_all_filters(table, min_size, max_size, max_bb, None, None)
if raw_path != '':
assert raw_key is not None and scale_factor_raw is not None
# get scale factors
if raw_path is not None:
log("Have raw path; compute intensity features")
# NOTE for now we can hard-code the resolution for the raw data here,
# but we might need to change this if we get additional dataset(s)
raw_resolution = [0.025, 0.01, 0.01]
scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
f_raw = h5py.File(raw_path, 'r')
ds_raw = f_raw[raw_key]
else:
f_raw = ds_raw = None
log("Don't have raw path; do not compute intensity features")
scale_factor_raw = f_raw = ds_raw = None
if chromatin_path is not None:
log("Have chromatin path; compute chromatin features")
scale_factor_chromatin = get_scale_factor(chromatin_path, chromatin_key_full, chromatin_key,
chromatin_resolution)
f_chromatin = h5py.File(chromatin_path, 'r')
ds_chromatin = f_chromatin[chromatin_key]
else:
log("Don't have chromatin path; do not compute chromatin features")
scale_factor_chromatin = f_chromatin = ds_chromatin = None
with h5py.File(segmentation_path, 'r') as f:
ds = f[seg_key]
log("Computing morphology features")
scale_factor_nucleus_seg = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
nucleus_resolution)
with h5py.File(nucleus_segmentation_path, 'r') as f:
ds = f[nucleus_seg_key]
stats = []
for label_a, label_b in zip(label_starts, label_stops):
log("Computing features from label-id %i to %i" % (label_a, label_b))
stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
scale_factor_seg, scale_factor_raw,
ds_chromatin,
None,
scale_factor_nucleus_seg, scale_factor_raw,
scale_factor_chromatin,
None,
label_a, label_b))
if f_raw is not None:
f_raw.close()
for var in f_raw, f_chromatin:
if var is not None:
var.close()
# convert to pandas table and add column names
stats = pd.DataFrame(stats)
columns = ['label_id',
'shape_volume_in_microns', 'shape_extent', 'shape_surface_area', 'shape_sphericity']
if raw_path != '':
columns += ['intensity_mean', 'intensity_st_dev']
stats.columns = columns
stats.columns = generate_column_names(raw_path, chromatin_path, None)
return stats
def morphology_impl(segmentation_path, raw_path, table, mapping_path,
min_size, max_size,
resolution, raw_scale, seg_scale,
label_starts, label_stops):
""" Compute morphology features for a segmentation.
Can compute features for multiple label ranges. If you want to
compute features for the full label range, pass
'label_starts=[0]' and 'label_stops=[number_of_labels]'
Arguments:
segmentation_path [str] - path to segmentation stored as h5.
raw_path [str] - path to raw data stored as h5.
Pass 'None' if you don't want to compute features based on raw data.
table [pd.DataFrame] - table with default attributes
(sizes, center of mass and bounding boxes) for segmentation
mapping_path [str] - path to - path to nucleus id mapping.
Pass 'None' if not relevant.
min_size [int] - minimal size for objects used in calcualtion
max_size [int] - maximum size for objects used in calcualtion
resolution [listlike] - resolution in nanometer.
Must be given in [Z, Y, X].
raw_scale [int] - scale level of the raw data
seg_scale [int] - scale level of the segmentation
label_starts [listlike] - list with label start positions
label_stops [listlike] - list with label stop positions
"""
# keys to segmentation and raw data for the different scales
seg_key_full = 't00000/s00/0/cells'
seg_key = 't00000/s00/%i/cells' % seg_scale
def morphology_impl_cell(cell_segmentation_path, raw_path,
nucleus_segmentation_path,
table, mapping_path,
region_mapping_path,
min_size, max_size,
max_bb,
cell_resolution, nucleus_resolution,
cell_seg_scale, raw_scale, nucleus_seg_scale,
label_starts, label_stops):
""" Compute morphology features for cell segmentation.
Can compute features for multiple label ranges. If you want to
compute features for the full label range, pass
'label_starts=[0]' and 'label_stops=[number_of_labels]'
Arguments:
cell_segmentation_path [str] - path to cell segmentation stored as h5.
raw_path [str] - path to raw data stored as h5.
Pass 'None' if you don't want to compute features based on raw data.
nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5.
Pass 'None' if you don't want to exclude the region covered by the nucleus from intensity &
texture calculations.
table [pd.DataFrame] - table with default attributes
(sizes, center of mass and bounding boxes) for segmentation
mapping_path [str] - path to - path to nucleus id mapping.
Pass 'None' if not relevant.
region_mapping_path [str] - path to - path to cellid to region mapping
Pass 'None' if not relevant
min_size [int] - minimal size for objects used in calculation
max_size [int] - maximum size for objects used in calculation
max_bb [int] - maximum total volume of bounding box for objects used in calculation
cell_resolution [listlike] - resolution in nanometer.
Must be given in [Z, Y, X].
nucleus_resolution [listlike] - resolution in nanometer.
Must be given in [Z, Y, X].
cell_seg_scale [int] - scale level of the segmentation
raw_scale [int] - scale level of the raw data
nucleus_seg_scale [int] - scale level of the segmentation.
label_starts [listlike] - list with label start positions
label_stops [listlike] - list with label stop positions
"""
# keys for the different scales
cell_seg_key_full = 't00000/s00/0/cells'
cell_seg_key = 't00000/s00/%i/cells' % cell_seg_scale
raw_key_full = 't00000/s00/0/cells'
raw_key = 't00000/s00/%i/cells' % raw_scale
nucleus_seg_key_full = 't00000/s00/0/cells'
nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale
# get scale factor for the segmentation
scale_factor_seg = get_scale_factor(segmentation_path, seg_key_full, seg_key, resolution)
# filter table
table = run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path)
# get scale factor for raw data (if it's given)
if raw_path != '':
# get scale factors
if raw_path is not None:
log("Have raw path; compute intensity features")
# NOTE for now we can hard-code the resolution for the raw data here,
# but we might need to change this if we get additional dataset(s)
raw_resolution = [0.025, 0.01, 0.01]
scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
f_raw = h5py.File(raw_path, 'r')
ds_raw = f_raw[raw_key]
else:
log("Don't have raw path; do not compute intensity features")
raw_resolution = scale_factor_raw = None
scale_factor_raw = f_raw = ds_raw = None
if nucleus_segmentation_path is not None:
log("Have nucleus path; exclude nucleus for intensity measures")
scale_factor_nucleus = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
nucleus_resolution)
f_nucleus = h5py.File(nucleus_segmentation_path, 'r')
ds_nucleus = f_nucleus[nucleus_seg_key]
else:
log("Don't have exclude path; don't exclude nucleus area for intensity measures")
scale_factor_nucleus = f_nucleus = ds_nucleus = None
# remove zero label if it exists
table = table.loc[table['label_id'] != 0, :]
log("Computing morphology features")
scale_factor_cell_seg = get_scale_factor(cell_segmentation_path, cell_seg_key_full, cell_seg_key, cell_resolution)
with h5py.File(cell_segmentation_path, 'r') as f:
ds = f[cell_seg_key]
# if we have a mappin, only keep objects in the mapping
# (i.e cells that have assigned nuclei)
if mapping_path != '':
log("Have mapping path %s" % mapping_path)
table = filter_table_from_mapping(table, mapping_path)
log("Number of labels after filter with mapping: %i" % table.shape[0])
# filter by size
table = filter_table(table, min_size, max_size)
log("Number of labels after size filte: %i" % table.shape[0])
stats = []
for label_a, label_b in zip(label_starts, label_stops):
log("Computing features from label-id %i to %i" % (label_a, label_b))
stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
None,
ds_nucleus,
scale_factor_cell_seg, scale_factor_raw,
None,
scale_factor_nucleus,
label_a, label_b))
for var in f_raw, f_nucleus:
if var is not None:
var.close()
# convert to pandas table and add column names
stats = pd.DataFrame(stats)
stats.columns = generate_column_names(raw_path, None, nucleus_segmentation_path)
log("Computing morphology features")
stats = compute_morphology_features(table, segmentation_path, raw_path,
seg_key, raw_key, scale_factor_seg, scale_factor_raw,
label_starts, label_stops)
return stats
Loading