diff --git a/.gitignore b/.gitignore index e5bf8f03e82af0f1a632deda1492ff3a08a52d52..271a141778e84ab904e8074b63320776d3b45e75 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,11 @@ *.h5 __pycache__/ tmp* +.idea/* *.DS_Store data/dev-* *.tif *.swp software *.zip -*.n5 +*.n5 \ No newline at end of file diff --git a/scripts/extension/attributes/morphology_impl.py b/scripts/extension/attributes/morphology_impl.py index 0a4bc1ccb9d51384eb7eab63ca66aedec258027a..3cf094733fc88c8aca6db54b8b6c783098f51e7a 100644 --- a/scripts/extension/attributes/morphology_impl.py +++ b/scripts/extension/attributes/morphology_impl.py @@ -4,13 +4,17 @@ from datetime import datetime # so we need to restrict the number of threads used by numpy from elf.util import set_numpy_threads set_numpy_threads(1) + import numpy as np +import vigra import h5py import pandas as pd from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area -from skimage.transform import resize from skimage.util import pad +from scipy.ndimage.morphology import distance_transform_edt +from mahotas.features import haralick +from skimage.morphology import label, remove_small_objects def log(msg): @@ -40,13 +44,74 @@ def filter_table(table, min_size, max_size): return table +# some cell segmentations have a sensible total pixel size but very large bounding boxes i.e. they are small spots +# distributed over a large region, not one cell > filter for these cases by capping the bounding box size +def filter_table_bb(table, max_bb): + total_bb = (table['bb_max_z'] - table['bb_min_z']) * (table['bb_max_y'] - table['bb_min_y']) * ( + table['bb_max_x'] - table['bb_min_x']) + + table = table.loc[total_bb < max_bb, :] + + return table + + def filter_table_from_mapping(table, mapping_path): # read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id - mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64') + mapping = pd.read_csv(mapping_path, sep='\t') + # remove zero labels from this table too, if exist - mapping = mapping[np.logical_and(mapping[:, 0] != 0, - mapping[:, 1] != 0)] - table = table.loc[np.isin(table['label_id'], mapping[:, 0]), :] + mapping = mapping.loc[np.logical_and(mapping.iloc[:, 0] != 0, + mapping.iloc[:, 1] != 0), :] + table = table.loc[np.isin(table['label_id'], mapping['label_id']), :] + + # add a column for the 'nucleus_id' of the mapped nucleus (use this later to exclude the area covered + # by the nucleus) + table = table.join(mapping.set_index('label_id'), on='label_id', how='left') + + return table + + +# regions here are regions to exclude +def filter_table_region(table, region_path, regions=('empty', 'yolk', 'neuropil', 'cuticle')): + region_mapping = pd.read_csv(region_path, sep='\t') + + # remove zero label if it exists + region_mapping = region_mapping.loc[region_mapping['label_id'] != 0, :] + + for region in regions: + region_mapping = region_mapping.loc[region_mapping[region] == 0, :] + + table = table.loc[np.isin(table['label_id'], region_mapping['label_id']), :] + + return table + + +def run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path): + # remove zero label if present + table = table.loc[table['label_id'] != 0, :] + + # filter to only keep cells with assigned nuclei + if mapping_path is not None: + log("Have mapping path %s" % mapping_path) + table = filter_table_from_mapping(table, mapping_path) + log("Number of labels after filter with mapping: %i" % table.shape[0]) + + # filter to exclude certain regions + if region_mapping_path is not None: + log("Have region mapping path %s" % region_mapping_path) + table = filter_table_region(table, region_mapping_path) + log("Number of labels after region filter: %i" % table.shape[0]) + + # filter by size of object (no. pixels) + if min_size is not None or max_size is not None: + table = filter_table(table, min_size, max_size) + log("Number of labels after size filter: %i" % table.shape[0]) + + # filter by bounding box size + if max_bb is not None: + table = filter_table_bb(table, max_bb) + log("Number of labels after bounding box size filter %i" % table.shape[0]) + return table @@ -61,16 +126,53 @@ def load_data(ds, row, scale): return ds[bb] -def morphology_row_features(mask, scale): +def generate_column_names(raw_path, chromatin_path, exclude_path): + columns = ['label_id'] + morph_columns = ['shape_volume_in_microns', 'shape_extent', 'shape_equiv_diameter', + 'shape_major_axis', 'shape_minor_axis', 'shape_surface_area', 'shape_sphericity', + 'shape_max_radius'] + + columns += morph_columns + + if raw_path is not None: + intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr', + 'intensity_total'] + texture_columns = ['texture_hara%s' % x for x in range(1, 14)] + + columns += intensity_columns + + if exclude_path is None: + # radial intensity columns + for val in [25, 50, 75, 100]: + columns += ['%s_%s' % (var, val) for var in intensity_columns] + + columns += texture_columns + + if chromatin_path is not None: + edt_columns = ['shape_edt_mean', 'shape_edt_stdev', 'shape_edt_median', 'shape_edt_iqr'] + edt_columns += ['shape_percent_%s' % var for var in [25, 50, 75, 100]] + for phase in ['_het', '_eu']: + columns += [var + phase for var in morph_columns] + columns += [var + phase for var in edt_columns] + + if raw_path is not None: + columns += [var + phase for var in intensity_columns] + columns += [var + phase for var in texture_columns] + + return columns + + +def morphology_row_features(mask, scale): # Calculate stats from skimage ski_morph = regionprops(mask.astype('uint8')) - # volume in pixels volume_in_pix = ski_morph[0]['area'] - - # extent + volume_in_microns = np.prod(scale) * volume_in_pix extent = ski_morph[0]['extent'] + equiv_diameter = ski_morph[0]['equivalent_diameter'] + major_axis = ski_morph[0]['major_axis_length'] + minor_axis = ski_morph[0]['minor_axis_length'] # The mesh calculation below fails if an edge of the segmentation is right up against the # edge of the volume - gives an open, rather than a closed surface @@ -81,14 +183,16 @@ def morphology_row_features(mask, scale): verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale)) surface_area = mesh_surface_area(verts, faces) - # volume in microns - volume_in_microns = np.prod(scale)*volume_in_pix - # sphericity (as in morpholibj) # Should run from zero to one - sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3) + sphericity = (36 * np.pi * (float(volume_in_microns) ** 2)) / (float(surface_area) ** 3) - return [volume_in_microns, extent, surface_area, sphericity] + # max radius = max distance from pixel to outside + edt = distance_transform_edt(mask, sampling=scale, return_distances=True) + max_radius = np.max(edt) + + return (volume_in_microns, extent, equiv_diameter, major_axis, + minor_axis, surface_area, sphericity, max_radius) def intensity_row_features(raw, mask): @@ -96,17 +200,108 @@ def intensity_row_features(raw, mask): # mean and stdev - use float64 to avoid silent overflow errors mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64) st_dev = np.std(intensity_vals_in_mask, dtype=np.float64) - return mean_intensity, st_dev + median_intensity = np.median(intensity_vals_in_mask) + + quartile_75, quartile_25 = np.percentile(intensity_vals_in_mask, [75, 25]) + interquartile_range_intensity = quartile_75 - quartile_25 + + total = np.sum(intensity_vals_in_mask, dtype=np.float64) + + return mean_intensity, st_dev, median_intensity, interquartile_range_intensity, total + + +def radial_intensity_row_features(raw, mask, scale, stops=(0.0, 0.25, 0.5, 0.75, 1.0)): + result = () + + edt = distance_transform_edt(mask, sampling=scale, return_distances=True) + edt = edt / np.max(edt) + + bottoms = stops[0:len(stops) - 1] + tops = stops[1:] + + radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)] + + for m in radial_masks: + result += intensity_row_features(raw, m) + + return result + + +def texture_row_features(raw, mask): + # errors if there are small, isolated spots (because I'm using ignore zeros as true) + # so here remove components that are < 10 pixels + # may still error in some cases + labelled = label(mask) + if len(np.unique(labelled)) > 2: + labelled = remove_small_objects(labelled, min_size=10) + mask = labelled != 0 + mask = mask.astype('uint8') + + # set regions outside mask to zero + raw_copy = raw.copy() + raw_copy[mask == 0] = 0 + + try: + hara = haralick(raw_copy, ignore_zeros=True, return_mean=True, distance=2) + + except ValueError: + log('Texture computation failed - can happen when using ignore_zeros') + hara = (0.,) * 13 + + return tuple(hara) + + +def radial_distribution(edt, mask, stops=(0.0, 0.25, 0.5, 0.75, 1.0)): + result = () + + bottoms = stops[0:len(stops) - 1] + tops = stops[1:] + + radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)] + + for m in radial_masks: + # percent of that zone that is covered by the mask + result += (np.sum(mask[m]) / np.sum(m),) + + return result + + +def chromatin_row_features(chromatin, edt, raw, scale_chromatin): + result = () + + result += morphology_row_features(chromatin, scale_chromatin) + + # edt stats i.e. stats on distribution of chromatin, dropping the total value + result += intensity_row_features(edt, chromatin)[:-1] + result += radial_distribution(edt, chromatin) + + if raw is not None: + # resize the chromatin masks if not same size as raw + chromatin_type = chromatin.dtype + if chromatin.shape != raw.shape: + chromatin = chromatin.astype('float32') + chromatin = vigra.sampling.resize(chromatin, shape=raw.shape, order=0) + chromatin = chromatin.astype(chromatin_type) + + result += intensity_row_features(raw, chromatin) + result += texture_row_features(raw, chromatin) + + return result # compute morphology (and intensity features) for label range def morphology_features_for_label_range(table, ds, ds_raw, + ds_chromatin, + ds_exclude, scale_factor_seg, scale_factor_raw, + scale_factor_chromatin, + scale_factor_exclude, label_begin, label_end): label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end) sub_table = table.loc[label_range, :] stats = [] for row in sub_table.itertuples(index=False): + log(str(row.label_id)) label_id = int(row.label_id) # load the segmentation data from the bounding box corresponding @@ -121,120 +316,274 @@ def morphology_features_for_label_range(table, ds, ds_raw, continue # compute the morphology features from the segmentation mask - result = [float(label_id)] + morphology_row_features(seg_mask, scale_factor_seg) + result = (float(label_id),) + morphology_row_features(seg_mask, scale_factor_seg) + + if ds_exclude is not None: + exclude = load_data(ds_exclude, row, scale_factor_exclude) + exclude_type = exclude.dtype + + # resize to fit seg + if exclude.shape != seg_mask.shape: + exclude = exclude.astype('float32') + exclude = vigra.sampling.resize(exclude, shape=seg_mask.shape, order=0) + exclude = exclude.astype(exclude_type) - # compute the intensiry features from raw data and segmentation mask + # binary for correct nucleus + exclude = exclude == int(row.nucleus_id) + + # remove nucleus area form seg_mask + seg_mask[exclude] = False + + # compute the intensity features from raw data and segmentation mask if ds_raw is not None: raw = load_data(ds_raw, row, scale_factor_raw) + # resize the segmentation mask if it does not fit the raw data + seg_mask_type = seg_mask.dtype + if seg_mask.shape != raw.shape: - seg_mask = resize(seg_mask, raw.shape, - order=0, mode='reflect', - anti_aliasing=True, preserve_range=True).astype('bool') + seg_mask = seg_mask.astype('float32') + seg_mask = vigra.sampling.resize(seg_mask, shape=raw.shape, order=0) + seg_mask = seg_mask.astype(seg_mask_type) + result += intensity_row_features(raw, seg_mask) + # doesn't make sense to run the radial intensity if the nucleus area is being excluded, as the euclidean + # distance transform then gives distance from the outside & nuclear surface - hard to interpret + if ds_exclude is None: + result += radial_intensity_row_features(raw, seg_mask, scale_factor_raw) + result += texture_row_features(raw, seg_mask) + + if ds_chromatin is not None: + chromatin = load_data(ds_chromatin, row, scale_factor_chromatin) + + # set to 1 (heterochromatin), 2 (euchromatin) + heterochromatin = chromatin == label_id + 12000 + euchromatin = chromatin == label_id + + # skip if no chromatin segmentation + total_heterochromatin = heterochromatin.sum() + total_euchromatin = euchromatin.sum() + if total_heterochromatin == 0 and total_euchromatin.sum() == 0: + continue + + # euclidean distance transform for whole nucleus, normalised to run from 0 to 1 + whole_nucleus = np.logical_or(heterochromatin, euchromatin) + edt = distance_transform_edt(whole_nucleus, sampling=scale_factor_chromatin, return_distances=True) + edt = edt / np.max(edt) + + if ds_raw is None: + raw = None + + if total_heterochromatin != 0: + result += chromatin_row_features(heterochromatin, edt, raw, scale_factor_chromatin) + else: + result += (0.,) * 36 + + if total_euchromatin != 0: + result += chromatin_row_features(euchromatin, edt, raw, scale_factor_chromatin) + else: + result += (0.,) * 36 + stats.append(result) return stats -def compute_morphology_features(table, segmentation_path, raw_path, - seg_key, raw_key, - scale_factor_seg, scale_factor_raw, - label_starts, label_stops): +def morphology_impl_nucleus(nucleus_segmentation_path, raw_path, chromatin_path, + table, + min_size, max_size, + max_bb, + nucleus_resolution, chromatin_resolution, + nucleus_seg_scale, raw_scale, chromatin_scale, + label_starts, label_stops): + """ Compute morphology features for nucleus segmentation. + + Can compute features for multiple label ranges. If you want to + compute features for the full label range, pass + 'label_starts=[0]' and 'label_stops=[number_of_labels]' + + Arguments: + nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5. + raw_path [str] - path to raw data stored as h5. + Pass 'None' if you don't want to compute features based on raw data. + chromatin_path [str] - path to chromatin segmentation data stored as h5. + table [pd.DataFrame] - table with default attributes + (sizes, center of mass and bounding boxes) for segmentation + min_size [int] - minimal size for objects used in calculation + max_size [int] - maximum size for objects used in calculation + max_bb [int] - maximum total volume of bounding box for objects used in calculation + nucleus_resolution [listlike] - resolution in nanometer. + Must be given in [Z, Y, X]. + chromatin_resolution [listlike] - resolution in nanometer. + Must be given in [Z, Y, X]. + nucleus_seg_scale [int] - scale level of the segmentation. + raw_scale [int] - scale level of the raw data + chromatin_scale [int] - scale level of the segmentation. + label_starts [listlike] - list with label start positions + label_stops [listlike] - list with label stop positions + """ + + # keys for the different scales + nucleus_seg_key_full = 't00000/s00/0/cells' + nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale + raw_key_full = 't00000/s00/0/cells' + raw_key = 't00000/s00/%i/cells' % raw_scale + chromatin_key_full = 't00000/s00/0/cells' + chromatin_key = 't00000/s00/%i/cells' % chromatin_scale + + # filter table + table = run_all_filters(table, min_size, max_size, max_bb, None, None) - if raw_path != '': - assert raw_key is not None and scale_factor_raw is not None + # get scale factors + if raw_path is not None: + log("Have raw path; compute intensity features") + # NOTE for now we can hard-code the resolution for the raw data here, + # but we might need to change this if we get additional dataset(s) + raw_resolution = [0.025, 0.01, 0.01] + scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution) f_raw = h5py.File(raw_path, 'r') ds_raw = f_raw[raw_key] else: - f_raw = ds_raw = None + log("Don't have raw path; do not compute intensity features") + scale_factor_raw = f_raw = ds_raw = None + + if chromatin_path is not None: + log("Have chromatin path; compute chromatin features") + scale_factor_chromatin = get_scale_factor(chromatin_path, chromatin_key_full, chromatin_key, + chromatin_resolution) + f_chromatin = h5py.File(chromatin_path, 'r') + ds_chromatin = f_chromatin[chromatin_key] + else: + log("Don't have chromatin path; do not compute chromatin features") + scale_factor_chromatin = f_chromatin = ds_chromatin = None - with h5py.File(segmentation_path, 'r') as f: - ds = f[seg_key] + log("Computing morphology features") + scale_factor_nucleus_seg = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key, + nucleus_resolution) + with h5py.File(nucleus_segmentation_path, 'r') as f: + ds = f[nucleus_seg_key] stats = [] for label_a, label_b in zip(label_starts, label_stops): log("Computing features from label-id %i to %i" % (label_a, label_b)) stats.extend(morphology_features_for_label_range(table, ds, ds_raw, - scale_factor_seg, scale_factor_raw, + ds_chromatin, + None, + scale_factor_nucleus_seg, scale_factor_raw, + scale_factor_chromatin, + None, label_a, label_b)) - if f_raw is not None: - f_raw.close() + + for var in f_raw, f_chromatin: + if var is not None: + var.close() # convert to pandas table and add column names stats = pd.DataFrame(stats) - columns = ['label_id', - 'shape_volume_in_microns', 'shape_extent', 'shape_surface_area', 'shape_sphericity'] - if raw_path != '': - columns += ['intensity_mean', 'intensity_st_dev'] - stats.columns = columns + stats.columns = generate_column_names(raw_path, chromatin_path, None) + return stats -def morphology_impl(segmentation_path, raw_path, table, mapping_path, - min_size, max_size, - resolution, raw_scale, seg_scale, - label_starts, label_stops): - """ Compute morphology features for a segmentation. - - Can compute features for multiple label ranges. If you want to - compute features for the full label range, pass - 'label_starts=[0]' and 'label_stops=[number_of_labels]' - - Arguments: - segmentation_path [str] - path to segmentation stored as h5. - raw_path [str] - path to raw data stored as h5. - Pass 'None' if you don't want to compute features based on raw data. - table [pd.DataFrame] - table with default attributes - (sizes, center of mass and bounding boxes) for segmentation - mapping_path [str] - path to - path to nucleus id mapping. - Pass 'None' if not relevant. - min_size [int] - minimal size for objects used in calcualtion - max_size [int] - maximum size for objects used in calcualtion - resolution [listlike] - resolution in nanometer. - Must be given in [Z, Y, X]. - raw_scale [int] - scale level of the raw data - seg_scale [int] - scale level of the segmentation - label_starts [listlike] - list with label start positions - label_stops [listlike] - list with label stop positions - """ - - # keys to segmentation and raw data for the different scales - seg_key_full = 't00000/s00/0/cells' - seg_key = 't00000/s00/%i/cells' % seg_scale +def morphology_impl_cell(cell_segmentation_path, raw_path, + nucleus_segmentation_path, + table, mapping_path, + region_mapping_path, + min_size, max_size, + max_bb, + cell_resolution, nucleus_resolution, + cell_seg_scale, raw_scale, nucleus_seg_scale, + label_starts, label_stops): + """ Compute morphology features for cell segmentation. + + Can compute features for multiple label ranges. If you want to + compute features for the full label range, pass + 'label_starts=[0]' and 'label_stops=[number_of_labels]' + + Arguments: + cell_segmentation_path [str] - path to cell segmentation stored as h5. + raw_path [str] - path to raw data stored as h5. + Pass 'None' if you don't want to compute features based on raw data. + nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5. + Pass 'None' if you don't want to exclude the region covered by the nucleus from intensity & + texture calculations. + table [pd.DataFrame] - table with default attributes + (sizes, center of mass and bounding boxes) for segmentation + mapping_path [str] - path to - path to nucleus id mapping. + Pass 'None' if not relevant. + region_mapping_path [str] - path to - path to cellid to region mapping + Pass 'None' if not relevant + min_size [int] - minimal size for objects used in calculation + max_size [int] - maximum size for objects used in calculation + max_bb [int] - maximum total volume of bounding box for objects used in calculation + cell_resolution [listlike] - resolution in nanometer. + Must be given in [Z, Y, X]. + nucleus_resolution [listlike] - resolution in nanometer. + Must be given in [Z, Y, X]. + cell_seg_scale [int] - scale level of the segmentation + raw_scale [int] - scale level of the raw data + nucleus_seg_scale [int] - scale level of the segmentation. + label_starts [listlike] - list with label start positions + label_stops [listlike] - list with label stop positions + """ + + # keys for the different scales + cell_seg_key_full = 't00000/s00/0/cells' + cell_seg_key = 't00000/s00/%i/cells' % cell_seg_scale raw_key_full = 't00000/s00/0/cells' raw_key = 't00000/s00/%i/cells' % raw_scale + nucleus_seg_key_full = 't00000/s00/0/cells' + nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale - # get scale factor for the segmentation - scale_factor_seg = get_scale_factor(segmentation_path, seg_key_full, seg_key, resolution) + # filter table + table = run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path) - # get scale factor for raw data (if it's given) - if raw_path != '': + # get scale factors + if raw_path is not None: log("Have raw path; compute intensity features") # NOTE for now we can hard-code the resolution for the raw data here, # but we might need to change this if we get additional dataset(s) raw_resolution = [0.025, 0.01, 0.01] scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution) + f_raw = h5py.File(raw_path, 'r') + ds_raw = f_raw[raw_key] else: log("Don't have raw path; do not compute intensity features") - raw_resolution = scale_factor_raw = None + scale_factor_raw = f_raw = ds_raw = None + + if nucleus_segmentation_path is not None: + log("Have nucleus path; exclude nucleus for intensity measures") + scale_factor_nucleus = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key, + nucleus_resolution) + f_nucleus = h5py.File(nucleus_segmentation_path, 'r') + ds_nucleus = f_nucleus[nucleus_seg_key] + else: + log("Don't have exclude path; don't exclude nucleus area for intensity measures") + scale_factor_nucleus = f_nucleus = ds_nucleus = None - # remove zero label if it exists - table = table.loc[table['label_id'] != 0, :] + log("Computing morphology features") + scale_factor_cell_seg = get_scale_factor(cell_segmentation_path, cell_seg_key_full, cell_seg_key, cell_resolution) + with h5py.File(cell_segmentation_path, 'r') as f: + ds = f[cell_seg_key] - # if we have a mappin, only keep objects in the mapping - # (i.e cells that have assigned nuclei) - if mapping_path != '': - log("Have mapping path %s" % mapping_path) - table = filter_table_from_mapping(table, mapping_path) - log("Number of labels after filter with mapping: %i" % table.shape[0]) - # filter by size - table = filter_table(table, min_size, max_size) - log("Number of labels after size filte: %i" % table.shape[0]) + stats = [] + for label_a, label_b in zip(label_starts, label_stops): + log("Computing features from label-id %i to %i" % (label_a, label_b)) + stats.extend(morphology_features_for_label_range(table, ds, ds_raw, + None, + ds_nucleus, + scale_factor_cell_seg, scale_factor_raw, + None, + scale_factor_nucleus, + label_a, label_b)) + + for var in f_raw, f_nucleus: + if var is not None: + var.close() + + # convert to pandas table and add column names + stats = pd.DataFrame(stats) + stats.columns = generate_column_names(raw_path, None, nucleus_segmentation_path) - log("Computing morphology features") - stats = compute_morphology_features(table, segmentation_path, raw_path, - seg_key, raw_key, scale_factor_seg, scale_factor_raw, - label_starts, label_stops) return stats