from datetime import datetime # this is a task called by multiple processes, # so we need to restrict the number of threads used by numpy from elf.util import set_numpy_threads set_numpy_threads(1) import numpy as np import vigra import h5py import pandas as pd from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area from skimage.util import pad from scipy.ndimage.morphology import distance_transform_edt from mahotas.features import haralick from skimage.morphology import label, remove_small_objects def log(msg): print("%s: %s" % (str(datetime.now()), msg)) # get shape of full data & downsampling factor def get_scale_factor(path, key_full, key, resolution): with h5py.File(path, 'r') as f: full_shape = f[key_full].shape shape = f[key].shape # scale factor for downsampling scale_factor = [res * (fs / sh) for res, fs, sh in zip(resolution, full_shape, shape)] return scale_factor def filter_table(table, min_size, max_size): if max_size is None: table = table.loc[table['n_pixels'] >= min_size, :] else: criteria = np.logical_and(table['n_pixels'] > min_size, table['n_pixels'] < max_size) table = table.loc[criteria, :] return table # some cell segmentations have a sensible total pixel size but very large bounding boxes i.e. they are small spots # distributed over a large region, not one cell > filter for these cases by capping the bounding box size def filter_table_bb(table, max_bb): total_bb = (table['bb_max_z'] - table['bb_min_z']) * (table['bb_max_y'] - table['bb_min_y']) * ( table['bb_max_x'] - table['bb_min_x']) table = table.loc[total_bb < max_bb, :] return table def filter_table_from_mapping(table, mapping_path): # read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id mapping = pd.read_csv(mapping_path, sep='\t') # remove zero labels from this table too, if exist mapping = mapping.loc[np.logical_and(mapping.iloc[:, 0] != 0, mapping.iloc[:, 1] != 0), :] table = table.loc[np.isin(table['label_id'], mapping['label_id']), :] # add a column for the 'nucleus_id' of the mapped nucleus (use this later to exclude the area covered # by the nucleus) table = table.join(mapping.set_index('label_id'), on='label_id', how='left') return table # regions here are regions to exclude def filter_table_region(table, region_path, regions=('empty', 'yolk', 'neuropil', 'cuticle')): region_mapping = pd.read_csv(region_path, sep='\t') # remove zero label if it exists region_mapping = region_mapping.loc[region_mapping['label_id'] != 0, :] for region in regions: region_mapping = region_mapping.loc[region_mapping[region] == 0, :] table = table.loc[np.isin(table['label_id'], region_mapping['label_id']), :] return table def run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path): # remove zero label if present table = table.loc[table['label_id'] != 0, :] # filter to only keep cells with assigned nuclei if mapping_path is not None: log("Have mapping path %s" % mapping_path) table = filter_table_from_mapping(table, mapping_path) log("Number of labels after filter with mapping: %i" % table.shape[0]) # filter to exclude certain regions if region_mapping_path is not None: log("Have region mapping path %s" % region_mapping_path) table = filter_table_region(table, region_mapping_path) log("Number of labels after region filter: %i" % table.shape[0]) # filter by size of object (no. pixels) if min_size is not None or max_size is not None: table = filter_table(table, min_size, max_size) log("Number of labels after size filter: %i" % table.shape[0]) # filter by bounding box size if max_bb is not None: table = filter_table_bb(table, max_bb) log("Number of labels after bounding box size filter %i" % table.shape[0]) return table def load_data(ds, row, scale): # compute the bounding box from the row information mins = [row.bb_min_z, row.bb_min_y, row.bb_min_x] maxs = [row.bb_max_z, row.bb_max_y, row.bb_max_x] mins = [int(mi / sca) for mi, sca in zip(mins, scale)] maxs = [int(ma / sca) + 1 for ma, sca in zip(maxs, scale)] bb = tuple(slice(mi, ma) for mi, ma in zip(mins, maxs)) # load the data from the bounding box return ds[bb] def generate_column_names(raw_path, chromatin_path, exclude_path): columns = ['label_id'] morph_columns = ['shape_volume_in_microns', 'shape_extent', 'shape_equiv_diameter', 'shape_major_axis', 'shape_minor_axis', 'shape_surface_area', 'shape_sphericity', 'shape_max_radius'] columns += morph_columns if raw_path is not None: intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr', 'intensity_total'] texture_columns = ['texture_hara%s' % x for x in range(1, 14)] columns += intensity_columns if exclude_path is None: # radial intensity columns for val in [25, 50, 75, 100]: columns += ['%s_%s' % (var, val) for var in intensity_columns] columns += texture_columns if chromatin_path is not None: edt_columns = ['shape_edt_mean', 'shape_edt_stdev', 'shape_edt_median', 'shape_edt_iqr'] edt_columns += ['shape_percent_%s' % var for var in [25, 50, 75, 100]] for phase in ['_het', '_eu']: columns += [var + phase for var in morph_columns] columns += [var + phase for var in edt_columns] if raw_path is not None: columns += [var + phase for var in intensity_columns] columns += [var + phase for var in texture_columns] return columns def morphology_row_features(mask, scale): # Calculate stats from skimage ski_morph = regionprops(mask.astype('uint8')) volume_in_pix = ski_morph[0]['area'] volume_in_microns = np.prod(scale) * volume_in_pix extent = ski_morph[0]['extent'] equiv_diameter = ski_morph[0]['equivalent_diameter'] major_axis = ski_morph[0]['major_axis_length'] minor_axis = ski_morph[0]['minor_axis_length'] # The mesh calculation below fails if an edge of the segmentation is right up against the # edge of the volume - gives an open, rather than a closed surface # Pad by a few pixels to avoid this mask = pad(mask, 10, mode='constant') # surface area of mesh around object (other ways to calculate better?) verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale)) surface_area = mesh_surface_area(verts, faces) # sphericity (as in morpholibj) # Should run from zero to one sphericity = (36 * np.pi * (float(volume_in_microns) ** 2)) / (float(surface_area) ** 3) # max radius = max distance from pixel to outside edt = distance_transform_edt(mask, sampling=scale, return_distances=True) max_radius = np.max(edt) return (volume_in_microns, extent, equiv_diameter, major_axis, minor_axis, surface_area, sphericity, max_radius) def intensity_row_features(raw, mask): intensity_vals_in_mask = raw[mask] # mean and stdev - use float64 to avoid silent overflow errors mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64) st_dev = np.std(intensity_vals_in_mask, dtype=np.float64) median_intensity = np.median(intensity_vals_in_mask) quartile_75, quartile_25 = np.percentile(intensity_vals_in_mask, [75, 25]) interquartile_range_intensity = quartile_75 - quartile_25 total = np.sum(intensity_vals_in_mask, dtype=np.float64) return mean_intensity, st_dev, median_intensity, interquartile_range_intensity, total def radial_intensity_row_features(raw, mask, scale, stops=(0.0, 0.25, 0.5, 0.75, 1.0)): result = () edt = distance_transform_edt(mask, sampling=scale, return_distances=True) edt = edt / np.max(edt) bottoms = stops[0:len(stops) - 1] tops = stops[1:] radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)] for m in radial_masks: result += intensity_row_features(raw, m) return result def texture_row_features(raw, mask): # errors if there are small, isolated spots (because I'm using ignore zeros as true) # so here remove components that are < 10 pixels # may still error in some cases labelled = label(mask) if len(np.unique(labelled)) > 2: labelled = remove_small_objects(labelled, min_size=10) mask = labelled != 0 mask = mask.astype('uint8') # set regions outside mask to zero raw_copy = raw.copy() raw_copy[mask == 0] = 0 try: hara = haralick(raw_copy, ignore_zeros=True, return_mean=True, distance=2) except ValueError: log('Texture computation failed - can happen when using ignore_zeros') hara = (0.,) * 13 return tuple(hara) def radial_distribution(edt, mask, stops=(0.0, 0.25, 0.5, 0.75, 1.0)): result = () bottoms = stops[0:len(stops) - 1] tops = stops[1:] radial_masks = [np.logical_and(edt > b, edt <= t) for b, t in zip(bottoms, tops)] for m in radial_masks: # percent of that zone that is covered by the mask result += (np.sum(mask[m]) / np.sum(m),) return result def chromatin_row_features(chromatin, edt, raw, scale_chromatin): result = () result += morphology_row_features(chromatin, scale_chromatin) # edt stats i.e. stats on distribution of chromatin, dropping the total value result += intensity_row_features(edt, chromatin)[:-1] result += radial_distribution(edt, chromatin) if raw is not None: # resize the chromatin masks if not same size as raw chromatin_type = chromatin.dtype if chromatin.shape != raw.shape: chromatin = chromatin.astype('float32') chromatin = vigra.sampling.resize(chromatin, shape=raw.shape, order=0) chromatin = chromatin.astype(chromatin_type) result += intensity_row_features(raw, chromatin) result += texture_row_features(raw, chromatin) return result # compute morphology (and intensity features) for label range def morphology_features_for_label_range(table, ds, ds_raw, ds_chromatin, ds_exclude, scale_factor_seg, scale_factor_raw, scale_factor_chromatin, scale_factor_exclude, label_begin, label_end): label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end) sub_table = table.loc[label_range, :] stats = [] for row in sub_table.itertuples(index=False): log(str(row.label_id)) label_id = int(row.label_id) # load the segmentation data from the bounding box corresponding # to this row seg = load_data(ds, row, scale_factor_seg) # compute the segmentation mask and check that we have # foreground in the mask seg_mask = seg == label_id if seg_mask.sum() == 0: # if the seg mask is empty, we simply skip this label-id continue # compute the morphology features from the segmentation mask result = (float(label_id),) + morphology_row_features(seg_mask, scale_factor_seg) if ds_exclude is not None: exclude = load_data(ds_exclude, row, scale_factor_exclude) exclude_type = exclude.dtype # resize to fit seg if exclude.shape != seg_mask.shape: exclude = exclude.astype('float32') exclude = vigra.sampling.resize(exclude, shape=seg_mask.shape, order=0) exclude = exclude.astype(exclude_type) # binary for correct nucleus exclude = exclude == int(row.nucleus_id) # remove nucleus area form seg_mask seg_mask[exclude] = False # compute the intensity features from raw data and segmentation mask if ds_raw is not None: raw = load_data(ds_raw, row, scale_factor_raw) # resize the segmentation mask if it does not fit the raw data seg_mask_type = seg_mask.dtype if seg_mask.shape != raw.shape: seg_mask = seg_mask.astype('float32') seg_mask = vigra.sampling.resize(seg_mask, shape=raw.shape, order=0) seg_mask = seg_mask.astype(seg_mask_type) result += intensity_row_features(raw, seg_mask) # doesn't make sense to run the radial intensity if the nucleus area is being excluded, as the euclidean # distance transform then gives distance from the outside & nuclear surface - hard to interpret if ds_exclude is None: result += radial_intensity_row_features(raw, seg_mask, scale_factor_raw) result += texture_row_features(raw, seg_mask) if ds_chromatin is not None: chromatin = load_data(ds_chromatin, row, scale_factor_chromatin) # set to 1 (heterochromatin), 2 (euchromatin) heterochromatin = chromatin == label_id + 12000 euchromatin = chromatin == label_id # skip if no chromatin segmentation total_heterochromatin = heterochromatin.sum() total_euchromatin = euchromatin.sum() if total_heterochromatin == 0 and total_euchromatin.sum() == 0: continue # euclidean distance transform for whole nucleus, normalised to run from 0 to 1 whole_nucleus = np.logical_or(heterochromatin, euchromatin) edt = distance_transform_edt(whole_nucleus, sampling=scale_factor_chromatin, return_distances=True) edt = edt / np.max(edt) if ds_raw is None: raw = None if total_heterochromatin != 0: result += chromatin_row_features(heterochromatin, edt, raw, scale_factor_chromatin) else: result += (0.,) * 36 if total_euchromatin != 0: result += chromatin_row_features(euchromatin, edt, raw, scale_factor_chromatin) else: result += (0.,) * 36 stats.append(result) return stats def morphology_impl_nucleus(nucleus_segmentation_path, raw_path, chromatin_path, table, min_size, max_size, max_bb, nucleus_resolution, chromatin_resolution, nucleus_seg_scale, raw_scale, chromatin_scale, label_starts, label_stops): """ Compute morphology features for nucleus segmentation. Can compute features for multiple label ranges. If you want to compute features for the full label range, pass 'label_starts=[0]' and 'label_stops=[number_of_labels]' Arguments: nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5. raw_path [str] - path to raw data stored as h5. Pass 'None' if you don't want to compute features based on raw data. chromatin_path [str] - path to chromatin segmentation data stored as h5. table [pd.DataFrame] - table with default attributes (sizes, center of mass and bounding boxes) for segmentation min_size [int] - minimal size for objects used in calculation max_size [int] - maximum size for objects used in calculation max_bb [int] - maximum total volume of bounding box for objects used in calculation nucleus_resolution [listlike] - resolution in nanometer. Must be given in [Z, Y, X]. chromatin_resolution [listlike] - resolution in nanometer. Must be given in [Z, Y, X]. nucleus_seg_scale [int] - scale level of the segmentation. raw_scale [int] - scale level of the raw data chromatin_scale [int] - scale level of the segmentation. label_starts [listlike] - list with label start positions label_stops [listlike] - list with label stop positions """ # keys for the different scales nucleus_seg_key_full = 't00000/s00/0/cells' nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale raw_key_full = 't00000/s00/0/cells' raw_key = 't00000/s00/%i/cells' % raw_scale chromatin_key_full = 't00000/s00/0/cells' chromatin_key = 't00000/s00/%i/cells' % chromatin_scale # filter table table = run_all_filters(table, min_size, max_size, max_bb, None, None) # get scale factors if raw_path is not None: log("Have raw path; compute intensity features") # NOTE for now we can hard-code the resolution for the raw data here, # but we might need to change this if we get additional dataset(s) raw_resolution = [0.025, 0.01, 0.01] scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution) f_raw = h5py.File(raw_path, 'r') ds_raw = f_raw[raw_key] else: log("Don't have raw path; do not compute intensity features") scale_factor_raw = f_raw = ds_raw = None if chromatin_path is not None: log("Have chromatin path; compute chromatin features") scale_factor_chromatin = get_scale_factor(chromatin_path, chromatin_key_full, chromatin_key, chromatin_resolution) f_chromatin = h5py.File(chromatin_path, 'r') ds_chromatin = f_chromatin[chromatin_key] else: log("Don't have chromatin path; do not compute chromatin features") scale_factor_chromatin = f_chromatin = ds_chromatin = None log("Computing morphology features") scale_factor_nucleus_seg = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key, nucleus_resolution) with h5py.File(nucleus_segmentation_path, 'r') as f: ds = f[nucleus_seg_key] stats = [] for label_a, label_b in zip(label_starts, label_stops): log("Computing features from label-id %i to %i" % (label_a, label_b)) stats.extend(morphology_features_for_label_range(table, ds, ds_raw, ds_chromatin, None, scale_factor_nucleus_seg, scale_factor_raw, scale_factor_chromatin, None, label_a, label_b)) for var in f_raw, f_chromatin: if var is not None: var.close() # convert to pandas table and add column names stats = pd.DataFrame(stats) stats.columns = generate_column_names(raw_path, chromatin_path, None) return stats def morphology_impl_cell(cell_segmentation_path, raw_path, nucleus_segmentation_path, table, mapping_path, region_mapping_path, min_size, max_size, max_bb, cell_resolution, nucleus_resolution, cell_seg_scale, raw_scale, nucleus_seg_scale, label_starts, label_stops): """ Compute morphology features for cell segmentation. Can compute features for multiple label ranges. If you want to compute features for the full label range, pass 'label_starts=[0]' and 'label_stops=[number_of_labels]' Arguments: cell_segmentation_path [str] - path to cell segmentation stored as h5. raw_path [str] - path to raw data stored as h5. Pass 'None' if you don't want to compute features based on raw data. nucleus_segmentation_path [str] - path to nucleus segmentation data stored as h5. Pass 'None' if you don't want to exclude the region covered by the nucleus from intensity & texture calculations. table [pd.DataFrame] - table with default attributes (sizes, center of mass and bounding boxes) for segmentation mapping_path [str] - path to - path to nucleus id mapping. Pass 'None' if not relevant. region_mapping_path [str] - path to - path to cellid to region mapping Pass 'None' if not relevant min_size [int] - minimal size for objects used in calculation max_size [int] - maximum size for objects used in calculation max_bb [int] - maximum total volume of bounding box for objects used in calculation cell_resolution [listlike] - resolution in nanometer. Must be given in [Z, Y, X]. nucleus_resolution [listlike] - resolution in nanometer. Must be given in [Z, Y, X]. cell_seg_scale [int] - scale level of the segmentation raw_scale [int] - scale level of the raw data nucleus_seg_scale [int] - scale level of the segmentation. label_starts [listlike] - list with label start positions label_stops [listlike] - list with label stop positions """ # keys for the different scales cell_seg_key_full = 't00000/s00/0/cells' cell_seg_key = 't00000/s00/%i/cells' % cell_seg_scale raw_key_full = 't00000/s00/0/cells' raw_key = 't00000/s00/%i/cells' % raw_scale nucleus_seg_key_full = 't00000/s00/0/cells' nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale # filter table table = run_all_filters(table, min_size, max_size, max_bb, mapping_path, region_mapping_path) # get scale factors if raw_path is not None: log("Have raw path; compute intensity features") # NOTE for now we can hard-code the resolution for the raw data here, # but we might need to change this if we get additional dataset(s) raw_resolution = [0.025, 0.01, 0.01] scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution) f_raw = h5py.File(raw_path, 'r') ds_raw = f_raw[raw_key] else: log("Don't have raw path; do not compute intensity features") scale_factor_raw = f_raw = ds_raw = None if nucleus_segmentation_path is not None: log("Have nucleus path; exclude nucleus for intensity measures") scale_factor_nucleus = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key, nucleus_resolution) f_nucleus = h5py.File(nucleus_segmentation_path, 'r') ds_nucleus = f_nucleus[nucleus_seg_key] else: log("Don't have exclude path; don't exclude nucleus area for intensity measures") scale_factor_nucleus = f_nucleus = ds_nucleus = None log("Computing morphology features") scale_factor_cell_seg = get_scale_factor(cell_segmentation_path, cell_seg_key_full, cell_seg_key, cell_resolution) with h5py.File(cell_segmentation_path, 'r') as f: ds = f[cell_seg_key] stats = [] for label_a, label_b in zip(label_starts, label_stops): log("Computing features from label-id %i to %i" % (label_a, label_b)) stats.extend(morphology_features_for_label_range(table, ds, ds_raw, None, ds_nucleus, scale_factor_cell_seg, scale_factor_raw, None, scale_factor_nucleus, label_a, label_b)) for var in f_raw, f_nucleus: if var is not None: var.close() # convert to pandas table and add column names stats = pd.DataFrame(stats) stats.columns = generate_column_names(raw_path, None, nucleus_segmentation_path) return stats if __name__ == '__main__': pass