From 8c04dd4448c8421c48ebb7a530c2bbe41af977c9 Mon Sep 17 00:00:00 2001 From: Kimberly Meechan <kimberly.meechan@embl.de> Date: Wed, 17 Jul 2019 10:52:45 +0200 Subject: [PATCH] More cell features added - rough --- .../extension/attributes/morphology_impl.py | 181 ++++++++++++++---- 1 file changed, 145 insertions(+), 36 deletions(-) diff --git a/scripts/extension/attributes/morphology_impl.py b/scripts/extension/attributes/morphology_impl.py index c21566d..496042f 100644 --- a/scripts/extension/attributes/morphology_impl.py +++ b/scripts/extension/attributes/morphology_impl.py @@ -1,10 +1,13 @@ from datetime import datetime -#TODO - uncomment this part +# TODO - uncomment this part # this is a task called by multiple processes, # so we need to restrict the number of threads used by numpy -#from cluster_tools.utils.numpy_utils import set_numpy_threads -#set_numpy_threads(1) +# from cluster_tools.utils.numpy_utils import set_numpy_threads +# set_numpy_threads(1) +from tmp.numpy_utils import set_numpy_threads + +set_numpy_threads(1) import numpy as np import h5py @@ -44,16 +47,46 @@ def filter_table(table, min_size, max_size): table = table.loc[criteria, :] return table +# some cell segmentations have a sensible total pixel size but very large bounding boxes i.e. they are small spots +# distributed over a large region, not one cell > filter for these cases by capping the bounding box size +def filter_table_bb(table, max_bb): + + total_bb = (table['bb_max_z'] - table['bb_min_z']) * (table['bb_max_y'] - table['bb_min_y']) * ( + table['bb_max_x'] - table['bb_min_x']) + + table = table.loc[total_bb < max_bb, :] + + return table def filter_table_from_mapping(table, mapping_path): # read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id - mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64') + mapping = pd.read_csv(mapping_path, sep='\t') + + # mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64') # remove zero labels from this table too, if exist - mapping = mapping[np.logical_and(mapping[:, 0] != 0, - mapping[:, 1] != 0)] - table = table.loc[np.isin(table['label_id'], mapping[:, 0]), :] + mapping = mapping.loc[np.logical_and(mapping.iloc[:, 0] != 0, + mapping.iloc[:, 1] != 0), :] + table = table.loc[np.isin(table['label_id'], mapping['label_id']), :] + + # add a column for the id of the mapped nucleus (use this later to exclude the area covered by the nucleus) + table = table.join(mapping.set_index('label_id'), on='label_id', how='left') + + return table + + +def filter_table_region(table, region_path, regions=('empty', 'yolk', 'neuropil', 'cuticle')): + # regions here are regions to exclude + + region_mapping = pd.read_csv(region_path, sep='\t') + + # remove zero label if it exists + region_mapping = region_mapping.loc[region_mapping['label_id'] != 0, :] + + for region in regions: + region_mapping = region_mapping.loc[region_mapping[region] == 0, :] + + table = table.loc[np.isin(table['label_id'], region_mapping['label_id']), :] - # TODO - ADD a column for the nucleus id from the mapping return table @@ -69,7 +102,6 @@ def load_data(ds, row, scale): def morphology_row_features(mask, scale): - # Calculate stats from skimage ski_morph = regionprops(mask.astype('uint8')) @@ -82,8 +114,17 @@ def morphology_row_features(mask, scale): # extent extent = ski_morph[0]['extent'] - # volume of convex hull - convex_area = ski_morph[0]['convex_area'] + try: + # volume of convex hull + convex_area = ski_morph[0]['convex_area'] + + # solidity + solidity = ski_morph[0]['solidity'] + + except MemoryError: + log("Too large to compute convex hull") + convex_area = 0 + solidity = 0 # equivalent diameter equiv_diameter = ski_morph[0]['equivalent_diameter'] @@ -94,9 +135,6 @@ def morphology_row_features(mask, scale): # minor axis length minor_axis = ski_morph[0]['minor_axis_length'] - # solidity - solidity = ski_morph[0]['solidity'] - # The mesh calculation below fails if an edge of the segmentation is right up against the # edge of the volume - gives an open, rather than a closed surface # Pad by a few pixels to avoid this @@ -108,7 +146,7 @@ def morphology_row_features(mask, scale): # sphericity (as in morpholibj) # Should run from zero to one - sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3) + sphericity = (36 * np.pi * (float(volume_in_microns) ** 2)) / (float(surface_area) ** 3) # max radius - max distance from pixel to outside edt = distance_transform_edt(mask, sampling=scale, return_distances=True) @@ -132,7 +170,8 @@ def intensity_row_features(raw, mask): return mean_intensity, st_dev, median_intensity, interquartile_range_intensity, total -def radial_intensity_row_features(raw, mask, scale, stops = (0.0, 0.25, 0.5, 0.75, 1.0)): + +def radial_intensity_row_features(raw, mask, scale, stops=(0.0, 0.25, 0.5, 0.75, 1.0)): result = () edt = distance_transform_edt(mask, sampling=scale, return_distances=True) @@ -152,6 +191,7 @@ def radial_intensity_row_features(raw, mask, scale, stops = (0.0, 0.25, 0.5, 0.7 def texture_row_features(raw, mask): # errors if there are small, isolated spots (because I'm using ignore zeros as true) # so here remove components that are < 10 pixels + # may still error in some cases labelled = label(mask) if len(np.unique(labelled)) > 2: labelled = remove_small_objects(labelled, min_size=10) @@ -162,11 +202,17 @@ def texture_row_features(raw, mask): raw_copy = raw.copy() raw_copy[mask == 0] = 0 - hara = haralick(raw_copy, ignore_zeros=True, return_mean=True, distance=2) + try: + hara = haralick(raw_copy, ignore_zeros=True, return_mean=True, distance=2) + + except ValueError: + log('Texture computation failed - can happen when using ignore_zeros') + hara = (0.,)*13 return tuple(hara) -def radial_distribution (edt, mask, stops = (0.0, 0.25, 0.5, 0.75, 1.0)): + +def radial_distribution(edt, mask, stops=(0.0, 0.25, 0.5, 0.75, 1.0)): result = () bottoms = stops[0:len(stops) - 1] @@ -179,7 +225,8 @@ def radial_distribution (edt, mask, stops = (0.0, 0.25, 0.5, 0.75, 1.0)): return result -def chromatin_row_features (chromatin, edt, raw, scale_chromatin, scale_raw): + +def chromatin_row_features(chromatin, edt, raw, scale_chromatin, scale_raw): """ :param chromatin: numpy ndarray mask, boolean @@ -199,8 +246,8 @@ def chromatin_row_features (chromatin, edt, raw, scale_chromatin, scale_raw): # resize the chromatin masks if not same size as raw if chromatin.shape != raw.shape: chromatin = resize(chromatin, raw.shape, - order=0, mode='reflect', - anti_aliasing=True, preserve_range=True).astype('bool') + order=0, mode='reflect', + anti_aliasing=True, preserve_range=True).astype('bool') result += intensity_row_features(raw, chromatin) result += texture_row_features(raw, chromatin) @@ -211,13 +258,16 @@ def chromatin_row_features (chromatin, edt, raw, scale_chromatin, scale_raw): # compute morphology (and intensity features) for label range def morphology_features_for_label_range(table, ds, ds_raw, ds_chromatin, + ds_exclude, scale_factor_seg, scale_factor_raw, scale_factor_chromatin, + scale_factor_exclude, label_begin, label_end): label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end) sub_table = table.loc[label_range, :] stats = [] for row in sub_table.itertuples(index=False): + log(str(row.label_id)) label_id = int(row.label_id) # load the segmentation data from the bounding box corresponding @@ -234,7 +284,24 @@ def morphology_features_for_label_range(table, ds, ds_raw, # compute the morphology features from the segmentation mask result = (float(label_id),) + morphology_row_features(seg_mask, scale_factor_seg) - # TODO - if exclude, remove that part form mask before continuing, use added nucleus_id column + if ds_exclude is not None: + exclude = load_data(ds_exclude, row, scale_factor_exclude) + + # resize to fit seg + if exclude.shape != seg_mask.shape: + exclude = resize(exclude, seg_mask.shape, + order=0, mode='reflect', + anti_aliasing=True, preserve_range=True).astype('bool') + + # binary for correct nucleus + exclude = exclude == int(row.nucleus_id) + + # remove nucleus area form seg_mask + seg_mask[exclude] = False + + # just testing + from skimage.io import imsave + imsave('Z:\\Kimberly\\temp\\test_ex' + str(row.label_id) + '.tiff', seg_mask.astype('uint8')) # compute the intensiry features from raw data and segmentation mask if ds_raw is not None: @@ -266,31 +333,31 @@ def morphology_features_for_label_range(table, ds, ds_raw, # euclidean distance transform for whole nucleus, normalised to run from 0 to 1 whole_nucleus = np.logical_or(heterochromatin, euchromatin) edt = distance_transform_edt(whole_nucleus, sampling=scale_factor_chromatin, return_distances=True) - edt = edt/np.max(edt) + edt = edt / np.max(edt) if total_heterochromatin != 0: result += chromatin_row_features(heterochromatin, edt, raw, scale_factor_chromatin, scale_factor_raw) else: - result += (0.,)*36 + result += (0.,) * 36 if total_euchromatin != 0: result += chromatin_row_features(euchromatin, edt, raw, scale_factor_chromatin, scale_factor_raw) else: - result += (0.,)*36 - + result += (0.,) * 36 stats.append(result) return stats def compute_morphology_features(table, segmentation_path, raw_path, - chromatin_path, + chromatin_path, exclude_nuc_path, seg_key, raw_key, chromatin_key, + exclude_key, scale_factor_seg, scale_factor_raw, scale_factor_chromatin, + scale_factor_exclude, label_starts, label_stops): - if raw_path != '': assert raw_key is not None and scale_factor_raw is not None f_raw = h5py.File(raw_path, 'r') @@ -305,8 +372,13 @@ def compute_morphology_features(table, segmentation_path, raw_path, else: f_chromatin = ds_chromatin = None + if exclude_nuc_path != '': + assert exclude_key is not None and scale_factor_exclude is not None + f_exclude = h5py.File(exclude_nuc_path, 'r') + ds_exclude = f_exclude[exclude_key] - # TODO - opening of exclude dataset + else: + f_exclude = ds_exclude = None with h5py.File(segmentation_path, 'r') as f: ds = f[seg_key] @@ -316,8 +388,10 @@ def compute_morphology_features(table, segmentation_path, raw_path, log("Computing features from label-id %i to %i" % (label_a, label_b)) stats.extend(morphology_features_for_label_range(table, ds, ds_raw, ds_chromatin, + ds_exclude, scale_factor_seg, scale_factor_raw, scale_factor_chromatin, + scale_factor_exclude, label_a, label_b)) if f_raw is not None: f_raw.close() @@ -325,6 +399,9 @@ def compute_morphology_features(table, segmentation_path, raw_path, if f_chromatin is not None: f_chromatin.close() + if f_exclude is not None: + f_exclude.close() + # convert to pandas table and add column names stats = pd.DataFrame(stats) columns = ['label_id'] @@ -335,8 +412,9 @@ def compute_morphology_features(table, segmentation_path, raw_path, columns += morph_columns if raw_path != '': - intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr', 'intensity_total'] - texture_columns = ['texture_hara' + str(x) for x in range(1,14)] + intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr', + 'intensity_total'] + texture_columns = ['texture_hara' + str(x) for x in range(1, 14)] columns += intensity_columns # radial intensity columns @@ -359,10 +437,13 @@ def compute_morphology_features(table, segmentation_path, raw_path, def morphology_impl(segmentation_path, raw_path, chromatin_path, + exclude_nuc_path, table, mapping_path, + region_mapping_path, min_size, max_size, + max_bb, resolution, raw_scale, seg_scale, - chromatin_scale, + chromatin_scale, exclude_nuc_scale, label_starts, label_stops): """ Compute morphology features for a segmentation. @@ -378,6 +459,8 @@ def morphology_impl(segmentation_path, raw_path, chromatin_path, (sizes, center of mass and bounding boxes) for segmentation mapping_path [str] - path to - path to nucleus id mapping. Pass 'None' if not relevant. + region_mapping_path [str] - path to - path to cellid to region mapping + Pass 'None' if not relevant min_size [int] - minimal size for objects used in calcualtion max_size [int] - maximum size for objects used in calcualtion resolution [listlike] - resolution in nanometer. @@ -395,6 +478,8 @@ def morphology_impl(segmentation_path, raw_path, chromatin_path, raw_key = 't00000/s00/%i/cells' % raw_scale chromatin_key_full = 't00000/s00/0/cells' chromatin_key = 't00000/s00/%i/cells' % chromatin_scale + exclude_key_full = 't00000/s00/0/cells' + exclude_key = 't00000/s00/%i/cells' % exclude_nuc_scale # get scale factor for the segmentation scale_factor_seg = get_scale_factor(segmentation_path, seg_key_full, seg_key, resolution) @@ -416,11 +501,24 @@ def morphology_impl(segmentation_path, raw_path, chromatin_path, # NOTE for now we can hard-code the resolution for the chromatin data here, # but we might need to change this if we get additional dataset(s) chromatin_resolution = [0.025, 0.02, 0.02] - scale_factor_chromatin = get_scale_factor(chromatin_path, chromatin_key_full, chromatin_key, chromatin_resolution) + scale_factor_chromatin = get_scale_factor(chromatin_path, chromatin_key_full, chromatin_key, + chromatin_resolution) else: log("Don't have chromatin path; do not compute chromatin features") chromatin_resolution = scale_factor_chromatin = None + # get scale factor for nuclei segmentation (to be used to exclude nucleus) if given + if exclude_nuc_path != '': + log("Have nucleus path; exclude nucleus for intensity measures") + # NOTE for now we can hard-code the resolution for the nuclei data here, + # but we might need to change this if we get additional dataset(s) + exclude_resolution = [0.1, 0.08, 0.08] + scale_factor_exclude = get_scale_factor(exclude_nuc_path, exclude_key_full, exclude_key, + exclude_resolution) + else: + log("Don't have exclude path; don't exclude nucleus area for intensity measures") + scale_factor_exclude = scale_factor_exclude = None + # remove zero label if it exists table = table.loc[table['label_id'] != 0, :] @@ -430,18 +528,29 @@ def morphology_impl(segmentation_path, raw_path, chromatin_path, log("Have mapping path %s" % mapping_path) table = filter_table_from_mapping(table, mapping_path) log("Number of labels after filter with mapping: %i" % table.shape[0]) + + if region_mapping_path != '': + log("Have region mapping path %s" % region_mapping_path) + table = filter_table_region(table, region_mapping_path) + log("Number of labels after region filter: %i" % table.shape[0]) + # filter by size table = filter_table(table, min_size, max_size) - log("Number of labels after size filte: %i" % table.shape[0]) + log("Number of labels after size filter: %i" % table.shape[0]) - # TODO - add filtering for region - just add another path like the mapping_path one + # filter by bounding box size + if max_bb is not None: + table = filter_table_bb(table, max_bb) + log("Number of labels after bounding box size filter %i" % table.shape[0]) log("Computing morphology features") stats = compute_morphology_features(table, segmentation_path, raw_path, - chromatin_path, + chromatin_path, exclude_nuc_path, seg_key, raw_key, chromatin_key, + exclude_key, scale_factor_seg, scale_factor_raw, scale_factor_chromatin, + scale_factor_exclude, label_starts, label_stops) return stats -- GitLab