Newer
Older
from datetime import datetime
#TODO - uncomment this part
# this is a task called by multiple processes,
# so we need to restrict the number of threads used by numpy
#from cluster_tools.utils.numpy_utils import set_numpy_threads
#set_numpy_threads(1)
import numpy as np
import h5py
import pandas as pd
from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area
from skimage.transform import resize
from skimage.util import pad
def log(msg):
print("%s: %s" % (str(datetime.now()), msg))
# get shape of full data & downsampling factor
def get_scale_factor(path, key_full, key, resolution):
with h5py.File(path, 'r') as f:
full_shape = f[key_full].shape
shape = f[key].shape
# scale factor for downsampling
scale_factor = [res * (fs / sh)
for res, fs, sh in zip(resolution,
full_shape,
shape)]
return scale_factor
# changed to shape_pixel_size, is the plan to go to n_pixels in future?
def filter_table(table, min_size, max_size):
if max_size is None:
table = table.loc[table['shape_pixelsize'] >= min_size, :]
criteria = np.logical_and(table['shape_pixelsize'] > min_size, table['shape_pixelsize'] < max_size)
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
table = table.loc[criteria, :]
return table
def filter_table_from_mapping(table, mapping_path):
# read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id
mapping = np.genfromtxt(mapping_path, skip_header=1, delimiter='\t')[:, :2].astype('uint64')
# remove zero labels from this table too, if exist
mapping = mapping[np.logical_and(mapping[:, 0] != 0,
mapping[:, 1] != 0)]
table = table.loc[np.isin(table['label_id'], mapping[:, 0]), :]
return table
def load_data(ds, row, scale):
# compute the bounding box from the row information
mins = [row.bb_min_z, row.bb_min_y, row.bb_min_x]
maxs = [row.bb_max_z, row.bb_max_y, row.bb_max_x]
mins = [int(mi / sca) for mi, sca in zip(mins, scale)]
maxs = [int(ma / sca) + 1 for ma, sca in zip(maxs, scale)]
bb = tuple(slice(mi, ma) for mi, ma in zip(mins, maxs))
# load the data from the bounding box
return ds[bb]
def morphology_row_features(mask, scale):
# Calculate stats from skimage
ski_morph = regionprops(mask.astype('uint8'))
# volume in pixels
volume_in_pix = ski_morph[0]['area']
# extent
extent = ski_morph[0]['extent']
# The mesh calculation below fails if an edge of the segmentation is right up against the
# edge of the volume - gives an open, rather than a closed surface
# Pad by a few pixels to avoid this
mask = pad(mask, 10, mode='constant')
# surface area of mesh around object (other ways to calculate better?)
verts, faces, normals, values = marching_cubes_lewiner(mask, spacing=tuple(scale))
surface_area = mesh_surface_area(verts, faces)
# volume in microns
volume_in_microns = np.prod(scale)*volume_in_pix
# sphericity (as in morpholibj)
# Should run from zero to one
sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3)
return [volume_in_microns, extent, surface_area, sphericity]
def intensity_row_features(raw, mask):
intensity_vals_in_mask = raw[mask]
# mean and stdev - use float64 to avoid silent overflow errors
mean_intensity = np.mean(intensity_vals_in_mask, dtype=np.float64)
st_dev = np.std(intensity_vals_in_mask, dtype=np.float64)
return mean_intensity, st_dev
# compute morphology (and intensity features) for label range
def morphology_features_for_label_range(table, ds, ds_raw,
scale_factor_seg, scale_factor_raw,
label_begin, label_end):
label_range = np.logical_and(table['label_id'] >= label_begin, table['label_id'] < label_end)
sub_table = table.loc[label_range, :]
stats = []
for row in sub_table.itertuples(index=False):
label_id = int(row.label_id)
# load the segmentation data from the bounding box corresponding
# to this row
seg = load_data(ds, row, scale_factor_seg)
# compute the segmentation mask and check that we have
# foreground in the mask
seg_mask = seg == label_id
if seg_mask.sum() == 0:
# if the seg mask is empty, we simply skip this label-id
continue
# compute the morphology features from the segmentation mask
result = [float(label_id)] + morphology_row_features(seg_mask, scale_factor_seg)
# compute the intensiry features from raw data and segmentation mask
if ds_raw is not None:
raw = load_data(ds_raw, row, scale_factor_raw)
# resize the segmentation mask if it does not fit the raw data
if seg_mask.shape != raw.shape:
seg_mask = resize(seg_mask, raw.shape,
order=0, mode='reflect',
anti_aliasing=True, preserve_range=True).astype('bool')
result += intensity_row_features(raw, seg_mask)
stats.append(result)
return stats
def compute_morphology_features(table, segmentation_path, raw_path,
seg_key, raw_key,
scale_factor_seg, scale_factor_raw,
label_starts, label_stops):
if raw_path != '':
assert raw_key is not None and scale_factor_raw is not None
f_raw = h5py.File(raw_path, 'r')
ds_raw = f_raw[raw_key]
else:
f_raw = ds_raw = None
with h5py.File(segmentation_path, 'r') as f:
ds = f[seg_key]
stats = []
for label_a, label_b in zip(label_starts, label_stops):
log("Computing features from label-id %i to %i" % (label_a, label_b))
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
scale_factor_seg, scale_factor_raw,
label_a, label_b))
if f_raw is not None:
f_raw.close()
# convert to pandas table and add column names
stats = pd.DataFrame(stats)
columns = ['label_id',
'shape_volume_in_microns', 'shape_extent', 'shape_surface_area', 'shape_sphericity']
if raw_path != '':
columns += ['intensity_mean', 'intensity_st_dev']
stats.columns = columns
return stats
def morphology_impl(segmentation_path, raw_path, table, mapping_path,
min_size, max_size,
resolution, raw_scale, seg_scale,
label_starts, label_stops):
""" Compute morphology features for a segmentation.
Can compute features for multiple label ranges. If you want to
compute features for the full label range, pass
'label_starts=[0]' and 'label_stops=[number_of_labels]'
Arguments:
segmentation_path [str] - path to segmentation stored as h5.
raw_path [str] - path to raw data stored as h5.
Pass 'None' if you don't want to compute features based on raw data.
table [pd.DataFrame] - table with default attributes
(sizes, center of mass and bounding boxes) for segmentation
mapping_path [str] - path to - path to nucleus id mapping.
Pass 'None' if not relevant.
min_size [int] - minimal size for objects used in calcualtion
max_size [int] - maximum size for objects used in calcualtion
resolution [listlike] - resolution in nanometer.
Must be given in [Z, Y, X].
raw_scale [int] - scale level of the raw data
seg_scale [int] - scale level of the segmentation
label_starts [listlike] - list with label start positions
label_stops [listlike] - list with label stop positions
"""
# keys to segmentation and raw data for the different scales
seg_key_full = 't00000/s00/0/cells'
seg_key = 't00000/s00/%i/cells' % seg_scale
raw_key_full = 't00000/s00/0/cells'
raw_key = 't00000/s00/%i/cells' % raw_scale
# get scale factor for the segmentation
scale_factor_seg = get_scale_factor(segmentation_path, seg_key_full, seg_key, resolution)
# get scale factor for raw data (if it's given)
if raw_path != '':
log("Have raw path; compute intensity features")
# NOTE for now we can hard-code the resolution for the raw data here,
# but we might need to change this if we get additional dataset(s)
raw_resolution = [0.025, 0.01, 0.01]
scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
else:
log("Don't have raw path; do not compute intensity features")
raw_resolution = scale_factor_raw = None
# remove zero label if it exists
table = table.loc[table['label_id'] != 0, :]
# if we have a mappin, only keep objects in the mapping
# (i.e cells that have assigned nuclei)
if mapping_path != '':
log("Have mapping path %s" % mapping_path)
table = filter_table_from_mapping(table, mapping_path)
log("Number of labels after filter with mapping: %i" % table.shape[0])
# filter by size
table = filter_table(table, min_size, max_size)
log("Number of labels after size filte: %i" % table.shape[0])
log("Computing morphology features")
stats = compute_morphology_features(table, segmentation_path, raw_path,
seg_key, raw_key, scale_factor_seg, scale_factor_raw,
label_starts, label_stops)
return stats
if __name__ == '__main__':
pass