From aad7c22fc7a3e2e2777616ac8a7fc3f2933a2e62 Mon Sep 17 00:00:00 2001
From: Kimberly Meechan <kimberly.meechan@embl.de>
Date: Tue, 28 Jan 2020 11:11:34 +0100
Subject: [PATCH] moved column generation into separate function, merged
 compute_morphology_features function into main function

---
 .../extension/attributes/morphology_impl.py   | 352 +++++++++++-------
 1 file changed, 210 insertions(+), 142 deletions(-)

diff --git a/scripts/extension/attributes/morphology_impl.py b/scripts/extension/attributes/morphology_impl.py
index d9e33fe..734db38 100644
--- a/scripts/extension/attributes/morphology_impl.py
+++ b/scripts/extension/attributes/morphology_impl.py
@@ -37,7 +37,6 @@ def get_scale_factor(path, key_full, key, resolution):
                                            shape)]
     return scale_factor
 
-
 def filter_table(table, min_size, max_size):
     if max_size is None:
         table = table.loc[table['n_pixels'] >= min_size, :]
@@ -99,6 +98,40 @@ def load_data(ds, row, scale):
     # load the data from the bounding box
     return ds[bb]
 
+def generate_column_names (raw_path, chromatin_path):
+    columns = ['label_id']
+    morph_columns = ['shape_volume_in_microns', 'shape_extent', 'shape_equiv_diameter',
+                     'shape_major_axis', 'shape_minor_axis', 'shape_surface_area', 'shape_sphericity',
+                     'shape_max_radius']
+
+    columns += morph_columns
+
+    if raw_path is not None:
+        intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr',
+                             'intensity_total']
+        texture_columns = ['texture_hara%s' % x for x in range(1, 14)]
+
+        columns += intensity_columns
+
+        # radial intensity columns
+        for val in [25, 50, 75, 100]:
+            columns += ['%s_%s' % (var, val) for var in intensity_columns]
+        columns += texture_columns
+
+    if chromatin_path is not None:
+        edt_columns = ['shape_edt_mean', 'shape_edt_stdev', 'shape_edt_median', 'shape_edt_iqr']
+        edt_columns += ['shape_percent_%s' % var for var in [25, 50, 75, 100]]
+
+        for phase in ['_het', '_eu']:
+            columns += [var + phase for var in morph_columns]
+            columns += [var + phase for var in edt_columns]
+
+            if raw_path is not None:
+                columns += [var + phase for var in intensity_columns]
+                columns += [var + phase for var in texture_columns]
+
+    return columns
+
 # @profile
 def morphology_row_features(mask, scale):
     # Calculate stats from skimage
@@ -315,96 +348,6 @@ def morphology_features_for_label_range(table, ds, ds_raw,
         stats.append(result)
     return stats
 
-
-def compute_morphology_features(table, segmentation_path, raw_path,
-                                chromatin_path, exclude_nuc_path,
-                                seg_key, raw_key, chromatin_key,
-                                exclude_key,
-                                scale_factor_seg, scale_factor_raw,
-                                scale_factor_chromatin,
-                                scale_factor_exclude,
-                                label_starts, label_stops):
-    if raw_path != '':
-        assert raw_key is not None and scale_factor_raw is not None
-        f_raw = h5py.File(raw_path, 'r')
-        ds_raw = f_raw[raw_key]
-    else:
-        f_raw = ds_raw = None
-
-    if chromatin_path != '':
-        assert chromatin_key is not None and scale_factor_chromatin is not None
-        f_chromatin = h5py.File(chromatin_path, 'r')
-        ds_chromatin = f_chromatin[chromatin_key]
-    else:
-        f_chromatin = ds_chromatin = None
-
-    if exclude_nuc_path != '':
-        assert exclude_key is not None and scale_factor_exclude is not None
-        f_exclude = h5py.File(exclude_nuc_path, 'r')
-        ds_exclude = f_exclude[exclude_key]
-
-    else:
-        f_exclude = ds_exclude = None
-
-    with h5py.File(segmentation_path, 'r') as f:
-        ds = f[seg_key]
-
-        stats = []
-        for label_a, label_b in zip(label_starts, label_stops):
-            log("Computing features from label-id %i to %i" % (label_a, label_b))
-            stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
-                                                             ds_chromatin,
-                                                             ds_exclude,
-                                                             scale_factor_seg, scale_factor_raw,
-                                                             scale_factor_chromatin,
-                                                             scale_factor_exclude,
-                                                             label_a, label_b))
-    if f_raw is not None:
-        f_raw.close()
-
-    if f_chromatin is not None:
-        f_chromatin.close()
-
-    if f_exclude is not None:
-        f_exclude.close()
-
-    # convert to pandas table and add column names
-    stats = pd.DataFrame(stats)
-    columns = ['label_id']
-    morph_columns = ['shape_volume_in_microns', 'shape_extent', 'shape_equiv_diameter',
-                     'shape_major_axis', 'shape_minor_axis', 'shape_surface_area', 'shape_sphericity',
-                     'shape_max_radius']
-
-    columns += morph_columns
-
-    if raw_path != '':
-        intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr',
-                             'intensity_total']
-        texture_columns = ['texture_hara%s' % x for x in range(1, 14)]
-
-        columns += intensity_columns
-        # radial intensity columns
-        for val in [25, 50, 75, 100]:
-            columns += ['%s_%s' % (var, val) for var in intensity_columns]
-        columns += texture_columns
-
-    if chromatin_path != '':
-        edt_columns = ['shape_edt_mean', 'shape_edt_stdev', 'shape_edt_median', 'shape_edt_iqr']
-        edt_columns += ['shape_percent_%s' % var for var in [25, 50, 75, 100]]
-
-        for phase in ['_het', '_eu']:
-            columns += [var + phase for var in morph_columns]
-            columns += [var + phase for var in edt_columns]
-
-            if raw_path != '':
-                columns += [var + phase for var in intensity_columns]
-                columns += [var + phase for var in texture_columns]
-
-    stats.columns = columns
-    return stats
-
-
-
 def morphology_impl_nucleus (nucleus_segmentation_path, raw_path, chromatin_path,
                     table,
                     min_size, max_size,
@@ -421,50 +364,69 @@ def morphology_impl_nucleus (nucleus_segmentation_path, raw_path, chromatin_path
     chromatin_key_full = 't00000/s00/0/cells'
     chromatin_key = 't00000/s00/%i/cells' % chromatin_scale
 
-    # get scale factors
-    scale_factor_nucleus_seg = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
-                                                nucleus_resolution)
+    # remove zero label if it exists
+    table = table.loc[table['label_id'] != 0, :]
+
+    # filter by size
+    if min_size is not None or max_size is not None:
+        table = filter_table(table, min_size, max_size)
+        log("Number of labels after size filter: %i" % table.shape[0])
+
+    # filter by bounding box size
+    if max_bb is not None:
+        table = filter_table_bb(table, max_bb)
+        log("Number of labels after bounding box size filter %i" % table.shape[0])
 
+    # get scale factors
     if raw_path is not None:
         log("Have raw path; compute intensity features")
         # NOTE for now we can hard-code the resolution for the raw data here,
         # but we might need to change this if we get additional dataset(s)
         raw_resolution = [0.025, 0.01, 0.01]
         scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
+        f_raw = h5py.File(raw_path, 'r')
+        ds_raw = f_raw[raw_key]
     else:
         log("Don't have raw path; do not compute intensity features")
         scale_factor_raw = None
+        f_raw = ds_raw = None
 
     if chromatin_path is not None:
         log("Have chromatin path; compute chromatin features")
         scale_factor_chromatin = get_scale_factor(chromatin_path, chromatin_key_full, chromatin_key,
                                                   chromatin_resolution)
+        f_chromatin = h5py.File(chromatin_path, 'r')
+        ds_chromatin = f_chromatin[chromatin_key]
     else:
         log("Don't have chromatin path; do not compute chromatin features")
         scale_factor_chromatin = None
+        f_chromatin = ds_chromatin = None
 
-    # remove zero label if it exists
-    table = table.loc[table['label_id'] != 0, :]
+    log("Computing morphology features")
+    scale_factor_nucleus_seg = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
+                                                nucleus_resolution)
+    with h5py.File(nucleus_segmentation_path, 'r') as f:
+        ds = f[nucleus_seg_key]
 
-    # filter by size
-    if min_size is not None or max_size is not None:
-        table = filter_table(table, min_size, max_size)
-        log("Number of labels after size filter: %i" % table.shape[0])
+        stats = []
+        for label_a, label_b in zip(label_starts, label_stops):
+            log("Computing features from label-id %i to %i" % (label_a, label_b))
+            stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
+                                                             ds_chromatin,
+                                                             ds_exclude,
+                                                             scale_factor_seg, scale_factor_raw,
+                                                             scale_factor_chromatin,
+                                                             scale_factor_exclude,
+                                                             label_a, label_b))
 
-    # filter by bounding box size
-    if max_bb is not None:
-        table = filter_table_bb(table, max_bb)
-        log("Number of labels after bounding box size filter %i" % table.shape[0])
+    for var in f_raw, f_chromatin:
+        if var is not None:
+            var.close()
+
+    # convert to pandas table and add column names
+    stats = pd.DataFrame(stats)
+    stats.columns = generate_column_names(raw_path, chromatin_path)
 
-    log("Computing morphology features")
-    stats = compute_morphology_features(table, segmentation_path, raw_path,
-                                        chromatin_path, exclude_nuc_path,
-                                        seg_key, raw_key, chromatin_key,
-                                        exclude_key,
-                                        scale_factor_seg, scale_factor_raw,
-                                        scale_factor_chromatin,
-                                        scale_factor_exclude,
-                                        label_starts, label_stops)
     return stats
 
 def morphology_impl_cell (cell_segmentation_path, raw_path,
@@ -485,27 +447,6 @@ def morphology_impl_cell (cell_segmentation_path, raw_path,
     nucleus_seg_key_full = 't00000/s00/0/cells'
     nucleus_seg_key = 't00000/s00/%i/cells' % nucleus_seg_scale
 
-    # get scale factors
-    cell_seg_scale_factor = get_scale_factor(cell_segmentation_path, cell_seg_key_full, cell_seg_key, cell_resolution)
-
-    if raw_path is not None:
-        log("Have raw path; compute intensity features")
-        # NOTE for now we can hard-code the resolution for the raw data here,
-        # but we might need to change this if we get additional dataset(s)
-        raw_resolution = [0.025, 0.01, 0.01]
-        scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
-    else:
-        log("Don't have raw path; do not compute intensity features")
-        scale_factor_raw = None
-
-    if nucleus_segmentation_path is not None:
-        log("Have nucleus path; exclude nucleus for intensity measures")
-        scale_factor_nucleus = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
-                                                nucleus_resolution)
-    else:
-        log("Don't have exclude path; don't exclude nucleus area for intensity measures")
-        scale_factor_nucleus = None
-
     # remove zero label if it exists
     table = table.loc[table['label_id'] != 0, :]
 
@@ -531,15 +472,55 @@ def morphology_impl_cell (cell_segmentation_path, raw_path,
         table = filter_table_bb(table, max_bb)
         log("Number of labels after bounding box size filter %i" % table.shape[0])
 
+    # get scale factors
+    if raw_path is not None:
+        log("Have raw path; compute intensity features")
+        # NOTE for now we can hard-code the resolution for the raw data here,
+        # but we might need to change this if we get additional dataset(s)
+        raw_resolution = [0.025, 0.01, 0.01]
+        scale_factor_raw = get_scale_factor(raw_path, raw_key_full, raw_key, raw_resolution)
+        f_raw = h5py.File(raw_path, 'r')
+        ds_raw = f_raw[raw_key]
+    else:
+        log("Don't have raw path; do not compute intensity features")
+        scale_factor_raw = None
+        f_raw = ds_raw = None
+
+    if nucleus_segmentation_path is not None:
+        log("Have nucleus path; exclude nucleus for intensity measures")
+        scale_factor_nucleus = get_scale_factor(nucleus_segmentation_path, nucleus_seg_key_full, nucleus_seg_key,
+                                                nucleus_resolution)
+        f_nucleus = h5py.File(nucleus_segmentation_path, 'r')
+        ds_nucleus = f_exclude[nucleus_seg_key]
+    else:
+        log("Don't have exclude path; don't exclude nucleus area for intensity measures")
+        scale_factor_nucleus = None
+        f_nucleus = ds_nucleus = None
+
     log("Computing morphology features")
-    stats = compute_morphology_features(table, segmentation_path, raw_path,
-                                        chromatin_path, exclude_nuc_path,
-                                        seg_key, raw_key, chromatin_key,
-                                        exclude_key,
-                                        scale_factor_seg, scale_factor_raw,
-                                        scale_factor_chromatin,
-                                        scale_factor_exclude,
-                                        label_starts, label_stops)
+    cell_seg_scale_factor = get_scale_factor(cell_segmentation_path, cell_seg_key_full, cell_seg_key, cell_resolution)
+    with h5py.File(cell_segmentation_path, 'r') as f:
+        ds = f[cell_seg_key]
+
+        stats = []
+        for label_a, label_b in zip(label_starts, label_stops):
+            log("Computing features from label-id %i to %i" % (label_a, label_b))
+            stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
+                                                             ds_chromatin,
+                                                             ds_exclude,
+                                                             scale_factor_seg, scale_factor_raw,
+                                                             scale_factor_chromatin,
+                                                             scale_factor_exclude,
+                                                             label_a, label_b))
+
+    for var in f_raw, f_nucleus:
+        if var is not None:
+            var.close()
+
+    # convert to pandas table and add column names
+    stats = pd.DataFrame(stats)
+    stats.columns = generate_column_names(raw_path, None)
+
     return stats
 
 
@@ -668,6 +649,93 @@ def morphology_impl(segmentation_path, raw_path, chromatin_path,
                                         label_starts, label_stops)
     return stats
 
+def compute_morphology_features(table, segmentation_path, raw_path,
+                                chromatin_path, exclude_nuc_path,
+                                seg_key, raw_key, chromatin_key,
+                                exclude_key,
+                                scale_factor_seg, scale_factor_raw,
+                                scale_factor_chromatin,
+                                scale_factor_exclude,
+                                label_starts, label_stops):
+    if raw_path != '':
+        assert raw_key is not None and scale_factor_raw is not None
+        f_raw = h5py.File(raw_path, 'r')
+        ds_raw = f_raw[raw_key]
+    else:
+        f_raw = ds_raw = None
+
+    if chromatin_path != '':
+        assert chromatin_key is not None and scale_factor_chromatin is not None
+        f_chromatin = h5py.File(chromatin_path, 'r')
+        ds_chromatin = f_chromatin[chromatin_key]
+    else:
+        f_chromatin = ds_chromatin = None
+
+    if exclude_nuc_path != '':
+        assert exclude_key is not None and scale_factor_exclude is not None
+        f_exclude = h5py.File(exclude_nuc_path, 'r')
+        ds_exclude = f_exclude[exclude_key]
+
+    else:
+        f_exclude = ds_exclude = None
+
+    with h5py.File(segmentation_path, 'r') as f:
+        ds = f[seg_key]
+
+        stats = []
+        for label_a, label_b in zip(label_starts, label_stops):
+            log("Computing features from label-id %i to %i" % (label_a, label_b))
+            stats.extend(morphology_features_for_label_range(table, ds, ds_raw,
+                                                             ds_chromatin,
+                                                             ds_exclude,
+                                                             scale_factor_seg, scale_factor_raw,
+                                                             scale_factor_chromatin,
+                                                             scale_factor_exclude,
+                                                             label_a, label_b))
+    if f_raw is not None:
+        f_raw.close()
+
+    if f_chromatin is not None:
+        f_chromatin.close()
+
+    if f_exclude is not None:
+        f_exclude.close()
+
+    # convert to pandas table and add column names
+    stats = pd.DataFrame(stats)
+    columns = ['label_id']
+    morph_columns = ['shape_volume_in_microns', 'shape_extent', 'shape_equiv_diameter',
+                     'shape_major_axis', 'shape_minor_axis', 'shape_surface_area', 'shape_sphericity',
+                     'shape_max_radius']
+
+    columns += morph_columns
+
+    if raw_path != '':
+        intensity_columns = ['intensity_mean', 'intensity_st_dev', 'intensity_median', 'intensity_iqr',
+                             'intensity_total']
+        texture_columns = ['texture_hara%s' % x for x in range(1, 14)]
+
+        columns += intensity_columns
+        # radial intensity columns
+        for val in [25, 50, 75, 100]:
+            columns += ['%s_%s' % (var, val) for var in intensity_columns]
+        columns += texture_columns
+
+    if chromatin_path != '':
+        edt_columns = ['shape_edt_mean', 'shape_edt_stdev', 'shape_edt_median', 'shape_edt_iqr']
+        edt_columns += ['shape_percent_%s' % var for var in [25, 50, 75, 100]]
+
+        for phase in ['_het', '_eu']:
+            columns += [var + phase for var in morph_columns]
+            columns += [var + phase for var in edt_columns]
+
+            if raw_path != '':
+                columns += [var + phase for var in intensity_columns]
+                columns += [var + phase for var in texture_columns]
+
+    stats.columns = columns
+    return stats
+
 
 
 if __name__ == '__main__':
-- 
GitLab