From fd712ab5f3c380c24cbf94fbd7e0a57e68791ecb Mon Sep 17 00:00:00 2001
From: Kimberly Meechan <kimberly.meechan@embl.de>
Date: Tue, 11 Jun 2019 12:59:05 +0200
Subject: [PATCH] Add morphology script

---
 scripts/attributes/morphology.py | 269 +++++++++++++++++++++++++++++++
 1 file changed, 269 insertions(+)
 create mode 100644 scripts/attributes/morphology.py

diff --git a/scripts/attributes/morphology.py b/scripts/attributes/morphology.py
new file mode 100644
index 0000000..dba5bac
--- /dev/null
+++ b/scripts/attributes/morphology.py
@@ -0,0 +1,269 @@
+#calculate morphology stats for cells / nuclei
+
+import pandas as pd
+import h5py
+import numpy as np
+from skimage.measure import regionprops, marching_cubes_lewiner, mesh_surface_area
+from skimage.transform import resize
+from skimage.io import imsave
+from skimage.util import pad
+
+def calculate_slice (scale, minmax):
+    """
+    Takes a scale (get this from xml file of object), and list of min and 
+    max coordinates in microns [min_coord_x_microns, min_coord_y_microns,
+    min_coord_z_microns, max_coord_x_microns, max_coord_y_microns, 
+    max_coord_z_microns].
+    Return a slice to use to extract
+    that bounding box from the original image.
+    """
+    #Get mins and flip order - so now z, y, x
+    mins = minmax[0:3][::-1]
+    #Get maxes and flip order - so now z, y, x
+    maxs = minmax[3:][::-1]
+    
+    #flip order of the scale
+    scale = scale[::-1]
+    
+    #convert min and max in microns to pixel ranges
+    mins = [int(mi / sca) for mi, sca in zip(mins, scale)]
+    maxs = [int(ma / sca) + 1 for ma, sca in zip(maxs, scale)]
+    
+    return( tuple(slice(mi, ma) for mi, ma in zip(mins, maxs)) )
+    
+  
+def calculate_stats_binary (mask, scale):
+    """
+    Calculate morphology statistics from the binary mask of an object
+    1 in object, 0 outside. 
+    mask - numpy array for image, should be binary, 1 in object, 0 outside
+    scale - list of form [x, y, z] stating scale in microns of image
+    """
+    #Calculate stats from skimage
+    ski_morph = regionprops(mask)
+    
+    #volume in pixels
+    volume_in_pix = ski_morph[0]['area']
+    
+    #extent
+    extent = ski_morph[0]['extent']
+    
+    #The mesh calculation below fails if an edge of the segmentation is right up against the
+    #edge of the volume - gives an open, rather than a closed surface
+    #Pad by a few pixels to avoid this
+    mask = pad(mask, 10, mode = 'constant')
+    
+    #surface area of mesh around object (other ways to calculate better?)
+    #spacing z, y , x to match mask dimensions
+    verts, faces, normals, values = marching_cubes_lewiner(mask, spacing = (scale[2], scale[1] , scale[0]))
+    surface_area = mesh_surface_area(verts, faces)
+    
+    #volume in microns
+    volume_in_microns = scale[0]*scale[1]*scale[2]*volume_in_pix
+    
+    #sphericity (as in morpholibj)
+    #Should run from zero to one
+    sphericity = (36*np.pi*(float(volume_in_microns)**2))/(float(surface_area)**3)
+    
+    return (volume_in_microns, extent, surface_area, sphericity)
+
+
+def calculate_stats_intensity (raw, mask):
+    """
+    Calculate intensity stats on the raw EM data for the region covered by the
+    provided binary mask. Mask and raw must have the same dimensions!
+    
+    raw - numpy array of raw image data
+    mask - numpy array of binary mask, 1 in object, 0 outside
+    """
+    
+    intensity_vals_in_mask = raw[mask == 1]
+    
+    #mean and stdev - use float64 to avoid silent overflow errors
+    mean_intensity = np.mean(intensity_vals_in_mask, dtype = np.float64)
+    st_dev = np.std(intensity_vals_in_mask, dtype = np.float64)
+    
+    return (mean_intensity, st_dev)
+
+
+#TODO - possibility that downscaled version of cell segmentation may not include some of the smallest cells in the table -
+#will fail if this happens. 
+def calculate_row_stats (row, seg_path, input_type, morphology, intensity):
+    """
+    Calculate all morphology stats for one row of the table
+    row - named tuple of values from that row
+    input_type - 'nucleus' or 'cell'
+    morphology - whether to calculate stats based on shape of segmentation: volume, extent, surface area, sphericity
+    intensity - whether to calculate stats based on raw data covered by segmentation: mean_intensity, st_dev_intensity
+    """
+    if row.label_id % 100 == 0:
+        print('Processing ' + str(row.label_id))
+    
+    #tuple to hold result
+    result = (row.label_id,)
+    
+    if input_type == 'nucleus':
+        full_seg_scale = [0.08, 0.08, 0.1]
+        #use full res of segmentation - 0.08, 0.08, 0.1
+        seg_level = 0
+        #Use the /t00000/s00/3/cells for raw data - gives pixel size similar to the nuclear segmentation
+        raw_level = 3
+        
+    elif input_type == 'cell':
+        full_seg_scale = [0.02, 0.02, 0.025]
+        #look at 4x downsampled segmentation - close to 0.08, 0.08, 0.1
+        seg_level = 2
+    
+    #min max bounding box in microns for segmentation
+    minmax_seg = [row.bb_min_x, row.bb_min_y, row.bb_min_z, row.bb_max_x, row.bb_max_y, row.bb_max_z]
+    
+    with h5py.File(seg_path, 'r') as f:
+        
+        #get shape of full data & downsampled
+        full_data_shape = f['t00000/s00/0/cells'].shape
+        down_data_shape = f['t00000/s00/' + str(seg_level) + '/cells'].shape
+            
+        #scale for downsampled
+        down_scale = [full_seg_scale[0]*(full_data_shape[2]/down_data_shape[2]),
+                      full_seg_scale[1]*(full_data_shape[1]/down_data_shape[1]),
+                      full_seg_scale[2]*(full_data_shape[0]/down_data_shape[0])]
+            
+        #slice for seg file
+        seg_slice = calculate_slice(down_scale, minmax_seg)
+            
+        #get specified layer in bdv file
+        data = f['t00000/s00/' + str(seg_level) + '/cells']
+        img_array = data[seg_slice]
+        
+     
+    #make into a binary mask
+    binary = img_array == int(row.label_id)
+    binary = binary.astype('uint8')
+    
+    img_array = None
+    
+    #calculate morphology stats straight from segmentation
+    if morphology == True:
+
+        result = result + calculate_stats_binary(binary, down_scale)
+    
+    #calculate intensity statistics
+    if intensity == True:
+        
+        #scale for full resolution raw data
+        full_raw_scale = [0.01, 0.01, 0.025]
+            
+        with h5py.File('/g/arendt/EM_6dpf_segmentation/EM-Prospr/em-raw-full-res.h5', 'r') as f:
+            
+            #get shape of full data & downsampled
+            full_data_shape = f['t00000/s00/0/cells'].shape
+            down_data_shape = f['t00000/s00/' + str(raw_level) + '/cells'].shape
+            
+            #scale for donwampled
+            down_scale = [full_raw_scale[0]*(full_data_shape[2]/down_data_shape[2]),
+                          full_raw_scale[1]*(full_data_shape[1]/down_data_shape[1]),
+                          full_raw_scale[2]*(full_data_shape[0]/down_data_shape[0])]
+            
+            #slice for raw file
+            raw_slice = calculate_slice(down_scale, minmax_seg)
+            
+            #get specified layer in bdv file
+            data = f['t00000/s00/' + str(raw_level) + '/cells']
+            img_array = data[raw_slice]
+            
+        
+        if img_array.shape != binary.shape:
+
+            #rescale the binary to match the raw, using nearest neighbour interpolation (order = 0)
+            binary = resize(binary, img_array.shape, order = 0, mode = 'reflect', anti_aliasing = True, preserve_range = True)
+            binary = binary.astype('uint8')
+        
+        
+        #Calculate statistics on raw data
+        result = result + calculate_stats_intensity(img_array, binary)
+    
+    
+    return (result)
+
+    
+def calculate_morphology_stats(table, seg_path, input_type, morphology = True, intensity = True):
+    '''
+    Calculate statistics based on EM & segmentation.
+    table - input table (pandas dataframe) 
+    input_type - 'nucleus' or 'cell'
+    morphology - whether to calculate stats based on shape of segmentation: volume_in_microns, extent, surface area, sphericity
+    intensity - whether to calculate stats based on raw data covered by segmentation: mean_intensity, st_dev_intensity
+    '''
+    
+    #size cutoffs (at the moment just browsed through segmentation and chose sensible values)
+    #not very stringent, still some rather large / small things included
+    if input_type == 'nucleus':
+        table = table.loc[table['shape_pixelsize'] >= 18313.0, :]
+        
+    elif input_type == 'cell':
+        criteria = np.logical_and(table['shape_pixelsize'] > 88741.0, table['shape_pixelsize'] < 600000000.0)
+        table = table.loc[criteria, :]
+    
+    stats = [calculate_row_stats(row, seg_path, input_type, morphology, intensity) for row in table.itertuples(index=False)]
+    
+    #convert back to pandas dataframe
+    stats = pd.DataFrame(stats)
+    
+    #names for columns
+    columns = ['label_id']
+    
+    if morphology == True:
+        columns = columns + ['shape_volume_in_microns', 'shape_extent', 'shape_surface_area', 'shape_sphericity']
+        
+    if intensity == True:
+        columns = columns + ['intensity_mean', 'intensity_st_dev']
+    
+    #set column names
+    stats.columns = columns
+    
+    return (stats)
+
+
+def write_morphology_table (cell_seg_path, nucleus_seg_path, cell_table_path, nucleus_table_path, cell_nuc_mapping_path, nucleus_out_path, cell_out_path):
+    """
+    Write csv files of morphology stats for both the nucleus and cell segmentation
+    
+    cell_seg_path - string, file path to cell segmentation
+    nucleus_seg_path - string, file path to nucleus segmentation
+    cell_table_path - string, file path to cell table
+    nucleus_table_path - string, file path to nucleus table
+    cell_nuc_mapping_path - string, file path to numpy array mapping cells to nuclei (first column cell id, second nucleus id)
+    nucleus_out_path - string, file path to save new nucleus table
+    cell_out_path - string, file path to save new cell table
+    """
+    
+    cell_table = pd.read_csv(cell_table_path, sep = '\t')
+    
+    nuclei_table = pd.read_csv(nucleus_table_path, sep = '\t')
+
+    #remove zero label form both tables if it exists
+    nuclei_table = nuclei_table.loc[nuclei_table['label_id'] != 0, :]
+    cell_table = cell_table.loc[cell_table['label_id'] != 0, :]
+    
+    #read in numpy array of mapping of cells to nuclei - first column cell id, second nucleus id
+    cell_nucleus_mapping = np.load(cell_nuc_mapping_path)
+    #remove zero labels from this table too, if exist
+    cell_nucleus_mapping = cell_nucleus_mapping[np.logical_and(cell_nucleus_mapping[:,0] != 0, cell_nucleus_mapping[:, 1] != 0)]
+    
+    #only keep cells in the cell_nuc_mapping (i.e those that have assigned nuclei)
+    cell_table = cell_table.loc[np.isin(cell_table['label_id'], cell_nucleus_mapping[:, 0]), :]
+    
+    #calculate stats for both tables and save results to csv
+    result_nuclei = calculate_morphology_stats(nuclei_table, nucleus_seg_path, 'nucleus', morphology = True, intensity = True)
+    result_nuclei.to_csv(nucleus_out_path, index = False, sep = '\t')
+    
+    result_cells = calculate_morphology_stats(cell_table, cell_seg_path, 'cell', morphology = True, intensity = False)
+    result_cells.to_csv(cell_out_path, index = False, sep = '\t')
+    
+    
+    
+    
+    
+    
+    
+    
-- 
GitLab