Add script for chunk estimates

3f682f72 · Constantin Pape · 710ea4fb · 3f682f72 · 3f682f72
Commit 3f682f72 authored 5 years ago by Constantin Pape
--- a/analysis/nephridia/analyse_cilia.py
+++ b/analysis/nephridia/analyse_cilia.py
 import numpy as np
 import pandas as pd
-from make_cell_table import right_nephr_ids, left_nephr_ids
 from matplotlib import pyplot as plt

-TABLE_PATH = '../../data/0.6.2/tables/sbem-6dpf-1-whole-segmented-cilia-labels/cell_mapping.csv'

+def get_nephr_ids(version):
+    table_path = '../../data/%s/tables/sbem-6dpf-1-whole-segmented-cells-labels/regions.csv' % version
+    table = pd.read_csv(table_path, sep='\t')
+    nephr_ids = table['nephridia'].values
+    right_nephr_ids = np.where(nephr_ids == 1)[0]
+    left_nephr_ids = np.where(nephr_ids == 2)[0]
+    return right_nephr_ids, left_nephr_ids

-def check_cell_ids():
-    table = pd.read_csv(TABLE_PATH, sep='\t')
+
+def check_cell_ids(version):
+    table_path = '../../data/%s/tables/sbem-6dpf-1-whole-segmented-cilia-labels/cell_mapping.csv' % version
+
+    right_nephr_ids, left_nephr_ids = get_nephr_ids(version)
+    table = pd.read_csv(table_path, sep='\t')
    cell_ids = table['cell_id'].values

    matched_right = []
@@ -35,16 +44,18 @@ def check_cell_ids():
    print("With cilia:", len(matched_left))


-def plot_cilia_per_cell():
+def plot_cilia_per_cell(version):
    counts_left = []
    counts_right = []

-    table = pd.read_csv(TABLE_PATH, sep='\t')
+    table_path = '../../data/%s/tables/sbem-6dpf-1-whole-segmented-cilia-labels/cell_mapping.csv' % version
+    table = pd.read_csv(table_path, sep='\t')
    cell_ids = table['cell_id']
    cell_ids = cell_ids[cell_ids != 0]
    cell_ids = cell_ids[~np.isnan(cell_ids)]
    cell_ids = cell_ids.astype('uint32')

+    right_nephr_ids, left_nephr_ids = get_nephr_ids(version)
    unique_cell_ids = np.unique(cell_ids)

    total_count = 0
@@ -84,5 +95,6 @@ def plot_cilia_per_cell():


 if __name__ == '__main__':
-    check_cell_ids()
-    plot_cilia_per_cell()
+    version = '0.6.5'
+    check_cell_ids(version)
+    plot_cilia_per_cell(version)
--- a/scripts/files/for_upload.py
+++ b/scripts/files/for_upload.py
@@ -214,7 +214,7 @@ def copy_segmentations(in_folder, out_folder, segmentations_to_copy, output_root
        # path in bucket is the relative path from out_file to output_root
        path_in_bucket = os.path.relpath(out_file, output_root)
        out_file = os.path.join(s3_folder, seg_name + '.xml')
-        make_xml_s3(in_file, out_file, path_in_bucket, s3_config, shape, resolution)
+        make_xml_s3(in_file, out_file, path_in_bucket, None, shape, resolution)

        # check if we need to copy tables
        seg_table_in = os.path.join(table_in, seg_name)
@@ -258,7 +258,7 @@ def copy_folder_for_s3(version, images_to_copy, segmentations_to_copy, output_ro
                       segmentations_to_copy, output_root)


-if __name__ == '__main__':
+def make_test_folder():
    res = [.1, .08, .08]
    im_names = {'sbem-6dpf-1-whole-raw': {'start_scale': 3, 'resolution': res},
                'prospr-6dpf-1-whole-AChE-MED': {'resolution': [.55, .55, .55]}}
@@ -268,3 +268,91 @@ if __name__ == '__main__':
    out = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/test_n5'
    s3_config = {}
    copy_folder_for_s3('0.6.5', im_names, seg_names, out, s3_config)
+
+
+def make_different_chunkings():
+    path = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/rawdata/sbem-6dpf-1-whole-raw.h5'
+    xml_path = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/0.6.5/images/sbem-6dpf-1-whole-raw.xml'
+    output_root = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/test_n5'
+    data_out_folder = os.path.join(output_root, 'rawdata')
+    start_scale = 3
+    resolution = [.1, .08, .08]
+    chunk_shapes = [(32, 256, 256), (32, 128, 128), (64, 64, 64)]
+    chunk_shapes = [(128, 128, 128)]
+    for ii, chunks in enumerate(chunk_shapes, 4):
+        out_path = os.path.join(data_out_folder, 'sbem-6dpf-1-whole-raw-%i.n5' % ii)
+        copy_file_to_bdv_n5(path, out_path, resolution, chunks, start_scale)
+        # make the xml
+        path_in_bucket = os.path.relpath(out_path, output_root)
+        with open_file(out_path, 'r') as f:
+            shape = f['setup0/timepoint0/s0'].shape
+        out_path = os.path.join(data_out_folder, 'sbem-6dpf-1-whole-raw-%i.xml' % ii)
+        make_xml_s3(xml_path, out_path, path_in_bucket, None, shape, resolution)
+
+
+def iterate_chunks(path, key):
+    with open_file(path, 'r') as f:
+        ds = f[key]
+        n_chunks = ds.number_of_chunks
+
+    ds_path = os.path.join(path, key)
+    chunk_sizes = []
+    for root, dirs, files in os.walk(ds_path):
+        for name in files:
+            if name == 'attributes.json':
+                continue
+            size = os.path.getsize(os.path.join(root, name))
+            chunk_sizes.append(size)
+
+    n_filled = len(chunk_sizes)
+    percent_filled = float(n_filled) / n_chunks
+
+    return percent_filled, chunk_sizes
+
+
+def check_block_shapes():
+    import nifty.tools as nt
+    full_path = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/rawdata/sbem-6dpf-1-whole-raw.h5'
+    key = 't00000/s00/0/cells'
+    with open_file(full_path, 'r') as f:
+        shape = f[key].shape
+
+    prefix = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data/test_n5/rawdata/sbem-6dpf-1-whole-raw-%i.n5'
+    ds_key = 'setup0/timepoint0/s0'
+    block_shapes = [[32, 256, 256], [32, 128, 128], [64, 64, 64], [128, 128, 128]]
+    for ii, block_shape in enumerate(block_shapes, 1):
+        path = prefix % ii
+        percent_filled, sizes = iterate_chunks(path, ds_key)
+        n_total = nt.blocking([0, 0, 0], shape, block_shape).numberOfBlocks
+        n_filled = int(n_total * percent_filled)
+        print("Chunk-shape:", block_shape)
+        print("Nr. chunks at highest res:", n_filled)
+        print("Mean chunk size in MB:", np.mean(sizes) / 1.e6, "+-", np.std(sizes) / 1.e6)
+        print("Min/max chunk size in MB:", np.min(sizes) / 1.e6, "/", np.max(sizes) / 1.e6)
+        print()
+
+
+def estimate_chunk_sizes():
+    ref_chunk_shape = [128, 128, 128]
+    ref_chunk_size = float(np.prod(ref_chunk_shape))
+    ref_chunk_mb = 1.0160599442530536
+    ref_n_chunks = 1553606.
+
+    start = 64
+    stop = 128
+    step = 16
+    for chunk_len in range(start, stop + step, step):
+        print("Chunks: %i^3" % chunk_len)
+        rel_size = chunk_len ** 3 / ref_chunk_size
+        chunk_mb = ref_chunk_mb * rel_size
+        n_chunks = ref_n_chunks / rel_size
+        print("Nr. chunks at highest res:", int(n_chunks))
+        print("Mean chunk size in MB:", chunk_mb)
+        print()
+
+
+if __name__ == '__main__':
+    # make_test_folder()
+    # make_different_chunkings()
+    # check_block_shapes()
+    estimate_chunk_sizes()