From cd2fe9b81b525af99a46093c033b20a92e16c7d0 Mon Sep 17 00:00:00 2001
From: Christopher Rhodes <christopher.rhodes@embl.de>
Date: Thu, 12 Oct 2023 10:53:51 +0200
Subject: [PATCH] Moved more of batch runner into utility methods

---
 .../chaeo/examples/batch_run_patches.py       | 77 +++++++------------
 extensions/chaeo/util.py                      | 59 +++++++++++++-
 extensions/chaeo/workflows.py                 | 40 +++++++++-
 3 files changed, 123 insertions(+), 53 deletions(-)

diff --git a/extensions/chaeo/examples/batch_run_patches.py b/extensions/chaeo/examples/batch_run_patches.py
index f0edacd7..15ef8c99 100644
--- a/extensions/chaeo/examples/batch_run_patches.py
+++ b/extensions/chaeo/examples/batch_run_patches.py
@@ -1,14 +1,8 @@
 from pathlib import Path
-import re
-# from time import localtime, strftime
 
-import pandas as pd
-
-from extensions.chaeo.util import autonumber_new_directory, get_matching_files
+from extensions.chaeo.util import autonumber_new_directory, get_matching_files, loop_workflow
 from extensions.chaeo.workflows import export_patches_from_multichannel_zstack
 
-from model_server.accessors import InMemoryDataAccessor, write_accessor_data_to_file
-
 if __name__ == '__main__':
     where_czi = 'z:/rhodes/projects/proj0004-marine-photoactivation/data/exp0038/AutoMic/20230906-163415/Selection'
 
@@ -17,48 +11,33 @@ if __name__ == '__main__':
         'batch-output'
     )
 
-    csv_args = {'mode': 'w', 'header': True} # when creating file
     px_ilp = Path.home() / 'model-server' / 'ilastik' / 'AF405-bodies_boundaries.ilp'
 
-    #TODO: try/catch blocks and error handling around workflow calls
-    #TODO: pack JSON-serializable workflow inputs
-
-    input_files = get_matching_files(where_czi, 'czi', coord_filter={'P': (0, 10)})
-    for ff in input_files:
-
-        export_kwargs = {
-            'input_zstack_path': Path(where_czi) / ff.__str__(),
-            'ilastik_project_file': px_ilp.__str__(),
-            'pxmap_threshold': 0.25,
-            'pixel_class': 0,
-            'zmask_channel': 0,
-            'patches_channel': 4,
-            'where_output': where_output,
-            'mask_type': 'boxes',
-            'zmask_filters': {'area': (1e3, 1e8)},
-            'zmask_expand_box_by': (128, 3),
-            'export_pixel_probabilities': False,
-            'export_2d_patches_for_training': True,
-            'export_2d_patches_for_annotation': False,
-            'export_3d_patches': False,
-            'export_annotated_zstack': False,
-            'export_patch_masks': False,
-            'export_patch_label_maps': True,
-        }
-
-        result = export_patches_from_multichannel_zstack(**export_kwargs)
-
-        # parse and record results
-        df = result['dataframe']
-        df['source_path'] = ff
-        df.to_csv(where_output / 'df_objects.csv', index=False, **csv_args)
-        pd.DataFrame(result['timer_results'], index=[0]).to_csv(where_output / 'timer_results.csv', **csv_args)
-        pd.json_normalize(export_kwargs).to_csv(where_output / 'workflow_params.csv', **csv_args)
-        csv_args = {'mode': 'a', 'header': False} # append to CSV from here on
+    params = {
+        'ilastik_project_file': px_ilp.__str__(),
+        'pxmap_threshold': 0.25,
+        'pixel_class': 0,
+        'zmask_channel': 0,
+        'patches_channel': 4,
+        'mask_type': 'boxes',
+        'zmask_filters': {'area': (1e3, 1e8)},
+        'zmask_expand_box_by': (128, 3),
+        'export_pixel_probabilities': False,
+        'export_2d_patches_for_training': True,
+        'export_2d_patches_for_annotation': False,
+        'export_3d_patches': False,
+        'export_annotated_zstack': False,
+        'export_patch_masks': False,
+        'export_patch_label_maps': True,
+    }
+
+    input_files = get_matching_files(where_czi, 'czi', coord_filter={'P': (0, 10)}, )
+
+    loop_workflow(
+        input_files,
+        where_output,
+        export_patches_from_multichannel_zstack,
+        params,
+    )
 
-        # export intermediate data if flagged
-        for k in result['interm'].keys():
-            write_accessor_data_to_file(
-                where_output / k / (ff.stem + '.tif'),
-                InMemoryDataAccessor(result['interm'][k])
-            )
\ No newline at end of file
+    print('Finished')
\ No newline at end of file
diff --git a/extensions/chaeo/util.py b/extensions/chaeo/util.py
index 402d05dd..7a8406db 100644
--- a/extensions/chaeo/util.py
+++ b/extensions/chaeo/util.py
@@ -2,6 +2,10 @@ from pathlib import Path
 import re
 from time import localtime, strftime
 
+import pandas as pd
+
+from model_server.accessors import InMemoryDataAccessor, write_accessor_data_to_file
+
 def autonumber_new_directory(where: str, prefix: str) -> str:
     yyyymmdd = strftime('%Y%m%d', localtime())
 
@@ -10,8 +14,9 @@ def autonumber_new_directory(where: str, prefix: str) -> str:
         ma = re.match(f'{prefix}-{yyyymmdd}-([\d]+)', ff.name)
         if ma:
             idx = max(idx, int(ma.groups()[0]) + 1)
-
-    return (Path(where) / f'batch-output-{yyyymmdd}-{idx:04d}').__str__()
+    new_path = (Path(where) / f'batch-output-{yyyymmdd}-{idx:04d}')
+    new_path.mkdir(parents=True, exist_ok=False)
+    return new_path.__str__()
 
 def get_matching_files(where: str, ext: str, coord_filter: dict={}) -> str:
     files = []
@@ -33,4 +38,52 @@ def get_matching_files(where: str, ext: str, coord_filter: dict={}) -> str:
         if is_filtered_out(ff):
             continue
         files.append(ff.__str__())
-    return files
\ No newline at end of file
+    return files
+
+
+def loop_workflow(files, where_output, workflow_func, params,
+                  write_intermediate_products=True):
+    failures = []
+    for ii, ff in enumerate(files):
+        export_kwargs = {
+            'input_zstack_path': ff,
+            'where_output': where_output,
+            **params,
+        }
+
+        # record failure information
+        try:
+            result = workflow_func(**export_kwargs)
+        except Exception as e:
+            failures.append({
+                'input_file': ff,
+                'error_message': e.__str__(),
+            })
+            print(f'Caught failure on {ff}:\n{e.__str__()}')
+            continue
+
+        # record dataframes associated with workflow results
+        batch_csv = {
+            'workflow_data': result['dataframe'],
+            'timer_results': pd.DataFrame(result['timer_results'], index=[0]),
+            'workflow_parameters': pd.json_normalize(export_kwargs),
+        }
+        for k in batch_csv.keys():
+            df = batch_csv[k]
+            df['input_file'] = ff
+            if ii == 0:
+                csv_args = {'mode': 'w', 'header': True}
+            else:  # append to existing file
+                csv_args = {'mode': 'a', 'header': False}
+            csv_path = Path(where_output) / f'{k}.csv'
+            df.to_csv(csv_path, index=False, **csv_args)
+
+        # export intermediate data if flagged
+        if write_intermediate_products:
+            for k in result['interm'].keys():
+                write_accessor_data_to_file(
+                    Path(where_output) / k / (Path(ff).stem + '.tif'),
+                    InMemoryDataAccessor(result['interm'][k])
+                )
+
+    pd.DataFrame(failures).to_csv(Path(where_output) / 'failures.csv')
\ No newline at end of file
diff --git a/extensions/chaeo/workflows.py b/extensions/chaeo/workflows.py
index 9a290ec5..332f8f9f 100644
--- a/extensions/chaeo/workflows.py
+++ b/extensions/chaeo/workflows.py
@@ -12,6 +12,7 @@ from model_server.accessors import generate_file_accessor, InMemoryDataAccessor,
 from model_server.workflows import Timer
 
 # TODO: unpack and validate inputs
+# TODO: expose channel indices and color balance vectors to caller
 def export_patches_from_multichannel_zstack(
         input_zstack_path: str,
         ilastik_project_file: str,
@@ -106,6 +107,23 @@ def export_patches_from_multichannel_zstack(
         # prepopulate patch UUID
         df['patch_id'] = df.apply(lambda _: uuid4(), axis=1)
 
+    if export_2d_patches_for_training:
+        files = export_multichannel_patches_from_zstack(
+            Path(where_output) / '2d_patches',
+            stack.get_one_channel_data(4),
+            zmask_meta,
+            prefix=fstem,
+            rescale_clip=0.001,
+            make_3d=False,
+            focus_metric='max_sobel',
+        )
+        df_patches = pd.DataFrame(files)
+        ti.click('export_2d_patches')
+        # associate 2d patches, dropping labeled objects that were not exported as patches
+        df = pd.merge(df, df_patches, left_index=True, right_on='df_index').drop(columns='df_index')
+        # prepopulate patch UUID
+        df['patch_id'] = df.apply(lambda _: uuid4(), axis=1)
+
     if export_patch_masks:
         files = export_patch_masks_from_zstack(
             Path(where_output) / 'patch_masks',
@@ -143,4 +161,24 @@ def export_patches_from_multichannel_zstack(
         'timer_results': ti.events,
         'dataframe': df,
         'interm': interm,
-    }
\ No newline at end of file
+    }
+
+def transfer_ecotaxa_labels_to_patch_object_maps(
+        path_to_patches: str,
+        path_to_ecotaxa_tsv: str,
+        path_output: str,
+) -> Dict:
+    where_patches = Path(path_to_patches)
+    df_meta = pd.read_csv(
+        path_to_ecotaxa_tsv,
+        sep='\t',
+        header=[0, 1],
+        dtype={
+            ('object_annotation_date', '[t]'): str,
+            ('object_annotation_time', '[t]'): str,
+            ('object_annotation_category_id', '[t]'): str,
+        }
+    )
+    for pp in where_patches.iterdir():
+        patch = generate_file_accessor(pp)
+
-- 
GitLab