Implemented training-test split in label mask export

da091a5d · Christopher Randolph Rhodes · 05af7b71 · da091a5d · da091a5d
Commit da091a5d authored 1 year ago by Christopher Randolph Rhodes
--- a/extensions/chaeo/examples/label_patches.py
+++ b/extensions/chaeo/examples/label_patches.py
@@ -11,3 +11,4 @@ if __name__ == '__main__':
        ecotaxa_tsv='c:/Users/rhodes/projects/proj0011-plankton-seg/exp0013/ecotaxa_export_10468_20231012_0930.tsv',
        where_output=autonumber_new_directory(root, 'labeled_patches')
    )
+    print('Finished')
--- a/extensions/chaeo/workflows.py
+++ b/extensions/chaeo/workflows.py
@@ -4,6 +4,7 @@ from uuid import uuid4

 import numpy as np
 import pandas as pd
+from sklearn.model_selection import train_test_split

 from extensions.ilastik.models import IlastikPixelClassifierModel
 from extensions.chaeo.annotators import draw_boxes_on_3d_image
@@ -173,7 +174,11 @@ def transfer_ecotaxa_labels_to_patch_stacks(
        ecotaxa_tsv: str,
        where_output: str,
        patch_size: tuple = (256, 256),
+        tr_split=0.6,
 ) -> Dict:
+    assert tr_split > 0.5 # reduce chance that low-probability objects are omitted from training
+
+    # read patch metadata
    df_obj = pd.read_csv(
        object_csv,
    )
@@ -188,6 +193,8 @@ def transfer_ecotaxa_labels_to_patch_stacks(
        }
    )
    df_merge = pd.merge(df_obj, df_ecotaxa, left_on='patch_id', right_on='object_id')
+
+    # assign each unique lowest-level annotation to a class index
    se_unique = pd.Series(
        df_merge.object_annotation_hierarchy.unique()
    )
@@ -202,6 +209,7 @@ def transfer_ecotaxa_labels_to_patch_stacks(
        'annotation_class': df_split.loc[:, 1].str.lower()
    })

+    # join patch filenames and annotation classes
    df_pf = pd.merge(
        df_merge[['patch_filename', 'object_annotation_hierarchy']],
        df_labels,
@@ -210,13 +218,28 @@ def transfer_ecotaxa_labels_to_patch_stacks(
    )
    df_pl = df_pf[df_pf['object_annotation_hierarchy'].notnull()]

-    zstack = np.zeros((*patch_size, 1, len(df_pl)), dtype='uint8')
-
+    # export annotation classes and their summary stats
+    df_tr, df_te = train_test_split(df_pl, train_size=tr_split)
    df_labels['counts'] = df_pl['annotation_class_id'].value_counts()
-    df_labels.to_csv(Path(where_output) / 'labels_key.csv')
-
-    # export patches as z-stack
-    for fi, pl in enumerate(df_pl.itertuples(name='PatchFile')):
+    df_labels = pd.merge(
+        df_labels,
+        pd.DataFrame(
+            [df_tr.annotation_class_id.value_counts(), df_te.annotation_class_id.value_counts()],
+            index=['to_train', 'to_test']
+        ).T,
+        left_on='annotation_class_id',
+        right_index=True,
+        how='outer'
+    )
+    df_labels.loc[df_labels.to_train.isna(), 'to_train'] = 0
+    df_labels.loc[df_labels.to_test.isna(), 'to_test'] = 0
+    for col in ['to_train', 'to_test', 'counts']:
+        df_labels.loc[df_labels[col].isna(), col] = 0
+    df_labels.to_csv(Path(where_output) / 'labels_key.csv', index=False)
+
+    # export patches as a single z-stack
+    zstack = np.zeros((*patch_size, 1, len(df_tr)), dtype='uint8')
+    for fi, pl in enumerate(df_tr.itertuples(name='PatchFile')):
        fn = pl._asdict()['patch_filename']
        ac = pl._asdict()['annotation_class_id']
        acc_bm = generate_file_accessor(Path(where_masks) / fn)
@@ -224,8 +247,6 @@ def transfer_ecotaxa_labels_to_patch_stacks(
        assert acc_bm.chroma == 1
        assert acc_bm.nz == 1
        zstack[:, :, 0, fi] = (acc_bm.data[:, :, 0, 0] == 255) * ac
-
-    # export masks as z-stack
    write_accessor_data_to_file(Path(where_output) / 'zstack_object_label.tif', InMemoryDataAccessor(zstack))