From 196d0db101abecb5e6716a69e0539af7c5140f42 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@iwr.uni-heidelberg.de>
Date: Fri, 7 Feb 2020 11:48:08 +0100
Subject: [PATCH] Clean up xml utils and remove bdv server file scripts

---
 mmpb/attributes/master.py                     |  12 +-
 mmpb/attributes/region_attributes.py          |  12 +-
 mmpb/export/extract_subvolume.py              |   5 +-
 mmpb/export/map_segmentation_ids.py           |   4 +-
 mmpb/files/bdv_server.py                      |  49 ------
 mmpb/files/checks.py                          |   4 +-
 mmpb/files/copy_helper.py                     |  13 +-
 mmpb/files/for_upload.py                      |   7 +-
 mmpb/files/migration.py                       |  51 ++++--
 mmpb/files/name_lookup.py                     |   9 +-
 mmpb/files/sources.py                         |   8 +-
 mmpb/files/xml_utils.py                       | 160 ++++++++++--------
 .../correction/cillia_correction_tool.py      |   4 +-
 13 files changed, 173 insertions(+), 165 deletions(-)
 delete mode 100644 mmpb/files/bdv_server.py

diff --git a/mmpb/attributes/master.py b/mmpb/attributes/master.py
index 89bd8a0..7fe9bdb 100644
--- a/mmpb/attributes/master.py
+++ b/mmpb/attributes/master.py
@@ -1,5 +1,6 @@
 import os
 import h5py
+from pybdv.metadata import get_data_path
 
 from .base_attributes import base_attributes, propagate_attributes, write_additional_table_file
 from .cell_nucleus_mapping import map_cells_to_nuclei
@@ -7,12 +8,11 @@ from .genes import gene_assignment_table, vc_assignment_table
 from .morphology import write_morphology_cells, write_morphology_nuclei
 from .region_attributes import region_attributes, extrapolated_intensities
 from .cilia_attributes import cilia_morphology
-from ..files.xml_utils import get_h5_path_from_xml
 
 
 def get_seg_path(folder, name, key=None):
     xml_path = os.path.join(folder, 'segmentations', '%s.xml' % name)
-    path = get_h5_path_from_xml(xml_path, return_absolute_path=True)
+    path = get_data_path(xml_path, return_absolute_path=True)
     assert os.path.exists(path), path
     if key is not None:
         with h5py.File(path, 'r') as f:
@@ -43,7 +43,7 @@ def make_cell_tables(old_folder, folder, name, tmp_folder, resolution,
 
     # make table with gene mapping
     aux_gene_xml = os.path.join(folder, 'misc', 'prospr-6dpf-1-whole_meds_all_genes.xml')
-    aux_gene_path = get_h5_path_from_xml(aux_gene_xml, return_absolute_path=True)
+    aux_gene_path = get_data_path(aux_gene_xml, return_absolute_path=True)
     if not os.path.exists(aux_gene_path):
         raise RuntimeError("Can't find auxiliary gene file @ %s" % aux_gene_path)
     gene_out = os.path.join(table_folder, 'genes.csv')
@@ -79,7 +79,7 @@ def make_cell_tables(old_folder, folder, name, tmp_folder, resolution,
 
     # mapping to extrapolated intensities
     extrapol_mask = os.path.join(folder, 'images', 'sbem-6dpf-1-whole-mask-extrapolated.xml')
-    extrapol_mask = get_h5_path_from_xml(extrapol_mask, return_absolute_path=True)
+    extrapol_mask = get_data_path(extrapol_mask, return_absolute_path=True)
     extrapol_out = os.path.join(table_folder, 'extrapolated_intensity_correction.csv')
     extrapolated_intensities(seg_path, 't00000/s00/3/cells',
                              extrapol_mask, 't00000/s00/0/cells',
@@ -114,7 +114,7 @@ def make_nuclei_tables(old_folder, folder, name, tmp_folder, resolution,
 
     # make the morphology attribute table
     xml_raw = os.path.join(folder, 'images', 'sbem-6dpf-1-whole-raw.xml')
-    raw_path = get_h5_path_from_xml(xml_raw, return_absolute_path=True)
+    raw_path = get_data_path(xml_raw, return_absolute_path=True)
     cell_seg_path = get_seg_path(folder, 'sbem-6dpf-1-whole-segmented-cells-labels')
     chromatin_seg_path = get_seg_path(folder, 'sbem-6dpf-1-whole-segmented-chromatin-labels')
     morpho_out = os.path.join(table_folder, 'morphology.csv')
@@ -125,7 +125,7 @@ def make_nuclei_tables(old_folder, folder, name, tmp_folder, resolution,
 
     # mapping to extrapolated intensities
     extrapol_mask = os.path.join(folder, 'segmentations', 'sbem-6dpf-mask-extrapolated.xml')
-    extrapol_mask = get_h5_path_from_xml(extrapol_mask, return_absolute_path=True)
+    extrapol_mask = get_data_path(extrapol_mask, return_absolute_path=True)
     extrapol_out = os.path.join(table_folder, 'extrapolated_intensity_correction.csv')
     extrapolated_intensities(seg_path, 't00000/s00/1/cells',
                              extrapol_mask, 't00000/s00/0/cells',
diff --git a/mmpb/attributes/region_attributes.py b/mmpb/attributes/region_attributes.py
index 6855f33..99811b1 100644
--- a/mmpb/attributes/region_attributes.py
+++ b/mmpb/attributes/region_attributes.py
@@ -3,9 +3,9 @@ import glob
 import numpy as np
 import pandas as pd
 import h5py
+from pybdv.metadata import get_data_path
 
 from .util import write_csv, node_labels, normalize_overlap_dict
-from ..files.xml_utils import get_h5_path_from_xml
 
 
 def write_region_table(label_ids, label_list, semantic_mapping_list, out_path):
@@ -55,17 +55,19 @@ def muscle_attributes(muscle_path, key_muscle,
     return muscle_labels, semantic_muscle
 
 
+# TODO add nephridia
 def region_attributes(seg_path, region_out,
                       image_folder, segmentation_folder,
                       label_ids, tmp_folder, target, max_jobs,
                       key_seg='t00000/s00/2/cells'):
+    assert False, "Add nephridia before running this!"
     key_tissue = 't00000/s00/0/cells'
 
     # 1.) compute the mapping to carved regions
     #
     carved_path = os.path.join(segmentation_folder,
                                'sbem-6dpf-1-whole-segmented-tissue-labels.xml')
-    carved_path = get_h5_path_from_xml(carved_path, return_absolute_path=True)
+    carved_path = get_data_path(carved_path, return_absolute_path=True)
     carved_labels = node_labels(seg_path, key_seg,
                                 carved_path, key_tissue,
                                 'carved-regions', tmp_folder,
@@ -81,7 +83,7 @@ def region_attributes(seg_path, region_out,
 
     # 2.) compute the mapping to muscles
     muscle_path = os.path.join(segmentation_folder, 'sbem-6dpf-1-whole-segmented-muscle.xml')
-    muscle_path = get_h5_path_from_xml(muscle_path, return_absolute_path=True)
+    muscle_path = get_data_path(muscle_path, return_absolute_path=True)
     # need to be more lenient with the overlap criterion for the muscle mapping
     muscle_labels, semantic_muscle = muscle_attributes(muscle_path, key_tissue,
                                                        seg_path, key_seg,
@@ -92,7 +94,7 @@ def region_attributes(seg_path, region_out,
     # 3.) map all the segmented prospr regions
     region_paths = glob.glob(os.path.join(image_folder, "prospr-6dpf-1-whole-segmented-*"))
     region_names = [os.path.splitext(pp.split('-')[-1])[0].lower() for pp in region_paths]
-    region_paths = [get_h5_path_from_xml(rp, return_absolute_path=True)
+    region_paths = [get_data_path(rp, return_absolute_path=True)
                     for rp in region_paths]
     for rpath, rname in zip(region_paths, region_names):
         rlabels = node_labels(seg_path, key_seg,
@@ -104,7 +106,7 @@ def region_attributes(seg_path, region_out,
 
     # 4.) map the midgut segmentation
     midgut_path = os.path.join(segmentation_folder, 'sbem-6dpf-1-whole-segmented-midgut.xml')
-    midgut_path = get_h5_path_from_xml(midgut_path, return_absolute_path=True)
+    midgut_path = get_data_path(midgut_path, return_absolute_path=True)
     midgut_labels = node_labels(seg_path, key_seg, midgut_path, key_tissue,
                                 'midgut', tmp_folder, target, max_jobs)
     label_list.append(midgut_labels)
diff --git a/mmpb/export/extract_subvolume.py b/mmpb/export/extract_subvolume.py
index c964e7b..f78d9ef 100644
--- a/mmpb/export/extract_subvolume.py
+++ b/mmpb/export/extract_subvolume.py
@@ -1,8 +1,7 @@
 import os
 import h5py
 import imageio
-
-from ..files.xml_utils import get_h5_path_from_xml
+from pybdv.metadata import get_data_path
 
 
 def parse_coordinate(coord):
@@ -79,7 +78,7 @@ def cutout_data(tag, name, scale, bb_start, bb_stop):
     assert all(sta < sto for sta, sto in zip(bb_start, bb_stop))
 
     path = os.path.join('data', tag, name_to_path(name))
-    path = get_h5_path_from_xml(path, return_absolute_path=True)
+    path = get_data_path(path, return_absolute_path=True)
     resolution = get_res_level(scale)
 
     base_scale = name_to_base_scale(name)
diff --git a/mmpb/export/map_segmentation_ids.py b/mmpb/export/map_segmentation_ids.py
index 5b86099..a3386fc 100644
--- a/mmpb/export/map_segmentation_ids.py
+++ b/mmpb/export/map_segmentation_ids.py
@@ -4,7 +4,7 @@ import luigi
 import z5py
 
 from cluster_tools.node_labels import NodeLabelWorkflow
-from ..files.xml_utils import get_h5_path_from_xml
+from pybdv.metadata import get_data_path
 from ..default_config import write_default_global_config
 
 
@@ -23,7 +23,7 @@ def get_seg_path(folder, name):
     path = os.path.join(data_folder, '%s.xml' % name)
     # read h5 path from the xml
     if os.path.exists(path):
-        path = get_h5_path_from_xml(path, return_absolute_path=True)
+        path = get_data_path(path, return_absolute_path=True)
         if not os.path.exists(path):
             raise RuntimeError("Invalid path in xml")
         return path
diff --git a/mmpb/files/bdv_server.py b/mmpb/files/bdv_server.py
deleted file mode 100644
index f153b69..0000000
--- a/mmpb/files/bdv_server.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import os
-from .xml_utils import get_h5_path_from_xml
-from .sources import get_privates, get_image_names, get_segmentation_names
-
-
-def add_to_bdv_config(name, path, bdv_config, relative_paths, ref_dir):
-
-    # make sure that the h5path linked in the xml exists
-    if not os.path.exists(path):
-        return bdv_config
-    h5path = get_h5_path_from_xml(path, return_absolute_path=True)
-    if not os.path.exists(h5path):
-        msg = 'Path to h5-file in xml does not exist - %s, %s' % (path, h5path)
-        # raise RuntimeError(msg)
-        return bdv_config
-
-    if relative_paths:
-        path = os.path.relpath(path, ref_dir)
-    bdv_config[name] = path
-    return bdv_config
-
-
-def make_bdv_server_file(folder, out_path, relative_paths=True):
-    """ Make the bigserver config file for a given release.
-    """
-    privates = get_privates()
-    image_names = get_image_names()
-    seg_names = get_segmentation_names()
-    ref_dir = os.path.split(out_path)[0]
-
-    bdv_config = {}
-    for name in image_names:
-        if name in privates:
-            continue
-        path = os.path.join(folder, 'images', '%s.xml' % name)
-        bdv_config = add_to_bdv_config(name, path, bdv_config,
-                                       relative_paths, ref_dir)
-
-    for name in seg_names:
-        if name in privates:
-            continue
-        path = os.path.join(folder, 'segmentations', '%s.xml' % name)
-        bdv_config = add_to_bdv_config(name, path, bdv_config,
-                                       relative_paths, ref_dir)
-
-    with open(out_path, 'w') as f:
-        for name, path in bdv_config.items():
-            line = '%s\t%s\n' % (name, path)
-            f.write(line)
diff --git a/mmpb/files/checks.py b/mmpb/files/checks.py
index 073ab88..17e6d27 100644
--- a/mmpb/files/checks.py
+++ b/mmpb/files/checks.py
@@ -2,7 +2,7 @@ import os
 import pandas as pd
 import z5py
 
-from .xml_utils import get_h5_path_from_xml
+from pybdv.metadata import get_data_path
 
 
 # TODO check more attributes in the xml to make sure that this actually is
@@ -11,7 +11,7 @@ def check_bdv(path):
     ext = os.path.splitext(path)[1]
     if ext != '.xml':
         return False
-    h5_path = get_h5_path_from_xml(path, return_absolute_path=True)
+    h5_path = get_data_path(path, return_absolute_path=True)
     if not os.path.exists(h5_path):
         return False
     return True
diff --git a/mmpb/files/copy_helper.py b/mmpb/files/copy_helper.py
index 9411984..ed5b8cc 100644
--- a/mmpb/files/copy_helper.py
+++ b/mmpb/files/copy_helper.py
@@ -4,10 +4,10 @@ import numpy as np
 
 from elf.io import open_file
 from pybdv.converter import copy_dataset
-from pybdv.metadata import write_n5_metadata
+from pybdv.metadata import write_n5_metadata, get_data_path
 from pybdv.util import get_key, get_number_of_scales, get_scale_factors
 
-from .xml_utils import copy_xml_with_newpath, get_h5_path_from_xml
+from .xml_utils import copy_xml_with_newpath
 from .sources import get_image_names, get_segmentation_names, get_segmentations
 from ..attributes.base_attributes import write_additional_table_file
 
@@ -30,7 +30,7 @@ def make_squashed_link(src_file, dst_file, override=False):
 
 
 def copy_file(xml_in, xml_out):
-    h5path = get_h5_path_from_xml(xml_in, return_absolute_path=True)
+    h5path = get_data_path(xml_in, return_absolute_path=True)
     xml_dir = os.path.split(xml_out)[0]
     h5path = os.path.relpath(h5path, start=xml_dir)
     copy_xml_with_newpath(xml_in, xml_out, h5path, path_type='relative')
@@ -175,7 +175,10 @@ def copy_to_bdv_n5(in_file, out_file, chunks, resolution,
         else:
             chunks_ = chunks
 
-        copy_dataset(in_file, in_key, out_file, out_key, False,
-                     chunks_, n_threads)
+        print(chunks_)
+        copy_dataset(in_file, in_key, out_file, out_key,
+                     convert_dtype=False,
+                     chunks=chunks_,
+                     n_threads=n_threads)
 
     write_n5_metadata(out_file, scale_factors, resolution, setup_id=0)
diff --git a/mmpb/files/for_upload.py b/mmpb/files/for_upload.py
index d944193..87aa108 100644
--- a/mmpb/files/for_upload.py
+++ b/mmpb/files/for_upload.py
@@ -3,12 +3,11 @@ import xml.etree.ElementTree as ET
 import numpy as np
 from shutil import copyfile
 
-from mmpb.files.xml_utils import get_h5_path_from_xml
 from glob import glob
 from elf.io import open_file
 from pybdv.converter import copy_dataset
 from pybdv.util import get_key, get_number_of_scales, get_scale_factors
-from pybdv.metadata import write_n5_metadata, get_resolution, indent_xml
+from pybdv.metadata import write_n5_metadata, get_resolution, indent_xml, get_data_path
 
 
 def normalize_scale_factors(scale_factors, start_scale):
@@ -140,7 +139,7 @@ def copy_images(in_folder, out_folder, data_out_folder,
 
     for im_name, in_file in zip(image_names, files_to_copy):
         print("Copying", im_name, "...")
-        in_h5 = get_h5_path_from_xml(in_file, True)
+        in_h5 = get_data_path(in_file, True)
         # TODO we don't want to always copy to rawdata, but instead we
         # need to copy to the correct path extract from the old path
         out_file = os.path.join(data_out_folder, im_name + '.n5')
@@ -191,7 +190,7 @@ def copy_segmentations(in_folder, out_folder, segmentations_to_copy, output_root
 
     for seg_name, in_file in zip(seg_names, files_to_copy):
         print("Copying", seg_name, "...")
-        in_h5 = get_h5_path_from_xml(in_file, True)
+        in_h5 = get_data_path(in_file, True)
         # TODO we don't want to always copy to rawdata, but instead we
         # need to copy to the correct path extract from the old path
         out_file = os.path.join(s3_folder, seg_name + '.n5')
diff --git a/mmpb/files/migration.py b/mmpb/files/migration.py
index e32097f..1ae5b19 100644
--- a/mmpb/files/migration.py
+++ b/mmpb/files/migration.py
@@ -2,10 +2,10 @@ import os
 import json
 import shutil
 from glob import glob
-from pybdv.metadata import get_resolution
+from pybdv.metadata import get_resolution, get_data_path
 from mmpb.files.name_lookup import (look_up_filename, get_image_properties,
                                     DYNAMIC_SEGMENTATIONS, get_dynamic_segmentation_properties)
-from mmpb.files.xml_utils import get_h5_path_from_xml, copy_xml_with_newpath
+from mmpb.files.xml_utils import copy_xml_with_newpath
 from mmpb.files.copy_helper import copy_to_bdv_n5
 
 ROOT = '/g/arendt/EM_6dpf_segmentation/platy-browser-data/data'
@@ -30,7 +30,7 @@ def move_image_file(image_folder, xml_path):
     new_name = look_up_filename(name)
 
     # get the linked hdf5 path
-    image_path = get_h5_path_from_xml(xml_path, return_absolute_path=True)
+    image_path = get_data_path(xml_path, return_absolute_path=True)
 
     # move the xml to 'images/local'
     new_xml_path = os.path.join(image_folder, 'local', new_name + '.xml')
@@ -217,7 +217,7 @@ def migrate_rawfolder():
         new_name = look_up_filename(name)
 
         # get the linked hdf5 path
-        image_path = get_h5_path_from_xml(xml_path, return_absolute_path=True)
+        image_path = get_data_path(xml_path, return_absolute_path=True)
 
         # move the xml to 'images/local'
         new_xml_path = os.path.join(raw_folder, new_name + '.xml')
@@ -261,7 +261,7 @@ def make_n5_files(version):
     # special chunk sizes
     chunk_dict = {'sbem-6dpf-1-whole-raw': None}  # don't copy raw yet
 
-    paths_to_remove = []
+    copied = []
 
     xmls = glob(os.path.join(version_folder, 'images', 'local', '*.xml'))
     for xml in xmls:
@@ -271,24 +271,40 @@ def make_n5_files(version):
         if chunks is None:
             continue
 
-        h5_path = get_h5_path_from_xml(xml, return_absolute_path=True)
+        h5_path = get_data_path(xml, return_absolute_path=True)
         n5_path = os.path.splitext(h5_path)[0] + '.n5'
+        copied.append(h5_path)
         if os.path.exists(n5_path):
             continue
 
         # load resolution from xml
-        resolution = get_resolution(xml)
-        copy_to_bdv_n5(h5_path, n5_path, resolution, chunks)
+        resolution = get_resolution(xml, 0)
+        copy_to_bdv_n5(h5_path, n5_path, chunks, resolution)
 
-        paths_to_remove.append(h5_path)
+    return copied
 
-    return paths_to_remove
 
-
-# TODO
 # switch xmls to n5 format if n5 file at image location exists
 def update_n5_xmls(version):
-    pass
+    version_folder = os.path.join(ROOT, version)
+    xmls = glob(os.path.join(version_folder, 'images', 'local', '*.xml'))
+    for xml in xmls:
+        data_rel_path = get_data_path(xml)
+        # is this already n5? -> continue
+        if os.path.splitext(data_rel_path) == '.n5':
+            continue
+
+        # get the absolute path and check if the corresponding n5 file exists
+        data_abs_path = get_data_path(xml, return_absolute_path=True)
+        new_abs_path = os.path.splitext(data_abs_path) + '.n5'
+        # n5 file is not there? -> continue
+        if not os.path.exists(new_abs_path):
+            continue
+
+        # write the new relative path
+        new_rel_path = os.path.splitext(data_rel_path) + '.n5'
+        copy_xml_with_newpath(xml, xml, new_rel_path,
+                              data_format='bdv.n5')
 
 
 def make_remote_xmls(version):
@@ -354,6 +370,11 @@ if __name__ == '__main__':
     # migrate_version(version)
 
     version = '0.6.5'
-    paths_to_remove = make_n5_files(version)
-    print(paths_to_remove)
+    copied = make_n5_files(version)
+    with open('/g/kreshuk/pape/copied_to_n5.json', 'w') as f:
+        json.dump(copied, f)
+    # x = json.dumps(copied, indent=2, sort_keys=True)
+    # print(x)
+
+    # version = '0.6.5'
     # update_n5_xmls(version)
diff --git a/mmpb/files/name_lookup.py b/mmpb/files/name_lookup.py
index 38bfe12..8a68fd8 100644
--- a/mmpb/files/name_lookup.py
+++ b/mmpb/files/name_lookup.py
@@ -66,9 +66,12 @@ def update_name_lut():
     image_names = os.listdir(os.path.join(folder, 'images'))
     image_names = [os.path.splitext(name)[0] for name in image_names
                    if os.path.splitext(name)[1] == '.xml']
-    seg_names = os.listdir(os.path.join(folder, 'segmentations'))
-    seg_names = [os.path.splitext(name)[0] for name in seg_names
-                 if os.path.splitext(name)[1] == '.xml']
+    if os.path.exists(os.path.join(folder, 'segmentations')):
+        seg_names = os.listdir(os.path.join(folder, 'segmentations'))
+        seg_names = [os.path.splitext(name)[0] for name in seg_names
+                     if os.path.splitext(name)[1] == '.xml']
+    else:
+        seg_names = []
 
     file_names = image_names + seg_names
     for name in file_names:
diff --git a/mmpb/files/sources.py b/mmpb/files/sources.py
index 6ff3f7e..5b2cbf5 100644
--- a/mmpb/files/sources.py
+++ b/mmpb/files/sources.py
@@ -1,9 +1,11 @@
 import json
 import os
 from shutil import copyfile
+from pybdv.metadata import get_data_path
+
 from .checks import check_bdv, check_tables, check_paintera
 from ..check_attributes import check_attributes
-from .xml_utils import get_h5_path_from_xml, copy_xml_with_newpath
+from .xml_utils import copy_xml_with_newpath
 
 RAW_FOLDER = 'data/rawdata'
 SOURCE_FILE = 'data/sources.json'
@@ -129,7 +131,7 @@ def add_image(source_name, name, input_path, copy_data=True, is_private=False):
     if output_name in names:
         raise ValueError("Name %s is already taken" % output_name)
 
-    h5_path = get_h5_path_from_xml(input_path, return_absolute_path=True)
+    h5_path = get_data_path(input_path, return_absolute_path=True)
     name_h5 = '%s.h5' % output_name
     out_xml = os.path.join(RAW_FOLDER, '%s.xml' % output_name)
     out_h5 = os.path.join(RAW_FOLDER, name_h5)
@@ -236,7 +238,7 @@ def add_segmentation(source_name, name, segmentation_path=None,
 
     # copy the segmentation data if we have a static segmentation
     if is_static:
-        h5_path = get_h5_path_from_xml(segmentation_path, return_absolute_path=True)
+        h5_path = get_data_path(segmentation_path, return_absolute_path=True)
         name_h5 = '%s.h5' % output_name
         out_xml = os.path.join(RAW_FOLDER, '%s.xml' % output_name)
         out_h5 = os.path.join(RAW_FOLDER, name_h5)
diff --git a/mmpb/files/xml_utils.py b/mmpb/files/xml_utils.py
index 105ebf4..087672e 100644
--- a/mmpb/files/xml_utils.py
+++ b/mmpb/files/xml_utils.py
@@ -1,79 +1,36 @@
-import os
 import xml.etree.ElementTree as ET
-
-
-# pretty print xml, from:
-# http://effbot.org/zone/element-lib.htm#prettyprint
-def indent_xml(elem, level=0):
-    i = "\n" + level*"  "
-    if len(elem):
-        if not elem.text or not elem.text.strip():
-            elem.text = i + "  "
-        if not elem.tail or not elem.tail.strip():
-            elem.tail = i
-        for elem in elem:
-            indent_xml(elem, level+1)
-        if not elem.tail or not elem.tail.strip():
-            elem.tail = i
-    else:
-        if level and (not elem.tail or not elem.tail.strip()):
-            elem.tail = i
-
-
-def get_h5_path_from_xml(xml_path, return_absolute_path=False):
-    # xml horror ...
-    et_root = ET.parse(xml_path).getroot()
-    et = et_root[1]
-    et = et[0]
-    et = et[0]
-    path = et.text
-    # this assumes relative path in xml
-    if return_absolute_path:
-        path = os.path.join(os.path.split(xml_path)[0], path)
-        path = os.path.abspath(os.path.relpath(path))
-    return path
+from pybdv.metadata import get_data_path, indent_xml
 
 
 def copy_xml_with_abspath(xml_in, xml_out):
-    # get the h5 path from the xml
-    et_root = ET.parse(xml_in).getroot()
-    et = et_root[1]
-    et = et[0]
-    et = et[0]
-    path = et.text
-
-    # NOTE we assume that this is a relative path to the xml's dir
-    # would be better to actually read this from the data
-    xml_dir = os.path.split(xml_in)[0]
-    path = os.path.join(xml_dir, path)
-    path = os.path.abspath(os.path.relpath(path))
-    if not os.path.exists(path):
-        raise RuntimeError("Could not parse proper path from xml")
-
-    # write new xml with the absolute path
-    et.text = path
-    et.set('type', 'absolute')
-    indent_xml(et_root)
-    tree = ET.ElementTree(et_root)
-    tree.write(xml_out)
+    path = get_data_path(xml_in, return_absolute_path=True)
+    copy_xml_with_newpath(xml_in, xml_out, path,
+                          path_type='absolute')
 
 
-def copy_xml_with_newpath(xml_in, xml_out, h5path, path_type='relative'):
+def copy_xml_with_newpath(xml_in, xml_out, data_path,
+                          path_type='relative', data_format='bdv.hdf5'):
     assert path_type in ('absolute', 'relative')
-    # get the h5 path from the xml
-    et_root = ET.parse(xml_in).getroot()
-    et = et_root[1]
-    et = et[0]
-    et = et[0]
-    # write new xml with the new path
-    et.text = h5path
+    # get the path node inn the xml tree
+    root = ET.parse(xml_in).getroot()
+    seqdesc = root.find('SequenceDescription')
+    imgload = seqdesc.find('ImageLoader')
+    imgload.set('format', data_format)
+    et = imgload.find('hdf5')
+    if et is None:
+        et = imgload.find('n5')
+    if et is None:
+        raise RuntimeError("Could not find data node")
+    et.tag = data_format.split('.')[-1]
+    et.text = data_path
     et.set('type', path_type)
-    indent_xml(et_root)
-    tree = ET.ElementTree(et_root)
+
+    indent_xml(root)
+    tree = ET.ElementTree(root)
     tree.write(xml_out)
 
 
-def write_simple_xml(xml_path, h5_path, path_type='absolute'):
+def write_simple_xml(xml_path, data_path, path_type='absolute'):
     # write top-level data
     root = ET.Element('SpimData')
     root.set('version', '0.2')
@@ -86,8 +43,79 @@ def write_simple_xml(xml_path, h5_path, path_type='absolute'):
     imgload.set('format', 'bdv.hdf5')
     el = ET.SubElement(imgload, 'hdf5')
     el.set('type', path_type)
-    el.text = h5_path
+    el.text = data_path
 
     indent_xml(root)
     tree = ET.ElementTree(root)
     tree.write(xml_path)
+
+
+# should be generalized and moved to pybdv at some point
+def write_s3_xml(in_xml, out_xml, path_in_bucket,
+                 region='us-west-2',
+                 service_endpoint='https://s3.embl.de',
+                 bucket_name='platybrowser',
+                 shape=None, resolution=None):
+    nt = 1
+    nz, ny, nx = tuple(shape)
+
+    # check if we have an xml already
+    tree = ET.parse(in_xml)
+    root = tree.getroot()
+
+    # load the sequence description
+    seqdesc = root.find('SequenceDescription')
+
+    # update the image loader
+    # remove the old image loader
+    imgload = seqdesc.find('ImageLoader')
+    seqdesc.remove(imgload)
+
+    # write the new image loader
+    imgload = ET.SubElement(seqdesc, 'ImageLoader')
+    bdv_dtype = 'bdv.n5.s3'
+    imgload.set('format', bdv_dtype)
+    el = ET.SubElement(imgload, 'Key')
+    el.text = path_in_bucket
+
+    el = ET.SubElement(imgload, 'SigningRegion')
+    el.text = region
+    el = ET.SubElement(imgload, 'ServiceEndpoint')
+    el.text = service_endpoint
+    el = ET.SubElement(imgload, 'BucketName')
+    el.text = bucket_name
+
+    # load the view descriptions
+    viewsets = seqdesc.find('ViewSetups')
+    vs = viewsets.find('ViewSetup')
+
+    oz, oy, ox = 0.0, 0.0, 0.0
+    # if resolution is not None, write it, otherwise read it
+    vox = vs.find('voxelSize')
+    if resolution is None:
+        resolution = vox.find('size').text
+        resolution = [float(res) for res in resolution.split()][::-1]
+        dz, dy, dx = resolution
+    else:
+        dz, dy, dx = resolution
+        voxs = vox.find('size')
+        voxs.text = '{} {} {}'.format(dx, dy, dz)
+
+    # write the shape if it is not None
+    if shape is not None:
+        vss = vs.find('size')
+        vss.text = '{} {} {}'.format(nx, ny, nz)
+
+    # load the registration description and write the affines
+    vregs = root.find('ViewRegistrations')
+    for t in range(nt):
+        vreg = vregs.find('ViewRegistration')
+        vt = vreg.find('ViewTransform')
+        vt.set('type', 'affine')
+        vta = vt.find('affine')
+        vta.text = '{} 0.0 0.0 {} 0.0 {} 0.0 {} 0.0 0.0 {} {}'.format(dx, ox,
+                                                                      dy, oy,
+                                                                      dz, oz)
+    indent_xml(root)
+    tree = ET.ElementTree(root)
+    tree.write(out_xml)
diff --git a/mmpb/segmentation/correction/cillia_correction_tool.py b/mmpb/segmentation/correction/cillia_correction_tool.py
index 63fba36..aa0a8c4 100644
--- a/mmpb/segmentation/correction/cillia_correction_tool.py
+++ b/mmpb/segmentation/correction/cillia_correction_tool.py
@@ -13,11 +13,11 @@ import napari
 
 from heimdall import view, to_source
 from elf.io import open_file
-from mmpb.files.xml_utils import get_h5_path_from_xml
+from pybdv.metadata import get_data_path
 
 
 def xml_to_h5_path(xml_path):
-    path = get_h5_path_from_xml(xml_path, return_absolute_path=True)
+    path = get_data_path(xml_path, return_absolute_path=True)
     return path
 
 
-- 
GitLab