From 6bdb3f2760b9092ad84ad2f9d862d0a221fca49c Mon Sep 17 00:00:00 2001 From: Constantin Pape <c.pape@gmx.net> Date: Wed, 5 Feb 2020 21:38:57 +0100 Subject: [PATCH] Start to implement migration scripts --- mmpb/files/migration.py | 240 ++++++++++++++++++++++++++++++++++++++ mmpb/files/name_lookup.py | 142 ++++++++++++++++++++++ 2 files changed, 382 insertions(+) create mode 100644 mmpb/files/migration.py create mode 100644 mmpb/files/name_lookup.py diff --git a/mmpb/files/migration.py b/mmpb/files/migration.py new file mode 100644 index 0000000..7225a7e --- /dev/null +++ b/mmpb/files/migration.py @@ -0,0 +1,240 @@ +import os +import json +import shutil +from glob import glob +from mmpb.files.name_lookup import look_up_filename, get_image_properties +from mmpb.files.xml_utils import get_h5_path_from_xml, copy_xml_with_newpath + +# ROOT = '/g/arendt/...' +ROOT = '/home/pape/Work/my_projects/platy-browser-data/data' +DRY_RUN = True + + +def new_folder_structure(folder): + image_folder = os.path.join(folder, 'images') + local_folder = os.path.join(image_folder, 'local') + remote_folder = os.path.join(image_folder, 'remote') + if DRY_RUN: + print("Creating folder", local_folder) + print("Creating folder", remote_folder) + else: + assert os.path.exists(image_folder), image_folder + os.makedirs(image_folder, exist_ok=True) + os.makedirs(remote_folder, exist_ok=True) + + +def move_image_file(image_folder, xml_path): + name = os.path.splitext(os.path.split(xml_path)[1])[0] + new_name = look_up_filename(name) + if new_name is None: + new_name = name + + # get the linked hdf5 path + image_path = get_h5_path_from_xml(xml_path, return_absolute_path=True) + + # move the xml to 'images/local' + new_xml_path = os.path.join(image_folder, 'local', new_name + '.xml') + if DRY_RUN: + print("Moving", xml_path, "to", new_xml_path) + else: + shutil.move(xml_path, new_xml_path) + + # if the hdf5 file is in the same folder, move it to 'images/local' as well + h5_is_local = len(os.relpath(image_path, os.path.split(xml_path)[0]).split('/')) > 1 + if h5_is_local: + new_image_path = os.path.join(image_folder, 'local', new_name + '.h5') + if DRY_RUN: + print("Moving", image_path, "to", new_image_path) + else: + assert os.path.exists(image_path), image_path + shutil.move(image_path, new_image_path) + # if not, construct the new correct data path + else: + # the new image path might be in rawdata; in this case there is now '/local' + # subfolder, if it is in a version folder, it is in '/local' + im_root, im_name = os.path.split(image_path) + new_image_path = os.path.join(im_root, new_name + '.h5') + if not os.path.exists(new_image_path): + new_image_path = os.path.join(im_root, 'local', new_name + '.h5') + + new_rel_data_path = os.path.relpath(new_image_path, os.path.split(new_xml_path)[0]) + if DRY_RUN: + print("Setting new xml path to", new_rel_data_path) + else: + assert os.path.exists(new_image_path), new_image_path + # set path in xml + copy_xml_with_newpath(new_xml_path, new_xml_path, new_rel_data_path) + + # update the h5 path in the new xml + return {new_name: get_image_properties(name)} + + +def update_image_dict(image_folder, image_dict): + dict_out_file = os.path.join(image_folder, 'images.json') + if os.path.exists(dict_out_file): + with open(dict_out_file) as f: + image_dict.update(json.load(f)) + + with open(dict_out_file, 'w') as f: + json.dump(f, sort_keys=True, indent=2) + + +def update_image_data(folder): + image_dict = {} + image_folder = os.path.join(folder, 'images') + xmls = glob(os.path.join(image_folder, "*.xml")) + + for xml in xmls: + image_properties = move_image_file(image_folder, xml) + image_dict.update(image_properties) + + if DRY_RUN: + print("New image dict:") + print(image_dict) + else: + update_image_dict(image_folder, image_dict) + + +def update_segmentation_data(folder): + image_dict = {} + image_folder = os.path.join(folder, 'images') + seg_folder = os.path.join(folder, 'segmentations') + xmls = glob(os.path.join(seg_folder, "*.xml")) + + for xml in xmls: + image_properties = move_image_file(image_folder, xml) + image_dict.update(image_properties) + + if DRY_RUN: + print("New image dict:") + print(image_dict) + else: + update_image_dict(image_folder, image_dict) + + # TODO need to update tables: + # - rename the table folders correctly + # - fix links to account for the updated names + + +def clean_up(version_folder): + # remove segmentation folder (needs to be empty!) + seg_folder = os.path.join(version_folder, 'segmentations') + os.rmdir(seg_folder) + + # remove bdv server config + bdv_server_config = os.path.join(version_folder, 'misc', 'bdv_server.txt') + if os.path.exists(bdv_server_config): + os.remove(bdv_server_config) + + +# migrate version folder from old to new data layout +def migrate_version(version): + version_folder = os.path.join(ROOT, version) + + # 1.) make new folder structure + new_folder_structure(version_folder) + + # 2.) iterate over all images and segmentations, replace names (if necessary), + # move the files and make the new images.json dict + update_image_data(version_folder) + + # 3.) iterate over all table links and repair them + update_segmentation_data(version_folder) + + # 4.) clean up: + # - remove segmentations folder (make sure it's empty) + # - remove bdv server config + clean_up(version_folder) + + +# migrate all the data in the raw folder +def migrate_rawfolder(): + raw_folder = os.path.join(ROOT, 'rawdata') + xmls = glob(os.path.join(raw_folder, "*.xml")) + + for xml_path in xmls: + name = os.path.splitext(os.path.split(xml_path)[1])[0] + new_name = look_up_filename(name) + if new_name is None: + new_name = name + + # get the linked hdf5 path + image_path = get_h5_path_from_xml(xml_path, return_absolute_path=True) + + # move the xml to 'images/local' + new_xml_path = os.path.join(raw_folder, new_name + '.xml') + if DRY_RUN: + print("Moving", xml_path, "to", new_xml_path) + else: + shutil.move(xml_path, new_xml_path) + + new_image_path = os.path.join(raw_folder, new_name + '.h5') + if DRY_RUN: + print("Moving", image_path, "to", new_image_path) + else: + assert os.path.exists(image_path), image_path + shutil.move(image_path, new_image_path) + + new_rel_data_path = new_name + '.h5' + if DRY_RUN: + print("Setting new xml path to", new_rel_data_path) + else: + assert os.path.exists(new_image_path), new_image_path + # set path in xml + copy_xml_with_newpath(new_xml_path, new_xml_path, new_rel_data_path) + + +# iterate over all the xmls in this version, follow the links +# and replace h5 files with n5 (if necessary) +def to_n5(version): + pass + + +def make_remote_xmls(version): + pass + + +def remove_deprecated_data(): + # cats-neuropil + # traces + # xray (is not part of any version yet, but we need to move the raw data) + + def remove_deprecated_seg(folder, pattern): + # remove xml for traces + trace_files = glob(os.path.join(vfolder, 'segmentations', pattern)) + if len(trace_files) > 0: + assert len(trace_files) == 1 + if DRY_RUN: + print("Remove", trace_files[0]) + else: + os.remove(trace_files[0]) + + # remove tables for traces + trace_files = glob(os.path.join(vfolder, 'tables', pattern)) + if len(trace_files) > 0: + assert len(trace_files) == 1 + if DRY_RUN: + print("Remove", trace_files[0]) + else: + shutil.rmtree(trace_files[0]) + + # remove xmls from the version folders + # (data from rawfolder should be backed up by hand!) + version_folders = glob(os.path.join(ROOT, "0.*")) + for vfolder in version_folders: + remove_deprecated_seg(vfolder, '*traces*') + remove_deprecated_seg(vfolder, '*cats*') + + +if __name__ == '__main__': + # TODO before doing any migration: + # - replace the chromatin segmentation and table + # - check nephridia segmentation + + # remove the data we don't want to upload (yet): + remove_deprecated_data() + + # migrate_rawfolder() + + # version = '0.0.0' + # migrate_version(version) diff --git a/mmpb/files/name_lookup.py b/mmpb/files/name_lookup.py new file mode 100644 index 0000000..cda2812 --- /dev/null +++ b/mmpb/files/name_lookup.py @@ -0,0 +1,142 @@ +import os +import json + +# From Detlev's mail +NEW_GENE_NAMES = { + "NOV1": "globin-like", + "NOV2": "NOV2", + "NOV6": "Stathmin", + "NOV15": "OLM2A", + "NOV18": "CEPU1", + "NOV29": "NOV29", + "NOV45": "TMTC3", + "NOV50": "GDPD1", + "NOV52": "KANL3", + "ENR1": "PGAM", + "ENR2": "RYR2", + "ENR3": "JKIP3", + "ENR4": "SND1", + "ENR6": "Nucleolin", + "ENR8": "Non-muscle-MHC", + "ENR9": "NOE1", + "ENR10": "UPP", + "ENR12": "UNC22", + "ENR13": "NDUS1", + "ENR16": "ODO2", + "ENR19": "FXL21", + "ENR20": "PPIB", + "ENR22": "CO1A1", + "ENR25": "Synaptopodin", + "ENR29": "USP9X", + "ENR30": "CCVD", + "ENR31": "Leucin-rich", + "ENR32": "GRIK3", + "ENR34": "MTHFSD", + "ENR39": "RPC2", + "ENR46": "Calexcitin2", + "ENR54": "Boule-like", + "ENR57": "Junctophilin1", + "ENR62": "NB5R3", + "ENR64": "PSMF1", + "ENR69": "BCA1", + "ENR71": "Patched" +} + +# ROOT = '/g/arendt/...' +ROOT = '/home/pape/Work/my_projects/platy-browser-data/data' + +FILE_NAME_LUT = {} +IMAGE_PROPERTIES = {} + + +# we need to make the following update to names: +# - prospr -> gene names need to be replaced according to list +# (including to two region names !) and be lowercase +# - prospr -> get rid of the '-MED' postfix +# - segmentations -> get rid of '-labels' postifx +def update_name_lut(): + global FILE_NAME_LUT + + # update files according to the last version folder + folder = os.path.join(ROOT, '0.6.5') + image_names = os.listdir(os.path.join(folder, 'images')) + image_names = [os.path.splitext(name)[0] for name in image_names + if os.path.splitext(name)[1] == '.xml'] + seg_names = os.listdir(os.path.join(folder, 'segmentations')) + seg_names = [os.path.splitext(name)[0] for name in seg_names + if os.path.splitext(name)[1] == '.xml'] + + file_names = image_names + seg_names + for name in file_names: + new_name = name + + # get rid of '-MED' + if '-MED' in new_name: + new_name = new_name.replace('-MED', '') + # get rid of '-labels' + if '-labels' in new_name: + new_name = new_name.replace('-labels', '') + # get rid of '-ariande' tag [sic] + if '-ariande' in new_name: + new_name = new_name.replace('-ariande', '') + + # update the gene / region names for prospr + # and make everything lowercase + if new_name.startswith('prospr'): + + # replace gene names / region names + gene_name = new_name.split('-')[4:] + if gene_name[0] in NEW_GENE_NAMES: + gene_name = [NEW_GENE_NAMES[gene_name[0]]] + elif len(gene_name) > 1 and gene_name[1] == 'PNS': + gene_name = ['segmented', 'lateralectoderm'] + elif len(gene_name) > 1 and gene_name[1] == 'Stomodeum': + gene_name = ['segmented', 'foregut'] + new_name = '-'.join(new_name.split('-')[:4] + gene_name) + + # make lowercase + new_name = new_name.lower() + + FILE_NAME_LUT[name] = new_name + + +def update_image_properties(): + global IMAGE_PROPERTIES + for name in FILE_NAME_LUT: + properties = {} + + # prospr: Color Magenta + # value range 0 - 1000 + if name.startswith('prospr'): + if 'virtual-cells' in name: + vc_table_folder = 'tables/%s' % name + properties.update({'ColorMap': 'Glasbey', 'TableFolder': vc_table_folder}) + else: + properties.update({'Color': 'Magenta', 'MinValue': 0, 'MaxValue': 1000}) + # TODO handle segmentations / masks + # TODO segmented / mask is not consistent, get rid of '-mask' tag? + # em-raw: Color White + # value range 0 - 255 + else: + properties.update({'Color': 'White', 'MinValue': 0, 'MaxValue': 255}) + + IMAGE_PROPERTIES[name] = properties + + +update_name_lut() +update_image_properties() + + +def look_up_filename(file_name): + return FILE_NAME_LUT.get(file_name, None) + + +def get_image_properties(name): + return IMAGE_PROPERTIES[name] + + +if __name__ == '__main__': + # x = json.dumps(FILE_NAME_LUT, sort_keys=True, indent=2) + # print(x) + with open('/home/pape/new_names.json', 'w') as f: + json.dump(FILE_NAME_LUT, f, sort_keys=True, indent=2) -- GitLab