Update gene script and call it for cell master table

693f7a25 · Constantin Pape · c08565cb · 693f7a25 · 693f7a25 · 693f7a25
Commit 693f7a25 authored 5 years ago by Constantin Pape
--- a/scripts/attributes/genes.py
+++ b/scripts/attributes/genes.py
-import os
 import csv
 import h5py
 import numpy as np
@@ -6,6 +5,12 @@ from vigra.analysis import extractRegionFeatures
 from vigra.sampling import resize


+# TODO
+# wrap this in a cluster_tools task in order to run remotely
+# fix blatant inefficiencis (size loop)
+# make test to check against original table
+
+
 def get_bbs(data):
    num_cells = (np.max(data)).astype('int') + 1
    cells_bbs = [[] for i in range(num_cells)]
@@ -23,6 +28,8 @@ def get_bbs(data):
    return cells_bbs


+# TODO very inefficient, can use "Count" feature of vigra features instead
+# and then just do this in `get_bbs` as well
 def get_cell_sizes(data):
    max_label = (np.max(data)).astype('uint32')
    cell_sizes = [0] * (max_label + 1)
@@ -58,25 +65,24 @@ def get_cell_expression(segm_data, all_genes):


 def write_genes_table(segm_file, genes_file, table_file, labels):
-    DSET = 't00000/s00/4/cells'
-    NEW_SHAPE = (570, 518, 550)
-    GENES_DSET = 'genes'
-    NAMES_DSET = 'gene_names'
-
-    genes_table_file = os.path.splitext(table_file)[0] + "_genes2" + os.path.splitext(table_file)[1]
+    dset = 't00000/s00/4/cells'
+    new_shape = (570, 518, 550)
+    genes_dset = 'genes'
+    names_dset = 'gene_names'

    with h5py.File(segm_file, 'r') as f:
-        segment_data = f[DSET][:]
+        segment_data = f[dset][:]

+    # TODO loading the whole thing into ram takes a lot of memory
    with h5py.File(genes_file, 'r') as f:
-        all_genes = f[GENES_DSET][:]
-        gene_names = [i.decode('utf-8') for i in f[NAMES_DSET]]
+        all_genes = f[genes_dset][:]
+        gene_names = [i.decode('utf-8') for i in f[names_dset]]

    num_genes = len(gene_names)
-    downsampled_data = resize(segment_data.astype("float32"), shape=NEW_SHAPE, order=0).astype('uint16')
+    downsampled_data = resize(segment_data.astype("float32"), shape=new_shape, order=0).astype('uint16')
    avail_labels, expression = get_cell_expression(downsampled_data, all_genes)

-    with open(genes_table_file, 'w') as genes_table:
+    with open(table_file, 'w') as genes_table:
        csv_writer = csv.writer(genes_table, delimiter='\t')
        _ = csv_writer.writerow(['label_id'] + gene_names)
        for label in labels:

--- a/scripts/attributes/map_objects.py
+++ b/scripts/attributes/map_objects.py
@@ -47,5 +47,5 @@ def map_objects(label_ids, seg_path, seg_key, map_out,
        data.append(labels[:, None])

    col_names = ['label_id'] + map_names
-    data = np.concatenate([label_ids[:, None]] + data, axis=0)
+    data = np.concatenate([label_ids[:, None]] + data, axis=1)
    write_csv(map_out, data, col_names)
--- a/scripts/attributes/master.py
+++ b/scripts/attributes/master.py
@@ -3,6 +3,7 @@ import h5py

 from .base_attributes import base_attributes
 from .map_objects import map_objects
+from .genes import write_genes_table
 from ..files import get_h5_path_from_xml


@@ -21,8 +22,8 @@ def make_cell_tables(folder, name, tmp_folder, resolution,
    table_folder = os.path.join(folder, 'tables', name)
    os.makedirs(table_folder, exist_ok=True)

-    seg_path = get_seg_path(folder, name)
    seg_key = 't00000/s00/0/cells'
+    seg_path = get_seg_path(folder, name, seg_key)

    # make the basic attributes table
    base_out = os.path.join(table_folder, 'default.csv')
@@ -33,16 +34,25 @@ def make_cell_tables(folder, name, tmp_folder, resolution,
    # make table with mapping to other objects
    # nuclei, cellular models (TODO), ...
    map_out = os.path.join(table_folder, 'objects.csv')
-    map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels')]
+    map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels', seg_key)]
    map_keys = [seg_key]
    map_names = ['nucleus_id']
    map_objects(label_ids, seg_path, seg_key, map_out,
                map_paths, map_keys, map_names,
                tmp_folder, target, max_jobs)

+    # make table with gene mapping
+    # TODO we need to make sure that this file is always copied / updated
+    # before this is called !
+    aux_gene_xml = os.path.join(folder, 'misc', 'meds_all_genes.xml')
+    aux_gene_path = get_h5_path_from_xml(aux_gene_xml)
+    if not os.path.exists(aux_gene_path):
+        raise RuntimeError("Can't find auxiliary gene file")
+    gene_out = os.path.join(table_folder, 'genes.csv')
+    write_genes_table(seg_path, aux_gene_path, gene_out, label_ids)
+
    # TODO additional tables:
    # regions / semantics
-    # gene expression
    # ???