From 693f7a25e4ed9cfced86504756d4695d158828a1 Mon Sep 17 00:00:00 2001
From: Constantin Pape <constantin.pape@iwr.uni-heidelberg.de>
Date: Fri, 7 Jun 2019 16:01:34 +0200
Subject: [PATCH] Update gene script and call it for cell master table

---
 scripts/attributes/genes.py       | 30 ++++++++++++++++++------------
 scripts/attributes/map_objects.py |  2 +-
 scripts/attributes/master.py      | 16 +++++++++++++---
 3 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/scripts/attributes/genes.py b/scripts/attributes/genes.py
index 15e64ff..726c8a0 100755
--- a/scripts/attributes/genes.py
+++ b/scripts/attributes/genes.py
@@ -1,4 +1,3 @@
-import os
 import csv
 import h5py
 import numpy as np
@@ -6,6 +5,12 @@ from vigra.analysis import extractRegionFeatures
 from vigra.sampling import resize
 
 
+# TODO
+# wrap this in a cluster_tools task in order to run remotely
+# fix blatant inefficiencis (size loop)
+# make test to check against original table
+
+
 def get_bbs(data):
     num_cells = (np.max(data)).astype('int') + 1
     cells_bbs = [[] for i in range(num_cells)]
@@ -23,6 +28,8 @@ def get_bbs(data):
     return cells_bbs
 
 
+# TODO very inefficient, can use "Count" feature of vigra features instead
+# and then just do this in `get_bbs` as well
 def get_cell_sizes(data):
     max_label = (np.max(data)).astype('uint32')
     cell_sizes = [0] * (max_label + 1)
@@ -58,25 +65,24 @@ def get_cell_expression(segm_data, all_genes):
 
 
 def write_genes_table(segm_file, genes_file, table_file, labels):
-    DSET = 't00000/s00/4/cells'
-    NEW_SHAPE = (570, 518, 550)
-    GENES_DSET = 'genes'
-    NAMES_DSET = 'gene_names'
-
-    genes_table_file = os.path.splitext(table_file)[0] + "_genes2" + os.path.splitext(table_file)[1]
+    dset = 't00000/s00/4/cells'
+    new_shape = (570, 518, 550)
+    genes_dset = 'genes'
+    names_dset = 'gene_names'
 
     with h5py.File(segm_file, 'r') as f:
-        segment_data = f[DSET][:]
+        segment_data = f[dset][:]
 
+    # TODO loading the whole thing into ram takes a lot of memory
     with h5py.File(genes_file, 'r') as f:
-        all_genes = f[GENES_DSET][:]
-        gene_names = [i.decode('utf-8') for i in f[NAMES_DSET]]
+        all_genes = f[genes_dset][:]
+        gene_names = [i.decode('utf-8') for i in f[names_dset]]
 
     num_genes = len(gene_names)
-    downsampled_data = resize(segment_data.astype("float32"), shape=NEW_SHAPE, order=0).astype('uint16')
+    downsampled_data = resize(segment_data.astype("float32"), shape=new_shape, order=0).astype('uint16')
     avail_labels, expression = get_cell_expression(downsampled_data, all_genes)
 
-    with open(genes_table_file, 'w') as genes_table:
+    with open(table_file, 'w') as genes_table:
         csv_writer = csv.writer(genes_table, delimiter='\t')
         _ = csv_writer.writerow(['label_id'] + gene_names)
         for label in labels:
diff --git a/scripts/attributes/map_objects.py b/scripts/attributes/map_objects.py
index b817eef..01a4770 100644
--- a/scripts/attributes/map_objects.py
+++ b/scripts/attributes/map_objects.py
@@ -47,5 +47,5 @@ def map_objects(label_ids, seg_path, seg_key, map_out,
         data.append(labels[:, None])
 
     col_names = ['label_id'] + map_names
-    data = np.concatenate([label_ids[:, None]] + data, axis=0)
+    data = np.concatenate([label_ids[:, None]] + data, axis=1)
     write_csv(map_out, data, col_names)
diff --git a/scripts/attributes/master.py b/scripts/attributes/master.py
index f02335a..4ceb1ea 100644
--- a/scripts/attributes/master.py
+++ b/scripts/attributes/master.py
@@ -3,6 +3,7 @@ import h5py
 
 from .base_attributes import base_attributes
 from .map_objects import map_objects
+from .genes import write_genes_table
 from ..files import get_h5_path_from_xml
 
 
@@ -21,8 +22,8 @@ def make_cell_tables(folder, name, tmp_folder, resolution,
     table_folder = os.path.join(folder, 'tables', name)
     os.makedirs(table_folder, exist_ok=True)
 
-    seg_path = get_seg_path(folder, name)
     seg_key = 't00000/s00/0/cells'
+    seg_path = get_seg_path(folder, name, seg_key)
 
     # make the basic attributes table
     base_out = os.path.join(table_folder, 'default.csv')
@@ -33,16 +34,25 @@ def make_cell_tables(folder, name, tmp_folder, resolution,
     # make table with mapping to other objects
     # nuclei, cellular models (TODO), ...
     map_out = os.path.join(table_folder, 'objects.csv')
-    map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels')]
+    map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels', seg_key)]
     map_keys = [seg_key]
     map_names = ['nucleus_id']
     map_objects(label_ids, seg_path, seg_key, map_out,
                 map_paths, map_keys, map_names,
                 tmp_folder, target, max_jobs)
 
+    # make table with gene mapping
+    # TODO we need to make sure that this file is always copied / updated
+    # before this is called !
+    aux_gene_xml = os.path.join(folder, 'misc', 'meds_all_genes.xml')
+    aux_gene_path = get_h5_path_from_xml(aux_gene_xml)
+    if not os.path.exists(aux_gene_path):
+        raise RuntimeError("Can't find auxiliary gene file")
+    gene_out = os.path.join(table_folder, 'genes.csv')
+    write_genes_table(seg_path, aux_gene_path, gene_out, label_ids)
+
     # TODO additional tables:
     # regions / semantics
-    # gene expression
     # ???
 
 
-- 
GitLab