Skip to content
Snippets Groups Projects
Commit 693f7a25 authored by Constantin Pape's avatar Constantin Pape
Browse files

Update gene script and call it for cell master table

parent c08565cb
No related branches found
No related tags found
No related merge requests found
import os
import csv
import h5py
import numpy as np
......@@ -6,6 +5,12 @@ from vigra.analysis import extractRegionFeatures
from vigra.sampling import resize
# TODO
# wrap this in a cluster_tools task in order to run remotely
# fix blatant inefficiencis (size loop)
# make test to check against original table
def get_bbs(data):
num_cells = (np.max(data)).astype('int') + 1
cells_bbs = [[] for i in range(num_cells)]
......@@ -23,6 +28,8 @@ def get_bbs(data):
return cells_bbs
# TODO very inefficient, can use "Count" feature of vigra features instead
# and then just do this in `get_bbs` as well
def get_cell_sizes(data):
max_label = (np.max(data)).astype('uint32')
cell_sizes = [0] * (max_label + 1)
......@@ -58,25 +65,24 @@ def get_cell_expression(segm_data, all_genes):
def write_genes_table(segm_file, genes_file, table_file, labels):
DSET = 't00000/s00/4/cells'
NEW_SHAPE = (570, 518, 550)
GENES_DSET = 'genes'
NAMES_DSET = 'gene_names'
genes_table_file = os.path.splitext(table_file)[0] + "_genes2" + os.path.splitext(table_file)[1]
dset = 't00000/s00/4/cells'
new_shape = (570, 518, 550)
genes_dset = 'genes'
names_dset = 'gene_names'
with h5py.File(segm_file, 'r') as f:
segment_data = f[DSET][:]
segment_data = f[dset][:]
# TODO loading the whole thing into ram takes a lot of memory
with h5py.File(genes_file, 'r') as f:
all_genes = f[GENES_DSET][:]
gene_names = [i.decode('utf-8') for i in f[NAMES_DSET]]
all_genes = f[genes_dset][:]
gene_names = [i.decode('utf-8') for i in f[names_dset]]
num_genes = len(gene_names)
downsampled_data = resize(segment_data.astype("float32"), shape=NEW_SHAPE, order=0).astype('uint16')
downsampled_data = resize(segment_data.astype("float32"), shape=new_shape, order=0).astype('uint16')
avail_labels, expression = get_cell_expression(downsampled_data, all_genes)
with open(genes_table_file, 'w') as genes_table:
with open(table_file, 'w') as genes_table:
csv_writer = csv.writer(genes_table, delimiter='\t')
_ = csv_writer.writerow(['label_id'] + gene_names)
for label in labels:
......
......@@ -47,5 +47,5 @@ def map_objects(label_ids, seg_path, seg_key, map_out,
data.append(labels[:, None])
col_names = ['label_id'] + map_names
data = np.concatenate([label_ids[:, None]] + data, axis=0)
data = np.concatenate([label_ids[:, None]] + data, axis=1)
write_csv(map_out, data, col_names)
......@@ -3,6 +3,7 @@ import h5py
from .base_attributes import base_attributes
from .map_objects import map_objects
from .genes import write_genes_table
from ..files import get_h5_path_from_xml
......@@ -21,8 +22,8 @@ def make_cell_tables(folder, name, tmp_folder, resolution,
table_folder = os.path.join(folder, 'tables', name)
os.makedirs(table_folder, exist_ok=True)
seg_path = get_seg_path(folder, name)
seg_key = 't00000/s00/0/cells'
seg_path = get_seg_path(folder, name, seg_key)
# make the basic attributes table
base_out = os.path.join(table_folder, 'default.csv')
......@@ -33,16 +34,25 @@ def make_cell_tables(folder, name, tmp_folder, resolution,
# make table with mapping to other objects
# nuclei, cellular models (TODO), ...
map_out = os.path.join(table_folder, 'objects.csv')
map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels')]
map_paths = [get_seg_path(folder, 'em-segmented-nuclei-labels', seg_key)]
map_keys = [seg_key]
map_names = ['nucleus_id']
map_objects(label_ids, seg_path, seg_key, map_out,
map_paths, map_keys, map_names,
tmp_folder, target, max_jobs)
# make table with gene mapping
# TODO we need to make sure that this file is always copied / updated
# before this is called !
aux_gene_xml = os.path.join(folder, 'misc', 'meds_all_genes.xml')
aux_gene_path = get_h5_path_from_xml(aux_gene_xml)
if not os.path.exists(aux_gene_path):
raise RuntimeError("Can't find auxiliary gene file")
gene_out = os.path.join(table_folder, 'genes.csv')
write_genes_table(seg_path, aux_gene_path, gene_out, label_ids)
# TODO additional tables:
# regions / semantics
# gene expression
# ???
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment