From b75430bcb91c6c644da1a51770fce99419301c68 Mon Sep 17 00:00:00 2001
From: vzinche <valentyna.zinchenko@embl.de>
Date: Fri, 7 Jun 2019 10:21:01 +0200
Subject: [PATCH] script to write gene table

---
 scripts/attributes/genes.py | 92 +++++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100755 scripts/attributes/genes.py

diff --git a/scripts/attributes/genes.py b/scripts/attributes/genes.py
new file mode 100755
index 0000000..a7a77ec
--- /dev/null
+++ b/scripts/attributes/genes.py
@@ -0,0 +1,92 @@
+import os
+import csv
+import h5py
+import numpy as np
+from vigra.analysis import extractRegionFeatures
+from vigra.sampling import resize
+
+
+def get_bbs(data):
+    shape = data.shape
+    num_cells = (np.max(data)).astype('int') + 1
+    cells_bbs = [[] for i in range(num_cells)]
+    mins_and_maxs = extractRegionFeatures(data.astype('float32'), data.astype('uint32'),
+            features = ['Coord<Maximum >', 'Coord<Minimum >'])
+    mins = mins_and_maxs['Coord<Minimum >'].astype('uint32')
+    maxs = mins_and_maxs['Coord<Maximum >'].astype('uint32') + 1
+    for cell in range(num_cells):
+        cell_bb = []
+        cell_min = mins[cell]
+        cell_max = maxs[cell]
+        for axis in range(3):
+            cell_bb.append(slice(cell_min[axis], cell_max[axis]))
+        cells_bbs[cell] = tuple(cell_bb)
+    return cells_bbs
+
+
+def get_cell_sizes(data):
+    max_label = (np.max(data)).astype('uint32')
+    cell_sizes = [0] * (max_label + 1)
+    Z, X, Y = data.shape
+    for z in range(Z):
+        for x in range(X):
+            for y in range(Y):
+                label = data[z,x,y]
+                cell_sizes[label] += 1
+    cell_sizes = np.array(cell_sizes)
+    return cell_sizes
+
+
+def get_cell_expression(segm_data, all_genes):
+    num_genes = all_genes.shape[0]
+    labels = list(np.unique(segm_data))
+    cells_expression = np.zeros((len(labels), num_genes), dtype='float32')
+    cell_sizes = get_cell_sizes(segm_data)
+    cell_bbs = get_bbs(segm_data)
+    for cell_idx in range(len(labels)):
+        cell_label = labels[cell_idx]
+        if cell_label == 0: continue
+        cell_size = cell_sizes[cell_label]
+        bb = cell_bbs[cell_label]
+        cell_masked = (segm_data[bb]==cell_label)
+        genes_in_cell = all_genes[tuple([slice(0,None),] +  list(bb))]
+        for gene in range(num_genes):
+            gene_expr = genes_in_cell[gene]
+            try:
+                gene_expr_sum = np.sum(gene_expr[cell_masked]>0)
+            except:
+                print(gene_expr.shape, cell_masked.shape)
+            cells_expression[cell_idx, gene] = gene_expr_sum / cell_size
+    return labels, cells_expression
+
+
+def write_genes_table(segm_file, genes_file, table_file, labels):
+    DSET = 't00000/s00/4/cells'
+    NEW_SHAPE = (570,518,550)
+    GENES_DSET = 'genes'
+    NAMES_DSET = 'gene_names'
+
+    genes_table_file = os.path.splitext(table_file)[0] + "_genes2" + os.path.splitext(table_file)[1]
+
+    with h5py.File(segm_file, 'r') as f:
+        segment_data = f[DSET][:]
+
+    with h5py.File(genes_file, 'r') as f:
+        all_genes = f[GENES_DSET][:]
+        gene_names = [i.decode('utf-8') for i in f[NAMES_DSET]]
+
+    num_genes = len(gene_names)
+    downsampled_data = resize(segment_data.astype("float32"), shape = NEW_SHAPE, order=0).astype('uint16')
+    avail_labels, expression = get_cell_expression(downsampled_data, all_genes)
+
+    with open(genes_table_file, 'w') as genes_table:
+        csv_writer = csv.writer(genes_table, delimiter='\t')
+        _ = csv_writer.writerow(['label_id',] + gene_names)
+        for label in labels:
+            if label in avail_labels:
+                idx = avail_labels.index(label)
+                _ = csv_writer.writerow([label, ] + list(expression[idx]))
+            else:
+                _ = csv_writer.writerow([label, ] + [0] * num_genes)
+
+
-- 
GitLab