Commit 0fc62ee9 authored by Sudeep Sahadevan's avatar Sudeep Sahadevan
Browse files

clean up

parent cb7d81c6
%% Cell type:markdown id:7ca3358f tags:
## Functional programming and `toolz` use cases
These examples are adapted from Juan Nunez-Iglesias' [Linux conference Australia talk in 2016](https://www.youtube.com/watch?v=INAvD4Kmdbc). Refer to [his github repo](https://github.com/jni/functional-programming-in-python) for additional examples.
%% Cell type:markdown id:4c90d8bc tags:
%% Cell type:code id:3cf0fc1d tags:
``` python
from itertools import groupby
from toolz import curry
from toolz.functoolz import compose, pipe
from toolz.curried import map as mapc
from toolz.curried import filter as filterc
```
%% Cell type:markdown id:55b8d5af tags:
### Plot the distribution of codons in a given fasta file
#### dataset
A random selection of 100 protein coding transcripts from human genome
`data/transcripts.fa`
%% Cell type:code id:80213242 tags:
%% Cell type:code id:402ef208 tags:
``` python
from itertools import groupby
from toolz import curry
from toolz.functoolz import compose
from toolz.curried import map as mapc
def read_fasta(fa):
'''
https://www.biostars.org/p/710/
'''
with open(fa,'r') as fh:
faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
for grp in faiter:
# header = grp.__next__()[1:].strip()
# seq = "".join(s.strip() for s in faiter.__next__())
seq = pipe(faiter.__next__(), mapc(lambda s: s.strip()), curry(''.join))
yield seq
```
%% Cell type:code id:72501725 tags:
%% Cell type:code id:8e077037 tags:
``` python
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt
# partition: partition sequence into tuples of length n partition(n, sequence)
# concat: Concatenate iterables,
from toolz.itertoolz import partition, concat
from toolz.functoolz import pipe
```
%% Cell type:code id:632df94c tags:
%% Cell type:code id:b1f3499c tags:
``` python
triplet_fn = mapc(curry(partition)(3))
triplet_counts = pipe('data/transcripts.fa',read_fasta, triplet_fn, concat, mapc(''.join), Counter)
```
%% Cell type:code id:7ee92688 tags:
%% Cell type:code id:b121ba56 tags:
``` python
plt.figure(figsize=(14, 14))
triplet_df = pd.DataFrame( triplet_counts.most_common(50), columns = ['triplet', 'count'] )
triplet_df = pd.DataFrame( triplet_counts.most_common(32), columns = ['triplet', 'count'] )
triplet_plot = sns.barplot(x = 'count', y = 'triplet', data = triplet_df)
```
%% Output
%% Cell type:markdown id:298d0c0d tags:
%% Cell type:markdown id:354039c6 tags:
### Streaming and filtering read count data
#### Airway dataset
Himes BE, Jiang X, Wagner P, Hu R, Wang Q, Klanderman B, Whitaker RM, Duan Q, Lasky-Su J, Nikolos C, Jester W, Johnson M, Panettieri R Jr, Tantisira KG, Weiss ST, Lu Q. “RNA-Seq Transcriptome Profiling Identifies CRISPLD2 as a Glucocorticoid Responsive Gene that Modulates Cytokine Function in Airway Smooth Muscle Cells.” [PLoS One. 2014 Jun 13;9(6):e99625](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0099625)
Airway smooth muscle cells were treated with dexamethasone, a synthetic steroid with anti-inflammatory effects and expression levels were measured using RNA-seq.
`data/airway.csv`
%% Cell type:code id:cb5b2da3 tags:
``` python
import pandas as pd
```
%% Cell type:code id:17d48aeb tags:
``` python
airway = pd.read_csv('data/airway.csv',sep="\t", header=None, index_col = 0)
```
%% Cell type:code id:aa45fc7b tags:
``` python
airway
```
%% Output
1 2 3 4 5 6 7 8
0
ENSG00000000003 679 448 873 408 1138 1047 770 572
ENSG00000000005 0 0 0 0 0 0 0 0
ENSG00000000419 467 515 621 365 587 799 417 508
ENSG00000000457 260 211 263 164 245 331 233 229
ENSG00000000460 60 55 40 35 78 63 76 60
... ... ... ... ... ... ... ... ...
LRG_94 0 0 0 0 0 0 0 0
LRG_96 0 0 0 0 0 0 0 0
LRG_97 0 0 0 0 0 0 0 0
LRG_98 0 0 0 0 0 0 0 0
LRG_99 0 0 0 0 0 0 0 0
[64102 rows x 8 columns]
%% Cell type:markdown id:3e57eebe tags:
### Find all genes with mean read count $\ge$ 50
#### Using `pandas`
%% Cell type:code id:5d5ed72f tags:
``` python
airway_mean = airway.mean(axis=1)
airway_mean[airway_mean>=50]
```
%% Output
0
ENSG00000000003 741.875
ENSG00000000419 534.875
ENSG00000000457 242.000
ENSG00000000460 58.375
ENSG00000000971 6034.750
...
ENSG00000273179 174.875
ENSG00000273230 119.500
ENSG00000273270 137.375
ENSG00000273290 168.500
ENSG00000273344 65.500
Length: 12671, dtype: float64
%% Cell type:markdown id:06b09f77 tags:
#### Using `for` loop and `map`
%% Cell type:code id:4e46b096 tags:
``` python
from statistics import mean
```
%% Cell type:code id:6de40095 tags:
``` python
airway_genes = []
with open('data/airway.csv','r') as fh:
for f in fh:
seq_data = f.strip().split('\t')
count_data = map(int, seq_data[1:])
count_mean = mean(count_data)
if count_mean > 50.0:
airway_genes.append((seq_data[0],count_mean))
```
%% Cell type:code id:6b4c2415 tags:
``` python
print('Genes after filtering: ',len(airway_genes))
print('First 3 from list: ', airway_genes[:3])
```
%% Output
Genes after filtering: 12666
First 3 from list: [('ENSG00000000003', 741.875), ('ENSG00000000419', 534.875), ('ENSG00000000457', 242)]
%% Cell type:markdown id:8a30ccc7 tags:
#### Rewriting using `pipe` and curried versions of `reduce` and `filter`
Data flow: `open` $\rightarrow$ `split` $\rightarrow$ `map` $\rightarrow$ `reduce` $\rightarrow$ `filter`
%% Cell type:code id:2403784d tags:
``` python
from toolz.curried import map as mapc
from toolz.curried import filter as filterc
from toolz.functoolz import pipe, curry
```
%% Cell type:code id:b4ba3e35 tags:
``` python
def read_data(fin):
with open(fin,'r') as fh:
for l in fh:
yield l.strip().split('\t')
def map_int_mean(raw_data):
count_mean = pipe(raw_data[1:], mapc(int), mean)
return (raw_data[0],count_mean)
def filter_mean(gene_tuple):
return gene_tuple[1] >= 50.00
```
%% Cell type:code id:72de5559 tags:
``` python
airway_genes_50 = pipe('data/airway.csv',
read_data,
mapc(map_int_mean),
filterc(filter_mean))
```
%% Cell type:code id:372e9613 tags:
``` python
len(list(airway_genes_50))
```
%% Output
12671
%% Cell type:markdown id:702565cb tags:
#### Rewriting `map_int_mean` using `compose`
%% Cell type:code id:1791bccc tags:
``` python
from toolz.functoolz import compose
```
%% Cell type:code id:1561cba8 tags:
``` python
def map_int_mean_compose(raw_data):
mean_from_str = compose(mean, mapc(int))
return (raw_data[0], mean_from_str(raw_data[1:]))
```
%% Cell type:code id:fb813580 tags:
``` python
airway_genes_50 = pipe('data/airway.csv',
read_data,
mapc(map_int_mean_compose),
filterc(filter_mean))
```
%% Cell type:code id:e7aacccd tags:
``` python
len(list(airway_genes_50))
```
%% Output
12671
%% Cell type:markdown id:7a8c580b tags:
#### Rewriting `filter_mean` as a curried version
Adding an additional input `thresh` for `filter_mean`
%% Cell type:code id:55b4cb77 tags:
``` python
@curry # <--- using curry function as a decorator
def filter_mean_thresh(gene_tuple,thresh=0.0):
return gene_tuple[1] >= thresh
```
%% Cell type:code id:fc9f6e5e tags:
``` python
airway_genes_100 = pipe('data/airway.csv',
read_data,
mapc(map_int_mean_compose),
filterc(filter_mean_thresh(thresh=100.0)),
list)
```
%% Cell type:code id:d4c65bee tags:
``` python
len(airway_genes_100)
```
%% Output
11244
%% Cell type:code id:b543592a tags:
``` python
airway_genes_100[:3]
```
%% Output
[('ENSG00000000003', 741.875),
('ENSG00000000419', 534.875),
('ENSG00000000457', 242)]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment