Commit b446dabb authored by Thomas Schwarzl's avatar Thomas Schwarzl
Browse files

annotation

parent ac01ebbe
......@@ -4,23 +4,40 @@ output: BiocStyle::html_document
author: "Thomas Schwarzl and Sudeep Sahavan"
---
This is analysis of 2C-sequencing data of Pab1 and TDH 3 in yeast.
Pab1 is a poly A binder, which binds to the 3' UTR of mRNAs.
TDH3 is a glycolytic enzyme we observed binding to short RNAs.
Protein tags:
- TDH3 was tagged with ProtA and TAP1
we do have Pab1 samples without RiboDepletion.
In addition, there is wild type control.
All of the samples have an input control and an IPed sample.
# Setup
## Libraries
Load libraries
Loading required libraries
```{r}
library(edgeR)
library(csaw)
require(tidyverse)
library(IHW)
suppressPackageStartupMessages({
library(edgeR)
library(csaw)
require(tidyverse)
library(IHW)
})
```
## Bam files
### Preparation
Either clone bam files into directory
```
......@@ -33,16 +50,18 @@ Or link bam directory to original path
ln -s bam /Volumes/hentze/ projects/2Cseq/Pilot/data/sequencing/Novoalign_tRNA_chrom/primary
```
### Loading bam files
Load files from bam directory
```{r}
bams_files_all <- list.files(path = 'bam',
bams_files_all <- list.files(path = 'bam',
pattern = '*.sorted.bam$',
full.names = TRUE)
bams_files_all
```
bams for without noRibo or rep2
bams for without noRibo or rep2 (rep 2 was excluded from the analysis)
```{r}
bams_files_selected <- bams_files_all[grep(pattern = 'noRibo|rep2',
......@@ -61,27 +80,35 @@ ip_files
```{r}
bams <- c(ip_files,input_files)
bams <- c(ip_files, input_files)
bams
```
## Parameters
We define minimum quality of alignment of reads and that we have paired-end
reads.
```{r}
params <- readParam(minq = 20,
params <- readParam(minq = 20, # minimum mapping quality
pe = 'both')
```
## Fragment Length
Calculate fragment sizes
With this, we calculate fragment sizes for each sample.
```{r}
get_names <- function(x) {
gsub(pattern="^.*\\/|\\_1.*$",replacement='',x)
}
get_fragment_length <- function(x) {
frag_list <- vector(length(x),
mode = 'list')
names(frag_list) <- gsub(pattern="^.*\\/|\\_1.*$",replacement='',x)
names(frag_list) <- get_names(x)
frag_list
for(i in c(1:length(x))) {
......@@ -89,34 +116,32 @@ get_fragment_length <- function(x) {
param = params)
}
frag_list
# Extract the mean fragment size
frag_list %>% sapply(function(x) {
ceiling(mean(x$sizes))
})
}
```
Input
Calculating fragment sizes for all samples
```{r}
frag_sizes <- get_fragment_length(bams)
frag_sizes_input <- get_fragment_length(input_files)
frag_sizes_ip <- get_fragment_length(ip_files)
fragment_sizes <- get_fragment_length(bams)
fragment_sizes_input <- fragment_sizes[get_names(input_files)]
fragment_sizes_ip <- fragment_sizes[get_names(ip_files)]
```
```{r}
fragment_sizes <- sapply(frag_sizes,function(x){ceiling(mean(x$sizes))})
fragment_sizes_input <- sapply(frag_sizes_input,function(x){ceiling(mean(x$sizes))})
fragment_sizes_ip <- sapply(frag_sizes_ip,function(x){ceiling(mean(x$sizes))})
```
# Preprocessing
## Count reads into windows
# Analysis
A helper function for counting reads into (sliding) windows.
## Count Windows
Counting reads into windows
```{r}
count_windows <- function(infiles, fragsizes,
count_windows <- function(infiles, # bam files
fragsizes, # fragment sizes
width = 50,
filter = 10,
spacing = 20,
......@@ -132,20 +157,31 @@ count_windows <- function(infiles, fragsizes,
filter = filter,
...)
}
```
counts <- list()
Counting reads into windows.
counts[["all"]] <- csaw::windowCounts(bam.files = bams,
ext = list(
frag_sizes,
NA),
param = params,
spacing = 20,
width = 50,
filter = 10)
ip_counts <- windowCounts(bam.files=ip_files,ext=list(frag_sizes[ grep(pattern="IP",names(frag_sizes))],NA),param=params,spacing=20,width=50,filter=10)
```{r}
counts <- list()
# counts[["all"]] <- csaw::windowCounts(bam.files = bams,
# ext = list(
# frag_sizes,
# NA),
# param = params,
# spacing = 20,
# width = 50,
# filter = 10)
#
#
#
# ip_counts <- windowCounts(
# bam.files = ip_files,
# ext = list(fragment_sizes_ip, NA),
# param = params,
# spacing = 20,
# width = 50,
# filter = 10)
counts[["all"]] <- count_windows(bams, fragment_sizes)
counts[["ip"]] <- count_windows(ip_files, fragment_sizes_input)
......@@ -219,12 +255,13 @@ save.image("Seq_together.Rdata")
## Testing differences
## Differentially expression analysis
### Sample
all
```{r}
samples <- factor(gsub(pattern = "(^.*\\/|\\_IP\\_rep\\d{1,}|\\_input\\_rep\\d{1,}|\\_1.*$)",
replacement = '',
......@@ -249,19 +286,25 @@ samples_ip <- factor(gsub(pattern = "(^.*\\/|\\_IP\\_rep\\d{1,}|\\_input\\_r
ip_files),
levels = c('WT', 'Pab1', 'T3_ProtA', 'T3_TAP'))
samples_ip
```
```{r}
samples_input <- factor(gsub(pattern = "(^.*\\/|\\_IP\\_rep\\d{1,}|\\_input\\_rep\\d{1,}|\\_1.*$)",
replacement = '',
ip_files),
levels = c('WT', 'Pab1', 'T3_ProtA', 'T3_TAP'))
samples_input
```
Check that ip and input have right order
```{r}
stopifnot(samples_ip == samples_input)
```
sampleDesign_ip <- model.matrix(~samples_ip)
```{r}
sampleDesign_ip <- model.matrix(~samples_ip)
sampleDesign_input <- model.matrix(~samples_input)
```
......@@ -271,13 +314,17 @@ sampleDesign_input <- model.matrix(~samples_input)
y_ip <- estimateDisp(asDGEList(counts_norm[["ip"]]),
design = sampleDesign_ip,
robust = TRUE)
```
```{r}
y_input <- estimateDisp(asDGEList(counts_norm[["input"]]),
design = sampleDesign_input,
robust = TRUE)
```
```{r}
#estimateDisp(asDGEList(input_local_pruned_list$T3_ProtA),
# sampleDesign[c(3,4,7:8),c(1,3)],
# robust=TRUE)
......@@ -291,15 +338,13 @@ summary(y_ip$trended.dispersion)
fit
```{r}
fit_ip <- glmQLFit(y_ip, sampleDesign_ip, robust = TRUE)
fit_ip <- glmQLFit(y_ip, sampleDesign_ip, robust = TRUE)
fit_input <- glmQLFit(y_input, sampleDesign_input, robust = TRUE)
summary(fit_input$var.post)
```
```{r}
save.image("Seq_together2.Rdata")
```
```{r}
......@@ -314,11 +359,10 @@ save.image("Seq_together2.Rdata")
```{r}
results_ip <- results_input <- list()
results_ip[["T3_ProtA"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 1, 0))
results_ip[["T3_ProtA"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 1, 0))
results_input[["T3_ProtA"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 1, 0))
results_ip[["T3_TAP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 0, 1))
results_input[["T3_TAP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 0, 1))
results_ip[["T3_TAP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 0, 1))
results_input[["T3_TAP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 0, 1))
head(results_ip[["T3_ProtA"]]$table)
```
......@@ -357,20 +401,37 @@ colData(counts_norm[["ip"]])
```{r}
merged_res_ip[["T3_ProtA"]]
```
```{r}
save.image("Seq_together2.Rdata")
```
## Annotation
```{r}
ANNO <- read_tsv("Saccharomyces_cerevisiae.R64-1-1.84.bed.gz",
ANNO <- read_tsv(file.path("annotation", "SaccUTR.sorted.bed"),#"Saccharomyces_cerevisiae.R64-1-1.84.bed.gz",
col_names = c("chr", "start", "end", "name", "score", "strand"),
quote = "#") %>%
dplyr::filter(!is.na(start)) %>% dplyr::filter(chr != "track")
dplyr::filter(!is.na(start)) %>% dplyr::filter(chr != "track") %>%
separate(sep = "@", col = "name", into = c("ID", "name", "biotype", "region_type", "of", "exonID"), convert = T) %>% separate(sep = ":", col = "ID", into = c("gene", "ID"))
ANNO
```
```{r}
ANNO <- ANNO %>% mutate(region_type2 = ifelse(biotype == "protein_coding" & region_type == "exon",
"pc_exon", region_type)) %>%
mutate(region_type2 = ifelse(region_type2 == "exon", biotype, region_type2))
ANNO <- ANNO %>% mutate(label = paste(name, region_type2, sep = "-"))
ANNO
```
```{r}
ANNOGR <- makeGRangesFromDataFrame(as.data.frame(ANNO),
keep.extra.columns = T,
......@@ -382,22 +443,26 @@ ANNOGR <- makeGRangesFromDataFrame(as.data.frame(ANNO),
```{r}
qr <- findOverlaps(rowRanges(counts_norm[["ip"]]), ANNOGR,
maxgap=-1L, minoverlap=0L,
type=c("any"),
select=c("first"),
ignore.strand=FALSE)
```
X <- bind_cols(ANNO[qr,], as.data.frame(rowRanges(counts_norm[["ip"]])), results_ip[["T3_ProtA"]]$table)
Y <- bind_cols(ANNO[qr,], as.data.frame(rowRanges(counts_norm[["ip"]])), results_ip[["T3_TAP"]]$table)
```{r}
X <- bind_cols(ANNO[qr,], as.data.frame(rowRanges(counts_norm[["ip"]])) %>% dplyr::rename(region_chr = seqnames, region_start = start, region_end = end, region_width = width, region_strand = strand), results_ip[["T3_ProtA"]]$table)
Y <- bind_cols(ANNO[qr,], as.data.frame(rowRanges(counts_norm[["ip"]])) %>% dplyr::rename(region_chr = seqnames, region_start = start, region_end = end, region_width = width, region_strand = strand), results_ip[["T3_TAP"]]$table)
```
```{r}
X %>% slice(1)
```
```{r}
ihwResX <- ihw(PValue ~ logCPM, data = X, alpha = 0.1)
ihwResY <- ihw(PValue ~ logCPM, data = X, alpha = 0.1)
ihwResY <- ihw(PValue ~ logCPM, data = Y, alpha = 0.1)
```
......@@ -409,22 +474,41 @@ table(X$qval < 0.05)
Y$qval <- adj_pvalues(ihwResY)
table(Y$qval < 0.05)
```
```{r}
X <- X %>% mutate(col = ifelse(qval < 0.01 & abs(logFC) > 2, region_type2, "not significant"))
Y <- Y %>% mutate(col = ifelse(qval < 0.01 & abs(logFC) > 2, region_type2, "not significant"))
```
```{r}
X %>% filter(is.na(region_type2))
```
```{r}
X %>% filter(is.na(region_type2)) %>% pull(region_strand) %>% table
```
```{r}
ggplot(X, aes(logFC, -log10(PValue), label = name, col = qval < 0.01 & abs(logFC) > 2)) + geom_point(alpha = 0.8) +
ggplot(X, aes(logFC, -log10(PValue),
label = label,
col = col)) +
geom_point(alpha = 0.8) +
geom_text(data = ( X %>% dplyr::filter(-log10(PValue) > 20) %>%
dplyr::mutate(name = ifelse(is.na(name) & grepl(seqnames, pattern = "^t", perl = T), as.character(seqnames), name))), # %>% dplyr::filter(abs(logFC) > 3 )
aes(logFC, -log10(PValue)),
col = "black",
size = 2,
alpha = 1) +
scale_color_manual(values = c("grey", "red"))
alpha = 1) #+
# scale_color_manual(values = c("grey", "red", "orange", ))
```
```{r}
X %>% filter(logFC > 2 & -log10(PValue) > 10)
```
```{r}
ggplot(Y, aes(logFC, -log10(PValue), label = name, col = qval < 0.01 & abs(logFC) > 2)) + geom_point(alpha = 0.8) +
ggplot(Y, aes(logFC, -log10(PValue), label = label, col = qval < 0.01 & abs(logFC) > 2)) + geom_point(alpha = 0.8) +
geom_text(data = ( Y %>% dplyr::filter(-log10(PValue) > 10) %>%
dplyr::mutate(name = ifelse(is.na(name) & grepl(seqnames, pattern = "^t", perl = T), as.character(seqnames), name))), # %>% dplyr::filter(abs(logFC) > 3 )
aes(logFC, -log10(PValue)),
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment