Commit 551ceb8a authored by Thomas Schwarzl's avatar Thomas Schwarzl
Browse files

added contrasts fit for 2 crossed factor model

parent b446dabb
......@@ -92,7 +92,8 @@ reads.
```{r}
params <- readParam(minq = 20, # minimum mapping quality
pe = 'both')
pe = 'both',
max.frag = 600,)
```
## Fragment Length
......@@ -147,7 +148,7 @@ count_windows <- function(infiles, # bam files
spacing = 20,
...
) {
windowCounts(bam.files = infiles,
x <- windowCounts(bam.files = infiles,
ext = list(
fragsizes,
NA),
......@@ -156,260 +157,342 @@ count_windows <- function(infiles, # bam files
width = width,
filter = filter,
...)
colData(x)[,"names"] <- x %>% colData %>% .[,"bam.files"] %>% get_names
colData(x)[,"sample"] <- colData(x)$names %>%
gsub(pattern = "(^.*\\/|\\_IP\\_rep\\d{1,}|\\_input\\_rep\\d{1,}|\\_1.*$)",
replacement = '',
bams) %>% factor(levels = c('WT',
'Pab1',
'T3_ProtA',
'T3_TAP') )
colData(x)[,"type"] <- colData(x)$names %>%
grepl(pattern = "IP") %>% ifelse("IP",
"input") %>%
factor(levels = c('IP',
'input'))
x
}
```
Counting reads into windows.
Counting reads into 100 nucleotide windows.
```{r}
counts <- list()
# counts[["all"]] <- csaw::windowCounts(bam.files = bams,
# ext = list(
# frag_sizes,
# NA),
# param = params,
# spacing = 20,
# width = 50,
# filter = 10)
#
#
#
# ip_counts <- windowCounts(
# bam.files = ip_files,
# ext = list(fragment_sizes_ip, NA),
# param = params,
# spacing = 20,
# width = 50,
# filter = 10)
counts[["all"]] <- count_windows(bams, fragment_sizes)
counts[["ip"]] <- count_windows(ip_files, fragment_sizes_input)
counts[["input"]] <- count_windows(input_files, fragment_sizes_ip)
counts[["all"]] <- count_windows(bams, fragment_sizes, width = 100)
counts[["ip"]] <- count_windows(ip_files, fragment_sizes_input, width = 100)
counts[["input"]] <- count_windows(input_files, fragment_sizes_ip, width = 100)
```
Preview counts
```{r}
background <- list()
background[["all"]] <- count_windows(bams, fragment_sizes, width = 500, bin = T)
background[["ip"]] <- count_windows(ip_files, fragment_sizes_input, width = 500, bin = T)
background[["input"]] <- count_windows(input_files, fragment_sizes_ip, width = 500, bin = T)
assay(counts[["all"]]) %>% head
```
```{r}
ip_regions <- resize(rowRanges(ip_counts),
width = 500,
fix = 'center')
counts[["all"]] %>% metadata
```
## Filter Windows
Keep counts average greater than 2
```{r}
# ip_region_counts <- regionCounts(bam.files = ip_files,
# param = params,
# ext = list(
# frag_sizes[
# grep(pattern = "IP",
# invert = FALSE,
# names(frag_sizes))],
# NA),
# regions = ip_regions)
counts_filtered <- lapply(counts, function(x) {
abundances <- aveLogCPM(asDGEList(x))
keep <- abundances > aveLogCPM(2, lib.size = mean(x$totals))
x[keep,]
})
```
## Library size normalisation
```{r}
#table(getWidths(ip_region_counts))
counts_norm <- lapply(counts_filtered, function(x) {
normFactors(x, se.out = T)
})
```
```{r}
counts_norm <- lapply(counts, function(x) {
normFactors(x)
})
save.image("Workspace_2CSeq.Rdata")
```
## Differentially expression analysis
```{r}
save.image("Seq_together.Rdata")
counts_filtered[["all"]] %>% colData
```
### Sample design
all
```{r}
#colData(ip_counts)
```
sample <- counts_filtered[["all"]]$sample
type <- counts_filtered[["all"]]$type
sampletypes <- paste(sample, type, sep = "_")
## Filter Windows
sample_design <- model.matrix(~0 + sample * type)
```{r}
#filtered_windows <- filterWindowsLocal(ip_counts, window_background)
sample_design
```
```{r eval = FALSE}
library(ExploreModelMatrix)
ExploreModelMatrix(sampleData = colData(counts_filtered[["all"]])[,c("sample", "type")],
designFormula = ~0 + sample * type)
```
## Differentially expression analysis
### Sample
all
ip
```{r}
samples <- factor(gsub(pattern = "(^.*\\/|\\_IP\\_rep\\d{1,}|\\_input\\_rep\\d{1,}|\\_1.*$)",
replacement = '',
bams),
levels = c('WT', 'Pab1', 'T3_ProtA', 'T3_TAP'))
samples
type <- factor(ifelse(grepl(pattern = "IP",
bams), "IP", "input"),
levels = c('IP', 'input'))
type
sampleDesign <- model.matrix(~samples + type + samples:type)
sampleDesign
samples_ip <- counts_filtered[["ip"]]$sample
sample_design_ip <- model.matrix(~samples_ip)
```
ip
```{r}
samples_ip <- factor(gsub(pattern = "(^.*\\/|\\_IP\\_rep\\d{1,}|\\_input\\_rep\\d{1,}|\\_1.*$)",
replacement = '',
ip_files),
levels = c('WT', 'Pab1', 'T3_ProtA', 'T3_TAP'))
samples_ip
sample_design
```
input
```{r}
samples_input <- factor(gsub(pattern = "(^.*\\/|\\_IP\\_rep\\d{1,}|\\_input\\_rep\\d{1,}|\\_1.*$)",
replacement = '',
ip_files),
levels = c('WT', 'Pab1', 'T3_ProtA', 'T3_TAP'))
samples_input
samples_input <- counts_filtered[["input"]]$sample
sample_design_input <- model.matrix(~samples_input)
```
Check that ip and input have right order
```{r}
stopifnot(samples_ip == samples_input)
```
### Estimate Dispersion
```{r}
sampleDesign_ip <- model.matrix(~samples_ip)
sampleDesign_input <- model.matrix(~samples_input)
y_all <- estimateDisp(asDGEList(counts_norm[["all"]]),
design = sample_design,
robust = TRUE)
```
```{r}
y_ip <- estimateDisp(asDGEList(counts_norm[["ip"]]),
design = sampleDesign_ip,
design = sample_design_ip,
robust = TRUE)
```
```{r}
y_input <- estimateDisp(asDGEList(counts_norm[["input"]]),
design = sampleDesign_input,
design = sample_design_input,
robust = TRUE)
```
```{r}
summary(y_all$trended.dispersion)
```
```{r}
#estimateDisp(asDGEList(input_local_pruned_list$T3_ProtA),
# sampleDesign[c(3,4,7:8),c(1,3)],
# robust=TRUE)
summary(y_input$trended.dispersion)
```
```{r}
summary(y_ip$trended.dispersion)
```
fit
### Fit
```{r}
fit_ip <- glmQLFit(y_ip, sampleDesign_ip, robust = TRUE)
fit_input <- glmQLFit(y_input, sampleDesign_input, robust = TRUE)
fit_all <- glmQLFit(y_all, sample_design, robust = TRUE)
fit_ip <- glmQLFit(y_ip, sample_design_ip, robust = TRUE)
fit_input <- glmQLFit(y_input, sample_design_input, robust = TRUE)
```
```{r}
summary(fit_input$var.post)
```
### QC plots
```{r}
plot_mds <- function(y, names) {
adj.counts <- cpm(y, log=TRUE)
for (top in c(1000)) {
plotMDS(adj.counts,
main=top,
#col=c("blue", "blue", "red", "red"),
labels=names,
top=top)
}
}
```
```{r}
plot_mds(y_all, names = counts_norm[["all"]]$names)
```
```{r}
# par(mfrow=c(1,2))
# o <- order(y$AveLogCPM)
# plot(y$AveLogCPM[o], sqrt(y$trended.dispersion[o]), type="l", lwd=2,
# ylim=c(0, 1), xlab=expression("Ave."~Log[2]~"CPM"),
# ylab=("Biological coefficient of variation"))
# plotQLDisp(fit)
plot_mds(y_ip, names = counts_norm[["ip"]]$names)
```
```{r}
results_ip <- results_input <- list()
results_ip[["T3_ProtA"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 1, 0))
results_input[["T3_ProtA"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 1, 0))
results_ip[["T3_TAP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 0, 1))
results_input[["T3_TAP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 0, 1))
plot_mds(y_input, names = counts_norm[["input"]]$names)
```
head(results_ip[["T3_ProtA"]]$table)
```{r}
plot_qc <- function(y, fit) {
par(mfrow=c(1,2))
o <- order(y$AveLogCPM)
plot(y$AveLogCPM[o], sqrt(y$trended.dispersion[o]), type="l", lwd=2,
ylim=c(0, 1), xlab=expression("Ave."~Log[2]~"CPM"),
ylab=("Biological coefficient of variation"))
plotQLDisp(fit)
}
```
```{r}
plot_qc(y_all, fit_all)
```
## Merge
```{r}
plot_qc(y_ip, fit_ip)
```
```{r}
merged_res_ip <- merged_res_input <- list()
merged_res_ip[["T3_ProtA"]] <- mergeResults(counts_norm[["ip"]],
results_ip[["T3_ProtA"]]$table,
tol=100,
merge.args=list(max.width=5000))
# merged_res_input[["T3_ProtA"]] <- mergeResults(counts_norm[["input"]],
# results_input[["T3_ProtA"]]$table,
# tol=100,
# merge.args=list(max.width=5000))
plot_qc(y_input, fit_input)
```
### Test for differential expression
![design.jpg]()
```{r}
results_ip <- list()
results_all <- list()
# results[["ip"]] <- list()
# results[["input"]] <- list()
# results[["all"]] <- list()
```
```{r}
head(sample_design_ip, 1)
```
```{r}
head(sample_design, 1)
```
```{r}
s <- sample_design %>% as.data.frame()
rownames(s) <- counts[["all"]]$names
s
```
```{r}
#samplePab1 - (samplePab1 + typeinput + samplePab1:typeinput)
colnames(sample_design) <- gsub(colnames(sample_design),
pattern = ":",
replacement = "_")
contrasts <- makeContrasts(Pab1 = samplePab1 - (samplePab1 + typeinput + samplePab1_typeinput), levels = sample_design)
```
```{r}
contrasts[,1]
```
```{r}
assay(counts_norm[["ip"]])
rowData(counts_norm[["ip"]])
results_all[["Pab1_IP_vs_Pab1_Input"]] <- contrasts[,"Pab1"]
rowRanges(counts_norm[["ip"]])
colData(counts_norm[["ip"]])
```
Those comparisons are not useful, because wild type does not have many
counts
```{r}
#results_ip[["T3_TAP_IP_vs_WT_IP"]] <- glmQLFTest(fit_ip, contrast=c(-1, 0, 0, 1))
#results_ip[["T3_ProtA_IP_vs_WT_IP"]] <- glmQLFTest(fit_ip, contrast=c(-1, 0, 1, 0))
#results_ip[["Pab1_IP_vs_WT_IP"]] <- glmQLFTest(fit_ip, contrast=c(-1, 1, 0, 0))
results_ip[["T3_ProtA_IP_vs_Pab1_IP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 1, 0))
results_ip[["T3_TAP_IP_vs_Pab1_IP"]] <- glmQLFTest(fit_ip, contrast=c(0, -1, 0, 1))
```
```{r}
merged_res_ip[["T3_ProtA"]]
# results[["T3_TAP_IP_vs_Input"]] <- glmQLFTest(fit_all, contrast=c(-1, 0, 0, 1))
# results[["T3_TAP_IP_vs_Pab1_IP"]] <- glmQLFTest(fit_all, contrast=c(0, -1, 0, 1))
results_all[["Pab1_IP_vs_Pab1_Input"]] <- glmQLFTest(fit_all,
contrast = contrasts[,"Pab1"] )
#results_all[["T3_ProtA_IP_vs_T3_ProtA_Input"]] <- glmQLFTest(fit_all,
# contrast = c(0, 0, -1, 1 , 0 , 0, 0, 0) )
#results_all[["T3_TAP_IP_vs_T3_TAP_Input"]] <- glmQLFTest(fit_all,
# contrast = c(0, 0, 0 , 0 , -1, 1, 0, 0) )
```
```{r}
save.image("Seq_together2.Rdata")
head(results_all[["Pab1_IP_vs_Pab1_Input"]]$table)
```
## Annotation
## Merge
```{r}
merges_res_ip <- lapply(results_ip, function(x) {
mergeResults(counts_norm[["ip"]],
x$table,
tol = 100,
merge.args = list(max.width = 5000))
})
merges_res_all <- lapply(results_all, function(x) {
mergeResults(counts_norm[["all"]],
x$table,
tol = 100,
merge.args = list(max.width = 5000))
})
```
```{r}
#save.image("Seq_together2.Rdata")
```
## Annotation
### Prepare Annotation
```{r}
ANNO <- read_tsv(file.path("annotation", "SaccUTR.sorted.bed"),#"Saccharomyces_cerevisiae.R64-1-1.84.bed.gz",
......@@ -420,6 +503,9 @@ ANNO <- read_tsv(file.path("annotation", "SaccUTR.sorted.bed"),#"Saccharomyces_c
ANNO
```
```{r}
ANNO <- ANNO %>% mutate(region_type2 = ifelse(biotype == "protein_coding" & region_type == "exon",
"pc_exon", region_type)) %>%
......@@ -431,54 +517,96 @@ ANNO <- ANNO %>% mutate(label = paste(name, region_type2, sep = "-"))
ANNO
```
Generate GRanges
```{r}
ANNOGR <- makeGRangesFromDataFrame(as.data.frame(ANNO),
keep.extra.columns = T,
start.field = "start",
end.field = "end",
seqnames.field = "chr",
strand.field = "strand")
start.field = "start",
end.field = "end",
seqnames.field = "chr",
strand.field = "strand")
```
```{r}
qr <- findOverlaps(rowRanges(counts_norm[["ip"]]), ANNOGR,
maxgap=-1L, minoverlap=0L,
type=c("any"),
select=c("first"),
ignore.strand=FALSE)
```
### Annotate
```{r}
X <- bind_cols(ANNO[qr,], as.data.frame(rowRanges(counts_norm[["ip"]])) %>% dplyr::rename(region_chr = seqnames, region_start = start, region_end = end, region_width = width, region_strand = strand), results_ip[["T3_ProtA"]]$table)
Y <- bind_cols(ANNO[qr,], as.data.frame(rowRanges(counts_norm[["ip"]])) %>% dplyr::rename(region_chr = seqnames, region_start = start, region_end = end, region_width = width, region_strand = strand), results_ip[["T3_TAP"]]$table)
do_annotate <- function(res, cou) {
qr <- findOverlaps(rowRanges(cou),
ANNOGR,
maxgap = -1L,
minoverlap = 0L,
type = c("any"),
select = c("first"),
ignore.strand = FALSE)
X <- bind_cols(ANNO[qr,],
as.data.frame(rowRanges(cou)) %>%
dplyr::rename(region_chr = seqnames,
region_start = start,
region_end = end,
region_width = width,
region_strand = strand),
res$table)
ihwResX <- ihw(PValue ~ logCPM, data = X, alpha = 0.1)
X$qval <- adj_pvalues(ihwResX)
X %>% mutate(sig = qval < 0.01 & abs(logFC) > 1) %>%
mutate(enr = qval < 0.01 & logFC > 1) %>%
mutate(col = ifelse(sig, region_type2, "not significant"))
#table(X$qval < 0.05)
}
```
```{r}
X %>% slice(1)
#res <- results_all[[1]]
#cou <- counts_norm[["all"]]
```
```{r}
ihwResX <- ihw(PValue ~ logCPM, data = X, alpha = 0.1)
ihwResY <- ihw(PValue ~ logCPM, data = Y, alpha = 0.1)
results_ip_anno <- lapply(results_ip, function(x) {
do_annotate(x, counts_norm[["ip"]])
})
results_all_anno <- lapply(results_all, function(x) {
do_annotate(x, counts_norm[["all"]])
})
```
## Plot
```{r}
X$qval <- adj_pvalues(ihwResX)
table(X$qval < 0.05)
Xp <- lapply(results_all_anno, function(x) {
x %>% filter(!is.na(chr))
})
```
```{r}
Y$qval <- adj_pvalues(ihwResY)
table(Y$qval < 0.05)
lapply(Xp, function(x) table(x$enr))
```
```{r}
X <- X %>% mutate(col = ifelse(qval < 0.01 & abs(logFC) > 2, region_type2, "not significant"))
Y <- Y %>% mutate(col = ifelse(qval < 0.01 & abs(logFC) > 2, region_type2, "not significant"))
Xp_table <- lapply(Xp, function(x) {
Xsum <- x %>% filter(qval < 0.01 & logFC > 2) %>%
group_by(name, biotype, region_type2) %>% summarize(count = n()) %>%
ungroup()
Xsum %>% group_by(region_type2) %>%
summarize(sum = n())
})
Xp_table
```
```{r}
X %>% filter(is.na(region_type2))
```
......@@ -487,6 +615,31 @@ X %>% filter(is.na(region_type2)) %>% pull(region_strand) %>% table
```
```{r}
ggplot(Xp$Pab1_IP_vs_Pab1_Input, aes(logFC, -log10(PValue),
label = label,
col = col)) +
geom_point(alpha = 0.5)
```
```{r}
```
```{r}
ggplot(X, aes(logFC, -log10(PValue),
......@@ -529,6 +682,10 @@ library(org.Sc.sgd.db)
```{r}
anno <- detailRanges(out.ranges, orgdb=org.Sc.sgd.db,
txdb=TxDb.Scerevisiae.UCSC.sacCer3.sgdGene, name.field = "GENENAME", key.field = "SGD")
......@@ -670,15 +827,15 @@ summary(rowSums(assay(ip_local_pruned_list$T3_ProtA)))
summary(rowSums(assay(ip_local_pruned_list$T3_TAP)))
samples <- factor(gsub(pattern="(^.*\\/|\\_IP\\_rep\\d{1,}|\\_1.*$)",replacement='',ip_files),levels=c('WT','Pab1','T3_ProtA','T3_TAP'))
samples
sampleDesign <- model.matrix(~samples)
sampleDesign
sample_design <- model.matrix(~samples)
sample_design