── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.4 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Attaching package: 'magrittr'
The following object is masked from 'package:purrr':
set_names
The following object is masked from 'package:tidyr':
extract
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:lubridate':
intersect, setdiff, union
The following objects are masked from 'package:dplyr':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind,
colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
table, tapply, union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Attaching package: 'S4Vectors'
The following objects are masked from 'package:lubridate':
second, second<-
The following objects are masked from 'package:dplyr':
first, rename
The following object is masked from 'package:tidyr':
expand
The following objects are masked from 'package:base':
expand.grid, I, unname
Loading required package: IRanges
Attaching package: 'IRanges'
The following object is masked from 'package:lubridate':
%within%
The following objects are masked from 'package:dplyr':
collapse, desc, slice
The following object is masked from 'package:purrr':
reduce
Loading required package: GenomeInfoDb
Attaching package: 'GenomicRanges'
The following object is masked from 'package:magrittr':
subtract
Loading required package: grid
Loading required package: Biostrings
Loading required package: XVector
Attaching package: 'XVector'
The following object is masked from 'package:purrr':
compact
Attaching package: 'Biostrings'
The following object is masked from 'package:grid':
pattern
The following object is masked from 'package:base':
strsplit
Attaching package: 'gridExtra'
The following object is masked from 'package:BiocGenerics':
combine
The following object is masked from 'package:dplyr':
combine
Attaching package: 'data.table'
The following object is masked from 'package:GenomicRanges':
shift
The following object is masked from 'package:IRanges':
shift
The following objects are masked from 'package:S4Vectors':
first, second
The following objects are masked from 'package:lubridate':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:dplyr':
between, first, last
The following object is masked from 'package:purrr':
transpose
Registered S3 method overwritten by 'gplots':
method from
reorder.factor gdata
ChIPseeker v1.34.1 For help: https://guangchuangyu.github.io/software/ChIPseeker
If you use ChIPseeker in published research, please cite:
Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring epigenomic datasets by ChIPseeker. Current Protocols 2022, 2(10): e585
Loading required package: graph
Attaching package: 'graph'
The following object is masked from 'package:Biostrings':
complement
The following object is masked from 'package:stringr':
boundary
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with
'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Loading required package: GO.db
Loading required package: AnnotationDbi
Attaching package: 'AnnotationDbi'
The following object is masked from 'package:dplyr':
select
Loading required package: SparseM
Attaching package: 'SparseM'
The following object is masked from 'package:base':
backsolve
groupGOTerms: GOBPTerm, GOMFTerm, GOCCTerm environments built.
Attaching package: 'topGO'
The following object is masked from 'package:grid':
depth
The following object is masked from 'package:IRanges':
members
Loading required package: GenomicFeatures
Attaching package: 'GenomicFeatures'
The following object is masked from 'package:topGO':
genes
config = load_config()
# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI)
# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")
# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows()
variants_overlapping_motif = unique(gsub("_", ":", fimo_alleles$snp_id))
twi_24h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_twi.24_annot.txt")
twi_24h_best_basenji_in_peak = take_best_Basenji_in_peak(twi_24h)
bin_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.68_annot.txt")
bin_68h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_68h)
ctcf_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_ctcf.68_annot.txt")
ctcf_68h_best_basenji_in_peak = take_best_Basenji_in_peak(ctcf_68h)
mef2_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.68_annot.txt")
mef2_68h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_68h)
bin_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.1012_annot.txt")
bin_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_1012h)
mef2_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.1012_annot.txt")
mef2_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_1012h)
all_samples = rbind(twi_24h, bin_68h, ctcf_68h, mef2_68h, bin_1012h, mef2_1012h)
all_samples$correct_predict = factor(ifelse(all_samples$AI>0.5 & all_samples$Basenji_AI>0.5, "correct", ifelse(all_samples$AI<0.5 & all_samples$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))
all_samples_best_basenji_in_peak = rbind(twi_24h_best_basenji_in_peak, bin_68h_best_basenji_in_peak, ctcf_68h_best_basenji_in_peak, mef2_68h_best_basenji_in_peak, bin_1012h_best_basenji_in_peak, mef2_1012h_best_basenji_in_peak)
all_samples_best_basenji_in_peak$correct_predict = factor(ifelse(all_samples_best_basenji_in_peak$AI>0.5 & all_samples_best_basenji_in_peak$Basenji_AI>0.5, "correct", ifelse(all_samples_best_basenji_in_peak$AI<0.5 & all_samples_best_basenji_in_peak$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)
all_samples_best_basenji_in_peak$best_variant = 1
all_samples_2nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, all_samples_best_basenji_in_peak)
all_samples_2nd_best_basenji_in_peak$best_variant = 2
all_samples_3nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak))
all_samples_3nd_best_basenji_in_peak$best_variant = 3
all_samples_4nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak))
all_samples_4nd_best_basenji_in_peak$best_variant = 4
all_samples_5nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak, all_samples_4nd_best_basenji_in_peak))
all_samples_5nd_best_basenji_in_peak$best_variant = 5
all_samples$motif = ifelse((all_samples$variant_ID %in% variants_overlapping_motif), TRUE, FALSE)
all_samples_sub_on_motif = subset(all_samples, significant==TRUE & overlaps_motif=="overlaps_motif")
all_samples_best_basenji_in_peak_sub_on_motif = subset(all_samples_best_basenji_in_peak, significant==TRUE & overlaps_motif=="overlaps_motif")
all_samples_2nd_best_basenji_in_peak_sub_on_motif = subset(all_samples_2nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="overlaps_motif")
all_samples_3nd_best_basenji_in_peak_sub_on_motif = subset(all_samples_3nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="overlaps_motif")
all_samples_4nd_best_basenji_in_peak_sub_on_motif = subset(all_samples_4nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="overlaps_motif")
all_samples_5nd_best_basenji_in_peak_sub_on_motif = subset(all_samples_5nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="overlaps_motif")
all_samples_sub_outside_motif = subset(all_samples, significant==TRUE & overlaps_motif=="no_overlap")
all_samples_best_basenji_in_peak_sub_outside_motif = subset(all_samples_best_basenji_in_peak, significant==TRUE & overlaps_motif=="no_overlap")
all_samples_2nd_best_basenji_in_peak_sub_outside_motif = subset(all_samples_2nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="no_overlap")
all_samples_3nd_best_basenji_in_peak_sub_outside_motif = subset(all_samples_3nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="no_overlap")
all_samples_4nd_best_basenji_in_peak_sub_outside_motif = subset(all_samples_4nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="no_overlap")
all_samples_5nd_best_basenji_in_peak_sub_outside_motif = subset(all_samples_5nd_best_basenji_in_peak, significant==TRUE & overlaps_motif=="no_overlap")
success_proportion_on_motif = as.data.frame(t(matrix(c("1st", nrow(subset(all_samples_best_basenji_in_peak_sub_on_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_best_basenji_in_peak_sub_on_motif),
"2nd", nrow(subset(all_samples_2nd_best_basenji_in_peak_sub_on_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_2nd_best_basenji_in_peak_sub_on_motif),
"3rd", nrow(subset(all_samples_3nd_best_basenji_in_peak_sub_on_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_3nd_best_basenji_in_peak_sub_on_motif),
"4th", nrow(subset(all_samples_4nd_best_basenji_in_peak_sub_on_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_4nd_best_basenji_in_peak_sub_on_motif),
"5th", nrow(subset(all_samples_5nd_best_basenji_in_peak_sub_on_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_5nd_best_basenji_in_peak_sub_on_motif),
"all", nrow(subset(all_samples_sub_on_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_sub_on_motif)), nrow=2)))
colnames(success_proportion_on_motif) = c("best_Basenji_AI", "proportion_correct")
success_proportion_on_motif$proportion_correct = as.numeric(success_proportion_on_motif$proportion_correct)
success_proportion_outside_motif = as.data.frame(t(matrix(c("1st", nrow(subset(all_samples_best_basenji_in_peak_sub_outside_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_best_basenji_in_peak_sub_outside_motif),
"2nd", nrow(subset(all_samples_2nd_best_basenji_in_peak_sub_outside_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_2nd_best_basenji_in_peak_sub_outside_motif),
"3rd", nrow(subset(all_samples_3nd_best_basenji_in_peak_sub_outside_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_3nd_best_basenji_in_peak_sub_outside_motif),
"4th", nrow(subset(all_samples_4nd_best_basenji_in_peak_sub_outside_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_4nd_best_basenji_in_peak_sub_outside_motif),
"5th", nrow(subset(all_samples_5nd_best_basenji_in_peak_sub_outside_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_5nd_best_basenji_in_peak_sub_outside_motif),
"all", nrow(subset(all_samples_sub_outside_motif, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_sub_outside_motif)), nrow=2)))
colnames(success_proportion_outside_motif) = c("best_Basenji_AI", "proportion_correct")
success_proportion_outside_motif$proportion_correct = as.numeric(success_proportion_outside_motif$proportion_correct)
background_success_proportion_df = data.frame(row.names = c("1st", "2nd", "3rd", "4th", "5th", "all"))
for (i in seq(1, 1000)) {
all_samples_variant = subset(all_samples)
background_variant_shuff = all_samples_variant %>%
select(variant_ID, peak_ID, significant, correct_predict) %>%
group_by(peak_ID) %>%
mutate(rank=sample(row_number())) %>%
ungroup()
background_best_variant_shuff = subset(background_variant_shuff, rank==1 & significant==TRUE)
background_2nd_best_variant_shuff = subset(background_variant_shuff, rank==2 & significant==TRUE)
background_3nd_best_variant_shuff = subset(background_variant_shuff, rank==3 & significant==TRUE)
background_4nd_best_variant_shuff = subset(background_variant_shuff, rank==4 & significant==TRUE)
background_5nd_best_variant_shuff = subset(background_variant_shuff, rank==5 & significant==TRUE)
background_success_proportion = as.data.frame(t(matrix(c("1st", nrow(subset(background_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_best_variant_shuff),
"2nd", nrow(subset(background_2nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_2nd_best_variant_shuff),
"3rd", nrow(subset(background_3nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_3nd_best_variant_shuff),
"4th", nrow(subset(background_4nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_4nd_best_variant_shuff),
"5th", nrow(subset(background_5nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_5nd_best_variant_shuff),
"all", nrow(subset(background_variant_shuff, (correct_predict=="correct"))) / nrow(background_variant_shuff)), nrow=2)))
colnames(background_success_proportion) = c("best_Basenji_AI", "proportion_correct")
background_success_proportion$proportion_correct = as.numeric(background_success_proportion$proportion_correct)
background_success_proportion_df = cbind(background_success_proportion_df, background_success_proportion$proportion_correct)
}
background_success_proportion_summary = data.frame(background_mean = rowMeans(background_success_proportion_df),
background_std = apply(background_success_proportion_df, 1, sd, na.rm = TRUE))
background_success_proportion_summary$best_Basenji_AI = rownames(background_success_proportion_summary)
success_proportion_on_motif = success_proportion_on_motif[c(1,2,3,4,5), ]
success_proportion_outside_motif = success_proportion_outside_motif[c(1,2,3,4,5), ]
background_success_proportion_summary = background_success_proportion_summary[c(1,2,3,4,5), ]
p = ggplot() +
geom_point(data = success_proportion_on_motif, aes(x=best_Basenji_AI, y=proportion_correct, group=1), size=3, colour="#FFA736") +
geom_line(data = success_proportion_on_motif, aes(x=best_Basenji_AI, y=proportion_correct, group=1), colour="#FFA736") +
geom_point(data = success_proportion_outside_motif, aes(x=best_Basenji_AI, y=proportion_correct, group=1), size=3, colour="grey15") +
geom_line(data = success_proportion_outside_motif, aes(x=best_Basenji_AI, y=proportion_correct, group=1), colour="grey15") +
geom_line(data=background_success_proportion_summary, aes(x=best_Basenji_AI, y = background_mean, group=1), color = "grey60", linewidth = 1) +
geom_ribbon(data=background_success_proportion_summary, aes(x=best_Basenji_AI, y = background_mean, ymin = background_mean - background_std * 2, ymax = background_mean + background_std * 2, group=1), fill = "grey70", alpha = .2) +
ylim(0.47, 1) +
geom_hline(yintercept = 0.5, colour = "#C92B27", linetype="dashed") +
geom_text(data = success_proportion_on_motif, aes(x = best_Basenji_AI, y = proportion_correct, label=round(proportion_correct, 3)), colour="grey15", fontface = 2, size = 4, vjust=-2) +
geom_text(data = success_proportion_outside_motif, aes(x = best_Basenji_AI, y = proportion_correct, label=round(proportion_correct, 3)), colour="grey15", fontface = 2, size = 4, vjust=-2) +
xlab("Best variant order (Basenji AI)") +
ylab("Proportion of correct predictions (AI direction)") +
theme_bw() +
theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 12)) +
theme(axis.title = element_text(size = 12), plot.title = element_text(size=12)) +
theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) +
theme(legend.position = "none")
p
give.n <- function(x){
return(c(y = 0.025, label = length(x)))
# experiment with the multiplier to find the perfect position
}
all_samples$overlaps_motif = factor(all_samples$overlaps_motif, levels=c("overlaps_motif", "no_overlap"))
all_samples_sign_peak = subset(all_samples, peak_ID %in% unique(subset(all_samples, significant==TRUE)$peak_ID) )
p = ggplot(all_samples_sign_peak, aes(x=overlaps_motif, y=Basenji_abs_AI, fill=overlaps_motif)) +
geom_violin(width=1.15) +
geom_boxplot(width=0.02, outlier.shape = NA, fill="white", alpha=0.75) +
ylim(0,0.010) +
xlab("Variant overlaps cognate motif") +
ylab("Basenji absolute pAI") +
stat_summary(fun.data = give.n, geom = "text", fun.y = 0.25) +
scale_fill_manual(values = c("#FFA736", "grey70")) +
theme_bw() +
theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 12)) +
theme(axis.title = element_text(size = 12), plot.title = element_text(size=12)) +
theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) +
theme(legend.position = "none")
Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
ℹ Please use the `fun` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
Warning: Removed 7496 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 7496 rows containing non-finite values (`stat_boxplot()`).
Warning: Removed 7496 rows containing non-finite values (`stat_summary()`).
Warning: `position_dodge()` requires non-overlapping x intervals
Warning: Removed 2 rows containing missing values (`geom_text()`).
outf = file.path(outdir_fig_suppl, paste0("FigS7B_Distribution_of_abs_AI_motif_overlap.pdf"))
ggsave(outf, p, width = 6, height = 4)
Warning: Removed 7496 rows containing non-finite values (`stat_ydensity()`).
Warning: Removed 7496 rows containing non-finite values (`stat_boxplot()`).
Warning: Removed 7496 rows containing non-finite values (`stat_summary()`).
Warning: `position_dodge()` requires non-overlapping x intervals
Warning: Removed 2 rows containing missing values (`geom_text()`).
Wilcoxon pvalue: 0
all_samples_best_basenji_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)
all_samples_2nd_best_basenji_sub = subset(all_samples_2nd_best_basenji_in_peak, significant==TRUE)
all_samples_3nd_best_basenji_sub = subset(all_samples_3nd_best_basenji_in_peak, significant==TRUE)
all_samples_4nd_best_basenji_sub = subset(all_samples_4nd_best_basenji_in_peak, significant==TRUE)
all_samples_5nd_best_basenji_sub = subset(all_samples_5nd_best_basenji_in_peak, significant==TRUE)
all_samples_rank_variants = rbind(all_samples_best_basenji_sub, all_samples_2nd_best_basenji_sub, all_samples_3nd_best_basenji_sub, all_samples_4nd_best_basenji_sub, all_samples_5nd_best_basenji_sub)
percentages = data.frame(table(all_samples_rank_variants[, c("best_variant", "overlaps_motif")])) %>%
pivot_wider(names_from =overlaps_motif, values_from = Freq)
percentages$tot = percentages$no_overlap + percentages$overlaps_motif
percentages$no_overlap = percentages$no_overlap / percentages$tot
percentages$overlaps_motif = percentages$overlaps_motif / percentages$tot
percentages = percentages %>%
select(best_variant, no_overlap, overlaps_motif) %>%
pivot_longer(cols = c(no_overlap, overlaps_motif))
colnames(percentages) = c("best_variant", "overlaps_motif", "ratio")
p = ggplot(percentages, aes(x=best_variant, y=ratio, fill=overlaps_motif)) +
geom_col(position="fill") +
xlab("Best variant order (Basenji AI)") +
ylab("Proportion of variants overlapping cognate motif") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(values = c("grey70", "#FFA736")) +
theme_bw() +
geom_text(aes(label = paste0(round(ratio*100, 1), "%")), position = position_fill(vjust = 0.5), size=5) +
theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 12)) +
theme(axis.title = element_text(size = 12), plot.title = element_text(size=12)) +
theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) +
theme(legend.position = "none")
p
saturation_scores_predictions = read.table("/g/furlong/project/103_Basenji/Mattia/analysis/saturation_scores/Basenji_DataTable_predictions.txt", header=TRUE)
saturation_scores_predictions$Basenji_predict = gsub("0", "no_prediction", gsub("1", "full_prediction", gsub("partially", "partial_prediction", saturation_scores_predictions$Basenji_predict)))
saturation_scores_predictions$Basenji_predict = factor(saturation_scores_predictions$Basenji_predict, levels=c("full_prediction", "partial_prediction", "no_prediction"))
saturation_scores_predictions$motif_on_variant = ifelse(saturation_scores_predictions$variant_in_self_motif == 1, "self_motif", ifelse(saturation_scores_predictions$variant_in_other_motif == 1, "cofactor_motif", ifelse(saturation_scores_predictions$Basenji_predict == "no_prediction", "no_prediction", "no_motif")))
saturation_scores_predictions$motif_on_variant = factor(saturation_scores_predictions$motif_on_variant, levels=c("self_motif", "cofactor_motif","no_motif", "no_prediction"))
saturation_scores_predictions$condition = factor(saturation_scores_predictions$condition, levels=c("twi.24", "ctcf.68", "mef2.68", "mef2.1012", "bin.68", "bin.1012"))
saturation_scores_predictions$motifs_predictions = ifelse(saturation_scores_predictions$self_motif == 1 & saturation_scores_predictions$cofactor_motif == 1, "self_and_cofactor", ifelse(saturation_scores_predictions$self_motif == 1, "self_motif", ifelse(saturation_scores_predictions$cofactor_motif == 1, "cofactor_motif", ifelse(saturation_scores_predictions$Basenji_predict == "no_prediction", "no_prediction" ,"no_motif"))))
saturation_scores_predictions$motifs_predictions = factor(saturation_scores_predictions$motifs_predictions, levels=c("self_and_cofactor", "self_motif", "cofactor_motif", "no_motif", "no_prediction"))
p = plot_counts_barplot(saturation_scores_predictions, "condition", "motifs_predictions") +
scale_fill_manual(values = c("#339024", "#FF2341", "#FFA736", "grey70", "grey15")) +
geom_text(aes(label=counts), position = position_stack(vjust = 0.5), colour="white") +
labs(fill="predicted_motifs")
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
twi_24h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_twi.24_annot.txt")
twi_24h_best_basenji_in_peak = take_best_Basenji_in_peak(twi_24h)
bin_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.68_annot.txt")
bin_68h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_68h)
ctcf_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_ctcf.68_annot.txt")
ctcf_68h_best_basenji_in_peak = take_best_Basenji_in_peak(ctcf_68h)
mef2_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.68_annot.txt")
mef2_68h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_68h)
bin_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.1012_annot.txt")
bin_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_1012h)
mef2_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.1012_annot.txt")
mef2_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_1012h)
all_samples = rbind(twi_24h, bin_68h, ctcf_68h, mef2_68h, bin_1012h, mef2_1012h)
all_samples$correct_predict = factor(ifelse(all_samples$AI>0.5 & all_samples$Basenji_AI>0.5, "correct", ifelse(all_samples$AI<0.5 & all_samples$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))
all_samples_best_basenji_in_peak = rbind(twi_24h_best_basenji_in_peak, bin_68h_best_basenji_in_peak, ctcf_68h_best_basenji_in_peak, mef2_68h_best_basenji_in_peak, bin_1012h_best_basenji_in_peak, mef2_1012h_best_basenji_in_peak)
all_samples_best_basenji_in_peak$correct_predict = factor(ifelse(all_samples_best_basenji_in_peak$AI>0.5 & all_samples_best_basenji_in_peak$Basenji_AI>0.5, "correct", ifelse(all_samples_best_basenji_in_peak$AI<0.5 & all_samples_best_basenji_in_peak$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)
all_samples_best_basenji_in_peak$best_variant = 1
all_samples_2nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, all_samples_best_basenji_in_peak)
all_samples_2nd_best_basenji_in_peak$best_variant = 2
all_samples_3nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak))
all_samples_3nd_best_basenji_in_peak$best_variant = 3
all_samples_4nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak))
all_samples_4nd_best_basenji_in_peak$best_variant = 4
all_samples_5nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak, all_samples_4nd_best_basenji_in_peak))
all_samples_5nd_best_basenji_in_peak$best_variant = 5
all_samples_sub = subset(all_samples, significant==TRUE)
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)
all_samples_2nd_best_basenji_in_peak_sub = subset(all_samples_2nd_best_basenji_in_peak, significant==TRUE)
all_samples_3nd_best_basenji_in_peak_sub = subset(all_samples_3nd_best_basenji_in_peak, significant==TRUE)
all_samples_4nd_best_basenji_in_peak_sub = subset(all_samples_4nd_best_basenji_in_peak, significant==TRUE)
all_samples_5nd_best_basenji_in_peak_sub = subset(all_samples_5nd_best_basenji_in_peak, significant==TRUE)
success_proportion = as.data.frame(t(matrix(c("1st", nrow(subset(all_samples_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_best_basenji_in_peak_sub),
"2nd", nrow(subset(all_samples_2nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_2nd_best_basenji_in_peak_sub),
"3rd", nrow(subset(all_samples_3nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_3nd_best_basenji_in_peak_sub),
"4th", nrow(subset(all_samples_4nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_4nd_best_basenji_in_peak_sub),
"5th", nrow(subset(all_samples_5nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_5nd_best_basenji_in_peak_sub),
"all", nrow(subset(all_samples_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_sub)), nrow=2)))
colnames(success_proportion) = c("best_Basenji_AI", "proportion_correct")
success_proportion$proportion_correct = as.numeric(success_proportion$proportion_correct)
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE & (Basenji_abs_AI > 0.1))
p = plot_counts_barplot(all_samples_best_basenji_in_peak_sub, "TSS", "correct_predict")
p
outf = file.path(outdir_fig_suppl, paste0("FigS7F_basenji_predictions_by_TSS.pdf"))
ggsave(outf, p, width = 3, height = 3)
outf = file.path(outdir_fig_suppl, paste0("FigS7F_basenji_predictions_by_peak_overlap.pdf"))
ggsave(outf, p, width = 3, height = 3)
p = plot_counts_barplot(all_samples_best_basenji_in_peak_sub, "overlaps_motif", "correct_predict")
p