Contents

1 Setup and data

source("../utils/utils.R")
   ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
   ✔ dplyr     1.1.2     ✔ readr     2.1.4
   ✔ forcats   1.0.0     ✔ stringr   1.5.0
   ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
   ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
   ✔ purrr     1.0.1     
   ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
   ✖ dplyr::filter() masks stats::filter()
   ✖ dplyr::lag()    masks stats::lag()
   ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
   
   Attaching package: 'magrittr'
   
   
   The following object is masked from 'package:purrr':
   
       set_names
   
   
   The following object is masked from 'package:tidyr':
   
       extract
   
   
   Loading required package: GenomicRanges
   
   Loading required package: stats4
   
   Loading required package: BiocGenerics
   
   
   Attaching package: 'BiocGenerics'
   
   
   The following objects are masked from 'package:lubridate':
   
       intersect, setdiff, union
   
   
   The following objects are masked from 'package:dplyr':
   
       combine, intersect, setdiff, union
   
   
   The following objects are masked from 'package:stats':
   
       IQR, mad, sd, var, xtabs
   
   
   The following objects are masked from 'package:base':
   
       anyDuplicated, aperm, append, as.data.frame, basename, cbind,
       colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
       get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
       match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
       Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
       table, tapply, union, unique, unsplit, which.max, which.min
   
   
   Loading required package: S4Vectors
   
   
   Attaching package: 'S4Vectors'
   
   
   The following objects are masked from 'package:lubridate':
   
       second, second<-
   
   
   The following objects are masked from 'package:dplyr':
   
       first, rename
   
   
   The following object is masked from 'package:tidyr':
   
       expand
   
   
   The following objects are masked from 'package:base':
   
       expand.grid, I, unname
   
   
   Loading required package: IRanges
   
   
   Attaching package: 'IRanges'
   
   
   The following object is masked from 'package:lubridate':
   
       %within%
   
   
   The following objects are masked from 'package:dplyr':
   
       collapse, desc, slice
   
   
   The following object is masked from 'package:purrr':
   
       reduce
   
   
   Loading required package: GenomeInfoDb
   
   
   Attaching package: 'GenomicRanges'
   
   
   The following object is masked from 'package:magrittr':
   
       subtract
   
   
   Loading required package: grid
   
   Loading required package: Biostrings
   
   Loading required package: XVector
   
   
   Attaching package: 'XVector'
   
   
   The following object is masked from 'package:purrr':
   
       compact
   
   
   
   Attaching package: 'Biostrings'
   
   
   The following object is masked from 'package:grid':
   
       pattern
   
   
   The following object is masked from 'package:base':
   
       strsplit
   
   
   
   Attaching package: 'gridExtra'
   
   
   The following object is masked from 'package:BiocGenerics':
   
       combine
   
   
   The following object is masked from 'package:dplyr':
   
       combine
   
   
   
   Attaching package: 'data.table'
   
   
   The following object is masked from 'package:GenomicRanges':
   
       shift
   
   
   The following object is masked from 'package:IRanges':
   
       shift
   
   
   The following objects are masked from 'package:S4Vectors':
   
       first, second
   
   
   The following objects are masked from 'package:lubridate':
   
       hour, isoweek, mday, minute, month, quarter, second, wday, week,
       yday, year
   
   
   The following objects are masked from 'package:dplyr':
   
       between, first, last
   
   
   The following object is masked from 'package:purrr':
   
       transpose
   
   
   
   
   Registered S3 method overwritten by 'gplots':
     method         from 
     reorder.factor gdata
   
   ChIPseeker v1.34.1  For help: https://guangchuangyu.github.io/software/ChIPseeker
   
   If you use ChIPseeker in published research, please cite:
   Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring epigenomic datasets by ChIPseeker. Current Protocols 2022, 2(10): e585
   
   Loading required package: graph
   
   
   Attaching package: 'graph'
   
   
   The following object is masked from 'package:Biostrings':
   
       complement
   
   
   The following object is masked from 'package:stringr':
   
       boundary
   
   
   Loading required package: Biobase
   
   Welcome to Bioconductor
   
       Vignettes contain introductory material; view with
       'browseVignettes()'. To cite Bioconductor, see
       'citation("Biobase")', and for packages 'citation("pkgname")'.
   
   
   Loading required package: GO.db
   
   Loading required package: AnnotationDbi
   
   
   Attaching package: 'AnnotationDbi'
   
   
   The following object is masked from 'package:dplyr':
   
       select
   
   
   Loading required package: SparseM
   
   
   Attaching package: 'SparseM'
   
   
   The following object is masked from 'package:base':
   
       backsolve
   
   
   
   groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
   
   
   Attaching package: 'topGO'
   
   
   The following object is masked from 'package:grid':
   
       depth
   
   
   The following object is masked from 'package:IRanges':
   
       members
   
   
   Loading required package: GenomicFeatures
   
   
   Attaching package: 'GenomicFeatures'
   
   
   The following object is masked from 'package:topGO':
   
       genes
config = load_config()

# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI) 

# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")

# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles  = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows() 

2 Figure 5B

samples_correlations = read.table("/g/furlong/project/103_Basenji/models/drosophila_l131k_augmented/acc_log.txt", header=TRUE)
samples_correlations$description = factor(samples_correlations$description, levels = c("F1_Chip-seq", "REMAP_ChIP-seq", "DHS"))
p = ggplot(samples_correlations, aes(x=description, y=pearsonr, fill=description)) + 
  geom_jitter(colour="grey50") +
  geom_boxplot(alpha=0.4, width=0.4, outlier.size = 0, colour="grey15") +
  ylim(0,1) +
  xlab("") +
  ylab("Pearson r correlation") +
  scale_fill_manual(values = c("#33B250", "#0066CC", "#CC4741")) +
  theme_bw() + 
  theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 12)) +
  theme(axis.title = element_text(size = 12), plot.title = element_text(size=12)) +
  theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) +
  theme(legend.position = "NA")

p

outf = file.path(outdir_fig_main, paste0("Fig5B_pearson_correlation_Basenji.pdf"))
ggsave(outf, p, width = 8, height = 6)

3 Figure 5C

twi_24h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_twi.24_annot.txt")
twi_24h_best_basenji_in_peak = take_best_Basenji_in_peak(twi_24h)

bin_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.68_annot.txt")
bin_68h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_68h)

ctcf_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_ctcf.68_annot.txt")
ctcf_68h_best_basenji_in_peak = take_best_Basenji_in_peak(ctcf_68h)

mef2_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.68_annot.txt")
mef2_68h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_68h)

bin_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.1012_annot.txt")
bin_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_1012h)

mef2_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.1012_annot.txt")
mef2_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_1012h)

all_samples = rbind(twi_24h, bin_68h, ctcf_68h, mef2_68h, bin_1012h, mef2_1012h)
all_samples$correct_predict = factor(ifelse(all_samples$AI>0.5 & all_samples$Basenji_AI>0.5, "correct", ifelse(all_samples$AI<0.5 & all_samples$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))

all_samples_best_basenji_in_peak = rbind(twi_24h_best_basenji_in_peak, bin_68h_best_basenji_in_peak, ctcf_68h_best_basenji_in_peak, mef2_68h_best_basenji_in_peak, bin_1012h_best_basenji_in_peak, mef2_1012h_best_basenji_in_peak)
all_samples_best_basenji_in_peak$correct_predict = factor(ifelse(all_samples_best_basenji_in_peak$AI>0.5 & all_samples_best_basenji_in_peak$Basenji_AI>0.5, "correct", ifelse(all_samples_best_basenji_in_peak$AI<0.5 & all_samples_best_basenji_in_peak$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))

all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)


all_samples_best_basenji_in_peak$best_variant = 1
all_samples_2nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, all_samples_best_basenji_in_peak)
all_samples_2nd_best_basenji_in_peak$best_variant = 2
all_samples_3nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak))
all_samples_3nd_best_basenji_in_peak$best_variant = 3
all_samples_4nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak))
all_samples_4nd_best_basenji_in_peak$best_variant = 4
all_samples_5nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak, all_samples_4nd_best_basenji_in_peak))
all_samples_5nd_best_basenji_in_peak$best_variant = 5

all_samples_sub = subset(all_samples, significant==TRUE)
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)
all_samples_2nd_best_basenji_in_peak_sub = subset(all_samples_2nd_best_basenji_in_peak, significant==TRUE)
all_samples_3nd_best_basenji_in_peak_sub = subset(all_samples_3nd_best_basenji_in_peak, significant==TRUE)
all_samples_4nd_best_basenji_in_peak_sub = subset(all_samples_4nd_best_basenji_in_peak, significant==TRUE)
all_samples_5nd_best_basenji_in_peak_sub = subset(all_samples_5nd_best_basenji_in_peak, significant==TRUE)


success_proportion = as.data.frame(t(matrix(c("1st", nrow(subset(all_samples_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_best_basenji_in_peak_sub), 
  "2nd", nrow(subset(all_samples_2nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_2nd_best_basenji_in_peak_sub), 
  "3rd", nrow(subset(all_samples_3nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_3nd_best_basenji_in_peak_sub), 
  "4th", nrow(subset(all_samples_4nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_4nd_best_basenji_in_peak_sub), 
  "5th", nrow(subset(all_samples_5nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_5nd_best_basenji_in_peak_sub), 
  "all", nrow(subset(all_samples_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_sub)), nrow=2)))

colnames(success_proportion) = c("best_Basenji_AI", "proportion_correct")
success_proportion$proportion_correct = as.numeric(success_proportion$proportion_correct)





background_success_proportion_df = data.frame(row.names = c("1st", "2nd", "3rd", "4th", "5th", "all"))

for (i in seq(1, 1000)) {
  
  all_samples_variant = subset(all_samples)
  background_variant_shuff = all_samples_variant %>% 
    select(variant_ID, peak_ID, significant, correct_predict) %>%
    group_by(peak_ID) %>%
    mutate(rank=sample(row_number())) %>%
    ungroup()
  background_best_variant_shuff = subset(background_variant_shuff, rank==1 & significant==TRUE)
  background_2nd_best_variant_shuff = subset(background_variant_shuff, rank==2 & significant==TRUE)
  background_3nd_best_variant_shuff = subset(background_variant_shuff, rank==3 & significant==TRUE)
  background_4nd_best_variant_shuff = subset(background_variant_shuff, rank==4 & significant==TRUE)
  background_5nd_best_variant_shuff = subset(background_variant_shuff, rank==5 & significant==TRUE)

  background_success_proportion = as.data.frame(t(matrix(c("1st", nrow(subset(background_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_best_variant_shuff), 
  "2nd", nrow(subset(background_2nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_2nd_best_variant_shuff), 
  "3rd", nrow(subset(background_3nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_3nd_best_variant_shuff), 
  "4th", nrow(subset(background_4nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_4nd_best_variant_shuff), 
  "5th", nrow(subset(background_5nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_5nd_best_variant_shuff), 
  "all", nrow(subset(background_variant_shuff, (correct_predict=="correct"))) / nrow(background_variant_shuff)), nrow=2)))

  colnames(background_success_proportion) = c("best_Basenji_AI", "proportion_correct")
  background_success_proportion$proportion_correct = as.numeric(background_success_proportion$proportion_correct)

  background_success_proportion_df = cbind(background_success_proportion_df, background_success_proportion$proportion_correct)
}


background_success_proportion_summary = data.frame(background_mean = rowMeans(background_success_proportion_df),
                                                   background_std = apply(background_success_proportion_df, 1, sd, na.rm = TRUE))

success_proportion = cbind(success_proportion, background_success_proportion_summary)
success_proportion = success_proportion[c(1,2,3,4,5), ]



p = ggplot(success_proportion, aes(x=best_Basenji_AI, y=proportion_correct, group=1)) +
    geom_point(colour="orange2", size=4) +
    geom_line(colour="orange2", linewidth=1) +
    geom_line(aes(y = background_mean), color = "grey40", linewidth = 1) + 
    geom_ribbon(aes(y = background_mean, ymin = background_mean - background_std * 2, ymax = background_mean + background_std * 2), fill = "grey30", alpha = .2) +
    ylim(0.47, 1) +
    geom_hline(yintercept = 0.5, colour = "#C92B27", linetype="dashed") +
    geom_text(aes(x = best_Basenji_AI, y = proportion_correct, label=round(proportion_correct, 3)),colour="grey20", fontface = 2, size = 4, vjust=-2) +
    xlab("Best variant order (Basenji AI)") +
    ylab("Proportion of correct predictions (AI direction)") +
    theme_bw() + 
    theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 12)) +
    theme(axis.title = element_text(size = 12), plot.title = element_text(size=12)) +
    theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) +
    theme(legend.position = "none")

p

outf = file.path(outdir_fig_main, paste0("Fig5C_Variants_priority_order_performance.pdf"))
ggsave(outf, p, width = 6, height = 3)

4 Figure 5D

5 Figure 5E

all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE & (Basenji_abs_AI > 0.1))
all_samples_best_basenji_in_peak_sub$condition = factor(all_samples_best_basenji_in_peak_sub$condition, levels=c("twi.24", "ctcf.68", "mef2.68", "mef2.1012", "bin.68", "bin.1012"))

p = plot_correlation(all_samples_best_basenji_in_peak_sub, "All experiments significant AI best Basenji score if > 0.1 AI") + 
  geom_point(aes(colour=condition), size=2) +
  scale_colour_manual(values = c("#E21F26", "#397FB9", "#4EAF49", "#68C3A6", "#984F9F", "#E38CBB")) +
  theme(legend.position = "right") 
p
   `geom_smooth()` using formula = 'y ~ x'

outf = file.path(outdir_fig_main, paste0("Fig5E_correlation_strong_guesses_significant_by_experiment.pdf"))
ggsave(outf, p, width = 7, height = 6)
   `geom_smooth()` using formula = 'y ~ x'

6 Figure 5F

p = plot_counts_barplot(all_samples_best_basenji_in_peak_sub, "condition", "correct_predict")
p

outf = file.path(outdir_fig_main, paste0("Fig5F_counts_guesses_significant_by_experiment.pdf"))
ggsave(outf, p, width = 7, height = 6)

7 Figure 5G

saturation_scores_predictions = read.table("/g/furlong/project/103_Basenji/Mattia/analysis/saturation_scores/Basenji_DataTable_predictions.txt", header=TRUE)

saturation_scores_predictions$motif_on_variant = ifelse(saturation_scores_predictions$variant_in_self_motif == 1, "self_motif", ifelse(saturation_scores_predictions$variant_in_other_motif == 1, "cofactor_motif", ifelse(saturation_scores_predictions$Basenji_predict == "no_prediction", "no_prediction", "no_motif")))
saturation_scores_predictions$motif_on_variant = factor(saturation_scores_predictions$motif_on_variant, levels=c("self_motif", "cofactor_motif","no_motif", "no_prediction"))
saturation_scores_predictions$condition = factor(saturation_scores_predictions$condition, levels=c("twi.24", "ctcf.68", "mef2.68", "mef2.1012", "bin.68", "bin.1012"))

p = plot_counts_barplot(saturation_scores_predictions, "condition", "motif_on_variant") +
  scale_fill_manual(values = c("#FF2341", "#FFA736", "grey70", "grey15")) +
  geom_text(aes(label=counts),  position = position_stack(vjust = 0.5), colour="white") +
  labs(fill="variant_on_motif")
   Scale for fill is already present.
   Adding another scale for fill, which will replace the existing scale.
outf = file.path(outdir_fig_main, paste0("Fig5G_counts_guesses_significant_by_motif.pdf"))
p

ggsave(outf, p, width = 7, height = 6)

8 Figure 5H

# counts per line
ll_ctcf = get_counts_per_line("ctcf/68") 
   [1] "399_399_1"
   [1] "399_399_2"
   [1] "vgn_28_1"
   [1] "vgn_28_2"
   [1] "vgn_307_1"
   [1] "vgn_307_2"
   [1] "vgn_399_1"
   [1] "vgn_399_2"
   [1] "vgn_57_1"
   [1] "vgn_57_2"
   [1] "vgn_639_1"
   [1] "vgn_639_2"
   [1] "vgn_712_1"
   [1] "vgn_712_2"
   [1] "vgn_714_1"
   [1] "vgn_714_2"
   [1] "vgn_852_1"
   [1] "vgn_852_2"
   [1] "vgn_vgn_1"
   [1] "vgn_vgn_2"
vid = "chr2R_15733144"

l = plot_ai_and_read_depth_for_variant(cht %>% filter(condition == "ctcf/68"), ll_ctcf, vid) 
   Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
   ℹ Please use `linewidth` instead.
   This warning is displayed once every 8 hours.
   Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
   generated.
outf = file.path(outdir_fig_main, paste0("Fig5H_example_ctcf_allele_ratios.pdf"))
print(l[[2]])

ggsave(outf, l[[2]], width = 3, height = 3)
---
title: "Figure 5"
output:
   BiocStyle::html_document:
      toc: true
      df_print: paged
      self_contained: true
      code_download: true
      highlight: tango
#bibliography: knn_ml_intro.bib
editor_options: 
  chunk_output_type: inline
---

```{r style, echo=FALSE, results="asis"}
library("knitr")
options(digits = 2, width = 80)
options(bitmapType = 'cairo')
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo = TRUE, tidy = FALSE, include = TRUE, cache = FALSE,
               dev=c('png', 'pdf'), comment = '  ', dpi = 300)

options(stringsAsFactors = FALSE)
knitr::opts_chunk$set(cache=FALSE)
options(digits = 5)         
```

# Setup and data

```{r}
source("../utils/utils.R")
config = load_config()

# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI) 

# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")

# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles  = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows() 

```




# Figure 5B


```{r }
samples_correlations = read.table("/g/furlong/project/103_Basenji/models/drosophila_l131k_augmented/acc_log.txt", header=TRUE)
samples_correlations$description = factor(samples_correlations$description, levels = c("F1_Chip-seq", "REMAP_ChIP-seq", "DHS"))
p = ggplot(samples_correlations, aes(x=description, y=pearsonr, fill=description)) + 
  geom_jitter(colour="grey50") +
  geom_boxplot(alpha=0.4, width=0.4, outlier.size = 0, colour="grey15") +
  ylim(0,1) +
  xlab("") +
  ylab("Pearson r correlation") +
  scale_fill_manual(values = c("#33B250", "#0066CC", "#CC4741")) +
  theme_bw() + 
  theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 12)) +
  theme(axis.title = element_text(size = 12), plot.title = element_text(size=12)) +
  theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) +
  theme(legend.position = "NA")

p
outf = file.path(outdir_fig_main, paste0("Fig5B_pearson_correlation_Basenji.pdf"))
ggsave(outf, p, width = 8, height = 6)
```


* median pearson R for Remap: `r median(subset(samples_correlations, description=="REMAP_ChIP-seq")$pearsonr)`
* median pearson R for DHS: `r median(subset(samples_correlations, description=="DHS")$pearsonr)`
* median pearson R for F1_Chip-seq: `r median(subset(samples_correlations, description=="F1_Chip-seq")$pearsonr)`




# Figure 5C

```{r }
twi_24h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_twi.24_annot.txt")
twi_24h_best_basenji_in_peak = take_best_Basenji_in_peak(twi_24h)

bin_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.68_annot.txt")
bin_68h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_68h)

ctcf_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_ctcf.68_annot.txt")
ctcf_68h_best_basenji_in_peak = take_best_Basenji_in_peak(ctcf_68h)

mef2_68h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.68_annot.txt")
mef2_68h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_68h)

bin_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_bin.1012_annot.txt")
bin_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(bin_1012h)

mef2_1012h = read_input("/g/furlong/project/103_Basenji/Mattia/analysis/correlations/Basenji_scores_and_allelic_imbalance_mef2.1012_annot.txt")
mef2_1012h_best_basenji_in_peak = take_best_Basenji_in_peak(mef2_1012h)

all_samples = rbind(twi_24h, bin_68h, ctcf_68h, mef2_68h, bin_1012h, mef2_1012h)
all_samples$correct_predict = factor(ifelse(all_samples$AI>0.5 & all_samples$Basenji_AI>0.5, "correct", ifelse(all_samples$AI<0.5 & all_samples$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))

all_samples_best_basenji_in_peak = rbind(twi_24h_best_basenji_in_peak, bin_68h_best_basenji_in_peak, ctcf_68h_best_basenji_in_peak, mef2_68h_best_basenji_in_peak, bin_1012h_best_basenji_in_peak, mef2_1012h_best_basenji_in_peak)
all_samples_best_basenji_in_peak$correct_predict = factor(ifelse(all_samples_best_basenji_in_peak$AI>0.5 & all_samples_best_basenji_in_peak$Basenji_AI>0.5, "correct", ifelse(all_samples_best_basenji_in_peak$AI<0.5 & all_samples_best_basenji_in_peak$Basenji_AI<0.5, "correct", "incorrect")), levels=c("incorrect", "correct"))

all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)


all_samples_best_basenji_in_peak$best_variant = 1
all_samples_2nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, all_samples_best_basenji_in_peak)
all_samples_2nd_best_basenji_in_peak$best_variant = 2
all_samples_3nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak))
all_samples_3nd_best_basenji_in_peak$best_variant = 3
all_samples_4nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak))
all_samples_4nd_best_basenji_in_peak$best_variant = 4
all_samples_5nd_best_basenji_in_peak = take_next_best_Basenji_in_peak(all_samples, rbind(all_samples_best_basenji_in_peak, all_samples_2nd_best_basenji_in_peak, all_samples_3nd_best_basenji_in_peak, all_samples_4nd_best_basenji_in_peak))
all_samples_5nd_best_basenji_in_peak$best_variant = 5

all_samples_sub = subset(all_samples, significant==TRUE)
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE)
all_samples_2nd_best_basenji_in_peak_sub = subset(all_samples_2nd_best_basenji_in_peak, significant==TRUE)
all_samples_3nd_best_basenji_in_peak_sub = subset(all_samples_3nd_best_basenji_in_peak, significant==TRUE)
all_samples_4nd_best_basenji_in_peak_sub = subset(all_samples_4nd_best_basenji_in_peak, significant==TRUE)
all_samples_5nd_best_basenji_in_peak_sub = subset(all_samples_5nd_best_basenji_in_peak, significant==TRUE)


success_proportion = as.data.frame(t(matrix(c("1st", nrow(subset(all_samples_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_best_basenji_in_peak_sub), 
  "2nd", nrow(subset(all_samples_2nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_2nd_best_basenji_in_peak_sub), 
  "3rd", nrow(subset(all_samples_3nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_3nd_best_basenji_in_peak_sub), 
  "4th", nrow(subset(all_samples_4nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_4nd_best_basenji_in_peak_sub), 
  "5th", nrow(subset(all_samples_5nd_best_basenji_in_peak_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_5nd_best_basenji_in_peak_sub), 
  "all", nrow(subset(all_samples_sub, (Basenji_AI > 0.5 & AI > 0.5) | (Basenji_AI < 0.5 & AI < 0.5))) / nrow(all_samples_sub)), nrow=2)))

colnames(success_proportion) = c("best_Basenji_AI", "proportion_correct")
success_proportion$proportion_correct = as.numeric(success_proportion$proportion_correct)





background_success_proportion_df = data.frame(row.names = c("1st", "2nd", "3rd", "4th", "5th", "all"))

for (i in seq(1, 1000)) {
  
  all_samples_variant = subset(all_samples)
  background_variant_shuff = all_samples_variant %>% 
    select(variant_ID, peak_ID, significant, correct_predict) %>%
    group_by(peak_ID) %>%
    mutate(rank=sample(row_number())) %>%
    ungroup()
  background_best_variant_shuff = subset(background_variant_shuff, rank==1 & significant==TRUE)
  background_2nd_best_variant_shuff = subset(background_variant_shuff, rank==2 & significant==TRUE)
  background_3nd_best_variant_shuff = subset(background_variant_shuff, rank==3 & significant==TRUE)
  background_4nd_best_variant_shuff = subset(background_variant_shuff, rank==4 & significant==TRUE)
  background_5nd_best_variant_shuff = subset(background_variant_shuff, rank==5 & significant==TRUE)

  background_success_proportion = as.data.frame(t(matrix(c("1st", nrow(subset(background_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_best_variant_shuff), 
  "2nd", nrow(subset(background_2nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_2nd_best_variant_shuff), 
  "3rd", nrow(subset(background_3nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_3nd_best_variant_shuff), 
  "4th", nrow(subset(background_4nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_4nd_best_variant_shuff), 
  "5th", nrow(subset(background_5nd_best_variant_shuff, (correct_predict=="correct"))) / nrow(background_5nd_best_variant_shuff), 
  "all", nrow(subset(background_variant_shuff, (correct_predict=="correct"))) / nrow(background_variant_shuff)), nrow=2)))

  colnames(background_success_proportion) = c("best_Basenji_AI", "proportion_correct")
  background_success_proportion$proportion_correct = as.numeric(background_success_proportion$proportion_correct)

  background_success_proportion_df = cbind(background_success_proportion_df, background_success_proportion$proportion_correct)
}


background_success_proportion_summary = data.frame(background_mean = rowMeans(background_success_proportion_df),
                                                   background_std = apply(background_success_proportion_df, 1, sd, na.rm = TRUE))

success_proportion = cbind(success_proportion, background_success_proportion_summary)
success_proportion = success_proportion[c(1,2,3,4,5), ]



p = ggplot(success_proportion, aes(x=best_Basenji_AI, y=proportion_correct, group=1)) +
    geom_point(colour="orange2", size=4) +
    geom_line(colour="orange2", linewidth=1) +
    geom_line(aes(y = background_mean), color = "grey40", linewidth = 1) + 
    geom_ribbon(aes(y = background_mean, ymin = background_mean - background_std * 2, ymax = background_mean + background_std * 2), fill = "grey30", alpha = .2) +
    ylim(0.47, 1) +
    geom_hline(yintercept = 0.5, colour = "#C92B27", linetype="dashed") +
    geom_text(aes(x = best_Basenji_AI, y = proportion_correct, label=round(proportion_correct, 3)),colour="grey20", fontface = 2, size = 4, vjust=-2) +
    xlab("Best variant order (Basenji AI)") +
    ylab("Proportion of correct predictions (AI direction)") +
    theme_bw() + 
    theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 12)) +
    theme(axis.title = element_text(size = 12), plot.title = element_text(size=12)) +
    theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) +
    theme(legend.position = "none")

p
outf = file.path(outdir_fig_main, paste0("Fig5C_Variants_priority_order_performance.pdf"))
ggsave(outf, p, width = 6, height = 3)
```


# Figure 5D

```{r plot_strong_predicted_AI, comment=NA, echo=FALSE, message=FALSE, fig.height = 7, fig.width = 8}
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak,  Basenji_abs_AI > 0.1)
all_samples_best_basenji_in_peak_sub$pvalue_significat = ifelse((all_samples_best_basenji_in_peak_sub$padjust < 0.01) & (all_samples_best_basenji_in_peak_sub$AI_abs > 0.1), TRUE, FALSE)
p = plot_correlation(all_samples_best_basenji_in_peak_sub, "All experiments best Basenji score if > 0.1 AI") + 
  geom_point(aes(colour=pvalue_significat), size=2) +
  theme(legend.position = "right") +
  guides(colour=guide_legend(title="significant AI")) +
  scale_colour_manual(values = c("#686868", "#DF7878"))
p

outf = file.path(outdir_fig_main, paste0("Fig5D_correlation_strong_guesses_all.pdf"))
ggsave(outf, p, width = 6, height = 6)
```

# Figure 5E

```{r}
all_samples_best_basenji_in_peak_sub = subset(all_samples_best_basenji_in_peak, significant==TRUE & (Basenji_abs_AI > 0.1))
all_samples_best_basenji_in_peak_sub$condition = factor(all_samples_best_basenji_in_peak_sub$condition, levels=c("twi.24", "ctcf.68", "mef2.68", "mef2.1012", "bin.68", "bin.1012"))

p = plot_correlation(all_samples_best_basenji_in_peak_sub, "All experiments significant AI best Basenji score if > 0.1 AI") + 
  geom_point(aes(colour=condition), size=2) +
  scale_colour_manual(values = c("#E21F26", "#397FB9", "#4EAF49", "#68C3A6", "#984F9F", "#E38CBB")) +
  theme(legend.position = "right") 
p

outf = file.path(outdir_fig_main, paste0("Fig5E_correlation_strong_guesses_significant_by_experiment.pdf"))
ggsave(outf, p, width = 7, height = 6)
```

# Figure 5F

```{r}
p = plot_counts_barplot(all_samples_best_basenji_in_peak_sub, "condition", "correct_predict")
p

outf = file.path(outdir_fig_main, paste0("Fig5F_counts_guesses_significant_by_experiment.pdf"))
ggsave(outf, p, width = 7, height = 6)
```


# Figure 5G


```{r}
saturation_scores_predictions = read.table("/g/furlong/project/103_Basenji/Mattia/analysis/saturation_scores/Basenji_DataTable_predictions.txt", header=TRUE)

saturation_scores_predictions$motif_on_variant = ifelse(saturation_scores_predictions$variant_in_self_motif == 1, "self_motif", ifelse(saturation_scores_predictions$variant_in_other_motif == 1, "cofactor_motif", ifelse(saturation_scores_predictions$Basenji_predict == "no_prediction", "no_prediction", "no_motif")))
saturation_scores_predictions$motif_on_variant = factor(saturation_scores_predictions$motif_on_variant, levels=c("self_motif", "cofactor_motif","no_motif", "no_prediction"))
saturation_scores_predictions$condition = factor(saturation_scores_predictions$condition, levels=c("twi.24", "ctcf.68", "mef2.68", "mef2.1012", "bin.68", "bin.1012"))

p = plot_counts_barplot(saturation_scores_predictions, "condition", "motif_on_variant") +
  scale_fill_manual(values = c("#FF2341", "#FFA736", "grey70", "grey15")) +
  geom_text(aes(label=counts),  position = position_stack(vjust = 0.5), colour="white") +
  labs(fill="variant_on_motif")

outf = file.path(outdir_fig_main, paste0("Fig5G_counts_guesses_significant_by_motif.pdf"))
p
ggsave(outf, p, width = 7, height = 6)
```


# Figure 5H

```{r}
# counts per line
ll_ctcf = get_counts_per_line("ctcf/68") 

vid = "chr2R_15733144"

l = plot_ai_and_read_depth_for_variant(cht %>% filter(condition == "ctcf/68"), ll_ctcf, vid) 

outf = file.path(outdir_fig_main, paste0("Fig5H_example_ctcf_allele_ratios.pdf"))
print(l[[2]])
ggsave(outf, l[[2]], width = 3, height = 3)
```


