Contents

1 Setup and data

source("../utils/utils.R")
   ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
   ✔ dplyr     1.1.2     ✔ readr     2.1.4
   ✔ forcats   1.0.0     ✔ stringr   1.5.0
   ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
   ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
   ✔ purrr     1.0.1     
   ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
   ✖ dplyr::filter() masks stats::filter()
   ✖ dplyr::lag()    masks stats::lag()
   ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
   
   Attaching package: 'magrittr'
   
   
   The following object is masked from 'package:purrr':
   
       set_names
   
   
   The following object is masked from 'package:tidyr':
   
       extract
   
   
   Loading required package: GenomicRanges
   
   Loading required package: stats4
   
   Loading required package: BiocGenerics
   
   
   Attaching package: 'BiocGenerics'
   
   
   The following objects are masked from 'package:lubridate':
   
       intersect, setdiff, union
   
   
   The following objects are masked from 'package:dplyr':
   
       combine, intersect, setdiff, union
   
   
   The following objects are masked from 'package:stats':
   
       IQR, mad, sd, var, xtabs
   
   
   The following objects are masked from 'package:base':
   
       anyDuplicated, aperm, append, as.data.frame, basename, cbind,
       colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
       get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
       match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
       Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
       table, tapply, union, unique, unsplit, which.max, which.min
   
   
   Loading required package: S4Vectors
   
   
   Attaching package: 'S4Vectors'
   
   
   The following objects are masked from 'package:lubridate':
   
       second, second<-
   
   
   The following objects are masked from 'package:dplyr':
   
       first, rename
   
   
   The following object is masked from 'package:tidyr':
   
       expand
   
   
   The following objects are masked from 'package:base':
   
       expand.grid, I, unname
   
   
   Loading required package: IRanges
   
   
   Attaching package: 'IRanges'
   
   
   The following object is masked from 'package:lubridate':
   
       %within%
   
   
   The following objects are masked from 'package:dplyr':
   
       collapse, desc, slice
   
   
   The following object is masked from 'package:purrr':
   
       reduce
   
   
   Loading required package: GenomeInfoDb
   
   
   Attaching package: 'GenomicRanges'
   
   
   The following object is masked from 'package:magrittr':
   
       subtract
   
   
   Loading required package: grid
   
   Loading required package: Biostrings
   
   Loading required package: XVector
   
   
   Attaching package: 'XVector'
   
   
   The following object is masked from 'package:purrr':
   
       compact
   
   
   
   Attaching package: 'Biostrings'
   
   
   The following object is masked from 'package:grid':
   
       pattern
   
   
   The following object is masked from 'package:base':
   
       strsplit
   
   
   
   Attaching package: 'gridExtra'
   
   
   The following object is masked from 'package:BiocGenerics':
   
       combine
   
   
   The following object is masked from 'package:dplyr':
   
       combine
   
   
   
   Attaching package: 'data.table'
   
   
   The following object is masked from 'package:GenomicRanges':
   
       shift
   
   
   The following object is masked from 'package:IRanges':
   
       shift
   
   
   The following objects are masked from 'package:S4Vectors':
   
       first, second
   
   
   The following objects are masked from 'package:lubridate':
   
       hour, isoweek, mday, minute, month, quarter, second, wday, week,
       yday, year
   
   
   The following objects are masked from 'package:dplyr':
   
       between, first, last
   
   
   The following object is masked from 'package:purrr':
   
       transpose
   
   
   
   
   Registered S3 method overwritten by 'gplots':
     method         from 
     reorder.factor gdata
   
   ChIPseeker v1.34.1  For help: https://guangchuangyu.github.io/software/ChIPseeker
   
   If you use ChIPseeker in published research, please cite:
   Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring epigenomic datasets by ChIPseeker. Current Protocols 2022, 2(10): e585
   
   Loading required package: graph
   
   
   Attaching package: 'graph'
   
   
   The following object is masked from 'package:Biostrings':
   
       complement
   
   
   The following object is masked from 'package:stringr':
   
       boundary
   
   
   Loading required package: Biobase
   
   Welcome to Bioconductor
   
       Vignettes contain introductory material; view with
       'browseVignettes()'. To cite Bioconductor, see
       'citation("Biobase")', and for packages 'citation("pkgname")'.
   
   
   Loading required package: GO.db
   
   Loading required package: AnnotationDbi
   
   
   Attaching package: 'AnnotationDbi'
   
   
   The following object is masked from 'package:dplyr':
   
       select
   
   
   Loading required package: SparseM
   
   
   Attaching package: 'SparseM'
   
   
   The following object is masked from 'package:base':
   
       backsolve
   
   
   
   groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
   
   
   Attaching package: 'topGO'
   
   
   The following object is masked from 'package:grid':
   
       depth
   
   
   The following object is masked from 'package:IRanges':
   
       members
   
   
   Loading required package: GenomicFeatures
   
   
   Attaching package: 'GenomicFeatures'
   
   
   The following object is masked from 'package:topGO':
   
       genes
config = load_config()

# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI) 

# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")

# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles  = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows() 

2 Figure S4A

# get variants distance to motifs (excluding peaks without motifs)
res_df = lapply(ab_tp_list, function(ab_tp) get_variant_distance2TFmotif(ab_tp, cht, fimo, same_peak = T) %>% 
                  mutate(condition = ab_tp)) %>% bind_rows()

dist_breaks = c(-1, 0, 20, 40, 60, 80, 100, 3000)
dist_labels = c("in motif", "1-20 bp", "21-40 bp", "41-60 bp", "61-80 bp", "81-100 bp", ">100 bp")

res_full = data.frame(matrix(ncol = 6, nrow = 0))
names(res_full) = c("dist_bin", "n", "share", "sd", "type", "condition")
background_sum_all_cond = data.frame(matrix(ncol = 7, nrow = 0))
names(background_sum_all_cond) = c("dist_bin", "n", "share", "share_full",  "type", "condition", "i")

for(ab_tp in ab_tp_list) {
  
  # all variants
  df_sel = res_df %>% dplyr::filter(condition == ab_tp)
  N_var = length(unique(df_sel$snp_id)) # number of variants in peaks with motifs
  N_peak = length(unique(df_sel$peak_id)) # number of peaks with motifs
  
  # significant variants
  df_sign = df_sel %>% filter(signif_strongAI)
  N_var_sign = length(unique(df_sign$snp_id)) # number of significant variants in peaks with motifs
  N_peak_sign = length(unique(df_sign$peak_id)) # number of AI peaks with motifs

  sign_sum = df_sign %>% 
    dplyr::group_by(peak_id) %>% 
    dplyr::mutate(min_dist = min(dist2motif)) %>%
    dplyr::filter(dist2motif == min_dist) %>%
    dplyr::select(peak_id, dist2motif) %>% unique() %>% ungroup() %>%
    dplyr::mutate(N_tot = n(), dist_bin = cut(dist2motif, breaks = dist_breaks, labels = dist_labels)) %>%
    dplyr::group_by(dist_bin) %>%
    dplyr::summarize(n = n(), share = n / mean(N_tot), sd = sd(n), type = "real", condition = ab_tp)
  
  
  df_non_sign = df_sel %>% dplyr::group_by(peak_id) %>% dplyr::mutate(AI_peak = any(signif_strongAI)) %>% dplyr::filter(!AI_peak) %>% ungroup()
  
  background_sum_all = lapply(1:10000, function(i) {
    
    #print(i)
    
    # 1. select same number of peaks as in AI peaks
    peak_ids = sample(unique(df_non_sign$peak_id), N_peak_sign)
    df_bg = df_non_sign %>% dplyr::filter(peak_id %in% peak_ids)
    
    # 2. select same number of variants as for AI peaks
    variant_ids = sample(unique(df_bg$snp_id), N_var_sign)
    df_bg %<>% filter(snp_id %in% variant_ids)
    
    N_peak_bg = length(unique(df_bg$peak_id)) 
    
    bg_sum = df_bg %>% 
      dplyr::group_by(peak_id) %>% 
      dplyr::mutate(min_dist = min(dist2motif)) %>%
      dplyr::filter(dist2motif == min_dist) %>%
      dplyr::select(peak_id, dist2motif) %>% unique() %>% ungroup() %>%
      dplyr::mutate(N_tot = n(), dist_bin = cut(dist2motif, breaks = dist_breaks, labels = dist_labels)) %>%
      dplyr::group_by(dist_bin) %>%
      dplyr::summarize(n = n(), share = n / mean(N_tot), share_full = n / N_peak_bg) %>%
      dplyr::mutate(condition = ab_tp, i = i)
    
    
    bg_sum$type = "background"
    bg_sum
    
  }) %>% bind_rows()
  
  background_sum_all_cond = rbind(background_sum_all_cond, background_sum_all)
  
  background_sum = background_sum_all %>% 
    dplyr::group_by(dist_bin) %>% 
    dplyr::summarize(n = mean(n), 
                     share = mean(share), 
                     sd = sd(share_full),
                     type = "background", 
                     condition = ab_tp)
  
  
  res = rbind.data.frame(sign_sum, background_sum)
  
  res_full = rbind.data.frame(res_full, res)

}


res_full$dist_bin = factor(res_full$dist_bin, levels = rev(dist_labels))
res_full$tf_labels = ab_tp_labels[res_full$condition]  
res_full$tf_labels = factor(res_full$tf_labels, levels = ab_tp_labels)

shares_in_motif = res_full %>% 
  arrange(tf_labels) %>% 
  filter(dist_bin == "in motif", type == "real") %>% 
  select(share) %>% unlist() %>% unique() %>% round(2)


res_full$dist_bin = factor(res_full$dist_bin, levels = dist_labels)
res_full$type = factor(res_full$type, levels = unique(res_full$type))

p = ggplot(res_full, aes(x = dist_bin, y = share, color = type, group = type)) +
  facet_wrap(~tf_labels, ncol = 3) +
  geom_ribbon(data=subset(res_full, type=="background"), aes(x=dist_bin, y = share, ymin = share - sd * 2, ymax = share + sd * 2, group = type), fill = "grey70", colour=NA, alpha = .4) +
  geom_point(size = 2) + 
  geom_line(linewidth = 1) +
  scale_color_manual(values = c("grey", "darkblue"), name = "", labels = c("AI peaks with motifs", "background")) +
  theme_bw() +
  ylab("Share of peaks") +
  xlab("Distance from TF motif") +
  theme(axis.text.y = element_text(size=14), axis.text.x = element_text(size=14, angle = 45, hjust = 1),
        axis.title.x = element_text(size=16), axis.title.y = element_text(size=16),
        legend.text=element_text(size=16), legend.title=element_text(size=16))


p

outf = file.path(outdir_fig_suppl, paste0("FigS4A_dist2motif_vs_background.pdf"))
ggsave(outf, p, width = 12, height = 6)

Empirical p-values for the test: real share is larger than background share


In motif

Condition p-value
Twi 2-4h 9.999^{-5}
CTCF 6-8h 9.999^{-5}
Mef2 6-8h 1.0001^{-4}
Mef2 10-12h 9.999^{-5}
Bin 6-8h 9.999^{-5}
Bin 10-12h 1.0001^{-4}


1-20 bp

Condition p-value
Twi 2-4h 0.0379
CTCF 6-8h 0.38196
Mef2 6-8h 0.39486
Mef2 10-12h 0.10009
Bin 6-8h 0.16348
Bin 10-12h 0.79562


21-40 bp

Condition p-value
Twi 2-4h 0.73813
CTCF 6-8h 0.9993
Mef2 6-8h 1
Mef2 10-12h 0.84762
Bin 6-8h 0.71863
Bin 10-12h 0.90831


41-60 bp

Condition p-value
Twi 2-4h 0.66383
CTCF 6-8h 0.996
Mef2 6-8h 0.73593
Mef2 10-12h 0.9998
Bin 6-8h 0.94411
Bin 10-12h 0.43036


61-80 bp

Condition p-value
Twi 2-4h 0.9999
CTCF 6-8h 1
Mef2 6-8h 0.92471
Mef2 10-12h 0.9726
Bin 6-8h 0.94691
Bin 10-12h 0.9971


81-100 bp

Condition p-value
Twi 2-4h 0.9994
CTCF 6-8h 0.9996
Mef2 6-8h 0.9989
Mef2 10-12h 0.9999
Bin 6-8h 0.9955
Bin 10-12h 0.72823


>100 bp

Condition p-value
Twi 2-4h 0.94161
CTCF 6-8h 0.97
Mef2 6-8h 0.76452
Mef2 10-12h 0.45485
Bin 6-8h 0.9952
Bin 10-12h 0.94581

3 Figure S4B - Fisher’s tests: enrichments of significant variants in motiifs in / outside peaks

out_df = data.frame(matrix(ncol = 10, nrow = 0))
names(out_df) = c("condition", "motif", "N_sign", "N_ns", "N_sign_motif", "N_ns_motif", "N_motif", "group", "odd_ratio", "pvalue")
i = 1

for(ab_tp in ab_tp_list) {
  
  tf = TFs[ab_tp]
  print(ab_tp)
  cht_sel = cht %>% filter(condition == ab_tp)
  
  # all signif variants - all, in and outside peaks
  N1 = cht_sel %>% filter(signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N1_a = cht_sel %>% filter(signif_strongAI & dist2summit < 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N1_b = cht_sel %>% filter(signif_strongAI & dist2summit >= 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  # all n.s. variants - in and outside peaks
  N2 = cht_sel %>% filter(!signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N2_a = cht_sel %>% filter(!signif_strongAI & dist2summit < 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N2_b = cht_sel %>% filter(!signif_strongAI & dist2summit >= 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  
  # variants in the motifs of selected TFs
  fimo_selected = parse_motifs_in_two_alleles(ab_tp, cht_sel, radius = 15, path2_base1 = "/all_variants_alleles/FIMO/",
                                              path2_base2 = "combined_motifs/fimo.tsv", 
                                              peak_radius = 250, subset_motif = F, subset_cht = T) 
  
  
  # 1. Enrichment in cognate motifs
  print("1. Enrichment in cognate motifs")
  #fimo_sel = fimo_selected %>% filter(motif_alt_id == tf & snp_id %in% cht_sel$snp_id)
  fimo_sel = fimo_selected %>% filter(motif_alt_id == tf)  # equivalent
  cognate_motif_ids = unique(fimo_sel$snp_id)
  
  # number of significant variants in motifs - all, in and outside peaks
  N3 = fimo_sel %>% filter(signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N3_a = fimo_sel %>% filter(signif_strongAI & in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N3_b = fimo_sel %>% filter(signif_strongAI & !in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  # number of n.s. variants in motifs - all, in and outside peaks
  N4 = fimo_sel %>% filter(!signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N4_a = fimo_sel %>% filter(!signif_strongAI & in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N4_b = fimo_sel %>% filter(!signif_strongAI & !in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  # mat = matrix(c(N3, N4, N1, N2), nrow = 2)
  # ft = fisher.test(mat)
  # out_df[i, ] = c(ab_tp, tf, N1, N2, N3, N4, N3 + N4, "all variants", ft$estimate, ft$p.value)
  # i = i + 1
  # print("1a. All variants")
  # print(ft)
  
  mat = matrix(c(N3_a, N4_a, N1_a, N2_a), nrow = 2)
  ft = fisher.test(mat)
  out_df[i, ] = c(ab_tp, tf, N1_a, N2_a, N3_a, N4_a, N3_a + N4_a, "in peaks", ft$estimate, ft$p.value)
  i = i + 1
  print("1b. Variants in peaks")
  print(ft)
  
  mat = matrix(c(N3_b, N4_b, N1_b, N2_b), nrow = 2)
  ft = fisher.test(mat)
  out_df[i, ] = c(ab_tp, tf, N1_b, N2_b, N3_b, N4_b, N3_b + N4_b, "outside peaks", ft$estimate, ft$p.value)
  i = i + 1
  print("1c. Variants outside peaks")
  print(ft)

}
   [1] "twi/24"
   [1] "1. Enrichment in cognate motifs"
   [1] "1b. Variants in peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 2.4e-11
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    1.7026 2.5660
   sample estimates:
   odds ratio 
       2.0982 
   
   [1] "1c. Variants outside peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 0.0076
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    1.0463 1.3476
   sample estimates:
   odds ratio 
       1.1895 
   
   [1] "ctcf/68"
   [1] "1. Enrichment in cognate motifs"
   [1] "1b. Variants in peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value <2e-16
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    2.3256 3.3831
   sample estimates:
   odds ratio 
       2.8113 
   
   [1] "1c. Variants outside peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 0.92
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    0.87304 1.12213
   sample estimates:
   odds ratio 
       0.9914 
   
   [1] "mef2/68"
   [1] "1. Enrichment in cognate motifs"
   [1] "1b. Variants in peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value <2e-16
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    2.2275 3.3489
   sample estimates:
   odds ratio 
       2.7412 
   
   [1] "1c. Variants outside peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 0.78
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    0.86807 1.19931
   sample estimates:
   odds ratio 
       1.0234 
   
   [1] "mef2/1012"
   [1] "1. Enrichment in cognate motifs"
   [1] "1b. Variants in peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value <2e-16
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    2.2876 3.3341
   sample estimates:
   odds ratio 
       2.7702 
   
   [1] "1c. Variants outside peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 0.18
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    0.95347 1.25785
   sample estimates:
   odds ratio 
       1.0975 
   
   [1] "bin/68"
   [1] "1. Enrichment in cognate motifs"
   [1] "1b. Variants in peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 4.7e-12
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    2.1427 3.6830
   sample estimates:
   odds ratio 
       2.8278 
   
   [1] "1c. Variants outside peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 0.95
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    0.75705 1.24591
   sample estimates:
   odds ratio 
      0.97841 
   
   [1] "bin/1012"
   [1] "1. Enrichment in cognate motifs"
   [1] "1b. Variants in peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 2.1e-13
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    2.407 4.254
   sample estimates:
   odds ratio 
       3.2233 
   
   [1] "1c. Variants outside peaks"
   
    Fisher's Exact Test for Count Data
   
   data:  mat
   p-value = 0.044
   alternative hypothesis: true odds ratio is not equal to 1
   95 percent confidence interval:
    0.99598 1.66680
   sample estimates:
   odds ratio 
       1.2986
out_df$label = rep(ab_tp_labels, each = 2)
out_df$label = factor(out_df$label, levels = ab_tp_labels)
out_df$pvalue = as.numeric(out_df$pvalue)
out_df$odd_ratio = as.numeric(out_df$odd_ratio)



p = ggplot(out_df, aes(x = label, y = odd_ratio)) +
  facet_wrap(~group) +
  geom_hline(yintercept = 1, color = "darkred") +
  geom_bar(aes(fill = -log10(pvalue)), color = "darkblue", stat = "identity", position = "dodge", width = 0.5) +
  #scale_fill_manual(name = "", values = c("darkblue", "darkgrey"), labels = c("AI peaks", "non-AI peaks")) +
  #geom_text(aes(label = round(r1, 2), x = label, y = odds_ratio + 0.07), data = df, size = 6) +
  ylab("Enrichment in cognate TF motifs \nFisher's Test Odds Ratio") +
  theme_bw() +
  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, colour = TFcols),
        axis.text.y = element_text(size = 12), 
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 16),
        legend.text = element_text(size=14),
        legend.title = element_text(size=14))
   Warning: Vectorized input to `element_text()` is not officially supported.
   ℹ Results may be unexpected or may change in future versions of ggplot2.
p

outf = file.path(outdir_fig_suppl, paste0("FigS4B_motifs_in_peaks_fisher.pdf"))
ggsave(outf, p, width = 10, height = 6)

4 Figure S4C - Motif prediction of AI

# Correlation between AI and delta_score

score_thres = 1
df_shared = fimo_alleles %>% filter(in_peak & !is.na(score.ref) & !is.na(score.alt)) %>%
  mutate(delta_score = as.numeric(score.ref) - as.numeric(score.alt),
         type = ifelse((AI > 0.5 & delta_score > 0) | (AI < 0.5 & delta_score < 0), "concordant", "discordant"))

df_shared %>% 
  group_by(condition, is_indel) %>% 
  summarize(min(abs(delta_score)), max(dist2summit), cor(delta_score, AI), share_concordant = sum(type == "concordant") / n(), n())
   `summarise()` has grouped output by 'condition'. You can override using the
   `.groups` argument.
df_sum = df_shared %>% filter(abs(delta_score) > score_thres  & dist2summit < 250) %>% 
  summarize(min(abs(delta_score)), max(dist2summit), cor = cor(delta_score, AI), share_concordant = sum(type == "concordant") / n(), n())

cor = round(df_sum$cor, 2)
n_conc = round(df_sum$share_concordant, 2) * 100
n_disc = (1 - round(df_sum$share_concordant, 2)) * 100


p = ggplot(df_shared %>% filter(abs(delta_score) > score_thres ), aes(x = delta_score, y = AI, color = type)) + 
  geom_point(size = 1, color = cbPalette[2]) + 
  geom_smooth(method = "lm", se = F, color = "darkblue", size = 0.5) +
  geom_vline(xintercept = -score_thres, color = "grey", size = 0.7) +
  geom_vline(xintercept = score_thres, color = "grey", size = 0.7) +
  geom_hline(yintercept = 0.5, color = "grey", size = 0.7) +
  #geom_hline(yintercept = 0.4, color = "grey", size = 0.7) +
  theme_bw() +
  annotate(geom = "text", x = -3.5, y = 0.95, label = paste("R=", cor)) +
  annotate(geom = "text", x = -3.5, y = 0.88, label = paste("% concordant: ", n_conc), size = 4) +
  xlab("Motif score change (REF-ALT)") +
  ylab("Allele Imbalamce") +
  #scale_color_manual(values = c(cbPalette[2], "darkgrey"), labels = c(paste0("concordant, ", n_conc, "%"), paste0("discordant, ", n_disc, "%")), name = "Variant type") +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14),
        legend.text = element_text(size=12),
        legend.title = element_text(size=12))
   Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
   ℹ Please use `linewidth` instead.
   This warning is displayed once every 8 hours.
   Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
   generated.
p
   `geom_smooth()` using formula = 'y ~ x'

outf = file.path(outdir_fig_suppl, paste0("FigS4C_motifs_AI_prediction.pdf"))
ggsave(outf, p, width = 4, height = 4)
   `geom_smooth()` using formula = 'y ~ x'

5 Figure S4D - ProBound scores on AI and non-AI variants

ProBound_CTCF = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_CTCF_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_CTCF$TF = "CTCF"
ProBound_Bin = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Bin_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_Bin$TF = "Bin"
ProBound_Mef2 = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Mef2_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_Mef2$TF = "Mef2"
ProBound_Twi = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Twi_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_Twi$TF = "Twi"

ProBound_results_REF = rbind(ProBound_CTCF, ProBound_Bin, ProBound_Mef2, ProBound_Twi)
colnames(ProBound_results_REF) = c("variant_chr", "variant_pos", "REF", "ALT", "ProBound_bed_REF", "ProBound_seq_REF", "ProBound_score_REF", "TF")
ProBound_results_REF = ProBound_results_REF %>%
        group_by(TF) %>%
        mutate(ProBound_score_scaled_REF = ProBound_score_REF / max(ProBound_score_REF))


ProBound_ALT_CTCF = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_CTCF_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_CTCF$TF = "CTCF"
ProBound_ALT_Bin = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Bin_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_Bin$TF = "Bin"
ProBound_ALT_Mef2 = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Mef2_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_Mef2$TF = "Mef2"
ProBound_ALT_Twi = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Twi_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_Twi$TF = "Twi"

ProBound_results_ALT = rbind(ProBound_ALT_CTCF, ProBound_ALT_Bin, ProBound_ALT_Mef2, ProBound_ALT_Twi)
colnames(ProBound_results_ALT) = c("variant_chr", "variant_pos", "REF", "ALT", "ProBound_bed_ALT", "ProBound_seq_ALT", "ProBound_score_ALT", "TF")
ProBound_results_ALT = ProBound_results_ALT %>%
        group_by(TF) %>%
        mutate(ProBound_score_scaled_ALT = ProBound_score_ALT / max(ProBound_score_ALT))


ProBound_results = merge(ProBound_results_REF, ProBound_results_ALT, by = c("variant_chr", "variant_pos", "REF", "ALT", "TF"), all.x=TRUE)
ProBound_results$variant_ID = paste(ProBound_results$variant_chr, ProBound_results$variant_pos, ProBound_results$TF, sep="_")
ProBound_results$ProBound_score_max = pmax(ProBound_results$ProBound_score_ALT, ProBound_results$ProBound_score_REF)


cht$variant_ID = paste(cht$snp_id, cht$ab, sep="_")
cht_ProBound = merge(cht, ProBound_results, by="variant_ID")
cht_ProBound$signif_strongAI = factor(cht_ProBound$signif_strongAI, levels=c(TRUE, FALSE))


p = ggplot(cht_ProBound, aes(x=signif_strongAI, y=ProBound_score_max, fill=signif_strongAI)) + 
      facet_wrap(~condition) +
      geom_violin(fill = "darkblue", alpha = 0.3) +
      geom_boxplot(width = 0.4, outlier.size = 0.1, fill = "darkblue", alpha = 0.5) +
      scale_y_log10() + 
      stat_compare_means(method = "wilcox") +
      xlab("Significant allelic imbalance") +
      ylab("-log(10) ProBound sscaled core") +
      theme_bw() +
      theme(axis.text.y = element_text(size=10), axis.text.x = element_text(size=10), 
        axis.title.x = element_text(size=10), axis.title.y = element_text(size=10),
        strip.text.x = element_text(size = 10), strip.text.y = element_text(size = 10),
        legend.position="none")
outf = file.path(outdir_fig_suppl, paste0("FigS4D_ProBound_max_scores_on_AI_variants.pdf"))
ggsave(outf, p, width = 6, height = 4)
   Warning: Removed 125432 rows containing non-finite values (`stat_ydensity()`).
   Warning: Removed 125432 rows containing non-finite values (`stat_boxplot()`).
   Warning: Removed 125432 rows containing non-finite values
   (`stat_compare_means()`).
---
title: "Figure_S4"
output:
   BiocStyle::html_document:
      toc: true
      df_print: paged
      self_contained: true
      code_download: true
      highlight: tango
#bibliography: knn_ml_intro.bib
editor_options: 
  chunk_output_type: inline
---

```{r style, echo=FALSE, results="asis"}
library("knitr")
options(digits = 2, width = 80)
options(bitmapType = 'cairo')
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo = TRUE, tidy = FALSE, include = TRUE, cache = FALSE,
               dev=c('png', 'pdf'), comment = '  ', dpi = 300)

options(stringsAsFactors = FALSE)
knitr::opts_chunk$set(cache=FALSE)
options(digits = 5)         
```

# Setup and data

```{r}
source("../utils/utils.R")
config = load_config()

# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI) 

# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")

# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles  = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows() 

```



# Figure S4A

```{r, fig.width=6, fig.height=3}

# get variants distance to motifs (excluding peaks without motifs)
res_df = lapply(ab_tp_list, function(ab_tp) get_variant_distance2TFmotif(ab_tp, cht, fimo, same_peak = T) %>% 
                  mutate(condition = ab_tp)) %>% bind_rows()

dist_breaks = c(-1, 0, 20, 40, 60, 80, 100, 3000)
dist_labels = c("in motif", "1-20 bp", "21-40 bp", "41-60 bp", "61-80 bp", "81-100 bp", ">100 bp")

res_full = data.frame(matrix(ncol = 6, nrow = 0))
names(res_full) = c("dist_bin", "n", "share", "sd", "type", "condition")
background_sum_all_cond = data.frame(matrix(ncol = 7, nrow = 0))
names(background_sum_all_cond) = c("dist_bin", "n", "share", "share_full",  "type", "condition", "i")

for(ab_tp in ab_tp_list) {
  
  # all variants
  df_sel = res_df %>% dplyr::filter(condition == ab_tp)
  N_var = length(unique(df_sel$snp_id)) # number of variants in peaks with motifs
  N_peak = length(unique(df_sel$peak_id)) # number of peaks with motifs
  
  # significant variants
  df_sign = df_sel %>% filter(signif_strongAI)
  N_var_sign = length(unique(df_sign$snp_id)) # number of significant variants in peaks with motifs
  N_peak_sign = length(unique(df_sign$peak_id)) # number of AI peaks with motifs

  sign_sum = df_sign %>% 
    dplyr::group_by(peak_id) %>% 
    dplyr::mutate(min_dist = min(dist2motif)) %>%
    dplyr::filter(dist2motif == min_dist) %>%
    dplyr::select(peak_id, dist2motif) %>% unique() %>% ungroup() %>%
    dplyr::mutate(N_tot = n(), dist_bin = cut(dist2motif, breaks = dist_breaks, labels = dist_labels)) %>%
    dplyr::group_by(dist_bin) %>%
    dplyr::summarize(n = n(), share = n / mean(N_tot), sd = sd(n), type = "real", condition = ab_tp)
  
  
  df_non_sign = df_sel %>% dplyr::group_by(peak_id) %>% dplyr::mutate(AI_peak = any(signif_strongAI)) %>% dplyr::filter(!AI_peak) %>% ungroup()
  
  background_sum_all = lapply(1:10000, function(i) {
    
    #print(i)
    
    # 1. select same number of peaks as in AI peaks
    peak_ids = sample(unique(df_non_sign$peak_id), N_peak_sign)
    df_bg = df_non_sign %>% dplyr::filter(peak_id %in% peak_ids)
    
    # 2. select same number of variants as for AI peaks
    variant_ids = sample(unique(df_bg$snp_id), N_var_sign)
    df_bg %<>% filter(snp_id %in% variant_ids)
    
    N_peak_bg = length(unique(df_bg$peak_id)) 
    
    bg_sum = df_bg %>% 
      dplyr::group_by(peak_id) %>% 
      dplyr::mutate(min_dist = min(dist2motif)) %>%
      dplyr::filter(dist2motif == min_dist) %>%
      dplyr::select(peak_id, dist2motif) %>% unique() %>% ungroup() %>%
      dplyr::mutate(N_tot = n(), dist_bin = cut(dist2motif, breaks = dist_breaks, labels = dist_labels)) %>%
      dplyr::group_by(dist_bin) %>%
      dplyr::summarize(n = n(), share = n / mean(N_tot), share_full = n / N_peak_bg) %>%
      dplyr::mutate(condition = ab_tp, i = i)
    
    
    bg_sum$type = "background"
    bg_sum
    
  }) %>% bind_rows()
  
  background_sum_all_cond = rbind(background_sum_all_cond, background_sum_all)
  
  background_sum = background_sum_all %>% 
    dplyr::group_by(dist_bin) %>% 
    dplyr::summarize(n = mean(n), 
                     share = mean(share), 
                     sd = sd(share_full),
                     type = "background", 
                     condition = ab_tp)
  
  
  res = rbind.data.frame(sign_sum, background_sum)
  
  res_full = rbind.data.frame(res_full, res)

}


res_full$dist_bin = factor(res_full$dist_bin, levels = rev(dist_labels))
res_full$tf_labels = ab_tp_labels[res_full$condition]  
res_full$tf_labels = factor(res_full$tf_labels, levels = ab_tp_labels)

shares_in_motif = res_full %>% 
  arrange(tf_labels) %>% 
  filter(dist_bin == "in motif", type == "real") %>% 
  select(share) %>% unlist() %>% unique() %>% round(2)


res_full$dist_bin = factor(res_full$dist_bin, levels = dist_labels)
res_full$type = factor(res_full$type, levels = unique(res_full$type))

p = ggplot(res_full, aes(x = dist_bin, y = share, color = type, group = type)) +
  facet_wrap(~tf_labels, ncol = 3) +
  geom_ribbon(data=subset(res_full, type=="background"), aes(x=dist_bin, y = share, ymin = share - sd * 2, ymax = share + sd * 2, group = type), fill = "grey70", colour=NA, alpha = .4) +
  geom_point(size = 2) + 
  geom_line(linewidth = 1) +
  scale_color_manual(values = c("grey", "darkblue"), name = "", labels = c("AI peaks with motifs", "background")) +
  theme_bw() +
  ylab("Share of peaks") +
  xlab("Distance from TF motif") +
  theme(axis.text.y = element_text(size=14), axis.text.x = element_text(size=14, angle = 45, hjust = 1),
        axis.title.x = element_text(size=16), axis.title.y = element_text(size=16),
        legend.text=element_text(size=16), legend.title=element_text(size=16))


p
outf = file.path(outdir_fig_suppl, paste0("FigS4A_dist2motif_vs_background.pdf"))
ggsave(outf, p, width = 12, height = 6)

```

**Empirical p-values for the test: real share is larger than background share**

<br>

**In motif**

| Condition | p-value |
| :----------- | :-----------: |
| Twi 2-4h | `r (sum(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="twi/24")$share >= subset(res_full, type=="real" & condition=="twi/24" & dist_bin=="in motif")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="twi/24")$share) + 1)` |
| CTCF 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="ctcf/68")$share >= subset(res_full, type=="real" & condition=="ctcf/68" & dist_bin=="in motif")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="ctcf/68")$share) + 1)` |
| Mef2 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="mef2/68")$share >= subset(res_full, type=="real" & condition=="mef2/68" & dist_bin=="in motif")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="mef2/68")$share) + 1)` |
| Mef2 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="mef2/1012")$share >= subset(res_full, type=="real" & condition=="mef2/1012" & dist_bin=="in motif")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="mef2/1012")$share) + 1)` |
| Bin 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="bin/68")$share >= subset(res_full, type=="real" & condition=="bin/68" & dist_bin=="in motif")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="bin/68")$share) + 1)` |
| Bin 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="bin/1012")$share >= subset(res_full, type=="real" & condition=="bin/1012" & dist_bin=="in motif")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="in motif" & condition=="bin/1012")$share) + 1)` |


<br>

**1-20 bp**

| Condition | p-value |
| :----------- | :-----------: |
| Twi 2-4h | `r (sum(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="twi/24")$share >= subset(res_full, type=="real" & condition=="twi/24" & dist_bin=="1-20 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="twi/24")$share) + 1)` |
| CTCF 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="ctcf/68")$share >= subset(res_full, type=="real" & condition=="ctcf/68" & dist_bin=="1-20 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="ctcf/68")$share) + 1)` |
| Mef2 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="mef2/68")$share >= subset(res_full, type=="real" & condition=="mef2/68" & dist_bin=="1-20 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="mef2/68")$share) + 1)` |
| Mef2 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="mef2/1012")$share >= subset(res_full, type=="real" & condition=="mef2/1012" & dist_bin=="1-20 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="mef2/1012")$share) + 1)` |
| Bin 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="bin/68")$share >= subset(res_full, type=="real" & condition=="bin/68" & dist_bin=="1-20 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="bin/68")$share) + 1)` |
| Bin 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="bin/1012")$share >= subset(res_full, type=="real" & condition=="bin/1012" & dist_bin=="1-20 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="1-20 bp" & condition=="bin/1012")$share) + 1)` |

<br>

**21-40 bp**

| Condition | p-value |
| :----------- | :-----------: |
| Twi 2-4h | `r (sum(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="twi/24")$share >= subset(res_full, type=="real" & condition=="twi/24" & dist_bin=="21-40 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="twi/24")$share) + 1)` |
| CTCF 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="ctcf/68")$share >= subset(res_full, type=="real" & condition=="ctcf/68" & dist_bin=="21-40 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="ctcf/68")$share) + 1)` |
| Mef2 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="mef2/68")$share >= subset(res_full, type=="real" & condition=="mef2/68" & dist_bin=="21-40 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="mef2/68")$share) + 1)` |
| Mef2 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="mef2/1012")$share >= subset(res_full, type=="real" & condition=="mef2/1012" & dist_bin=="21-40 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="mef2/1012")$share) + 1)` |
| Bin 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="bin/68")$share >= subset(res_full, type=="real" & condition=="bin/68" & dist_bin=="21-40 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="bin/68")$share) + 1)` |
| Bin 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="bin/1012")$share >= subset(res_full, type=="real" & condition=="bin/1012" & dist_bin=="21-40 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="21-40 bp" & condition=="bin/1012")$share) + 1)` |


<br>

**41-60 bp**

| Condition | p-value |
| :----------- | :-----------: |
| Twi 2-4h | `r (sum(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="twi/24")$share >= subset(res_full, type=="real" & condition=="twi/24" & dist_bin=="41-60 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="twi/24")$share) + 1)` |
| CTCF 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="ctcf/68")$share >= subset(res_full, type=="real" & condition=="ctcf/68" & dist_bin=="41-60 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="ctcf/68")$share) + 1)` |
| Mef2 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="mef2/68")$share >= subset(res_full, type=="real" & condition=="mef2/68" & dist_bin=="41-60 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="mef2/68")$share) + 1)` |
| Mef2 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="mef2/1012")$share >= subset(res_full, type=="real" & condition=="mef2/1012" & dist_bin=="41-60 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="mef2/1012")$share) + 1)` |
| Bin 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="bin/68")$share >= subset(res_full, type=="real" & condition=="bin/68" & dist_bin=="41-60 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="bin/68")$share) + 1)` |
| Bin 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="bin/1012")$share >= subset(res_full, type=="real" & condition=="bin/1012" & dist_bin=="41-60 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="41-60 bp" & condition=="bin/1012")$share) + 1)` |


<br>

**61-80 bp**

| Condition | p-value |
| :----------- | :-----------: |
| Twi 2-4h | `r (sum(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="twi/24")$share >= subset(res_full, type=="real" & condition=="twi/24" & dist_bin=="61-80 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="twi/24")$share) + 1)` |
| CTCF 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="ctcf/68")$share >= subset(res_full, type=="real" & condition=="ctcf/68" & dist_bin=="61-80 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="ctcf/68")$share) + 1)` |
| Mef2 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="mef2/68")$share >= subset(res_full, type=="real" & condition=="mef2/68" & dist_bin=="61-80 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="mef2/68")$share) + 1)` |
| Mef2 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="mef2/1012")$share >= subset(res_full, type=="real" & condition=="mef2/1012" & dist_bin=="61-80 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="mef2/1012")$share) + 1)` |
| Bin 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="bin/68")$share >= subset(res_full, type=="real" & condition=="bin/68" & dist_bin=="61-80 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="bin/68")$share) + 1)` |
| Bin 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="bin/1012")$share >= subset(res_full, type=="real" & condition=="bin/1012" & dist_bin=="61-80 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="61-80 bp" & condition=="bin/1012")$share) + 1)` |


<br>

**81-100 bp**

| Condition | p-value |
| :----------- | :-----------: |
| Twi 2-4h | `r (sum(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="twi/24")$share >= subset(res_full, type=="real" & condition=="twi/24" & dist_bin=="81-100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="twi/24")$share) + 1)` |
| CTCF 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="ctcf/68")$share >= subset(res_full, type=="real" & condition=="ctcf/68" & dist_bin=="81-100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="ctcf/68")$share) + 1)` |
| Mef2 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="mef2/68")$share >= subset(res_full, type=="real" & condition=="mef2/68" & dist_bin=="81-100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="mef2/68")$share) + 1)` |
| Mef2 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="mef2/1012")$share >= subset(res_full, type=="real" & condition=="mef2/1012" & dist_bin=="81-100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="mef2/1012")$share) + 1)` |
| Bin 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="bin/68")$share >= subset(res_full, type=="real" & condition=="bin/68" & dist_bin=="81-100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="bin/68")$share) + 1)` |
| Bin 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="bin/1012")$share >= subset(res_full, type=="real" & condition=="bin/1012" & dist_bin=="81-100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin=="81-100 bp" & condition=="bin/1012")$share) + 1)` |


<br>

**>100 bp**

| Condition | p-value |
| :----------- | :-----------: |
| Twi 2-4h | `r (sum(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="twi/24")$share >= subset(res_full, type=="real" & condition=="twi/24" & dist_bin==">100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="twi/24")$share) + 1)` |
| CTCF 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="ctcf/68")$share >= subset(res_full, type=="real" & condition=="ctcf/68" & dist_bin==">100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="ctcf/68")$share) + 1)` |
| Mef2 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="mef2/68")$share >= subset(res_full, type=="real" & condition=="mef2/68" & dist_bin==">100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="mef2/68")$share) + 1)` |
| Mef2 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="mef2/1012")$share >= subset(res_full, type=="real" & condition=="mef2/1012" & dist_bin==">100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="mef2/1012")$share) + 1)` |
| Bin 6-8h | `r (sum(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="bin/68")$share >= subset(res_full, type=="real" & condition=="bin/68" & dist_bin==">100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="bin/68")$share) + 1)` |
| Bin 10-12h | `r (sum(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="bin/1012")$share >= subset(res_full, type=="real" & condition=="bin/1012" & dist_bin==">100 bp")$share) + 1) / (length(subset(background_sum_all_cond, dist_bin==">100 bp" & condition=="bin/1012")$share) + 1)` |




# Figure S4B - Fisher's tests: enrichments of significant variants in motiifs in / outside peaks

```{r}
out_df = data.frame(matrix(ncol = 10, nrow = 0))
names(out_df) = c("condition", "motif", "N_sign", "N_ns", "N_sign_motif", "N_ns_motif", "N_motif", "group", "odd_ratio", "pvalue")
i = 1

for(ab_tp in ab_tp_list) {
  
  tf = TFs[ab_tp]
  print(ab_tp)
  cht_sel = cht %>% filter(condition == ab_tp)
  
  # all signif variants - all, in and outside peaks
  N1 = cht_sel %>% filter(signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N1_a = cht_sel %>% filter(signif_strongAI & dist2summit < 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N1_b = cht_sel %>% filter(signif_strongAI & dist2summit >= 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  # all n.s. variants - in and outside peaks
  N2 = cht_sel %>% filter(!signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N2_a = cht_sel %>% filter(!signif_strongAI & dist2summit < 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N2_b = cht_sel %>% filter(!signif_strongAI & dist2summit >= 250) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  
  # variants in the motifs of selected TFs
  fimo_selected = parse_motifs_in_two_alleles(ab_tp, cht_sel, radius = 15, path2_base1 = "/all_variants_alleles/FIMO/",
                                              path2_base2 = "combined_motifs/fimo.tsv", 
                                              peak_radius = 250, subset_motif = F, subset_cht = T) 
  
  
  # 1. Enrichment in cognate motifs
  print("1. Enrichment in cognate motifs")
  #fimo_sel = fimo_selected %>% filter(motif_alt_id == tf & snp_id %in% cht_sel$snp_id)
  fimo_sel = fimo_selected %>% filter(motif_alt_id == tf)  # equivalent
  cognate_motif_ids = unique(fimo_sel$snp_id)
  
  # number of significant variants in motifs - all, in and outside peaks
  N3 = fimo_sel %>% filter(signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N3_a = fimo_sel %>% filter(signif_strongAI & in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N3_b = fimo_sel %>% filter(signif_strongAI & !in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  # number of n.s. variants in motifs - all, in and outside peaks
  N4 = fimo_sel %>% filter(!signif_strongAI) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N4_a = fimo_sel %>% filter(!signif_strongAI & in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  N4_b = fimo_sel %>% filter(!signif_strongAI & !in_peak) %>% select(snp_id) %>% unique() %>% tally() %>% unlist()
  
  # mat = matrix(c(N3, N4, N1, N2), nrow = 2)
  # ft = fisher.test(mat)
  # out_df[i, ] = c(ab_tp, tf, N1, N2, N3, N4, N3 + N4, "all variants", ft$estimate, ft$p.value)
  # i = i + 1
  # print("1a. All variants")
  # print(ft)
  
  mat = matrix(c(N3_a, N4_a, N1_a, N2_a), nrow = 2)
  ft = fisher.test(mat)
  out_df[i, ] = c(ab_tp, tf, N1_a, N2_a, N3_a, N4_a, N3_a + N4_a, "in peaks", ft$estimate, ft$p.value)
  i = i + 1
  print("1b. Variants in peaks")
  print(ft)
  
  mat = matrix(c(N3_b, N4_b, N1_b, N2_b), nrow = 2)
  ft = fisher.test(mat)
  out_df[i, ] = c(ab_tp, tf, N1_b, N2_b, N3_b, N4_b, N3_b + N4_b, "outside peaks", ft$estimate, ft$p.value)
  i = i + 1
  print("1c. Variants outside peaks")
  print(ft)

}


out_df$label = rep(ab_tp_labels, each = 2)
out_df$label = factor(out_df$label, levels = ab_tp_labels)
out_df$pvalue = as.numeric(out_df$pvalue)
out_df$odd_ratio = as.numeric(out_df$odd_ratio)



p = ggplot(out_df, aes(x = label, y = odd_ratio)) +
  facet_wrap(~group) +
  geom_hline(yintercept = 1, color = "darkred") +
  geom_bar(aes(fill = -log10(pvalue)), color = "darkblue", stat = "identity", position = "dodge", width = 0.5) +
  #scale_fill_manual(name = "", values = c("darkblue", "darkgrey"), labels = c("AI peaks", "non-AI peaks")) +
  #geom_text(aes(label = round(r1, 2), x = label, y = odds_ratio + 0.07), data = df, size = 6) +
  ylab("Enrichment in cognate TF motifs \nFisher's Test Odds Ratio") +
  theme_bw() +
  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, colour = TFcols),
        axis.text.y = element_text(size = 12), 
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 16),
        legend.text = element_text(size=14),
        legend.title = element_text(size=14))


p

outf = file.path(outdir_fig_suppl, paste0("FigS4B_motifs_in_peaks_fisher.pdf"))
ggsave(outf, p, width = 10, height = 6)
```


# Figure S4C - Motif prediction of AI

```{r}

# Correlation between AI and delta_score

score_thres = 1
df_shared = fimo_alleles %>% filter(in_peak & !is.na(score.ref) & !is.na(score.alt)) %>%
  mutate(delta_score = as.numeric(score.ref) - as.numeric(score.alt),
         type = ifelse((AI > 0.5 & delta_score > 0) | (AI < 0.5 & delta_score < 0), "concordant", "discordant"))

df_shared %>% 
  group_by(condition, is_indel) %>% 
  summarize(min(abs(delta_score)), max(dist2summit), cor(delta_score, AI), share_concordant = sum(type == "concordant") / n(), n())

df_sum = df_shared %>% filter(abs(delta_score) > score_thres  & dist2summit < 250) %>% 
  summarize(min(abs(delta_score)), max(dist2summit), cor = cor(delta_score, AI), share_concordant = sum(type == "concordant") / n(), n())

cor = round(df_sum$cor, 2)
n_conc = round(df_sum$share_concordant, 2) * 100
n_disc = (1 - round(df_sum$share_concordant, 2)) * 100


p = ggplot(df_shared %>% filter(abs(delta_score) > score_thres ), aes(x = delta_score, y = AI, color = type)) + 
  geom_point(size = 1, color = cbPalette[2]) + 
  geom_smooth(method = "lm", se = F, color = "darkblue", size = 0.5) +
  geom_vline(xintercept = -score_thres, color = "grey", size = 0.7) +
  geom_vline(xintercept = score_thres, color = "grey", size = 0.7) +
  geom_hline(yintercept = 0.5, color = "grey", size = 0.7) +
  #geom_hline(yintercept = 0.4, color = "grey", size = 0.7) +
  theme_bw() +
  annotate(geom = "text", x = -3.5, y = 0.95, label = paste("R=", cor)) +
  annotate(geom = "text", x = -3.5, y = 0.88, label = paste("% concordant: ", n_conc), size = 4) +
  xlab("Motif score change (REF-ALT)") +
  ylab("Allele Imbalamce") +
  #scale_color_manual(values = c(cbPalette[2], "darkgrey"), labels = c(paste0("concordant, ", n_conc, "%"), paste0("discordant, ", n_disc, "%")), name = "Variant type") +
  theme(axis.text=element_text(size=12),
        axis.title=element_text(size=14),
        legend.text = element_text(size=12),
        legend.title = element_text(size=12))


p
outf = file.path(outdir_fig_suppl, paste0("FigS4C_motifs_AI_prediction.pdf"))
ggsave(outf, p, width = 4, height = 4)

```



# Figure S4D - ProBound scores on AI and non-AI variants

```{r}
ProBound_CTCF = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_CTCF_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_CTCF$TF = "CTCF"
ProBound_Bin = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Bin_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_Bin$TF = "Bin"
ProBound_Mef2 = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Mef2_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_Mef2$TF = "Mef2"
ProBound_Twi = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Twi_REF_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_Twi$TF = "Twi"

ProBound_results_REF = rbind(ProBound_CTCF, ProBound_Bin, ProBound_Mef2, ProBound_Twi)
colnames(ProBound_results_REF) = c("variant_chr", "variant_pos", "REF", "ALT", "ProBound_bed_REF", "ProBound_seq_REF", "ProBound_score_REF", "TF")
ProBound_results_REF = ProBound_results_REF %>%
        group_by(TF) %>%
        mutate(ProBound_score_scaled_REF = ProBound_score_REF / max(ProBound_score_REF))


ProBound_ALT_CTCF = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_CTCF_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_CTCF$TF = "CTCF"
ProBound_ALT_Bin = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Bin_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_Bin$TF = "Bin"
ProBound_ALT_Mef2 = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Mef2_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_Mef2$TF = "Mef2"
ProBound_ALT_Twi = read.table("/g/furlong/project/68_F1_cisreg_ichip/analysis/ProBound/ProBound_Twi_ALT_sequence_15bp_around_variants.txt", header=TRUE)
ProBound_ALT_Twi$TF = "Twi"

ProBound_results_ALT = rbind(ProBound_ALT_CTCF, ProBound_ALT_Bin, ProBound_ALT_Mef2, ProBound_ALT_Twi)
colnames(ProBound_results_ALT) = c("variant_chr", "variant_pos", "REF", "ALT", "ProBound_bed_ALT", "ProBound_seq_ALT", "ProBound_score_ALT", "TF")
ProBound_results_ALT = ProBound_results_ALT %>%
        group_by(TF) %>%
        mutate(ProBound_score_scaled_ALT = ProBound_score_ALT / max(ProBound_score_ALT))


ProBound_results = merge(ProBound_results_REF, ProBound_results_ALT, by = c("variant_chr", "variant_pos", "REF", "ALT", "TF"), all.x=TRUE)
ProBound_results$variant_ID = paste(ProBound_results$variant_chr, ProBound_results$variant_pos, ProBound_results$TF, sep="_")
ProBound_results$ProBound_score_max = pmax(ProBound_results$ProBound_score_ALT, ProBound_results$ProBound_score_REF)


cht$variant_ID = paste(cht$snp_id, cht$ab, sep="_")
cht_ProBound = merge(cht, ProBound_results, by="variant_ID")
cht_ProBound$signif_strongAI = factor(cht_ProBound$signif_strongAI, levels=c(TRUE, FALSE))


p = ggplot(cht_ProBound, aes(x=signif_strongAI, y=ProBound_score_max, fill=signif_strongAI)) + 
      facet_wrap(~condition) +
      geom_violin(fill = "darkblue", alpha = 0.3) +
      geom_boxplot(width = 0.4, outlier.size = 0.1, fill = "darkblue", alpha = 0.5) +
      scale_y_log10() + 
      stat_compare_means(method = "wilcox") +
      xlab("Significant allelic imbalance") +
      ylab("-log(10) ProBound sscaled core") +
      theme_bw() +
      theme(axis.text.y = element_text(size=10), axis.text.x = element_text(size=10), 
        axis.title.x = element_text(size=10), axis.title.y = element_text(size=10),
        strip.text.x = element_text(size = 10), strip.text.y = element_text(size = 10),
        legend.position="none")
outf = file.path(outdir_fig_suppl, paste0("FigS4D_ProBound_max_scores_on_AI_variants.pdf"))
ggsave(outf, p, width = 6, height = 4)
```
