Contents

1 Setup and data

source("../utils/utils.R")
   ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
   ✔ dplyr     1.1.0     ✔ readr     2.1.4
   ✔ forcats   1.0.0     ✔ stringr   1.5.0
   ✔ ggplot2   3.4.1     ✔ tibble    3.2.0
   ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
   ✔ purrr     1.0.1     
   ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
   ✖ dplyr::filter() masks stats::filter()
   ✖ dplyr::lag()    masks stats::lag()
   ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
   
   Attaching package: 'magrittr'
   
   
   The following object is masked from 'package:purrr':
   
       set_names
   
   
   The following object is masked from 'package:tidyr':
   
       extract
   
   
   Loading required package: GenomicRanges
   
   Loading required package: stats4
   
   Loading required package: BiocGenerics
   
   
   Attaching package: 'BiocGenerics'
   
   
   The following objects are masked from 'package:lubridate':
   
       intersect, setdiff, union
   
   
   The following objects are masked from 'package:dplyr':
   
       combine, intersect, setdiff, union
   
   
   The following objects are masked from 'package:stats':
   
       IQR, mad, sd, var, xtabs
   
   
   The following objects are masked from 'package:base':
   
       anyDuplicated, aperm, append, as.data.frame, basename, cbind,
       colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
       get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
       match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
       Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
       table, tapply, union, unique, unsplit, which.max, which.min
   
   
   Loading required package: S4Vectors
   
   
   Attaching package: 'S4Vectors'
   
   
   The following objects are masked from 'package:lubridate':
   
       second, second<-
   
   
   The following objects are masked from 'package:dplyr':
   
       first, rename
   
   
   The following object is masked from 'package:tidyr':
   
       expand
   
   
   The following objects are masked from 'package:base':
   
       expand.grid, I, unname
   
   
   Loading required package: IRanges
   
   
   Attaching package: 'IRanges'
   
   
   The following object is masked from 'package:lubridate':
   
       %within%
   
   
   The following objects are masked from 'package:dplyr':
   
       collapse, desc, slice
   
   
   The following object is masked from 'package:purrr':
   
       reduce
   
   
   Loading required package: GenomeInfoDb
   
   
   Attaching package: 'GenomicRanges'
   
   
   The following object is masked from 'package:magrittr':
   
       subtract
   
   
   Loading required package: grid
   
   Loading required package: Biostrings
   
   Loading required package: XVector
   
   
   Attaching package: 'XVector'
   
   
   The following object is masked from 'package:purrr':
   
       compact
   
   
   
   Attaching package: 'Biostrings'
   
   
   The following object is masked from 'package:grid':
   
       pattern
   
   
   The following object is masked from 'package:base':
   
       strsplit
   
   
   
   Attaching package: 'gridExtra'
   
   
   The following object is masked from 'package:BiocGenerics':
   
       combine
   
   
   The following object is masked from 'package:dplyr':
   
       combine
   
   
   
   Attaching package: 'data.table'
   
   
   The following object is masked from 'package:GenomicRanges':
   
       shift
   
   
   The following object is masked from 'package:IRanges':
   
       shift
   
   
   The following objects are masked from 'package:S4Vectors':
   
       first, second
   
   
   The following objects are masked from 'package:lubridate':
   
       hour, isoweek, mday, minute, month, quarter, second, wday, week,
       yday, year
   
   
   The following objects are masked from 'package:dplyr':
   
       between, first, last
   
   
   The following object is masked from 'package:purrr':
   
       transpose
   
   
   Loading required package: AnnotationDbi
   
   Loading required package: Biobase
   
   Welcome to Bioconductor
   
       Vignettes contain introductory material; view with
       'browseVignettes()'. To cite Bioconductor, see
       'citation("Biobase")', and for packages 'citation("pkgname")'.
   
   
   
   Attaching package: 'AnnotationDbi'
   
   
   The following object is masked from 'package:dplyr':
   
       select
   
   
   
   
   
   
   Registered S3 method overwritten by 'gplots':
     method         from 
     reorder.factor gdata
   
   ChIPseeker v1.34.1  For help: https://guangchuangyu.github.io/software/ChIPseeker
   
   If you use ChIPseeker in published research, please cite:
   Qianwen Wang, Ming Li, Tianzhi Wu, Li Zhan, Lin Li, Meijun Chen, Wenqin Xie, Zijing Xie, Erqiang Hu, Shuangbin Xu, Guangchuang Yu. Exploring epigenomic datasets by ChIPseeker. Current Protocols 2022, 2(10): e585
   
   Loading required package: graph
   
   
   Attaching package: 'graph'
   
   
   The following object is masked from 'package:Biostrings':
   
       complement
   
   
   The following object is masked from 'package:stringr':
   
       boundary
   
   
   Loading required package: GO.db
   
   Loading required package: SparseM
   
   
   Attaching package: 'SparseM'
   
   
   The following object is masked from 'package:base':
   
       backsolve
   
   
   
   groupGOTerms:    GOBPTerm, GOMFTerm, GOCCTerm environments built.
   
   
   Attaching package: 'topGO'
   
   
   The following object is masked from 'package:grid':
   
       depth
   
   
   The following object is masked from 'package:IRanges':
   
       members
   
   
   Loading required package: GenomicFeatures
   
   
   Attaching package: 'GenomicFeatures'
   
   
   The following object is masked from 'package:topGO':
   
       genes
config = load_config()

# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI) 

# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")

# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles  = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows() 

2 Figure S3B: Indels length

cht_sel = cht %>%
  filter(signif_strongAI) %>%
  group_by(peak_id) %>%
  #mutate(min_pval = min(P.VALUE)) %>%
  #filter(P.VALUE == min_pval) %>%
  mutate(max_indel = max(indel_length), min_indel = min(indel_length)) %>%
  filter(indel_length == max_indel) %>%
#  mutate(max_AI = max(AI_abs)) %>%
#  filter(AI_abs == max_AI) %>%
  select(condition, peak_id, indel_length, AI_abs) %>%
  unique() %>%
  mutate(bin_indel = cut(indel_length, breaks = c(0, 1, 10, Inf), labels = c("SNP", "2-10", ">10"), include.lowest = T))


cht_sel$label = factor(ab_tp_labels[cht_sel$condition], levels = ab_tp_labels)


p = ggplot(cht_sel, aes(x = bin_indel, y = AI_abs, fill = bin_indel)) + 
  geom_boxplot(width = 0.6, outlier.size = 0.1) +
  scale_fill_manual(values = cbPalette, name = "Indel length") +
  scale_y_continuous(trans = "log2", limits = c(0.1, 0.7)) +
  stat_compare_means(comparisons = list(c("SNP", ">10"), c("SNP", "2-10"))) +
  stat_compare_means() +
  xlab("") +
  ylab("Allele imbalance") +
  theme_bw() +
  facet_grid(~label) +
  stat_summary(fun.data = function(x) c(y = median(x) + 0.1, label = length(x)), geom = "text", size = 4) +
  theme(axis.text.x = element_blank(), 
        #axis.text.x = element_text(size=16, angle = 45, hjust = 1, color = TFcols), 
        axis.text.y = element_text(size=16), 
        axis.title.x = element_text(size=18), axis.title.y = element_text(size=16),
        strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14),
        legend.text=element_text(size=16), legend.title=element_text(size=16))


cht_sel %>% 
  group_by(condition) %>%
  mutate(N = n()) %>%
  group_by(condition, bin_indel) %>%
  dplyr::summarize(n = n(), share = n / mean(N), res_share = 1 - share)
   `summarise()` has grouped output by 'condition'. You can override using the
   `.groups` argument.
p

outf = file.path(outdir_fig_suppl, paste0("FigS3B_indels_AI.pdf"))
ggsave(outf, p, width = 18, height = 5)

3 Figure S3C: Indels length

cht = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp)) %>% bind_rows()
cht_gr = cht %>% mutate(chr = TEST.SNP.CHROM, start = TEST.SNP.POS, end = TEST.SNP.POS) %>% GRanges()

# F1 ATAC-seq results

# coordinates of ATAC peaks
atac_regions_path = "/g/furlong/project/68_F1_cisreg_ichip/data/F1_paper_multiom/ATAC_feature_location_dm6.bed"
atac_regions = import(atac_regions_path, format = "bed")

# imbalance info for ATAC peaks
atac_ai_path = "/g/furlong/project/68_F1_cisreg_ichip/data/F1_paper_multiom/ATAC_all_peaks_atac_X.txt"
atac_ai = read.delim(atac_ai_path)

# additional annotations for allele imbalance
atac_ai_sum = atac_ai %>% 
  mutate(AI = padj < 0.01 & abs(0.5 - meanprop) > 0.1) %>%
  group_by(feature, time) %>% 
  summarize(n_AI = sum(AI), 
            AI_peak = any(AI),
            mean_AI = abs(0.5 - mean(meanprop)), 
            max_AI = max(abs(0.5 - meanprop)),
            n_het_lines =n())
   `summarise()` has grouped output by 'feature'. You can override using the
   `.groups` argument.
# combine AII info with peak coordinates
atac_f1_df = merge(data.frame(atac_regions), atac_ai_sum, by.x = "name", by.y = "feature") %>% filter(!seqnames %in% c("chrX", "chrY"))
atac_ai_gr = GRanges(atac_f1_df)


# Do the analysis per TF and ATAC time-point


df = data.frame()

for (ab_tp in ab_tp_list) {
  
  tp = gsub("-", "", timepoints[ab_tp])
  
  # get TF peaks
  AI_peaks = get_peaks_from_cht(ab_tp, cht)
  names(mcols(AI_peaks)) = paste0(names(mcols(AI_peaks)), ".tf")
  
  # get ATAC peaks for selected time-point
  AI_atac = atac_ai_gr[atac_ai_gr$time == tp]
  names(mcols(AI_atac)) = paste0(names(mcols(AI_atac)), ".atac")
  
  ov = findOverlaps(AI_peaks, AI_atac)
  
  
  res = cbind.data.frame(mcols(AI_peaks[queryHits(ov)]), mcols(AI_atac[subjectHits(ov)]))
  
  print(ab_tp)
  wt = wilcox.test(mean_AI.atac ~ AI_peak.tf, res, alternative = "less")
  print(wt)
  wt = wilcox.test(max_AI.atac ~ AI_peak.tf, res, alternative = "less")
  print(wt)
  
  df = rbind.data.frame(df, res)
  
}
   `summarise()` has grouped output by 'peak_id', 'condition', 'chr', 'start'. You
   can override using the `.groups` argument.
   [1] "twi/24"
   
    Wilcoxon rank sum test with continuity correction
   
   data:  mean_AI.atac by AI_peak.tf
   W = 1408786, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   
   
    Wilcoxon rank sum test with continuity correction
   
   data:  max_AI.atac by AI_peak.tf
   W = 1317170, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   `summarise()` has grouped output by 'peak_id', 'condition', 'chr', 'start'. You
   can override using the `.groups` argument.
   [1] "ctcf/68"
   
    Wilcoxon rank sum test with continuity correction
   
   data:  mean_AI.atac by AI_peak.tf
   W = 493598, p-value = 1.2e-11
   alternative hypothesis: true location shift is less than 0
   
   
    Wilcoxon rank sum test with continuity correction
   
   data:  max_AI.atac by AI_peak.tf
   W = 444450, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   `summarise()` has grouped output by 'peak_id', 'condition', 'chr', 'start'. You
   can override using the `.groups` argument.
   [1] "mef2/68"
   
    Wilcoxon rank sum test with continuity correction
   
   data:  mean_AI.atac by AI_peak.tf
   W = 947570, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   
   
    Wilcoxon rank sum test with continuity correction
   
   data:  max_AI.atac by AI_peak.tf
   W = 879208, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   `summarise()` has grouped output by 'peak_id', 'condition', 'chr', 'start'. You
   can override using the `.groups` argument.
   [1] "mef2/1012"
   
    Wilcoxon rank sum test with continuity correction
   
   data:  mean_AI.atac by AI_peak.tf
   W = 1304888, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   
   
    Wilcoxon rank sum test with continuity correction
   
   data:  max_AI.atac by AI_peak.tf
   W = 1237730, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   `summarise()` has grouped output by 'peak_id', 'condition', 'chr', 'start'. You
   can override using the `.groups` argument.
   [1] "bin/68"
   
    Wilcoxon rank sum test with continuity correction
   
   data:  mean_AI.atac by AI_peak.tf
   W = 416654, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   
   
    Wilcoxon rank sum test with continuity correction
   
   data:  max_AI.atac by AI_peak.tf
   W = 417385, p-value <2e-16
   alternative hypothesis: true location shift is less than 0
   `summarise()` has grouped output by 'peak_id', 'condition', 'chr', 'start'. You
   can override using the `.groups` argument.
   [1] "bin/1012"
   
    Wilcoxon rank sum test with continuity correction
   
   data:  mean_AI.atac by AI_peak.tf
   W = 397406, p-value = 3.2e-07
   alternative hypothesis: true location shift is less than 0
   
   
    Wilcoxon rank sum test with continuity correction
   
   data:  max_AI.atac by AI_peak.tf
   W = 369466, p-value = 1.5e-11
   alternative hypothesis: true location shift is less than 0
df %<>% 
  mutate(label = factor(ab_tp_labels[condition.tf], levels = ab_tp_labels))

df %>% group_by(condition.tf, AI_peak.tf) %>%
  summarize(mean(mean_AI.atac, mean(max_AI.atac)))
   `summarise()` has grouped output by 'condition.tf'. You can override using the
   `.groups` argument.
p = ggplot(df, aes(x = AI_peak.tf,  y = max_AI.atac)) +
  geom_violin(fill = "darkblue", alpha = 0.3) +
  geom_boxplot(width = 0.4, outlier.size = 0.1, fill = "darkblue", alpha = 0.7) +
  facet_wrap(~ label, ncol = 3) +
  theme_bw() +
  stat_compare_means() +
  xlab("AI TF peaks (F1)") +
  ylab("Allele imbalance of ATAC peaks (F1)") +
  theme(axis.text.y = element_text(size=12), axis.text.x = element_text(size=12), 
        axis.title.x = element_text(size=16), axis.title.y = element_text(size=16),
        strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14))

print(p)

ggsave(file.path(outdir_fig_suppl, "FigS3C_AI_ATAC_by_TF.pdf"), p, width = 9, height = 6)

4 Figure S3D

# only quantified peaks for CHT
#peaks = lapply(ab_tp_list, function(ab_tp) get_peaks_from_cht(ab_tp, cht, as_granges = T))
#names(peaks) = ab_tp_list

peaks = get_consensus_peaksets_with_AI(ab_tp_list, cht, filter = F)

dhs = load_dhs() %>% GRanges()
dhs$DHScond = cut(dhs$num_conditions, breaks = c(1, 2, 17, 19), include.lowest = T)
dhs$DHS_modERN = cut(dhs$num_modERN,breaks = c(0, 9, 251), include.lowest = T)


res = lapply(c("distal", "proximal"), function(TSS_type) {

  dhs_gr = dhs %>% as.data.frame() %>% filter(TSS == TSS_type) %>% GRanges()
  
  # DHS conditions
  df = lapply(peaks, function(x) {ov = findOverlaps(x, dhs_gr);
                                  cbind.data.frame(as.data.frame(x[queryHits(ov)]), as.data.frame(dhs_gr[subjectHits(ov)])) }) %>%
            bind_rows()
  
  fts1 = lapply(ab_tp_list, function(ab_tp) {fisher_test_two_groups(df, ab_tp,
                                                                    group1 = "isAI", group1_val = c(TRUE, FALSE), 
                                                                    group2 = "DHScond", group2_val = c("[1,2]", "(17,19]"))}) %>%
            bind_rows() %>% mutate(comp = "DHS conditions\n(1-2 vs. 18-19)", labels = ab_tp_labels)
  
  # modERN TFs
  fts2 = lapply(ab_tp_list, function(ab_tp) {fisher_test_two_groups(df, ab_tp,
                                                                    group1 = "isAI", group1_val = c(TRUE, FALSE), 
                                                                    group2 = "DHS_modERN", group2_val = c("(9,251]", "[0,9]"))}) %>%
            bind_rows() %>% mutate(comp = "modERN TFs\n(31-251 vs. 0-5)", labels = ab_tp_labels)
  
  
  res = rbind.data.frame(fts1, fts2) %>% mutate(TSS_type = paste("TSS", TSS_type)) 
  
  
}) %>% bind_rows()
   New names:
   New names:
   New names:
   New names:
   New names:
   New names:
   New names:
   New names:
   New names:
   New names:
   New names:
   New names:
   • `seqnames` -> `seqnames...1`
   • `start` -> `start...2`
   • `end` -> `end...3`
   • `width` -> `width...4`
   • `strand` -> `strand...5`
   • `score` -> `score...7`
   • `seqnames` -> `seqnames...13`
   • `start` -> `start...14`
   • `end` -> `end...15`
   • `width` -> `width...16`
   • `strand` -> `strand...17`
   • `score` -> `score...21`
# ModERN TFs

df = res %>% filter(comp2 == "DHS_modERN_(9,251]_vs_[0,9]") 
df$label = ab_tp_labels
df$label = factor(df$label, levels = ab_tp_labels)

p = ggplot(df, aes(x = label, y = odds_ratio)) +
  facet_wrap(~TSS_type) +
  geom_hline(yintercept = 1, color = "darkred") +
  geom_bar(aes(fill = -log10(pval)), color = "darkblue", stat = "identity", position = "dodge", width = 0.5) +
  #scale_fill_manual(name = "", values = c("darkblue", "darkgrey"), labels = c("AI peaks", "non-AI peaks")) +
  geom_text(aes(label = round(r1, 2), x = label, y = odds_ratio + 0.05), data = df, size = 6) +
  ylab("Depletioin in ubiquitously bound regions (>10 modERN TFs) \nFisher's Test Odds Ratio") +
  theme_bw() +
  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, colour = TFcols),
        axis.text.y = element_text(size = 12), 
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 14),
        legend.text = element_text(size=14),
        legend.title = element_text(size=14))
   Warning: Vectorized input to `element_text()` is not officially supported.
   ℹ Results may be unexpected or may change in future versions of ggplot2.
p

outf = file.path(outdir_fig_suppl, paste0("FigS3D_modERN_fisher.pdf"))
ggsave(outf, p, width = 10, height = 6)

5 Figure S3E

df = res %>% filter(comp2 == "DHScond_[1,2]_vs_(17,19]") 
df$label = ab_tp_labels
df$label = factor(df$label, levels = ab_tp_labels)



p = ggplot(df, aes(x = label, y = odds_ratio)) +
  facet_wrap(~TSS_type) +
  geom_hline(yintercept = 1, color = "darkred") +
  geom_bar(aes(fill = -log10(pval)), color = "darkblue", stat = "identity", position = "dodge", width = 0.5) +
  #scale_fill_manual(name = "", values = c("darkblue", "darkgrey"), labels = c("AI peaks", "non-AI peaks")) +
  geom_text(aes(label = round(r1, 2), x = label, y = odds_ratio + 0.07), data = df, size = 6) +
  ylab("Enrichment in condition-specific DHS \nFisher's Test Odds Ratio") +
  theme_bw() +
  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, colour = TFcols),
        axis.text.y = element_text(size = 12), 
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 16),
        legend.text = element_text(size=14),
        legend.title = element_text(size=14))
   Warning: Vectorized input to `element_text()` is not officially supported.
   ℹ Results may be unexpected or may change in future versions of ggplot2.
p

outf = file.path(outdir_fig_suppl, paste0("FigS3E_nonubiqDHS_fisher.pdf"))
ggsave(outf, p, width = 10, height = 6)

6 Figure S3F - GO Enrichment for AI peaks

for (cond in c(ab_tp_list, list(ab_tp_list))) {
  
    test_geneID = get_test_geneID(cht, cond, 1000)
    background_geneID = get_background_geneID(cht, cond, 1000, test_geneID)

    out = create_GOdata(test_geneID, background_geneID, "BP")
    GOdata = out[[1]]
    test = out[[2]]
    n_nodes = length(attributes(GOdata)$graph@nodes)

    # Classic Fisher test
    test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
    resultFisher <- getSigGroups(GOdata, test.stat)

    # KS test
    test.stat <- new("classicScore", testStatistic = GOKSTest, name = "KS tests")
    resultKS <- getSigGroups(GOdata, test.stat)

    # Weight algorithm
    test.stat <- new("weightCount", testStatistic = GOFisherTest, name = "Fisher test", sigRatio = "ratio")
    resultWeight <- getSigGroups(GOdata, test.stat)
    
    allRes <- GenTable(GOdata, classic = resultFisher, 
           KS = resultKS, weight = resultWeight,
           orderBy = "weight", ranksOf = "classic", topNodes = n_nodes)
    allRes$Fold_Enrichment = allRes$Significant / allRes$Expected
    allRes$FDR = allRes$weight
    
    top_results = select_top_GO(allRes, 500, 10, 0.05, 10) 

    p = ggplot_GO_enrichment(top_results, nrow(as.data.frame(test)), length(as.data.frame(test)[as.data.frame(test)[,1] == 1, ]), cond)
    print(p)
    out_filename = paste0("FigS3F_GO_enrichment_AI_peaks_", gsub("/", "_", paste(cond, collapse = '_')) ,".pdf")
    ggsave(file.path(outdir_fig_suppl, out_filename), p, width = 6, height = 6)
}
   >> preparing features information...      2024-11-29 01:49:24 PM 
   >> identifying nearest features...        2024-11-29 01:49:24 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:49:24 PM 
   >> assigning genomic annotation...        2024-11-29 01:49:24 PM 
   >> adding gene annotation...          2024-11-29 01:49:29 PM
   'select()' returned 1:1 mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:49:29 PM 
   >> done...                    2024-11-29 01:49:29 PM 
   >> preparing features information...      2024-11-29 01:49:30 PM 
   >> identifying nearest features...        2024-11-29 01:49:30 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:49:30 PM 
   >> assigning genomic annotation...        2024-11-29 01:49:30 PM 
   >> adding gene annotation...          2024-11-29 01:49:31 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:49:31 PM 
   >> done...                    2024-11-29 01:49:31 PM
   'select()' returned 1:1 mapping between keys and columns
   'select()' returned 1:many mapping between keys and columns
   
   Building most specific GOs .....
    ( 3583 GO terms found. )
   
   Build GO DAG topology ..........
    ( 6174 GO terms and 13460 relations. )
   
   Annotating nodes ...............
    ( 3270 genes annotated to the GO terms. )
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 3229 nontrivial nodes
         parameters: 
             test statistic: Fisher test
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 6174 nontrivial nodes
         parameters: 
             test statistic: KS tests
             score order: increasing
   
             -- Weight Algorithm -- 
   
         The algorithm is scoring 3229 nontrivial nodes
         parameters: 
             test statistic: Fisher test : ratio
   
     Level 17:  4 nodes to be scored.
   
     Level 16:  11 nodes to be scored.
   
     Level 15:  20 nodes to be scored.
   
     Level 14:  37 nodes to be scored.
   
     Level 13:  74 nodes to be scored.
   
     Level 12:  130 nodes to be scored.
   
     Level 11:  226 nodes to be scored.
   
     Level 10:  319 nodes to be scored.
   
     Level 9:   452 nodes to be scored.
   
     Level 8:   450 nodes to be scored.
   
     Level 7:   494 nodes to be scored.
   
     Level 6:   449 nodes to be scored.
   
     Level 5:   312 nodes to be scored.
   
     Level 4:   162 nodes to be scored.
   
     Level 3:   72 nodes to be scored.
   
     Level 2:   16 nodes to be scored.
   
     Level 1:   1 nodes to be scored.
   Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
   ℹ Please use `linewidth` instead.
   This warning is displayed once every 8 hours.
   Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
   generated.
   Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
   ℹ Please use the `linewidth` argument instead.
   This warning is displayed once every 8 hours.
   Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
   generated.
   >> preparing features information...      2024-11-29 01:51:02 PM 
   >> identifying nearest features...        2024-11-29 01:51:02 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:51:02 PM 
   >> assigning genomic annotation...        2024-11-29 01:51:02 PM 
   >> adding gene annotation...          2024-11-29 01:51:03 PM
   'select()' returned 1:1 mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:51:03 PM 
   >> done...                    2024-11-29 01:51:03 PM 
   >> preparing features information...      2024-11-29 01:51:04 PM 
   >> identifying nearest features...        2024-11-29 01:51:04 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:51:04 PM 
   >> assigning genomic annotation...        2024-11-29 01:51:04 PM 
   >> adding gene annotation...          2024-11-29 01:51:05 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:51:05 PM 
   >> done...                    2024-11-29 01:51:05 PM
   'select()' returned 1:1 mapping between keys and columns
   'select()' returned 1:many mapping between keys and columns
   
   Building most specific GOs .....
    ( 3072 GO terms found. )
   
   Build GO DAG topology ..........
    ( 5653 GO terms and 12291 relations. )
   
   Annotating nodes ...............
    ( 2339 genes annotated to the GO terms. )
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 2911 nontrivial nodes
         parameters: 
             test statistic: Fisher test
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 5653 nontrivial nodes
         parameters: 
             test statistic: KS tests
             score order: increasing
   
             -- Weight Algorithm -- 
   
         The algorithm is scoring 2911 nontrivial nodes
         parameters: 
             test statistic: Fisher test : ratio
   
     Level 17:  1 nodes to be scored.
   
     Level 16:  7 nodes to be scored.
   
     Level 15:  17 nodes to be scored.
   
     Level 14:  25 nodes to be scored.
   
     Level 13:  58 nodes to be scored.
   
     Level 12:  102 nodes to be scored.
   
     Level 11:  193 nodes to be scored.
   
     Level 10:  272 nodes to be scored.
   
     Level 9:   384 nodes to be scored.
   
     Level 8:   408 nodes to be scored.
   
     Level 7:   467 nodes to be scored.
   
     Level 6:   425 nodes to be scored.
   
     Level 5:   297 nodes to be scored.
   
     Level 4:   164 nodes to be scored.
   
     Level 3:   74 nodes to be scored.
   
     Level 2:   16 nodes to be scored.
   
     Level 1:   1 nodes to be scored.

   >> preparing features information...      2024-11-29 01:52:18 PM 
   >> identifying nearest features...        2024-11-29 01:52:18 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:52:19 PM 
   >> assigning genomic annotation...        2024-11-29 01:52:19 PM 
   >> adding gene annotation...          2024-11-29 01:52:20 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:52:20 PM 
   >> done...                    2024-11-29 01:52:20 PM 
   >> preparing features information...      2024-11-29 01:52:20 PM 
   >> identifying nearest features...        2024-11-29 01:52:20 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:52:21 PM 
   >> assigning genomic annotation...        2024-11-29 01:52:21 PM 
   >> adding gene annotation...          2024-11-29 01:52:22 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:52:22 PM 
   >> done...                    2024-11-29 01:52:22 PM
   'select()' returned 1:many mapping between keys and columns
   'select()' returned 1:many mapping between keys and columns
   
   Building most specific GOs .....
    ( 3547 GO terms found. )
   
   Build GO DAG topology ..........
    ( 6132 GO terms and 13320 relations. )
   
   Annotating nodes ...............
    ( 3147 genes annotated to the GO terms. )
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 3047 nontrivial nodes
         parameters: 
             test statistic: Fisher test
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 6132 nontrivial nodes
         parameters: 
             test statistic: KS tests
             score order: increasing
   
             -- Weight Algorithm -- 
   
         The algorithm is scoring 3047 nontrivial nodes
         parameters: 
             test statistic: Fisher test : ratio
   
     Level 17:  2 nodes to be scored.
   
     Level 16:  11 nodes to be scored.
   
     Level 15:  20 nodes to be scored.
   
     Level 14:  44 nodes to be scored.
   
     Level 13:  60 nodes to be scored.
   
     Level 12:  108 nodes to be scored.
   
     Level 11:  202 nodes to be scored.
   
     Level 10:  290 nodes to be scored.
   
     Level 9:   417 nodes to be scored.
   
     Level 8:   431 nodes to be scored.
   
     Level 7:   480 nodes to be scored.
   
     Level 6:   422 nodes to be scored.
   
     Level 5:   306 nodes to be scored.
   
     Level 4:   165 nodes to be scored.
   
     Level 3:   71 nodes to be scored.
   
     Level 2:   17 nodes to be scored.
   
     Level 1:   1 nodes to be scored.

   >> preparing features information...      2024-11-29 01:53:49 PM 
   >> identifying nearest features...        2024-11-29 01:53:49 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:53:49 PM 
   >> assigning genomic annotation...        2024-11-29 01:53:49 PM 
   >> adding gene annotation...          2024-11-29 01:53:50 PM
   'select()' returned 1:1 mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:53:50 PM 
   >> done...                    2024-11-29 01:53:50 PM 
   >> preparing features information...      2024-11-29 01:53:51 PM 
   >> identifying nearest features...        2024-11-29 01:53:51 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:53:52 PM 
   >> assigning genomic annotation...        2024-11-29 01:53:52 PM 
   >> adding gene annotation...          2024-11-29 01:53:53 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:53:53 PM 
   >> done...                    2024-11-29 01:53:53 PM
   'select()' returned 1:1 mapping between keys and columns
   'select()' returned 1:many mapping between keys and columns
   
   Building most specific GOs .....
    ( 3712 GO terms found. )
   
   Build GO DAG topology ..........
    ( 6316 GO terms and 13779 relations. )
   
   Annotating nodes ...............
    ( 3568 genes annotated to the GO terms. )
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 3142 nontrivial nodes
         parameters: 
             test statistic: Fisher test
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 6316 nontrivial nodes
         parameters: 
             test statistic: KS tests
             score order: increasing
   
             -- Weight Algorithm -- 
   
         The algorithm is scoring 3142 nontrivial nodes
         parameters: 
             test statistic: Fisher test : ratio
   
     Level 18:  2 nodes to be scored.
   
     Level 17:  4 nodes to be scored.
   
     Level 16:  9 nodes to be scored.
   
     Level 15:  20 nodes to be scored.
   
     Level 14:  40 nodes to be scored.
   
     Level 13:  58 nodes to be scored.
   
     Level 12:  108 nodes to be scored.
   
     Level 11:  202 nodes to be scored.
   
     Level 10:  291 nodes to be scored.
   
     Level 9:   418 nodes to be scored.
   
     Level 8:   466 nodes to be scored.
   
     Level 7:   523 nodes to be scored.
   
     Level 6:   445 nodes to be scored.
   
     Level 5:   303 nodes to be scored.
   
     Level 4:   164 nodes to be scored.
   
     Level 3:   72 nodes to be scored.
   
     Level 2:   16 nodes to be scored.
   
     Level 1:   1 nodes to be scored.

   >> preparing features information...      2024-11-29 01:55:21 PM 
   >> identifying nearest features...        2024-11-29 01:55:21 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:55:21 PM 
   >> assigning genomic annotation...        2024-11-29 01:55:21 PM 
   >> adding gene annotation...          2024-11-29 01:55:22 PM
   'select()' returned 1:1 mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:55:22 PM 
   >> done...                    2024-11-29 01:55:22 PM 
   >> preparing features information...      2024-11-29 01:55:22 PM 
   >> identifying nearest features...        2024-11-29 01:55:22 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:55:23 PM 
   >> assigning genomic annotation...        2024-11-29 01:55:23 PM 
   >> adding gene annotation...          2024-11-29 01:55:24 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:55:24 PM 
   >> done...                    2024-11-29 01:55:24 PM
   'select()' returned 1:1 mapping between keys and columns
   'select()' returned 1:many mapping between keys and columns
   
   Building most specific GOs .....
    ( 3159 GO terms found. )
   
   Build GO DAG topology ..........
    ( 5697 GO terms and 12408 relations. )
   
   Annotating nodes ...............
    ( 2450 genes annotated to the GO terms. )
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 2594 nontrivial nodes
         parameters: 
             test statistic: Fisher test
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 5697 nontrivial nodes
         parameters: 
             test statistic: KS tests
             score order: increasing
   
             -- Weight Algorithm -- 
   
         The algorithm is scoring 2594 nontrivial nodes
         parameters: 
             test statistic: Fisher test : ratio
   
     Level 18:  1 nodes to be scored.
   
     Level 17:  2 nodes to be scored.
   
     Level 16:  5 nodes to be scored.
   
     Level 15:  17 nodes to be scored.
   
     Level 14:  35 nodes to be scored.
   
     Level 13:  55 nodes to be scored.
   
     Level 12:  87 nodes to be scored.
   
     Level 11:  153 nodes to be scored.
   
     Level 10:  240 nodes to be scored.
   
     Level 9:   341 nodes to be scored.
   
     Level 8:   369 nodes to be scored.
   
     Level 7:   409 nodes to be scored.
   
     Level 6:   374 nodes to be scored.
   
     Level 5:   275 nodes to be scored.
   
     Level 4:   150 nodes to be scored.
   
     Level 3:   66 nodes to be scored.
   
     Level 2:   14 nodes to be scored.
   
     Level 1:   1 nodes to be scored.

   >> preparing features information...      2024-11-29 01:56:38 PM 
   >> identifying nearest features...        2024-11-29 01:56:38 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:56:38 PM 
   >> assigning genomic annotation...        2024-11-29 01:56:38 PM 
   >> adding gene annotation...          2024-11-29 01:56:39 PM
   'select()' returned 1:1 mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:56:39 PM 
   >> done...                    2024-11-29 01:56:39 PM 
   >> preparing features information...      2024-11-29 01:56:39 PM 
   >> identifying nearest features...        2024-11-29 01:56:39 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:56:40 PM 
   >> assigning genomic annotation...        2024-11-29 01:56:40 PM 
   >> adding gene annotation...          2024-11-29 01:56:40 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:56:41 PM 
   >> done...                    2024-11-29 01:56:41 PM
   'select()' returned 1:1 mapping between keys and columns
   'select()' returned 1:many mapping between keys and columns
   
   Building most specific GOs .....
    ( 3202 GO terms found. )
   
   Build GO DAG topology ..........
    ( 5756 GO terms and 12564 relations. )
   
   Annotating nodes ...............
    ( 2358 genes annotated to the GO terms. )
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 2484 nontrivial nodes
         parameters: 
             test statistic: Fisher test
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 5756 nontrivial nodes
         parameters: 
             test statistic: KS tests
             score order: increasing
   
             -- Weight Algorithm -- 
   
         The algorithm is scoring 2484 nontrivial nodes
         parameters: 
             test statistic: Fisher test : ratio
   
     Level 18:  1 nodes to be scored.
   
     Level 17:  3 nodes to be scored.
   
     Level 16:  7 nodes to be scored.
   
     Level 15:  14 nodes to be scored.
   
     Level 14:  24 nodes to be scored.
   
     Level 13:  42 nodes to be scored.
   
     Level 12:  71 nodes to be scored.
   
     Level 11:  131 nodes to be scored.
   
     Level 10:  223 nodes to be scored.
   
     Level 9:   327 nodes to be scored.
   
     Level 8:   365 nodes to be scored.
   
     Level 7:   408 nodes to be scored.
   
     Level 6:   370 nodes to be scored.
   
     Level 5:   267 nodes to be scored.
   
     Level 4:   144 nodes to be scored.
   
     Level 3:   70 nodes to be scored.
   
     Level 2:   16 nodes to be scored.
   
     Level 1:   1 nodes to be scored.

   >> preparing features information...      2024-11-29 01:57:55 PM 
   >> identifying nearest features...        2024-11-29 01:57:55 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:57:55 PM 
   >> assigning genomic annotation...        2024-11-29 01:57:55 PM 
   >> adding gene annotation...          2024-11-29 01:57:56 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:57:56 PM 
   >> done...                    2024-11-29 01:57:56 PM 
   >> preparing features information...      2024-11-29 01:58:01 PM 
   >> identifying nearest features...        2024-11-29 01:58:01 PM 
   >> calculating distance from peak to TSS...   2024-11-29 01:58:02 PM 
   >> assigning genomic annotation...        2024-11-29 01:58:02 PM 
   >> adding gene annotation...          2024-11-29 01:58:04 PM
   'select()' returned 1:many mapping between keys and columns
   >> assigning chromosome lengths           2024-11-29 01:58:05 PM 
   >> done...                    2024-11-29 01:58:05 PM
   'select()' returned 1:many mapping between keys and columns
   'select()' returned 1:many mapping between keys and columns
   
   Building most specific GOs .....
    ( 4188 GO terms found. )
   
   Build GO DAG topology ..........
    ( 6851 GO terms and 14972 relations. )
   
   Annotating nodes ...............
    ( 5165 genes annotated to the GO terms. )
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 4850 nontrivial nodes
         parameters: 
             test statistic: Fisher test
   
             -- Classic Algorithm -- 
   
         the algorithm is scoring 6851 nontrivial nodes
         parameters: 
             test statistic: KS tests
             score order: increasing
   
             -- Weight Algorithm -- 
   
         The algorithm is scoring 4850 nontrivial nodes
         parameters: 
             test statistic: Fisher test : ratio
   
     Level 18:  2 nodes to be scored.
   
     Level 17:  6 nodes to be scored.
   
     Level 16:  15 nodes to be scored.
   
     Level 15:  34 nodes to be scored.
   
     Level 14:  73 nodes to be scored.
   
     Level 13:  121 nodes to be scored.
   
     Level 12:  221 nodes to be scored.
   
     Level 11:  388 nodes to be scored.
   
     Level 10:  552 nodes to be scored.
   
     Level 9:   700 nodes to be scored.
   
     Level 8:   714 nodes to be scored.
   
     Level 7:   733 nodes to be scored.
   
     Level 6:   598 nodes to be scored.
   
     Level 5:   387 nodes to be scored.
   
     Level 4:   204 nodes to be scored.
   
     Level 3:   84 nodes to be scored.
   
     Level 2:   17 nodes to be scored.
   
     Level 1:   1 nodes to be scored.

7 Figure S3G - PhyloP enrichment

phyloP = import.bw(config$data$genome$dm6$phyloP124)
peaks_cht = unique(data.frame("chr"=cht$TEST.SNP.CHROM, "start"=cht$REGION.START, "end"=cht$REGION.END, "seqinfo"=cht$peak_id, "signif"=cht$signif_strongAI, "cond"=cht$cond))
peaks_cht_GRange = GRanges(peaks_cht)
ov = findOverlaps(peaks_cht_GRange, phyloP)
cht_peaks_phylo =  cbind(as.data.frame(peaks_cht_GRange)[as.data.frame(ov)$queryHits, ], as.data.frame(phyloP)[as.data.frame(ov)$subjectHits, ])
colnames(cht_peaks_phylo) = c("peak_chr", "peaks_start", "peak_end", "peaks_width", "peak_strand", "seqinfo",  "signif", "cond", "seqnames", "start",  "end", "width", "strand", "score")

cht_peaks_phylo_mean = cht_peaks_phylo %>%
  dplyr::mutate(weighted_phylop = score * width) %>%
  dplyr::select(seqinfo, score, weighted_phylop, peaks_width, signif, cond) %>%
  dplyr::group_by(seqinfo, signif, cond) %>%
  dplyr::summarize(phylop = sum(weighted_phylop) / mean(peaks_width))
   `summarise()` has grouped output by 'seqinfo', 'signif'. You can override using
   the `.groups` argument.
cht_peaks_phylo_mean$signif = factor(gsub(TRUE, "AI peak", gsub(FALSE, "no AI", cht_peaks_phylo_mean$signif)), levels=c("AI peak", "no AI"))
cht_peaks_phylo_mean$cond = factor(cht_peaks_phylo_mean$cond, levels=c("twi/24", "ctcf/68", "mef2/68", "mef2/1012", "bin/68",  "bin/1012"))

Summary_data = cht_peaks_phylo_mean %>%  group_by(signif) %>% summarise(n=n()) 

p = ggplot(cht_peaks_phylo_mean, aes(x=signif, y=phylop, fill=signif)) +
    geom_violin() +
    geom_boxplot(width=0.1, fill="white", alpha=0.5) +
    scale_fill_manual(values = c("orange2", "grey40")) +
    stat_compare_means(comparisons = list(c("AI peak", "no AI")), label.y=5) +
    geom_text(data=Summary_data ,aes(x = signif, y = -0.6, label=n),color="grey25", fontface = 2, size = 3.5) +
    ylim(-1,6) +
    xlab("AI Peaks vs non AI peaks") +
    ylab("PhyloP average score on peak") +
    theme_bw() + 
    theme(legend.position="none") +
    theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 9)) +
    theme(axis.title = element_text(size = 9), plot.title = element_text(size=9)) +
    theme(strip.text = element_text(size=9)) +
    theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) 
#     facet_wrap(~cond)+

print(p)
   Warning: Removed 101 rows containing non-finite values (`stat_ydensity()`).
   Warning: Removed 101 rows containing non-finite values (`stat_boxplot()`).
   Warning: Removed 101 rows containing non-finite values (`stat_signif()`).

ggsave(file.path(outdir_fig_suppl, "FigS3G_phyloP_conservation_sign_peaks.pdf"), p, width = 6, height = 6)
   Warning: Removed 101 rows containing non-finite values (`stat_ydensity()`).
   Warning: Removed 101 rows containing non-finite values (`stat_boxplot()`).
   Warning: Removed 101 rows containing non-finite values (`stat_signif()`).
---
title: "Figure_S3"
output:
   BiocStyle::html_document:
      toc: true
      df_print: paged
      self_contained: true
      code_download: true
      highlight: tango
#bibliography: knn_ml_intro.bib
editor_options: 
  chunk_output_type: inline
---

```{r style, echo=FALSE, results="asis"}
library("knitr")
options(digits = 2, width = 80)
options(bitmapType = 'cairo')
golden_ratio <- (1 + sqrt(5)) / 2
opts_chunk$set(echo = TRUE, tidy = FALSE, include = TRUE, cache = FALSE,
               dev=c('png', 'pdf'), comment = '  ', dpi = 300)

options(stringsAsFactors = FALSE)
knitr::opts_chunk$set(cache=FALSE)
options(digits = 5)         
```

# Setup and data

```{r}
source("../utils/utils.R")
config = load_config()

# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI) 

# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")

# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles  = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows() 

```

# Figure S3B: Indels length

```{r}
cht_sel = cht %>%
  filter(signif_strongAI) %>%
  group_by(peak_id) %>%
  #mutate(min_pval = min(P.VALUE)) %>%
  #filter(P.VALUE == min_pval) %>%
  mutate(max_indel = max(indel_length), min_indel = min(indel_length)) %>%
  filter(indel_length == max_indel) %>%
#  mutate(max_AI = max(AI_abs)) %>%
#  filter(AI_abs == max_AI) %>%
  select(condition, peak_id, indel_length, AI_abs) %>%
  unique() %>%
  mutate(bin_indel = cut(indel_length, breaks = c(0, 1, 10, Inf), labels = c("SNP", "2-10", ">10"), include.lowest = T))


cht_sel$label = factor(ab_tp_labels[cht_sel$condition], levels = ab_tp_labels)


p = ggplot(cht_sel, aes(x = bin_indel, y = AI_abs, fill = bin_indel)) + 
  geom_boxplot(width = 0.6, outlier.size = 0.1) +
  scale_fill_manual(values = cbPalette, name = "Indel length") +
  scale_y_continuous(trans = "log2", limits = c(0.1, 0.7)) +
  stat_compare_means(comparisons = list(c("SNP", ">10"), c("SNP", "2-10"))) +
  stat_compare_means() +
  xlab("") +
  ylab("Allele imbalance") +
  theme_bw() +
  facet_grid(~label) +
  stat_summary(fun.data = function(x) c(y = median(x) + 0.1, label = length(x)), geom = "text", size = 4) +
  theme(axis.text.x = element_blank(), 
        #axis.text.x = element_text(size=16, angle = 45, hjust = 1, color = TFcols), 
        axis.text.y = element_text(size=16), 
        axis.title.x = element_text(size=18), axis.title.y = element_text(size=16),
        strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14),
        legend.text=element_text(size=16), legend.title=element_text(size=16))


cht_sel %>% 
  group_by(condition) %>%
  mutate(N = n()) %>%
  group_by(condition, bin_indel) %>%
  dplyr::summarize(n = n(), share = n / mean(N), res_share = 1 - share)

p

outf = file.path(outdir_fig_suppl, paste0("FigS3B_indels_AI.pdf"))
ggsave(outf, p, width = 18, height = 5)


```


# Figure S3C: Indels length

```{r}

cht = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp)) %>% bind_rows()
cht_gr = cht %>% mutate(chr = TEST.SNP.CHROM, start = TEST.SNP.POS, end = TEST.SNP.POS) %>% GRanges()

# F1 ATAC-seq results

# coordinates of ATAC peaks
atac_regions_path = "/g/furlong/project/68_F1_cisreg_ichip/data/F1_paper_multiom/ATAC_feature_location_dm6.bed"
atac_regions = import(atac_regions_path, format = "bed")

# imbalance info for ATAC peaks
atac_ai_path = "/g/furlong/project/68_F1_cisreg_ichip/data/F1_paper_multiom/ATAC_all_peaks_atac_X.txt"
atac_ai = read.delim(atac_ai_path)

# additional annotations for allele imbalance
atac_ai_sum = atac_ai %>% 
  mutate(AI = padj < 0.01 & abs(0.5 - meanprop) > 0.1) %>%
  group_by(feature, time) %>% 
  summarize(n_AI = sum(AI), 
            AI_peak = any(AI),
            mean_AI = abs(0.5 - mean(meanprop)), 
            max_AI = max(abs(0.5 - meanprop)),
            n_het_lines =n())

# combine AII info with peak coordinates
atac_f1_df = merge(data.frame(atac_regions), atac_ai_sum, by.x = "name", by.y = "feature") %>% filter(!seqnames %in% c("chrX", "chrY"))
atac_ai_gr = GRanges(atac_f1_df)


# Do the analysis per TF and ATAC time-point


df = data.frame()

for (ab_tp in ab_tp_list) {
  
  tp = gsub("-", "", timepoints[ab_tp])
  
  # get TF peaks
  AI_peaks = get_peaks_from_cht(ab_tp, cht)
  names(mcols(AI_peaks)) = paste0(names(mcols(AI_peaks)), ".tf")
  
  # get ATAC peaks for selected time-point
  AI_atac = atac_ai_gr[atac_ai_gr$time == tp]
  names(mcols(AI_atac)) = paste0(names(mcols(AI_atac)), ".atac")
  
  ov = findOverlaps(AI_peaks, AI_atac)
  
  
  res = cbind.data.frame(mcols(AI_peaks[queryHits(ov)]), mcols(AI_atac[subjectHits(ov)]))
  
  print(ab_tp)
  wt = wilcox.test(mean_AI.atac ~ AI_peak.tf, res, alternative = "less")
  print(wt)
  wt = wilcox.test(max_AI.atac ~ AI_peak.tf, res, alternative = "less")
  print(wt)
  
  df = rbind.data.frame(df, res)
  
}

df %<>% 
  mutate(label = factor(ab_tp_labels[condition.tf], levels = ab_tp_labels))

df %>% group_by(condition.tf, AI_peak.tf) %>%
  summarize(mean(mean_AI.atac, mean(max_AI.atac)))

p = ggplot(df, aes(x = AI_peak.tf,  y = max_AI.atac)) +
  geom_violin(fill = "darkblue", alpha = 0.3) +
  geom_boxplot(width = 0.4, outlier.size = 0.1, fill = "darkblue", alpha = 0.7) +
  facet_wrap(~ label, ncol = 3) +
  theme_bw() +
  stat_compare_means() +
  xlab("AI TF peaks (F1)") +
  ylab("Allele imbalance of ATAC peaks (F1)") +
  theme(axis.text.y = element_text(size=12), axis.text.x = element_text(size=12), 
        axis.title.x = element_text(size=16), axis.title.y = element_text(size=16),
        strip.text.x = element_text(size = 14), strip.text.y = element_text(size = 14))

print(p)
ggsave(file.path(outdir_fig_suppl, "FigS3C_AI_ATAC_by_TF.pdf"), p, width = 9, height = 6)

```




# Figure S3D

```{r}

# only quantified peaks for CHT
#peaks = lapply(ab_tp_list, function(ab_tp) get_peaks_from_cht(ab_tp, cht, as_granges = T))
#names(peaks) = ab_tp_list

peaks = get_consensus_peaksets_with_AI(ab_tp_list, cht, filter = F)

dhs = load_dhs() %>% GRanges()
dhs$DHScond = cut(dhs$num_conditions, breaks = c(1, 2, 17, 19), include.lowest = T)
dhs$DHS_modERN = cut(dhs$num_modERN,breaks = c(0, 9, 251), include.lowest = T)


res = lapply(c("distal", "proximal"), function(TSS_type) {

  dhs_gr = dhs %>% as.data.frame() %>% filter(TSS == TSS_type) %>% GRanges()
  
  # DHS conditions
  df = lapply(peaks, function(x) {ov = findOverlaps(x, dhs_gr);
                                  cbind.data.frame(as.data.frame(x[queryHits(ov)]), as.data.frame(dhs_gr[subjectHits(ov)])) }) %>%
            bind_rows()
  
  fts1 = lapply(ab_tp_list, function(ab_tp) {fisher_test_two_groups(df, ab_tp,
                                                                    group1 = "isAI", group1_val = c(TRUE, FALSE), 
                                                                    group2 = "DHScond", group2_val = c("[1,2]", "(17,19]"))}) %>%
            bind_rows() %>% mutate(comp = "DHS conditions\n(1-2 vs. 18-19)", labels = ab_tp_labels)
  
  # modERN TFs
  fts2 = lapply(ab_tp_list, function(ab_tp) {fisher_test_two_groups(df, ab_tp,
                                                                    group1 = "isAI", group1_val = c(TRUE, FALSE), 
                                                                    group2 = "DHS_modERN", group2_val = c("(9,251]", "[0,9]"))}) %>%
            bind_rows() %>% mutate(comp = "modERN TFs\n(31-251 vs. 0-5)", labels = ab_tp_labels)
  
  
  res = rbind.data.frame(fts1, fts2) %>% mutate(TSS_type = paste("TSS", TSS_type)) 
  
  
}) %>% bind_rows()


# ModERN TFs

df = res %>% filter(comp2 == "DHS_modERN_(9,251]_vs_[0,9]") 
df$label = ab_tp_labels
df$label = factor(df$label, levels = ab_tp_labels)

p = ggplot(df, aes(x = label, y = odds_ratio)) +
  facet_wrap(~TSS_type) +
  geom_hline(yintercept = 1, color = "darkred") +
  geom_bar(aes(fill = -log10(pval)), color = "darkblue", stat = "identity", position = "dodge", width = 0.5) +
  #scale_fill_manual(name = "", values = c("darkblue", "darkgrey"), labels = c("AI peaks", "non-AI peaks")) +
  geom_text(aes(label = round(r1, 2), x = label, y = odds_ratio + 0.05), data = df, size = 6) +
  ylab("Depletioin in ubiquitously bound regions (>10 modERN TFs) \nFisher's Test Odds Ratio") +
  theme_bw() +
  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, colour = TFcols),
        axis.text.y = element_text(size = 12), 
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 14),
        legend.text = element_text(size=14),
        legend.title = element_text(size=14))


p

outf = file.path(outdir_fig_suppl, paste0("FigS3D_modERN_fisher.pdf"))
ggsave(outf, p, width = 10, height = 6)

```



# Figure S3E


```{r}
df = res %>% filter(comp2 == "DHScond_[1,2]_vs_(17,19]") 
df$label = ab_tp_labels
df$label = factor(df$label, levels = ab_tp_labels)



p = ggplot(df, aes(x = label, y = odds_ratio)) +
  facet_wrap(~TSS_type) +
  geom_hline(yintercept = 1, color = "darkred") +
  geom_bar(aes(fill = -log10(pval)), color = "darkblue", stat = "identity", position = "dodge", width = 0.5) +
  #scale_fill_manual(name = "", values = c("darkblue", "darkgrey"), labels = c("AI peaks", "non-AI peaks")) +
  geom_text(aes(label = round(r1, 2), x = label, y = odds_ratio + 0.07), data = df, size = 6) +
  ylab("Enrichment in condition-specific DHS \nFisher's Test Odds Ratio") +
  theme_bw() +
  theme(axis.text.x = element_text(size = 14, angle = 45, hjust = 1, colour = TFcols),
        axis.text.y = element_text(size = 12), 
        axis.title.x = element_blank(),
        axis.title.y = element_text(size = 16),
        legend.text = element_text(size=14),
        legend.title = element_text(size=14))


p

outf = file.path(outdir_fig_suppl, paste0("FigS3E_nonubiqDHS_fisher.pdf"))
ggsave(outf, p, width = 10, height = 6)



```



# Figure S3F - GO Enrichment for AI peaks

```{r}
for (cond in c(ab_tp_list, list(ab_tp_list))) {
  
    test_geneID = get_test_geneID(cht, cond, 1000)
    background_geneID = get_background_geneID(cht, cond, 1000, test_geneID)

    out = create_GOdata(test_geneID, background_geneID, "BP")
    GOdata = out[[1]]
    test = out[[2]]
    n_nodes = length(attributes(GOdata)$graph@nodes)

    # Classic Fisher test
    test.stat <- new("classicCount", testStatistic = GOFisherTest, name = "Fisher test")
    resultFisher <- getSigGroups(GOdata, test.stat)

    # KS test
    test.stat <- new("classicScore", testStatistic = GOKSTest, name = "KS tests")
    resultKS <- getSigGroups(GOdata, test.stat)

    # Weight algorithm
    test.stat <- new("weightCount", testStatistic = GOFisherTest, name = "Fisher test", sigRatio = "ratio")
    resultWeight <- getSigGroups(GOdata, test.stat)
    
    allRes <- GenTable(GOdata, classic = resultFisher, 
           KS = resultKS, weight = resultWeight,
           orderBy = "weight", ranksOf = "classic", topNodes = n_nodes)
    allRes$Fold_Enrichment = allRes$Significant / allRes$Expected
    allRes$FDR = allRes$weight
    
    top_results = select_top_GO(allRes, 500, 10, 0.05, 10) 

    p = ggplot_GO_enrichment(top_results, nrow(as.data.frame(test)), length(as.data.frame(test)[as.data.frame(test)[,1] == 1, ]), cond)
    print(p)
    out_filename = paste0("FigS3F_GO_enrichment_AI_peaks_", gsub("/", "_", paste(cond, collapse = '_')) ,".pdf")
    ggsave(file.path(outdir_fig_suppl, out_filename), p, width = 6, height = 6)
}
```


# Figure S3G - PhyloP enrichment 


```{r}

phyloP = import.bw(config$data$genome$dm6$phyloP124)
peaks_cht = unique(data.frame("chr"=cht$TEST.SNP.CHROM, "start"=cht$REGION.START, "end"=cht$REGION.END, "seqinfo"=cht$peak_id, "signif"=cht$signif_strongAI, "cond"=cht$cond))
peaks_cht_GRange = GRanges(peaks_cht)
ov = findOverlaps(peaks_cht_GRange, phyloP)
cht_peaks_phylo =  cbind(as.data.frame(peaks_cht_GRange)[as.data.frame(ov)$queryHits, ], as.data.frame(phyloP)[as.data.frame(ov)$subjectHits, ])
colnames(cht_peaks_phylo) = c("peak_chr", "peaks_start", "peak_end", "peaks_width", "peak_strand", "seqinfo",  "signif", "cond", "seqnames", "start",  "end", "width", "strand", "score")

cht_peaks_phylo_mean = cht_peaks_phylo %>%
  dplyr::mutate(weighted_phylop = score * width) %>%
  dplyr::select(seqinfo, score, weighted_phylop, peaks_width, signif, cond) %>%
  dplyr::group_by(seqinfo, signif, cond) %>%
  dplyr::summarize(phylop = sum(weighted_phylop) / mean(peaks_width))
              

cht_peaks_phylo_mean$signif = factor(gsub(TRUE, "AI peak", gsub(FALSE, "no AI", cht_peaks_phylo_mean$signif)), levels=c("AI peak", "no AI"))
cht_peaks_phylo_mean$cond = factor(cht_peaks_phylo_mean$cond, levels=c("twi/24", "ctcf/68", "mef2/68", "mef2/1012", "bin/68",  "bin/1012"))

Summary_data = cht_peaks_phylo_mean %>%  group_by(signif) %>% summarise(n=n()) 

p = ggplot(cht_peaks_phylo_mean, aes(x=signif, y=phylop, fill=signif)) +
    geom_violin() +
    geom_boxplot(width=0.1, fill="white", alpha=0.5) +
    scale_fill_manual(values = c("orange2", "grey40")) +
    stat_compare_means(comparisons = list(c("AI peak", "no AI")), label.y=5) +
    geom_text(data=Summary_data ,aes(x = signif, y = -0.6, label=n),color="grey25", fontface = 2, size = 3.5) +
    ylim(-1,6) +
    xlab("AI Peaks vs non AI peaks") +
    ylab("PhyloP average score on peak") +
    theme_bw() + 
    theme(legend.position="none") +
    theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 9)) +
    theme(axis.title = element_text(size = 9), plot.title = element_text(size=9)) +
    theme(strip.text = element_text(size=9)) +
    theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5)) 
#     facet_wrap(~cond)+

print(p)
ggsave(file.path(outdir_fig_suppl, "FigS3G_phyloP_conservation_sign_peaks.pdf"), p, width = 6, height = 6)
```






