source("../utils/utils.R")
config = load_config()
# load CHT results
cht_full = lapply(ab_tp_list, function(ab_tp) load_cht_results(ab_tp, remove_chr = F)) %>% bind_rows()
cht = cht_full %>% filter(!TEST.SNP.CHROM %in% c("chrX", "chrY", "chrM"))
cht_sign = cht %>% filter(signif_strongAI)
# genes and promoters
genes = load_genes()
promoters = resize(genes, width = 1000, fix = "start")
# combined motif set (all TFs, peaks + alleles)
fimo = get_full_motif_sets(cht, ab_tp_list)
# only alleles
fimo_alleles = lapply(ab_tp_list, function(ab_tp) parse_motifs_in_two_alleles(ab_tp, cht)) %>% bind_rows()
Maximum percentage of passed MAPQ filters reads that overlap an INDEL: 28.07% minimum: 13.54%
Maximum percentage of mappability filtered reads by the new WASP pipeline: 15.52% minimum: 5.19%
vcf = read.table(paste0(config$data$VCF$vcf_dir, "/", config$data$VCF$stringent))[, c("V1", "V2", "V3", "V4", "V5")]
colnames(vcf) = c("chr", "start", "end", "REF", "ALT")
vcf$REF_len = apply(vcf,2,nchar)[,c("REF")]
vcf$ALT_len = apply(vcf,2,nchar)[,c("ALT")]
INDELs = subset(vcf, REF_len != ALT_len)
INDELs$INDEL_len = pmax(INDELs$REF_len, INDELs$ALT_len) - 1
p = ggplot(INDELs, aes(x=INDEL_len)) +
geom_histogram(bins = 26, fill="grey60") +
xlab("Length of INDELs (bp)") +
ylab("Number of INDELs") +
xlim(0,25) +
geom_vline(xintercept=1.5, linewidth=0.5, linetype="dashed", colour="grey30") +
geom_vline(xintercept=5.5, linewidth=0.5, linetype="dashed", colour="grey30") +
geom_vline(xintercept=10.5, linewidth=0.5, linetype="dashed", colour="grey30") +
annotate(x = 18, y=80000, geom = "text", label="number of INDEL by size", size=3) +
annotate(x = 18, y=70000, geom = "text", label=paste0("1bp = ", nrow(subset(INDELs, INDEL_len==1)), " (", round(nrow(subset(INDELs, INDEL_len==1))/nrow(INDELs), 3)*100, "%)"), size=2.5) +
annotate(x = 18, y=60000, geom = "text", label=paste0("2-5bp = ", nrow(subset(INDELs, INDEL_len>=2 & INDEL_len<=5)), " (", round(nrow(subset(INDELs, INDEL_len>=2 & INDEL_len<=5))/nrow(INDELs), 3)*100, "%)"), size=2.5) +
annotate(x = 18, y=50000, geom = "text", label=paste0("6-10bp = ", nrow(subset(INDELs, INDEL_len>=6 & INDEL_len<=10)), " (", round(nrow(subset(INDELs, INDEL_len>=6 & INDEL_len<=10))/nrow(INDELs), 3)*100, "%)"), size=2.5) +
annotate(x = 18, y=40000, geom = "text", label=paste0(">10bp = ", nrow(subset(INDELs, INDEL_len>10)), " (", round(nrow(subset(INDELs, INDEL_len>10))/nrow(INDELs), 3)*100, "%)"), size=2.5) +
theme_bw() +
theme(legend.position="none") +
theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 9)) +
theme(axis.title = element_text(size = 9), plot.title = element_text(size=9)) +
theme(strip.text = element_text(size=9)) +
theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5))
p
filtered_variants = unique(cht[, c("TEST.SNP.ID", "TEST.SNP.REF.ALLELE", "TEST.SNP.ALT.ALLELE")])
colnames(filtered_variants) = c("variant_ID", "REF", "ALT")
filtered_variants$REF_len = apply(filtered_variants,2,nchar)[,c("REF")]
filtered_variants$ALT_len = apply(filtered_variants,2,nchar)[,c("ALT")]
filtered_INDELs = subset(filtered_variants, REF_len != ALT_len)
filtered_INDELs$INDEL_len = pmax(filtered_INDELs$REF_len, filtered_INDELs$ALT_len) - 1
p = ggplot(filtered_INDELs, aes(x=INDEL_len)) +
geom_histogram(bins = 26, fill="grey60") +
xlab("Length of INDELs (bp)") +
ylab("Number of INDELs") +
xlim(0,25) +
geom_vline(xintercept=1.5, linewidth=0.5, linetype="dashed", colour="grey30") +
geom_vline(xintercept=5.5, linewidth=0.5, linetype="dashed", colour="grey30") +
geom_vline(xintercept=10.5, linewidth=0.5, linetype="dashed", colour="grey30") +
annotate(x = 18, y=20000, geom = "text", label="number of INDEL by size", size=3) +
annotate(x = 18, y=18000, geom = "text", label=paste0("1bp = ", nrow(subset(filtered_INDELs, INDEL_len==1)), " (", round(nrow(subset(filtered_INDELs, INDEL_len==1))/nrow(filtered_INDELs), 3)*100, "%)"), size=2.5) +
annotate(x = 18, y=16000, geom = "text", label=paste0("2-5bp = ", nrow(subset(filtered_INDELs, INDEL_len>=2 & INDEL_len<=5)), " (", round(nrow(subset(filtered_INDELs, INDEL_len>=2 & INDEL_len<=5))/nrow(filtered_INDELs), 3)*100, "%)"), size=2.5) +
annotate(x = 18, y=14000, geom = "text", label=paste0("6-10bp = ", nrow(subset(filtered_INDELs, INDEL_len>=6 & INDEL_len<=10)), " (", round(nrow(subset(filtered_INDELs, INDEL_len>=6 & INDEL_len<=10))/nrow(filtered_INDELs), 3)*100, "%)"), size=2.5) +
annotate(x = 18, y=12000, geom = "text", label=paste0(">10bp = ", nrow(subset(filtered_INDELs, INDEL_len>10)), " (", round(nrow(subset(filtered_INDELs, INDEL_len>10))/nrow(filtered_INDELs), 3)*100, "%)"), size=2.5) +
theme_bw() +
theme(legend.position="none") +
theme(panel.grid = element_line(colour = "grey80", linewidth = 1), axis.text = element_text(size = 9)) +
theme(axis.title = element_text(size = 9), plot.title = element_text(size=9)) +
theme(strip.text = element_text(size=9)) +
theme(panel.grid.minor = element_line(linewidth = 0.25), panel.grid.major = element_line(linewidth = 0.5))
p