edited consensus peak script to exclude control samples

a5896961 · Rim Moussa · 1e28aa2d · a5896961 · a5896961 · a5896961
Commit a5896961 authored 3 years ago by Rim Moussa
--- a/src/Snakemake/dev/R/._consensusPeaks.R
+++ b/src/Snakemake/dev/R/._consensusPeaks.R
--- a/src/Snakemake/dev/R/annotatePeaks.R
+++ b/src/Snakemake/dev/R/annotatePeaks.R
+start.time  <-  Sys.time()
+
+source(paste0(snakemake@config$additionalInput$scriptsDir, "/functions.R"))
+
+########################################################################
+# SAVE SNAKEMAKE S4 OBJECT THAT IS PASSED ALONG FOR DEBUGGING PURPOSES #
+########################################################################
+
+# Use the following line to load the Snakemake object to manually rerun this script (e.g., for debugging purposes)
+# Replace {outputFolder} correspondingly.
+# snakemake = readRDS("{outputFolder}/LOGS_AND_BENCHMARKS/1.annotatePeaks.R.rds")
+createDebugFile(snakemake)
+
+
+initFunctionsScript(packagesReq = NULL, minRVersion = "3.1.0", warningsLevel = 1, disableScientificNotation = TRUE)
+#checkAndLoadPackages(c("tidyverse", "futile.logger", "DiffBind", "checkmate", "stats", "stringr", "ChIPseeker"), verbose = FALSE)
+checkAndLoadPackages(c("futile.logger", "checkmate", "stats", "ChIPseeker"), verbose = FALSE)
+
+assertClass(snakemake, "Snakemake")
+assertDirectoryExists(snakemake@config$additionalInput$scriptsDir)
+
+###################
+#### PARAMETERS ###
+###################
+
+par.l = list()
+par.l$allPeakTypes = c("broadPeak", "gappedPeak", "narrowPeak", "stringent", "non-stringent")
+par.l$verbose = TRUE
+par.l$log_minlevel = "INFO"
+
+#####################
+# VERIFY PARAMETERS #
+#####################
+
+## INPUT ##
+assertList(snakemake@input, min.len = 1)
+assertSubset(names(snakemake@input), c("", "individual_stringent", "individual_nonStringent", "individual_encode",
+                                       "consensus_stringent", "consensus_nonStringent", "consensus_encode"))
+fileTypes = c("", "individual_stringent", "individual_nonStringent", "individual_encode",
+              "consensus_stringent", "consensus_nonStringent", "consensus_encode")
+par.l$input_peaks = snakemake@input 
+
+## OUTPUT ##
+assertList(snakemake@output, min.len = 1)
+assertSubset(names(snakemake@output), c("", "annotation_individual_stringent", "annotation_individual_nonStringent", "annotation_individual_encode",
+                                       "annotation_consensus_stringent", "annotation_consensus_nonStringent", "annotation_consensus_encode"))
+
+par.l$output = snakemake@output
+
+## CONFIG ##
+assertList(snakemake@config, min.len = 1)
+
+## PARAMS ##
+assertList(snakemake@params, min.len = 3)
+
+par.l$tabulateOutput = as.logical(snakemake@params$tabulateOutput)
+par.l$tssRegion = as.integer(unlist(strsplit(snakemake@params$tssRegion, ",")))
+par.l$assemblyVersion = snakemake@params$assemblyVersion
+
+## LOG ##
+assertList(snakemake@log, min.len = 1)
+par.l$file_log = snakemake@log[[1]]
+
+######################
+# FINAL PREPARATIONS #
+######################
+startLogger(par.l$file_log, par.l$log_minlevel, removeOldLog = TRUE)
+printParametersLog(par.l)
+
+allDirs = unique(dirname(unlist(par.l$output)))
+testExistanceAndCreateDirectoriesRecursively(allDirs)
+
+
+###################
+# Annotate peaks #
+##################
+
+
+for (file_grp in unique(names(par.l$input_peaks))){
+#   
+#   print("~~~~~~~~")
+#   print(file_grp)
+  if (file_grp == "") {next}
+  
+  # for (inFiles in par.l$input_peaks[file_grp]){
+  #   if (is.null(inFiles)) {next}
+    
+#    for (file in inFiles[[1]]){
+  for (file in unlist(par.l$input_peaks[file_grp])){
+    
+    futile.logger::flog.info(paste0("Peak annotation for the file: ", file))
+    
+    if (readLines(file, n=1) == "# No consensus peaks found"){next}
+    
+        peaks.annotated = suppressMessages(ChIPseeker::annotatePeak(
+          file,
+          tssRegion = par.l$tssRegion,
+          TxDb = .getGenomeObject(par.l$assemblyVersion, type = "txbd"),
+          level = "gene", 
+          assignGenomicAnnotation = TRUE,  # the default
+          genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
+                                        "Downstream", "Intergenic"),  # the default
+          annoDb = .getGenomeObject(par.l$assemblyVersion, type = "packageName"), # optional, if provided, extra columns including SYMBOL, GENENAME, ENSEMBL/ENTREZID will be added
+          sameStrand = FALSE, ignoreOverlap = FALSE, ignoreUpstream = FALSE, ignoreDownstream = FALSE, overlap = "TSS", verbose = TRUE
+            
+        ))
+        outIMG = grep(pattern = paste0(gsub(basename(file), pattern = "\\.bed\\.gz", replacement = ""), ".pdf"), 
+                      x = par.l$output[paste0("annotation_", file_grp)][[1]], value = T)
+        outTbl = grep(pattern = paste0(gsub(basename(file), pattern = "\\.bed\\.gz", replacement = ""), ".csv"),
+                      x = par.l$output[paste0("annotation_", file_grp)][[1]], value = T)
+        
+        pdf(outIMG)
+        print(plotAnnoPie(peaks.annotated, main = "Peak Annotation"))
+        print(plotDistToTSS(peaks.annotated))
+        dev.off()
+        
+        if (par.l$tabulateOutput){
+          write.csv(as.data.frame(peaks.annotated), file = outTbl, row.names = FALSE)
+        }
+#    }
+  }
+  
+}
+
+.printExecutionTime(start.time)
+
+flog.info("Session info: ", sessionInfo(), capture = TRUE)
+
--- a/src/Snakemake/dev/R/consensusPeaks.R
+++ b/src/Snakemake/dev/R/consensusPeaks.R
@@ -43,9 +43,12 @@ assertClass(snakemake, "Snakemake")
 assertList(snakemake@input, min.len = 1)
 assertSubset(names(snakemake@input), c("", "stringent", "nonStringent", "encode"))

-par.l$files_input_stringent = snakemake@input$stringent
-par.l$files_input_nonStringent = snakemake@input$nonStringent
-par.l$files_input_ENCODE = snakemake@input$encode
+par.l$files_input_stringent = setdiff(snakemake@input$stringent, 
+                                      grep(pattern = paste(snakemake@params$controlFiles, collapse = "|"), x = snakemake@input$stringent, value = T))
+par.l$files_input_nonStringent = setdiff(snakemake@input$nonStringent,
+                                         grep(pattern = paste(snakemake@input$nonStringent, collapse = "|"), x = snakemake@input$stringent, value = T))
+par.l$files_input_ENCODE = setdiff(snakemake@input$encode,
+                                   grep(pattern = paste(snakemake@params$controlFiles, collapse = "|"), x = snakemake@input$encode, value = T))
 #par.l$file_input_sampleFile = snakemake@input$sampleFile

 ## OUTPUT ##
@@ -159,6 +162,8 @@ for (GCBiasStrCur in par.l$GCBiasStr) {
  

        sampleMetaData.df = tibble(Peaks = filesInput, PeakCaller = "narrow")
+        # here exclude the control files based on sample name grep
+        # but how will the merged replicated be handled? maybe add an extra parameter for the merged control replicates


        assertIntegerish(max(par.l$minOverlap), upper = nrow(sampleMetaData.df), lower = 1)
@@ -235,12 +240,14 @@ for (GCBiasStrCur in par.l$GCBiasStr) {
            } else {
                write_lines("# No consensus peaks found", outputCur)
            }
-            
+          #break  
        }
        
        
-        
+        #break
    }
+  
+  #break
 }