From f399a0e5820b5963107cafecb937d53ace074e05 Mon Sep 17 00:00:00 2001 From: Christian Arnold <christian.arnold@embl.de> Date: Thu, 25 Nov 2021 17:08:46 +0100 Subject: [PATCH] Added examples/dev/input to Git --- example/dev/input/config.yaml | 240 +++++++++++ example/dev/input/execSnakemake.sh | 1 + example/dev/input/params.sh | 123 ++++++ example/dev/input/runSnakefile.sh | 26 ++ example/dev/input/runSnakemakeWrapper.sh | 509 +++++++++++++++++++++++ example/dev/input/samples.csv | 4 + 6 files changed, 903 insertions(+) create mode 100644 example/dev/input/config.yaml create mode 100644 example/dev/input/execSnakemake.sh create mode 100644 example/dev/input/params.sh create mode 100755 example/dev/input/runSnakefile.sh create mode 100644 example/dev/input/runSnakemakeWrapper.sh create mode 100644 example/dev/input/samples.csv diff --git a/example/dev/input/config.yaml b/example/dev/input/config.yaml new file mode 100644 index 0000000..d3d4436 --- /dev/null +++ b/example/dev/input/config.yaml @@ -0,0 +1,240 @@ +################################################ +################################################ +# CONFIGURATION FILE FOR THE ATAC-SEQ PIPELINE # +################################################ +################################################ + +# This format allows comments and is therefore easier to work with than the json format we used before. +# Quotation marks are optional for strings. Make sure to put ": " (that is, hyphen space) as separator + + +################## +# SECTION output # +################## +output: + + # STRING. Absolute path to the output directory. Will be created if not yet present. + outdir: "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/output" + + # BOOLEAN. “true†or “falseâ€. Default “trueâ€. Should the pipeline use MACS2 to call peaks? If yes, peaks will be called in 3 different flavors (ENCODE, stringent, non-stringent). See the section “par_peakCalling†in the configuration file for details. + doPeakCalling: true + + # BOOLEAN. "true" or "false". Default "false". Do peak calling also, in addition to the "stringent" and "non-stringent" flavour, according to ENCODE guidelines (as of 2018, we so far rarely used them) + encodePeaks: false + + # BOOLEAN. “true†or “falseâ€. Default “falseâ€. Should, in addition to treating all input files separately, the pipeline merge all replicate files and do all downstream analysis in addition? If set to true, for each individual as specified in the sample table, all samples belonging to this individual ID will be merged to one file after the post-processing. This makes only sense if you have more than 1 replicate per individual. + alsoMergeReplicates: true + + # BOOLEAN. "true" or "false". Default "false". Should peaks be annotated to find out which genomic regions they overlap with? Still being tested, set to false if it causes errors + annotatePeaks: true + + # BOOLEAN. “true†or “falseâ€. Default “falseâ€. Should GC bias be assessed and corrected for the final BAM files in addition to the non-corrected ones? If set to true, the GC bias will be assessed and corrected using deepTools. Additionally, all downstream steps of the pipeline will be done for both GC-corrected and original files (including peak calling, PCA, coverage, etc). This greatly increases running time of th pipeline and essentially doubles the number of files. We recommend setting this to false unless the GC bias is important for downstream analyses + # !!! Important: Leave at false for now, I have to fix an issue in the Snakefile that currently causes an error !! + correctGCBias: false + + # BOOLEAN. “true†or “falseâ€. Default “falseâ€. Should base qualities be recalibrated using GATK to detect and correct systematic errors in base quality scores? This step can be time-consuming and needs the following other parameters: GATK_jar, knownSNPs, knownIndels. I recommend turning this off for now. + doBaseRecalibration: false + + # BOOLEAN. “true†or “falseâ€. Default “trueâ€. Should the pipeline produce coverage files and diagnostic plots? If set to true, coverage files for the final BAM files in bigwig and bedgraph.gz format will be produced as well as a coverage plot using deepTools. + generateCoverageFiles: true + + # BOOLEAN. “true†or “falseâ€. Default “trueâ€. Not implemented yet. + doIDR: false + + +################### +# SECTION general # +################### +general: + + # INTEGER. Default 12. Maximum number of cores per rule. For local computation, the minimum of this value and the --cores parameter will define the number of CPUs per rule, while in a cluster setting, the minimum of this value and the number of cores on the node the jobs runs is used. + maxCoresPerRule: 12 + +################### +# SECTION samples # +################### +samples: + + # STRING. No default. Absolute path to the sample summary file. See section 2.4 for details. + summaryFile: "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/samples.csv" + + # BOOLEAN. “true†or “falseâ€. Default “trueâ€. Paired-end data? Single-end ATAC-Seq data is not yet supported with this pipeline. If set to "false", the Snakemake pipeline will abort in the beginning. + pairedEnd: true + + # STRING. "ATACseq" or "ChIPseq". Only data type specific steps are executed. Currently, if set to ChIP-Seq, ATAC-seq specific steps like RSS adjustment are not executed. + dataType: "ATACseq" + #dataType: "ChIPseq" + +########################### +# SECTION additionalInput # +########################### +additionalInput: + + # STRING. Absolute path to the scripts directory. Either "/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/R" or "/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/dev/R" + scriptsDir: "/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/dev/R" + + # STRING. Absolute path to the adapters file for Trimmomatic in fasta format. Default “/g/scb2/zaugg/zaugg_shared/Programs/Trimmomatic-0.33/adapters/NexteraPE-PE.faâ€. There is usually no need to change this unless for your experiment, this adapter file is not suited. + trimmomatic_adapters: "/g/scb2/zaugg/zaugg_shared/Programs/Trimmomatic-0.33/adapters/NexteraPE-PE.fa" + + # STRING. Absolute path to a BED file that contains the genomic regions that should be filtered from the peaks. The default depends on the genome assembly, see the templates for details. Only needed if doPeakCalling is set to true. + blacklistRegions: "/g/scb2/zaugg/zaugg_shared/annotations/hg19/blacklisted/hg19-blacklist.v2.bed" + + # STRING. Absolute path to a database of known polymorphic sites (SNPs and Indels, respectively). This is only needed if (1) doBaseRecalibration is set to true and (2) the genome is either hg19 or hg38 and ignored otherwise. The default depends on the genome assembly, see the templates for details. Supported formats from GATK: BCF2, BEAGLE, BED, BEDTABLE, EXAMPLEBINARY, GELITEXT, RAWHAPMAP, REFSEQ, SAMPILEUP, SAMREAD, TABLE, VCF, VCF3. + knownSNPs: "/g/scb2/zaugg/zaugg_shared/annotations/hg19/GATK_bundle/dbsnp_138.hg19.vcf.gz" + knownIndels: "/g/scb2/zaugg/zaugg_shared/annotations/hg19/GATK_bundle/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz" + + # STRING. The default depends on the genome assembly, see the templates for details. Absolute path to the reference genome in fasta and 2bit format, respectively, both of which have to correspond to the same genome assembly version as used for the alignment as well as the database of polymorphic sites (knownSNPs and knownIndels, if applicable). + refGenome_fasta: "/g/scb2/zaugg/zaugg_shared/annotations/hg19/GATK_bundle/ucsc.hg19.onlyRefChr.fasta" + refGenome_2bit: "/g/scb2/zaugg/zaugg_shared/annotations/hg19/GATK_bundle/ucsc.hg19.2bit" + + # STRING. The default depends on the genome assembly, see the templates for details. Absolute path to an genome annotation file in GTF format. + annotationGTF: "/g/scb2/zaugg/zaugg_shared/annotations/hg19/Gencode_v19/gencode.v19.annotation.gtf" + + +####################### +# SECTION executables # +####################### + +# If run with Singularity, this section can be ignored + +executables: + + # STRING. Default “javaâ€. Name of the executable for Java. Java version must be at least 1.8! Only needed if doBaseRecalibration is set to true + java: "java" + + # STRING. Default "/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/tools/GenomeAnalysisTK.jar". Absolute path to a JAR file for the GATK suite. Only needed if doBaseRecalibration is set to true + GATK_jar: "/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/tools/GenomeAnalysisTK.jar" + + # STRING. Default "/g/scb2/zaugg/zaugg_shared/Programs/Picardtools/picardOld.jar". Absolute path to a JAR file for the Picard suite. + PICARD_jar: "/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/tools/picardOld.jar" + +######################## +# SECTION par_trimming # +######################## +par_trimming: + + # STRING. Default "1:30:4:5:true". ILLUMINACLIP value. See trimmomatic manual + trimmomatic_ILLUMINACLIP: "1:30:4:1:true" + + # INTEGER. Default 3. TRAILING value. See trimmomatic manual + trimmomatic_trailing: 3 + + # INTEGER. Default 20. MINLEN value. See trimmomatic manual + trimmomatic_minlen: 20 + + # STRING. Default "phred33". Phred type. See trimmomatic manual. The "-" is added automatically by the Snakemake pipeline. + #trimmomatic_phredType: "phred33" + +##################### +# SECTION par_align # +##################### +par_align: + + # STRING. Default "--very-sensitive". Sensitivity. Leave empty for the default sensitivity. See bowtie2 manual. + bowtie2_sensitivity: "--very-sensitive" + + # INTEGER. Default 2000. Value for parameter X. See bowtie2 manual. + bowtie2_maxFragmentLength: 2000 + + # STRING. Default "/g/scb/zaugg/zaugg_shared/annotations/hg19/referenceGenome/Bowtie2/hg19". Absolute path to the reference genome + the prefix of the file with the index. In the default case, we use those files within /g/scb/zaugg/zaugg_shared/annotations/hg19/referenceGenome/Bowtie2 that start with hg19. See bowtie2 manual for more details. + bowtie2_refGenome: "/g/scb2/zaugg/zaugg_shared/annotations/hg19/referenceGenome/Bowtie2/hg19" + + # STRING. Default “hg19â€. Reference genome assembly version. Must match the one used by the alignment program. + assemblyVersion: "hg19" + + +######################### +# SECTION par_postalign # +######################### +par_postalign: + + # INTEGER. Default 10. Minimum MAPQ score. Reads with a lower MAPQ quality will be removed during the post-alignment processing. + minMAPQscore: 10 + + # STRING. Default “LENIENTâ€. Value of the VALIDATION_STRINGENCY from SortSam (Picard tools). See the manual for details. + ValidationStringencySortSam: "LENIENT" + + # STRING. Default “SILENTâ€. Value of the VALIDATION_STRINGENCY from MarkDuplicates (Picard tools). See the manual for details. + ValidationStringencyMarkDuplicates: "SILENT" + + # STRING. Defauled “IDâ€. Used for filtering reads. Relates to the one letter abbreviations for CIGAR strings such as I for insertion and D for deletion. Specify all the one letter abbreviations in the CIGAR string of a read here that should be filtered. "ID" keeps a read only if the CIGAR string does not contain the letters "I" and "D" (e.g., only M for example) + CIGAR: "ID" + + # INTEGER. Default 4 and -5, respectively. Adjustment of the read start positions on the forward and reverse strand and should be a positive or negative number, respectively. See the Buenrostro paper for details. FOR DATA TYPE ATAC-SEQ ONLY + adjustRSS_forward: 4 + adjustRSS_reverse: 5 + +####################### +# SECTION par_scripts # +####################### + +# NOTE: There is usually no need to modify the parameters here. + +par_scripts: + + # INTEGER. Default 4000. The region size in bp that specifies what is considered within a TSS. The STATS script does a TSS enrichment test to test whether or not ATAC-Seq reads are primarily located within annotated TSS as opposed to outside of TSS regions. A value of 4000 means the region from -2kb up to +2kb of annotated TSS. + STATS_script_withinThr: 4000 + + # INTEGER. Default 1000. The size of the region adjacent to the within TSS region that is considered outside of a TSS. A value of 1000 therefore denotes the 1kb region up- and downstream of the within TSS region (from -3 to -2kb upstream and from +2 to +3 kb downstream of annotated TSS.) + STATS_script_outsideThr: 1000 + + # STRING. Default "protein_coding". Gene type to keep / do the analayses for. Allowed are gene types as specified by GENCODE. The default is "protein_coding" + STATS_script_geneTypesToKeep: "protein_coding" + + # INTEGER. Default 600. Fragment length cutoff. All reads with a fragment length less than this value will be filtered for the purpose of this script. + FL_distr_script_cutoff: 600 + + + +########################### +# SECTION par_peakCalling # +########################### +par_peakCalling: + + # STRING. Default "--nolambda –nomodel". Peak calling model for non-stringent peak calling. + modelNonStringent: "--nolambda --nomodel" + + # STRING. Default "—nomodelâ€. Peak calling model for stringent peak calling. + modelStringent: "--nomodel" + + # NUMERIC [0, 1]. Default 0.01. Minimum q-value threshold for stringent peak calling. + modelStringent_minQValue: 0.01 + + # NUMERIC [0, 1]. Default 0.1. Minimum q-value threshold for non-stringent peak calling. + modelNonStringent_minQValue: 0.1 + + # INTEGER. Default 10000. Value for slocal parameter for non-stringent peak calling. + modelNonStringent_slocal: 10000 + + # the enxt 3 options are only relevant if encodePeaks=true. ignored otherwise + + # NUMERIC [0, 1]. Default 0.1. p-value threshold for ENCODE peak calling. + Encode_pValThreshold: 0.1 + + # STRING. Default "--nomodel --shift -75 --extsize 150 --broad --keep-dup all". Model for Encode peak calling (broad and gapped mode) + Encode_modelBroadAndGapped: "--nomodel --shift -75 --extsize 150 --broad --keep-dup all" + + # STRING. Default "--nomodel --shift -75 --extsize 150 -B --SPMR --keep-dup all –call-summits". Model for Encode peak calling (narrow mode) + Encode_modelNarrow: "--nomodel --shift -75 --extsize 150 -B --SPMR --keep-dup all --call-summits" + + +######################### +# SECTION par_deepTools # +######################### +par_deepTools: + + # INTEGER. The default depends on the genome assembly, see the templates for details. Length of the “mappable†genome in bp as defined by deepTools (see https://deeptools.readthedocs.io/en/develop/content/feature/effectiveGenomeSize.html). + #effectiveGenomeSize: 2750000000 + readLength: "" + + # STRING. Default “normalizeTo1xâ€. Either "normalizeTo1x NUMBER" OR "normalizeUsingRPKM" (note the missing (!) leading "--"), where NUMBER denotes the number of base pairs that are mappable. Beware of the mapping dependence on the read length for this number: The reported numbers on the websites are for 30bp reads, and we now have much longer reads usually. See Koehler et al. (2011) for numbers1. + bamCoverage_normalizationCoverage: "RPGC" + + # INTEGER. Default 10. Size of the bins, in bases + bamCoverage_binSize: 10 + + # BOOLEAN. Default true. If the data is pair ended a read will by default be extended to match its pair. + bamCoverage_extendReads: true + + # STRING. Default "--extendReads --centerReads". Additional options that are supported by bamCoverage. Note that the "--" or "-" has to be present here. + bamCoverage_otherOptions: "--centerReads" diff --git a/example/dev/input/execSnakemake.sh b/example/dev/input/execSnakemake.sh new file mode 100644 index 0000000..ccb5b2a --- /dev/null +++ b/example/dev/input/execSnakemake.sh @@ -0,0 +1 @@ +snakemake -s /g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/Snakefile_dev --use-conda --conda-prefix /g/scb2/zaugg/zaugg_shared/Programs/Snakemake/conda --nolock --configfile config.json --latency-wait 30 --dryrun --quiet --verbose --reason --printshellcmds --forceall --rerun-incomplete --timestamp --cores 16 diff --git a/example/dev/input/params.sh b/example/dev/input/params.sh new file mode 100644 index 0000000..e9046f1 --- /dev/null +++ b/example/dev/input/params.sh @@ -0,0 +1,123 @@ +############################### +# TEMPLATE FOR PARAMS.SH FILE # +# ALWAYS UP TO DATE # +############################### + +# GENERAL NOTE: All paths can be relative as well as absolute. Usually, relative paths should be prefered + +################# +# INPUT OPTIONS # +################# + +# Config file to use +configFile="config.yaml" + +# Snakefile to use +snakefile="/g/scb2/zaugg/carnold/Projects/AtacSeq/src/Snakemake/dev/Snakefile" + +####################### +# PERFORMANCE OPTIONS # +####################### + +# Use a dry run for testing purposes? +dryRun=true + +# Number of cores to use. For local execution, the minimum of this the real number of cores and this number is taken. +# When in cluster mode, the maximum number of CPUs per rule. See the separate cluster specification +nCores=4 + +# Disable locking of the .snakemake directory. Make to to NEVER run more than 1 Snakemake analysis for the same folder in parallel. Usually, there is a check for that, this switch disables it for performance reasons +nolock=true + +# Shadow directory. See Snakemake help +shadowDir="" + +# Enable various additional output of Snakemake such as verbose messages and printing shell commands +useVerbose=false + +################################# +# CONDA AND SINGULARITY OPTIONS # +################################# +useConda=false + +# Directory in which all conda environments are stored. As we use them repeatedly, a designated directory to store them in is useful. Defaults to "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/conda" unless specified otherwise +condaDir="/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/conda" + +# Should singularity be used? +useSingularity=true + +# Singularity prefix. Only relevant if useSingularity=true. Defaults to "/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/singularity" +singularityPrefix="/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/singularity" + +# Additional arguments for singularity. Only relevant if useSingularity=true +singularityArgs="--bind /g/scb,/g/scb2,/scratch" + + +################### +# CLUSTER OPTIONS # +################### + +submitToCluster=false + +# Use SLURM mode? This can always be set to TRUE currently and is redundant, as only SLURM is supported. +useSLURM=true + +# Cluster configuration file +clusterConfig="cluster.json" + +# Maximum number of simultaenous jobs +maxJobsCluster=500 + +# Maximum number of times a job should be reexecuted when failing. For SLURM, 0 is usually fine for SLURM, as it is reliable enough +maxRestartsPerJob=0 + +################ +# RULE OPTIONS # +################ + +# Default: "", which means use all rules defined in the Snakefile. +# If you want to use only a particular set of rules, specify them here by their rule name and separate them by spaces +allowedRules="" + +# Start from only a specific rule? If yes, name the rule here, which has to correspond to the rule name in the Snakefile +runSpecificRule="" + +# Run also all downstream rules when runSpecificRule is specified? If set to false, ONLY the specified rule will be run; otherwise, all downstream rules are also triggered +runAlsoDownstreamRules=false + +# Should rules marked as incomplete be rerun automatically? Usually, this is a good idea. +rerunIncomplete=true + +# Should the WHOLE pipeline be rerun? use with care, setting this to true will rerun everything +forceRerunAll=true + +# Should Snakemake abort after the first error or continue running? +abortAfterFirstError=false + + +################# +# FILES OPTIONS # +################# + +# Use --notemp for developing purposes: Ignore temp() declarations. This is useful when running only a part of the workflow, +# since temp() would lead to deletion of probably needed files by other parts of the workflow. +ignoreTemp=false + +# Only touches output files. useful when you manually copied data and you want to ensure that rules are not run unnecessarily because of timestamp reasons. +touchOutputFiles=false + + +####################### +# DEVELOPMENT OPTIONS # +####################### + +#Ask Christian +ignoreZeroSizedFiles=true + +runCustomCommand="" + +# Only execute the last part: Calling Snakemake +skipSummaryAndDAG=false + +# You can usually leave this untouched unless you also suffer from the "bug" that dot has a problem producing PDF files. Then change to "svg" pr "png" +workflowGraphFileType="pdf" diff --git a/example/dev/input/runSnakefile.sh b/example/dev/input/runSnakefile.sh new file mode 100755 index 0000000..dc1f7ba --- /dev/null +++ b/example/dev/input/runSnakefile.sh @@ -0,0 +1,26 @@ +#!/bin/bash + +# Cluster-specific settings, customized for the EMBL SLURM cluster. +# Important: !! Modifications here are only needed if NEITHER conda NOR Singularity is used (see below) !! +# !! Do not modify unless you know what you do!! +# !! Ignore the lines with comments below unless you have a different setup, ask Christian for advice + +# Unload all modules and load the necessary ones for the pipeline +# One exception might be R, which might have to be loaded depending on whether it is preloaded or not in the .profile or .bashrc +# module purge +# module load GCCcore ncurses BEDTools SAMtools R-bundle-Bioconductor/3.5-foss-2016b-R-3.4.0 Autoconf FastQC Trimmomatic snakemake MACS2 deepTools + +######################## +# PATHS AND PARAMETERS # +######################## + +# All parameters and paths are defined here: +. "./params.sh" + + + + +######################## +# RUN AUTOMATED SCRIPT # +######################## +. "/g/scb/zaugg/zaugg_shared/scripts/Christian/src/Snakemake/runSnakemakeWrapper.sh" diff --git a/example/dev/input/runSnakemakeWrapper.sh b/example/dev/input/runSnakemakeWrapper.sh new file mode 100644 index 0000000..897c4a7 --- /dev/null +++ b/example/dev/input/runSnakemakeWrapper.sh @@ -0,0 +1,509 @@ + +# parameters that are currently fixed +customJobStatusScript=false + +####################### +# AUTOMATIC FROM HERE # +####################### + +if [ ! -n "$configFile" ]; then + outputDir="output" +else + # grep output directory automatically + outputDir=`grep -P "\\"outdir\\"\s*:(.+)" $configFile | cut -d":" -f2 | sed 's/[\"]//g' | sed -e 's/^[ \t]*//' | sed -e 's/[,]*//g'` + + if [ ! -n "$outputDir" ]; then + echo "Error: Could not find \"outdir\" parameter in the config file $configFile to automatically set the output directory correctly." + exit 1 + fi + +fi + + +if [ ! -n "$skipPDFWorkflow" ] ; then + skipPDFWorkflow=false +fi + +if [ ! -n "$skipSummaryAndDAG" ] ; then + skipSummaryAndDAG=true +fi + +if [ ! -n "$mailType" ] ; then + mailType="None" +fi + +if [ ! -n "$ignoreZeroSizedFiles" ] ; then + ignoreZeroSizedFiles=true +fi + +if [ ! -n "$rerunIncomplete" ] ; then + rerunIncomplete=true +fi + +if [ ! -n "$nolock" ] ; then + nolock=true +fi + +if [ ! -n "$useSLURM" ] ; then + useSLURM=false +fi + + +if [ ! -n "$condaDir" ] ; then + condaDir="/g/scb2/zaugg/zaugg_shared/Programs/Snakemake/conda" +fi + + +now="$(date +'%Y-%m-%d_%H-%M-%S')" + +# Create a subdirectory where the stuff goes +logDirBasename="LOGS_AND_BENCHMARKS" +reportsDirBasename="LOGS_AND_BENCHMARKS" +outputDirLog="${outputDir}/$logDirBasename" +outputDirReports="${outputDir}/$reportsDirBasename" +inputDirBasename="0.Input/$now" +inputDir="${outputDir}/$inputDirBasename" +logParameters="${inputDir}/SnakemakeParams.log" +fileDAG="$outputDirReports/workflow.dag" +workflowGraphPDF="$outputDirReports/workflow.pdf" +workflowGraphSVG="$outputDirReports/workflow.svg" +stats="$outputDirLog/snakemake.stats" +stats2="$outputDirLog/snakemake.summaryDetailed" + + +if [ ! -d "$outputDirLog" ]; then + mkdir -p $outputDirLog + if [ $? -ne 0 ] ; then + echo "Error: Could not create directory $outputDirLog." + #exit 1 + fi +fi + + +if [ ! -d "$outputDirReports" ]; then + mkdir -p $outputDirReports + if [ $? -ne 0 ] ; then + echo "Error: Could not create directory $outputDirLog." + #exit 1 + fi +fi + +if [ ! -d "$inputDir" ]; then + mkdir -p $inputDir + if [ $? -ne 0 ] ; then + echo "Error: Could not create directory $outputDirLog." + #exit 1 + fi +fi + + + +echo "Automated parameter report, generated $now" | tee $logParameters +echo "" | tee -a $logParameters + +echo "##############" | tee -a $logParameters +echo "# PARAMETERS #" | tee -a $logParameters +echo "##############" | tee -a $logParameters + +echo " FILES AND DIRECTORIES" | tee -a $logParameters +echo " configFile = $configFile" | tee -a $logParameters +echo " snakefile = $snakefile" | tee -a $logParameters +echo " outputDir = $outputDir" | tee -a $logParameters +echo " ignoreZeroSizedFiles = $ignoreZeroSizedFiles" | tee -a $logParameters +echo " rerunIncomplete = $rerunIncomplete" | tee -a $logParameters + +echo " PERFORMANCE OPTIONS" | tee -a $logParameters +echo " nCores = $nCores" | tee -a $logParameters +echo " nolock = $nolock" | tee -a $logParameters +echo " skipSummaryAndDAG = $skipSummaryAndDAG" | tee -a $logParameters + +echo " DEVELOPMENT OPTIONS" | tee -a $logParameters +echo " useVerbose = $useVerbose" | tee -a $logParameters +echo " dryRun = $dryRun" | tee -a $logParameters +echo " abortAfterFirstError = $abortAfterFirstError" | tee -a $logParameters +echo " ignoreTemp = $ignoreTemp" | tee -a $logParameters +echo " touchOutputFiles = $touchOutputFiles" | tee -a $logParameters +echo " runCustomCommand = $runCustomCommand" | tee -a $logParameters +echo " workflowGraphFileType = $workflowGraphFileType" | tee -a $logParameters + +echo " CONDA OPTIONS" | tee -a $logParameters +echo " useConda = $useConda" | tee -a $logParameters +echo " conda-prefix = $condaDir" | tee -a $logParameters + +echo " CLUSTER OPTIONS" | tee -a $logParameters +echo " submitToCluster = $submitToCluster" | tee -a $logParameters +echo " useSLURM = $useSLURM" | tee -a $logParameters +echo " clusterConfig = $clusterConfig" | tee -a $logParameters +echo " maxJobsCluster = $maxJobsCluster" | tee -a $logParameters +echo " maxNoRestartJobsUponFailure = $maxRestartsPerJob" | tee -a $logParameters +echo " mailType = $mailType" | tee -a $logParameters +echo " customJobStatusScript = $customJobStatusScript" | tee -a $logParameters + +echo " RULE OPTIONS" | tee -a $logParameters +echo " forceRerunAll = $forceRerunAll" | tee -a $logParameters +echo " allowedRules = $allowedRules" | tee -a $logParameters +echo " runSpecificRule = $runSpecificRule" | tee -a $logParameters +echo " runAlsoDownstreamRules = $runAlsoDownstreamRules"| tee -a $logParameters + + +# Copy the configuration files etc to the input folder +if [ -n "$configFile" ]; then + cp $configFile $inputDir +fi + +cp $snakefile $inputDir + +if [ -n "$clusterConfig" ]; then + cp $clusterConfig $inputDir +fi + +configfileDirective="" +verboseDirective="" +tempDirective="" +printShellDirective="" +clusterDirective="" +forceRerunDirective="" +dryRunDirective="" +touchDirective="" +allowedRulesDirective="" +runSpecificRuleDirective="" +maxRestartDirective="" +condaDirective="" +keepGoingDirective="--keep-going" +nolockDirective="" +rerunIncompleteDirective="" +printShellCommands=false + +if [ "$ignoreZeroSizedFiles" = false ] ; then + + # Check for 0-sized output files and abort if some are present + echo " + + Check for zero-sized files (excluding files in $outputDirLog)...." + + command="find $outputDir -type f -size 0 ! -path '*$logDirBasename*' ! -path '*/.snakemake/*'" + + echo "Execute command \"$command\"" + + + nEmptyFiles=$(eval "$command | wc -l") + if [ $nEmptyFiles -gt 0 ] ; then + echo -e "\nWARNING\nThe following $nEmptyFiles zero-sized files have been found:" + emptyFiles=$(eval "$command") + echo $emptyFiles + echo "Check them carefully and delete them to avoid potential issues" + echo "Use '$command -delete' to delete them" + exit 1 + fi + +else + echo "\n\nZero sized files have been chosen to ignore. Use this option if you are sure that all zero sized files are supposed to be zero. It might be that during the Snakemake run something went wrong." +fi + + +if [ ! -f "$snakefile" ]; then + echo "Error: Snakefile $snakefile not found." + exit 1 +fi + +if [ "$touchOutputFiles" = true ] ; then + touchDirective="--touch" +fi + + +if [ -n "$configFile" ]; then + configfileDirective="--configfile $configFile" +fi + +if [ "$nolock" = true ] ; then + nolockDirective="--nolock" +fi + +if [ "$useConda" = true ] ; then + condaDirective="--use-conda --conda-prefix $condaDir" +fi + +if [ "$abortAfterFirstError" = true ] ; then + keepGoingDirective="" +fi + + + + +if [ "$useVerbose" = true ] ; then + verboseDirective="--verbose --reason " + printShellCommands=true +fi + +if [ "$dryRun" = true ] ; then + dryRunDirective="--dryrun" + dryRunDirective="--dryrun --quiet" + +fi + + +if [ -n "$allowedRules" ] ; then + allowedRulesDirective="--allowed-rules $allowedRules" + +fi + + +if [ -n "$runSpecificRule" ] ; then + runSpecificRuleDirective="--forcerun $runSpecificRule" + if [ "$runAlsoDownstreamRules" = false ] ; then + runSpecificRuleDirective="--forcerun $runSpecificRule --until $runSpecificRule" + allowedRulesDirective="" # For now, reset this as the combination does not seem to work properly + fi + +fi + +if [ "$ignoreTemp" = true ] ; then + tempDirective="--notemp" +fi + +if [ "$rerunIncomplete" = true ] ; then + rerunIncompleteDirective="--rerun-incomplete" +fi + +if [ "$printShellCommands" = true ] ; then + printShellDirective="--printshellcmds" +fi + +if [ "$submitToCluster" = false ] ; then + if [ $nCores -gt 8 ] ; then + echo "Many CPUs specified for local computation, make sure this value is ok." + + fi +fi + +if [ $maxRestartsPerJob -gt 0 ] ; then + if [ "$submitToCluster" = true ] ; then + maxRestartDirective="--restart-times $maxRestartsPerJob" + fi +fi + + +if [ "$submitToCluster" = true ] ; then + + if [ ! -f "$clusterConfig" ]; then + echo "Error: File $clusterConfig not found." + exit 1 + fi + + # if [ $maxJobsCluster -gt 500 ] ; then + # echo "Warning: maxJobsCluster=$maxJobsCluster too high! Adjust maxJobsCluster to 500..." + # maxJobsCluster=500 + # fi + + # Check if output and error directory can be written to + + outputFile=`grep -P "\\"output\\"\s*:(.+)" $clusterConfig | cut -d":" -f2 | sed 's/[\"]//g' | sed -e 's/^[ \t]*//' | sed -e 's/[,]*//g'` + errorFile=`grep -P "\\"error\\"\s*:(.+)" $clusterConfig | cut -d":" -f2 | sed 's/[\"]//g' | sed -e 's/^[ \t]*//' | sed -e 's/[,]*//g'` + + outputDir=$(dirname $outputFile) + errorDir=$(dirname $errorFile) + + if [ ! -d "$outputDir" ]; then + echo "Error: The specified \"output\" directory \"$outputDir\" in \"$clusterConfig\" is invalid and does not exist." + exit 1 + fi + + if [ ! -d "$errorDir" ]; then + echo "Error: The specified \"error\" directory \"$errorDir\" in \"$clusterConfig\" is invalid and does not exist." + exit 1 + fi + + # Check HOSTNAME + hostname=`hostname` + + queueDirective="" + nodesDirective="" + nameDirective="" + groupDirective="" + nCPUsDirective="" + memoryDirective="" + maxTimeDirective="" + outputDirective="" + errorDirective="" + qosDirective="" + excludeDirective="" + + generalMessage="Check the validity of the cluster file on the cluster system (SLURM or LSF) you want to run it on." + + + if [ "$hostname" = "login.cluster.embl.de" ] || [ "$hostname" = "spinoza.embl.de" ] ; then + + echo "Use SLURM cluster because hostname is \"login.cluster.embl.de\"" + + + nHits=$(grep -c queueSLURM $clusterConfig) + if [ "$nHits" -gt "0" ]; then + queueDirective="-p {cluster.queueSLURM}" + fi + + nHits=$(grep -c nodes $clusterConfig) + if [ "$nHits" -gt "0" ]; then + nodesDirective="-C {cluster.nodes}" + fi + + nHits=$(grep -c name $clusterConfig) + if [ "$nHits" -gt "0" ]; then + nameDirective="-J {cluster.name}" + fi + + nHits=$(grep -c group $clusterConfig) + if [ "$nHits" -gt "0" ]; then + groupDirective="-A {cluster.group}" + fi + + nHits=$(grep -c nCPUs $clusterConfig) + if [ "$nHits" -gt "0" ]; then + nCPUsDirective="--cpus-per-task {cluster.nCPUs}" + fi + + + nHits=$(grep -c threads $clusterConfig) + if [ "$nHits" -eq "0" ]; then + echo "Could not find \"threads\" in file \"$clusterConfig\". Use nCPUS: \"\{threads\}\". $generalMessage"; + exit 1; + fi + + + nHits=$(grep -c memory $clusterConfig) + if [ "$nHits" -gt "0" ]; then + memoryDirective="--mem {cluster.memory}" + fi + + nHits=$(grep -c maxTime $clusterConfig) + if [ "$nHits" -gt "0" ]; then + maxTimeDirective="--time {cluster.maxTime}" + fi + + nHits=$(grep -c output $clusterConfig) + if [ "$nHits" -gt "0" ]; then + outputDirective="-o \"{cluster.output}\"" + fi + + + nHits=$(grep -c exclude $clusterConfig) + if [ "$nHits" -gt "0" ]; then + excludeDirective="--exclude {cluster.exclude}" + fi + + + nHits=$(grep -c error $clusterConfig) + if [ "$nHits" -gt "0" ]; then + errorDirective="-e \"{cluster.error}\"" + fi + + nHits=$(grep -c nodes $clusterConfig) + if [ "$nHits" -gt "0" ]; then + nodesDirective="-C {cluster.nodes}" + fi + + nHits=$(grep -c qos $clusterConfig) + if [ "$nHits" -gt "0" ]; then + qosDirective="--qos={cluster.qos}" + fi + + clusterSpecifics="--cluster \" sbatch $queueDirective $nameDirective $groupDirective $nodesDirective $nCPUsDirective $memoryDirective $maxTimeDirective $outputDirective $errorDirective $qosDirective $excludeDirective --mail-type=$mailType --parsable \"" + + #clusterSpecifics=" --drmaa \" -p {cluster.queueSLURM} -J {cluster.name} -A {cluster.group} -N {cluster.nNodes} -n {cluster.nCores} --mem {cluster.memory} -o \"{cluster.output}\" -e \"{cluster.error}\" --mail-type NONE \"" + + + else + + echo "Use bsub because hostname is not \"login.cluster.embl.de\"" + echo "Exiting, LSF/bsub is not supported anymore" + exit 1 + + clusterSpecifics="--cluster \" bsub -q {cluster.queue} -J {cluster.name} -n {cluster.nCPUs} -R \"{cluster.resources}\" -M {cluster.memory} -o \"{cluster.output}\" -e \"{cluster.error}\" \"" + + nHits=$(grep -c queue $clusterConfig) + if [ "$nHits" -eq "0" ]; then + echo "Could not find \"queue\" parameter in file \"$clusterConfig\". $generalMessage"; + exit 1; + fi + + + nHits=$(grep -c resources $clusterConfig) + if [ "$nHits" -eq "0" ]; then + echo "Could not find \"resources\" parameter in file \"$clusterConfig\". $generalMessage"; + exit 1; + fi + + fi + + clusterStatusDirective="" + if [ "$customJobStatusScript" = true ] ; then + clusterStatusDirective="--cluster-status /g/scb/zaugg/zaugg_shared/scripts/Christian/src/Snakemake/SLURM_jobStatus.py" + fi + + clusterDirective="--jobs $maxJobsCluster --cluster-config $clusterConfig $clusterSpecifics --local-cores 1 $clusterStatusDirective" + + # nCores=16 + +fi + +if [ "$forceRerunAll" = true ] ; then + forceRerunDirective="--forceall" +fi + + +echo "" | tee -a $logParameters +echo "####################################" | tee -a $logParameters +echo "# Execute the following command(s) #" | tee -a $logParameters +echo "####################################" | tee -a $logParameters + +if [ -n "$runCustomCommand" ] ; then + echo "Run custom command only:" + commandFull="$runCustomCommand" + echo "$commandFull" +else + + # Run 1: Detailed summary about what files will be generated + command1="snakemake -s $snakefile $configfileDirective $forceRerunDirective $runSpecificRuleDirective $tempDirective --detailed-summary >$stats2" + + #Run 2: Produce a workflow graph + command2="snakemake -s $snakefile --configfile $configFile --forceall --dag > $fileDAG" + + if [ "$skipPDFWorkflow" = true ] ; then + command2a="echo \"skipping creation of PDF workflow graph\"" + else + command2a="dot $fileDAG -Tpdf > $workflowGraphPDF" + fi + + # Also do a SVG in addition for easier edits + command2b="dot $fileDAG -Tsvg > $workflowGraphSVG" + + # Run 3: Main run: Execute the pipeline + command3="snakemake -s $snakefile $condaDirective $nolockDirective $configfileDirective --latency-wait 30 $dryRunDirective $touchDirective $allowedRulesDirective $tempDirective $runSpecificRuleDirective $verboseDirective $printShellDirective $forceRerunDirective $rerunIncompleteDirective --timestamp --cores $nCores $maxRestartDirective $keepGoingDirective --stats $stats $clusterDirective" + + + #commandFull="$command2 && $command2a && $command2b" + + if [ "$skipSummaryAndDAG" = true ] ; then + + commandFull="$command3" + + else + + commandFull="$command1 && $command2 && $command2a && $command2b && $command3" + + echo "$command1" | tee -a $logParameters + echo "#######################" | tee -a $logParameters + echo "$command2" | tee -a $logParameters + echo "#######################" | tee -a $logParameters + echo "$command2a" | tee -a $logParameters + echo "#######################" | tee -a $logParameters + echo "$command2b" | tee -a $logParameters + echo "#######################" | tee -a $logParameters + fi + + echo "$command3" | tee -a $logParameters + echo "############################################" | tee -a $logParameters + +fi + + +eval $commandFull diff --git a/example/dev/input/samples.csv b/example/dev/input/samples.csv new file mode 100644 index 0000000..525e4dd --- /dev/null +++ b/example/dev/input/samples.csv @@ -0,0 +1,4 @@ +individual sampleName path_inputForward path_inputReverse Flowcell_ID lane_ID Technology Library_ID replicate_No +"test1" "test1_rep1" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep1_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" +"test1" "test1_rep2" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test1_rep2_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" +"test2" "test2_rep1" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_1.fastq.gz" "/g/scb2/zaugg/carnold/Projects/AtacSeq/example/dev/input/data/test2_rep1_2.fastq.gz" "NA" "NA" "ILLUMINA" "default" "1" -- GitLab