# Normalization methods for RNA and ATAC. The default for RNA-Seq is quantile normalization, for ATAC-Seq a "regular" DESeq size factor normalization
# Possible values are: "quantile", "DESeq_sizeFactor", and "none". If "none" is used, you have to normalize counts beforehand on your own. However, make
# sure that the normalization is not sensitive to outliers. The classification can be impacted by RNA-Seq outlier counts, for example, which is why we found quantile normalization to work well for the datasets we worked with so far.
normMethodRNA="quantile"
normMethod_peaks="DESeq_sizeFactor"
idColumn_peaks="peakID"
idColumn_RNA="ENSEMBL"
# Root output folder (it does not matter whether you put a trailing slash or not)
dir_output=paste0(rootDir,"output/")
nCores=2
# Arbitrary list with information and metadata that is stored within the GRN object
# Should the pipeline be run for only a subset of TFs or all? The special keyword "all" will use all TF that are found in the HOCOMOCO folder; however, if only a subset should be considered, specify the subset here with c() and the TF names, as shown below
TFs="all"
nTFMax=NULL
nTFMax=50
# Base directory of the folder with the TFBS predictions.
# The TFBS predictions are expected as *.bed files as well as a translation table with the name translationTable.csv
# We provide all files here: https://www.embl.de/download/zaugg/GRN/hg19_hg38_mm10_PWMScan.zip (7.5 GB)
# Make sure they are in the same genome assembly as the ATAC data
# BED file with TAD domains. If no TSD domains are available, leave empty or set to NULL
file_input_TADs=""
# Type of overlap for gene: Either "TSS" or "full". If "full", any extended peak-gene overlap is taken, regardless of where in the gene it occurs
# If set to "TSS", only overlap of extended peaks with the TSS of the gene (assumed to be at the 5' position) is considered
# Until 09.09.20, "full" was being used by default, this parameter did not exist before
overlapTypeGene="TSS"
# Only relevant when no TAD domains are provided; if TADs are provided, this parameter can be ignored.
# Specifies the neighborhood size in bp (for both upstream and downstream of the peak) for peaks to find genes in vicinity and associate/correlate genes with peaks
# Default value 250000, here set to a smaller value to decrease running time
promoterRange=10000
GRN=addConnections_peak_gene(GRN,
overlapTypeGene=overlapTypeGene,
corMethod="pearson",
promoterRange=promoterRange,file_TADs=NULL,
nCores=nCores,plotDiagnosticPlots=TRUE,
forceRerun=TRUE)
######################
# FILTER CONNECTIONS #
######################
# Save a filtered GRN with genes with loose thresholds. The user can hen read it back it and filter more stringently.
# Which gene types to keep for peak-gene correlations when connecting TF-peaks and peak-genes?
# Set to "all" to keep all gene types, or a subset of gene types as defined by Gencode. Default is c("protein_coding", "lincRNA")
# Normalization methods for RNA and ATAC. The default for RNA-Seq is quantile normalization, for ATAC-Seq a "regular" DESeq size factor normalization
# Possible values are: "quantile", "DESeq_sizeFactor", and "none". If "none" is used, you have to normalize counts beforehand on your own. However, make
# sure that the normalization is not sensitive to outliers. The classification can be impacted by RNA-Seq outlier counts, for example, which is why we found quantile normalization to work well for the datasets we worked with so far.
# Should the pipeline be run for only a subset of TFs or all? The special keyword "all" will use all TF that are found in the HOCOMOCO folder; however, if only a subset should be considered, specify the subset here with c() and the TF names, as shown below
TFs="all"
nTFMax=NULL
nTFMax=50
# Base directory of the folder with the TFBS predictions.
# The TFBS predictions are expected as *.bed files as well as a translation table with the name translationTable.csv
# We provide all files here: https://www.embl.de/download/zaugg/GRN/hg19_hg38_mm10_PWMScan.zip (7.5 GB)
# Make sure they are in the same genome assembly as the ATAC data
# BED file with TAD domains. If no TSD domains are available, leave empty or set to NULL
file_input_TADs=""
# Type of overlap for gene: Either "TSS" or "full". If "full", any extended peak-gene overlap is taken, regardless of where in the gene it occurs
# If set to "TSS", only overlap of extended peaks with the TSS of the gene (assumed to be at the 5' position) is considered
# Until 09.09.20, "full" was being used by default, this parameter did not exist before
overlapTypeGene="TSS"
# Only relevant when no TAD domains are provided; if TADs are provided, this parameter can be ignored.
# Specifies the neighborhood size in bp (for both upstream and downstream of the peak) for peaks to find genes in vicinity and associate/correlate genes with peaks
# Default value 250000, here set to a smaller value to decrease running time
promoterRange=10000
GRN=addConnections_peak_gene(GRN,
overlapTypeGene=overlapTypeGene,
corMethod="pearson",
promoterRange=promoterRange,file_TADs=NULL,
nCores=nCores,plotDiagnosticPlots=TRUE,
forceRerun=TRUE)
######################
# FILTER CONNECTIONS #
######################
# Save a filtered GRN with genes with loose thresholds. The user can hen read it back it and filter more stringently.
# Which gene types to keep for peak-gene correlations when connecting TF-peaks and peak-genes?
# Set to "all" to keep all gene types, or a subset of gene types as defined by Gencode. Default is c("protein_coding", "lincRNA")