Commit f39e6ace authored by Christian Arnold's avatar Christian Arnold
Browse files

Cleaned repo

parent 528c777f
Pipeline #28888 passed with stage
in 12 seconds
"sample_id" "assigned" "assigned_frac" "atac_date" "clone" "condition" "diff_start" "donor" "EB_formation" "macrophage_diff_days" "medium_changes" "mt_frac" "percent_duplication" "received_as" "sex" "short_long_ratio"
"babk_D" 5507093 0.210921230312979 "2015-12-04" 2 "IFNg_SL1344" "2015-10-12" "babk" "2015-10-09" 56 10 0.456376650921142 0.214115 "alive/frozen" "female" 3.93397533197864
"bima_D" 23275756 0.677443730848284 "2014-12-12" 1 "IFNg_SL1344" "2014-11-07" "bima" "2014-11-04" 38 6 0.117519740693119 0.079756 "alive" "male" 2.9130719846891
"cicb_D" 19751751 0.579721263614574 "2015-04-24" 3 "IFNg_SL1344" "2015-03-30" "cicb" "2015-03-27" 28 6 0.155462480193979 0.066931 "frozen" "male" 1.78650919136278
"coyi_D" 6733642 0.31185057682225 "2015-11-05" 3 "IFNg_SL1344" "2015-09-30" "coyi" "2015-09-27" 39 7 0.385297241068573 0.1026 "frozen" "female" 2.53084184878867
"diku_D" 7010213 0.195160647410325 "2015-11-13" 1 "IFNg_SL1344" "2015-10-15" "diku" "2015-10-12" 32 6 0.193993747097176 0.06893 "frozen" "female" 3.76757978572267
"eipl_D" 16923025 0.519579168206653 "2015-08-04" 1 "IFNg_SL1344" "2015-06-30" "eipl" "2015-06-27" 38 7 0.13988117725185 0.06944 "frozen" "female" 1.93265133137896
"eiwy_D" 9807860 0.4044462839792 "2015-12-02" 1 "IFNg_SL1344" "2015-10-23" "eiwy" "2015-10-20" 43 7 0.302453957654079 0.090776 "frozen" "female" 1.75613407276057
"eofe_D" 25687477 0.645688915687569 "2014-12-12" 1 "IFNg_SL1344" "2014-11-01" "eofe" "2014-10-29" 44 8 0.142640514763618 0.080655 "alive" "female" 2.63757415618575
"fafq_D" 4600004 0.414696506879133 "2015-10-14" 1 "IFNg_SL1344" "2015-09-16" "fafq" "2015-09-13" 31 5 0.169076649746682 0.087027 "frozen" "male" 1.92922628617947
"febc_D" 31712153 0.430018180628596 "2015-08-04" 2 "IFNg_SL1344" "2015-07-06" "febc" "2015-07-03" 32 6 0.400115313238623 0.106825 "frozen" "female" 1.45678159498552
"fikt_D" 12120655 0.43249826902993 "2015-11-05" 3 "IFNg_SL1344" "2015-10-03" "fikt" "2015-09-30" 36 5 0.301717390025625 0.090902 "frozen" "female" 1.68462488275924
"guss_D" 13876015 0.433140013544195 "2015-11-18" 1 "IFNg_SL1344" "2015-10-16" "guss" "2015-10-13" 36 5 0.356285810649081 0.080022 "frozen" "female" 2.03502060233313
"hayt_D" 20369487 0.628709157220539 "2015-10-09" 1 "IFNg_SL1344" "2015-09-12" "hayt" "2015-09-09" 30 6 0.309818415540119 0.102584 "frozen" "male" 1.92932077333801
"hehd_D" 3964874 0.192481936100666 "2015-11-05" 2 "IFNg_SL1344" "2015-09-30" "hehd" "2015-09-27" 39 8 0.248352910302544 0.097714 "frozen" "female" 3.58283704670194
"heja_D" 17033933 0.533817355964094 "2015-08-12" 1 "IFNg_SL1344" "2015-06-29" "heja" "2015-06-26" 47 9 0.114173433891483 0.080099 "frozen" "male" 1.39981868677139
"hiaf_D" 7907908 0.219613703665739 "2015-07-22" 2 "IFNg_SL1344" "2015-06-18" "hiaf" "2015-06-15" 37 7 0.087407665795477 0.045106 "frozen" "male" 3.71903790659248
"iill_D" 12728643 0.50291828405585 "2015-11-05" 1 "IFNg_SL1344" "2015-09-27" "iill" "2015-09-24" 42 7 0.276029290754818 0.08604 "frozen" "male" 1.89858291866774
"kuxp_D" 12493577 0.438787848094188 "2015-07-22" 1 "IFNg_SL1344" "2015-06-21" "kuxp" "2015-06-18" 34 6 0.188645298218458 0.068606 "frozen" "male" 1.68808634956033
"nukw_D" 17850501 0.645243508607759 "2015-10-27" 1 "IFNg_SL1344" "2015-09-23" "nukw" "2015-09-20" 46 7 0.234492649167015 0.104232 "frozen" "female" 1.26760229695547
"oapg_D" 13706648 0.478080924747425 "2015-07-31" 5 "IFNg_SL1344" "2015-06-21" "oapg" "2015-06-18" 43 7 0.211976949581975 0.064611 "frozen" "female" 1.81448916921816
"oevr_D" 10181246 0.387041137247721 "2015-10-09" 3 "IFNg_SL1344" "2015-08-30" "oevr" "2015-08-27" 48 8 0.171515043883457 0.102247 "frozen" "male" 2.29579995766913
"pamv_D" 12568782 0.530024876051982 "2015-07-31" 3 "IFNg_SL1344" "2015-07-01" "pamv" "2015-06-28" 33 6 0.226667604999457 0.083218 "frozen" "male" 3.37640181895908
"pelm_D" 12885093 0.460903123941983 "2015-08-12" 3 "IFNg_SL1344" "2015-07-03" "pelm" "2015-06-30" 43 8 0.141307760306748 0.056814 "frozen" "female" 1.7075403478253
"podx_D" 15095740 0.498507670978346 "2015-07-22" 1 "IFNg_SL1344" "2015-06-26" "podx" "2015-06-23" 29 5 0.278886251713605 0.076971 "frozen" "female" 2.50667597769246
"qolg_D" 14636966 0.508384278972431 "2015-11-13" 3 "IFNg_SL1344" "2015-10-10" "qolg" "2015-10-07" 29 6 0.345044067138392 0.079032 "frozen" "male" 1.67190803527389
"sojd_D" 9616378 0.359051864041036 "2015-10-14" 3 "IFNg_SL1344" "2015-09-11" "sojd" "2015-09-08" 36 7 0.218259508709228 0.076186 "frozen" "female" 3.22064049478649
"vass_D" 21993598 0.572955943483222 "2015-04-24" 1 "IFNg_SL1344" "2015-03-29" "vass" "2015-03-26" 29 7 0.193210532921721 0.07022 "frozen" "female" 1.69032752731357
"xugn_D" 17505168 0.532266309510938 "2015-07-22" 1 "IFNg_SL1344" "2015-06-11" "xugn" "2015-06-08" 44 8 0.128462362140362 0.07249 "frozen" "male" 3.48786721342933
"zaui_D" 7297175 0.250887074437376 "2015-11-18" 3 "IFNg_SL1344" "2015-10-11" "zaui" "2015-10-08" 41 9 0.219835807676744 0.073612 "frozen" "female" 3.6017018666313
"uaqe_D" 10294277 0.356799693383731 "2015-10-27" 1 "IFNg_SL1344" "2015-10-01" "uaqe" "2015-09-28" "NA" 7 0.218277844876963 0.090727 "frozen" "female" 4.59939570834911
"qaqx_D" 9562711 0.355390033860629 "2015-11-18" 1 "IFNg_SL1344" "2015-10-01" "qaqx" "2015-09-28" "NA" 11 0.30661352138567 0.072576 "frozen" "female" 2.78684999474948
startTime <- Sys.time()
library(devtools)
devtools::install_gitlab("grp-zaugg/grn", subdir = "src/GRNdev", host = "git.embl.de", force = TRUE)
library(GRNdev)
rootDir = "/g/scb2/zaugg/carnold/Projects/GRN_pipeline/example/dev/"
# Genome assembly shortcut. Either hg19, hg38 or mm10. Both ATAC and RNA must have the same genome assembly
genomeAssembly = "hg38"
#############
# READ DATA #
#############
# Raw counts ATAC for the consensus peaks. All rows will be used for the pipeline, so this should constitute your consensus peaks.
# No row filtering is necessary, 0 rows will be filtered in the pipeline
# One of the columns must be an ID column called "peakID", in the usual format "chr:start-end". All other columns are the counts for the samples
file_input_ATAC = paste0(rootDir, "input/countsATAC.75k.tsv.gz")
# Raw counts RNA
# No row filtering is necessary, 0 rows will be filtered in the pipeline
# One of the columns must be an ID column called "ENSEMBL", denoting ENSEMBL IDs.
file_input_RNA = paste0(rootDir, "input/countsRNA.sampled.tsv.gz")
countsRNA.df = read_tsv(file_input_RNA, col_types = cols())
countsATAC.df = read_tsv(file_input_ATAC, col_types = cols())
file_metadata = paste0(rootDir, "input/metadata.sampled.tsv")
metadata.all = read_tsv(file_metadata)
# Base directory of the folder with the TFBS predictions.
# The TFBS predictions are expected as *.bed files as well as a translation table with the name translationTable.csv
# Make sure they are in the same genome assembly as the ATAC data
motifFolder = "/g/scb2/zaugg/zaugg_shared/annotations/TFBS/hg38/PWMScan_HOCOMOCOv11"
# Normalization methods for RNA and ATAC. The default for RNA-Seq is quantile normalization, for ATAC-Seq a "regular" DESeq size factor normalization
# Possible values are: "quantile", "DESeq_sizeFactor", and "none". If "none" is used, you have to normalize counts beforehand on your own. However, make
# sure that the normalization is not sensitive to outliers. The classification can be impacted by RNA-Seq outlier counts, for example, which is why we found quantile normalization to work well for the datasets we worked with so far.
normMethodRNA = "quantile"
normMethod_peaks = "DESeq_sizeFactor"
idColumn_peaks = "peakID"
idColumn_RNA = "ENSEMBL"
# Root output folder (it does not matter whether you put a trailing slash or not)
dir_output = paste0(rootDir, "output/")
nCores = 2
# Arbitrary list with information and metadata that is stored within the GRN object
objectMetadata.l = list(name = paste0("Macrophages_infected_primed"),
file_atac = file_input_ATAC,
file_rna = file_input_RNA,
genomeAssembly = genomeAssembly,
file_metadata = file_metadata
)
#######################
# AUTOMATIC FROM HERE #
#######################
if (!dir.exists(dir_output)) dir.create(dir_output)
library(checkmate)
assertDirectoryExists(rootDir)
assertList(objectMetadata.l)
assertDirectory(dir_output)
assertChoice(genomeAssembly, c("mm10","hg19","hg38"))
assertDataFrame(countsATAC.df, min.rows = 1000, min.cols = 4)
assertDataFrame(countsRNA.df, min.rows = 1000, min.cols = 4)
#########################
# INITIALIZE GRN OBJECT #
#########################
GRN = initializeGRN(objectMetadata = objectMetadata.l,
outputFolder = dir_output,
genomeAssembly = genomeAssembly)
GRN = addData(GRN,
countsATAC.df, normalization_peaks = normMethod_peaks, idColumn_peaks = idColumn_peaks,
countsRNA.df, normalization_rna = normMethodRNA, idColumn_RNA = idColumn_RNA,
sampleMetadata = metadata.all,
forceRerun = TRUE)
######################################
# PCA plots and metadata correlation #
######################################
GRN = plotPCA_all(GRN, forceRerun = TRUE)
###########################################
# Parse TFBS folder and Gene - TF mapping #
###########################################
# Should the pipeline be run for only a subset of TFs or all? The special keyword "all" will use all TF that are found in the HOCOMOCO folder; however, if only a subset should be considered, specify the subset here with c() and the TF names, as shown below
TFs = "all"
nTFMax = NULL
nTFMax = 50
# Base directory of the folder with the TFBS predictions.
# The TFBS predictions are expected as *.bed files as well as a translation table with the name translationTable.csv
# We provide all files here: https://www.embl.de/download/zaugg/GRN/hg19_hg38_mm10_PWMScan.zip (7.5 GB)
# Make sure they are in the same genome assembly as the ATAC data
hocomocoVersion = if_else(genomeAssembly == "hg38", "v11", "v10")
motifFolder = paste0("/g/scb2/zaugg/zaugg_shared/annotations/TFBS/", genomeAssembly, "/PWMScan_HOCOMOCO", hocomocoVersion)
GRN = addTFBS(GRN, motifFolder = motifFolder, TFs = TFs, nTFMax = nTFMax, filesTFBSPattern = "_TFBS", fileEnding = ".bed", forceRerun = TRUE)
#####################
# PEAKS TFBS OVERLAP #
######################
GRN = overlapPeaksAndTFBS(GRN, nCores = nCores, forceRerun = TRUE)
####################
# FILTER ATAC DATA #
####################
# Chromosomes to include for ATAC peaks and peak-gene associations. This should be a vector of chromosome names
chrToKeep_peaks = c(paste0("chr", 1:22), "chrX", "chrY")
GRN = filterData(GRN, minNormalizedMeanATAC = 5, minNormalizedMeanRNA = 1, chrToKeep_peaks = chrToKeep_peaks, maxPeakSize = 10000, forceRerun = TRUE)
#######################
# TF-PEAK CONNECTIONS #
#######################
useGCCorrection = FALSE
GRN = addConnections_TF_peak(GRN, add_TFActivity = TRUE, plotDiagnosticPlots = TRUE, remove_negCor_TFActivity = TRUE,
corMethod = "pearson",
useGCCorrection = useGCCorrection, percBackground_size = 75, percBackground_resample = TRUE,
forceRerun = TRUE)
##########################
# RNA-SEQ CLASSIFICATION #
##########################
# Optional step: Runs automatically for both expression and TF activitys
GRN = AR_classification_wrapper(GRN, significanceThreshold_Wilcoxon = 0.05,
plot_minNoTFBS_heatmap = 100, plotDiagnosticPlots = TRUE,
deleteIntermediateData = TRUE,
forceRerun = TRUE)
# Intermediate save
saveRDS(GRN, paste0(dir_output, "/GRN.rds"))
#########################
# PEAK_GENE CONNECTIONS #
#########################
# BED file with TAD domains. If no TSD domains are available, leave empty or set to NULL
file_input_TADs = ""
# Type of overlap for gene: Either "TSS" or "full". If "full", any extended peak-gene overlap is taken, regardless of where in the gene it occurs
# If set to "TSS", only overlap of extended peaks with the TSS of the gene (assumed to be at the 5' position) is considered
# Until 09.09.20, "full" was being used by default, this parameter did not exist before
overlapTypeGene = "TSS"
# Only relevant when no TAD domains are provided; if TADs are provided, this parameter can be ignored.
# Specifies the neighborhood size in bp (for both upstream and downstream of the peak) for peaks to find genes in vicinity and associate/correlate genes with peaks
# Default value 250000, here set to a smaller value to decrease running time
promoterRange = 10000
GRN = addConnections_peak_gene(GRN,
overlapTypeGene = overlapTypeGene,
corMethod = "pearson",
promoterRange = promoterRange, file_TADs = NULL,
nCores = nCores, plotDiagnosticPlots = TRUE,
forceRerun = TRUE)
######################
# FILTER CONNECTIONS #
######################
# Save a filtered GRN with genes with loose thresholds. The user can hen read it back it and filter more stringently.
# Which gene types to keep for peak-gene correlations when connecting TF-peaks and peak-genes?
# Set to "all" to keep all gene types, or a subset of gene types as defined by Gencode. Default is c("protein_coding", "lincRNA")
geneTypes_keep = c("protein_coding", "lincRNA")
TF_peak_FDR_selectViaCorBins = TRUE
for (TF_peak.connectionTypeCur in c("expression", "TFActivity")) {
TF_peak_fdr_cur = 0.3
peak_gene_fdr_cur = 0.3
GRN = filterGRNAndConnectGenes(GRN, TF_peak.fdr.threshold = TF_peak_fdr_cur, TF_peak.connectionTypes = TF_peak.connectionTypeCur,
peak_gene.p_raw.threshold = NULL,
peak_gene.fdr.threshold = peak_gene_fdr_cur,
gene.types = geneTypes_keep,
allowMissingTFs = FALSE, allowMissingGenes = TRUE, add_TF_gene_correlation = TRUE,
peak_gene.r_range = c(0,1), TF_peak_FDR_selectViaCorBins = TF_peak_FDR_selectViaCorBins
)
############################
# ADD TF-GENE CORRELATIONS #
############################
# Optional
GRN = add_TF_gene_correlation(GRN, corMethod = "pearson", addRobustRegression = FALSE, nCores = 1, forceRerun = TRUE)
# Once the filter function has been called, they can be retrieved with a helper function
GRN_connections.all = getGRNConnections(GRN, permutation = 0, type = "all.filtered", include_TF_gene_correlations = TRUE)
filenameCur = paste0(GRN@config$directories$outputRoot,
"/GRN_TF-PeakFDR", TF_peak_fdr_cur,
"_", TF_peak.connectionTypeCur,
"_peakGeneFDR", peak_gene_fdr_cur, "_allowMissingGenes_perm0.tsv.gz")
write_tsv(GRN_connections.all, filenameCur)
TF_peak_fdr_cur = 0.2
peak_gene_fdr_cur = 0.1
for (allowMissingGenesCur in c(TRUE, FALSE)) {
GRN = filterGRNAndConnectGenes(GRN, TF_peak.fdr.threshold = TF_peak_fdr_cur, TF_peak.connectionTypes = TF_peak.connectionTypeCur,
peak_gene.p_raw.threshold = NULL,
peak_gene.fdr.threshold = peak_gene_fdr_cur,
gene.types = geneTypes_keep,
allowMissingTFs = FALSE, allowMissingGenes = allowMissingGenesCur,
peak_gene.r_range = c(0,1))
# Once the filter function has been called, they can be retrieved with a helper function
GRN_connections.all = getGRNConnections(GRN, permutation = 0, type = "all.filtered", include_TF_gene_correlations = TRUE)
suffix = if_else(allowMissingGenesCur, "_allowMissingGenes", "_notAllowMissingGenes")
filenameCur = paste0(GRN@config$directories$outputRoot,
"GRN_TF-PeakFDR", TF_peak_fdr_cur,
"_", TF_peak.connectionTypeCur,
"_peakGeneFDR", peak_gene_fdr_cur, suffix, "_perm0.tsv.gz")
write_tsv(GRN_connections.all, filenameCur)
}
}
#################
# VISUALIZE GRN #
#################
# TODO: Not working at he moment, has to be fixed
# Default metadata visualization
# visualizeGRN(GRN, perm = 0, file = NULL,
# title = if_else(!onlyCompleteCases, "All peaks", "Only peaks with gene connections"),
# maxRowsToPlot = 500,
# useDefaultMetadata = TRUE)
####################################
# WRAPPER FOR AUTOMATED STATISTICS #
####################################
GRN = generateStatsSummary(GRN,
TF_peak.fdr = c(0.001, 0.01, 0.05, 0.1, 0.2), TF_peak.connectionTypes = "all",
peak_gene.p_raw = NULL,
peak_gene.fdr = c(0.001, 0.01, 0.05, 0.1, 0.2),
peak_gene.r_range = c(0,1),
allowMissingGenes = c(FALSE, TRUE),
allowMissingTFs = c(FALSE),
gene.types = geneTypes_keep)
#######################
# ADDITIONAL QC PLOTS #
#######################
GRN = plot_stats_connectionSummary(GRN, type = "heatmap", forceRerun = TRUE)
GRN = plot_stats_connectionSummary(GRN, type = "boxplot", forceRerun = TRUE)
GRN = deleteIntermediateData(GRN)
saveRDS(GRN, paste0(dir_output, "/GRN.rds"))
flog.info("Session info: ", sessionInfo(), capture = TRUE)
endTime <- Sys.time()
futile.logger::flog.info(paste0("Finished successfully. Execution time: ", round(endTime - startTime, 1), " ", units(endTime - startTime)))
"sample_id" "assigned" "assigned_frac" "atac_date" "clone" "condition" "diff_start" "donor" "EB_formation" "macrophage_diff_days" "medium_changes" "mt_frac" "percent_duplication" "received_as" "sex" "short_long_ratio"
"babk_D" 5507093 0.210921230312979 "2015-12-04" 2 "IFNg_SL1344" "2015-10-12" "babk" "2015-10-09" 56 10 0.456376650921142 0.214115 "alive/frozen" "female" 3.93397533197864
"bima_D" 23275756 0.677443730848284 "2014-12-12" 1 "IFNg_SL1344" "2014-11-07" "bima" "2014-11-04" 38 6 0.117519740693119 0.079756 "alive" "male" 2.9130719846891
"cicb_D" 19751751 0.579721263614574 "2015-04-24" 3 "IFNg_SL1344" "2015-03-30" "cicb" "2015-03-27" 28 6 0.155462480193979 0.066931 "frozen" "male" 1.78650919136278
"coyi_D" 6733642 0.31185057682225 "2015-11-05" 3 "IFNg_SL1344" "2015-09-30" "coyi" "2015-09-27" 39 7 0.385297241068573 0.1026 "frozen" "female" 2.53084184878867
"diku_D" 7010213 0.195160647410325 "2015-11-13" 1 "IFNg_SL1344" "2015-10-15" "diku" "2015-10-12" 32 6 0.193993747097176 0.06893 "frozen" "female" 3.76757978572267
"eipl_D" 16923025 0.519579168206653 "2015-08-04" 1 "IFNg_SL1344" "2015-06-30" "eipl" "2015-06-27" 38 7 0.13988117725185 0.06944 "frozen" "female" 1.93265133137896
"eiwy_D" 9807860 0.4044462839792 "2015-12-02" 1 "IFNg_SL1344" "2015-10-23" "eiwy" "2015-10-20" 43 7 0.302453957654079 0.090776 "frozen" "female" 1.75613407276057
"eofe_D" 25687477 0.645688915687569 "2014-12-12" 1 "IFNg_SL1344" "2014-11-01" "eofe" "2014-10-29" 44 8 0.142640514763618 0.080655 "alive" "female" 2.63757415618575
"fafq_D" 4600004 0.414696506879133 "2015-10-14" 1 "IFNg_SL1344" "2015-09-16" "fafq" "2015-09-13" 31 5 0.169076649746682 0.087027 "frozen" "male" 1.92922628617947
"febc_D" 31712153 0.430018180628596 "2015-08-04" 2 "IFNg_SL1344" "2015-07-06" "febc" "2015-07-03" 32 6 0.400115313238623 0.106825 "frozen" "female" 1.45678159498552
"fikt_D" 12120655 0.43249826902993 "2015-11-05" 3 "IFNg_SL1344" "2015-10-03" "fikt" "2015-09-30" 36 5 0.301717390025625 0.090902 "frozen" "female" 1.68462488275924
"guss_D" 13876015 0.433140013544195 "2015-11-18" 1 "IFNg_SL1344" "2015-10-16" "guss" "2015-10-13" 36 5 0.356285810649081 0.080022 "frozen" "female" 2.03502060233313
"hayt_D" 20369487 0.628709157220539 "2015-10-09" 1 "IFNg_SL1344" "2015-09-12" "hayt" "2015-09-09" 30 6 0.309818415540119 0.102584 "frozen" "male" 1.92932077333801
"hehd_D" 3964874 0.192481936100666 "2015-11-05" 2 "IFNg_SL1344" "2015-09-30" "hehd" "2015-09-27" 39 8 0.248352910302544 0.097714 "frozen" "female" 3.58283704670194
"heja_D" 17033933 0.533817355964094 "2015-08-12" 1 "IFNg_SL1344" "2015-06-29" "heja" "2015-06-26" 47 9 0.114173433891483 0.080099 "frozen" "male" 1.39981868677139
"hiaf_D" 7907908 0.219613703665739 "2015-07-22" 2 "IFNg_SL1344" "2015-06-18" "hiaf" "2015-06-15" 37 7 0.087407665795477 0.045106 "frozen" "male" 3.71903790659248
"iill_D" 12728643 0.50291828405585 "2015-11-05" 1 "IFNg_SL1344" "2015-09-27" "iill" "2015-09-24" 42 7 0.276029290754818 0.08604 "frozen" "male" 1.89858291866774
"kuxp_D" 12493577 0.438787848094188 "2015-07-22" 1 "IFNg_SL1344" "2015-06-21" "kuxp" "2015-06-18" 34 6 0.188645298218458 0.068606 "frozen" "male" 1.68808634956033
"nukw_D" 17850501 0.645243508607759 "2015-10-27" 1 "IFNg_SL1344" "2015-09-23" "nukw" "2015-09-20" 46 7 0.234492649167015 0.104232 "frozen" "female" 1.26760229695547
"oapg_D" 13706648 0.478080924747425 "2015-07-31" 5 "IFNg_SL1344" "2015-06-21" "oapg" "2015-06-18" 43 7 0.211976949581975 0.064611 "frozen" "female" 1.81448916921816
"oevr_D" 10181246 0.387041137247721 "2015-10-09" 3 "IFNg_SL1344" "2015-08-30" "oevr" "2015-08-27" 48 8 0.171515043883457 0.102247 "frozen" "male" 2.29579995766913
"pamv_D" 12568782 0.530024876051982 "2015-07-31" 3 "IFNg_SL1344" "2015-07-01" "pamv" "2015-06-28" 33 6 0.226667604999457 0.083218 "frozen" "male" 3.37640181895908
"pelm_D" 12885093 0.460903123941983 "2015-08-12" 3 "IFNg_SL1344" "2015-07-03" "pelm" "2015-06-30" 43 8 0.141307760306748 0.056814 "frozen" "female" 1.7075403478253
"podx_D" 15095740 0.498507670978346 "2015-07-22" 1 "IFNg_SL1344" "2015-06-26" "podx" "2015-06-23" 29 5 0.278886251713605 0.076971 "frozen" "female" 2.50667597769246
"qolg_D" 14636966 0.508384278972431 "2015-11-13" 3 "IFNg_SL1344" "2015-10-10" "qolg" "2015-10-07" 29 6 0.345044067138392 0.079032 "frozen" "male" 1.67190803527389
"sojd_D" 9616378 0.359051864041036 "2015-10-14" 3 "IFNg_SL1344" "2015-09-11" "sojd" "2015-09-08" 36 7 0.218259508709228 0.076186 "frozen" "female" 3.22064049478649
"vass_D" 21993598 0.572955943483222 "2015-04-24" 1 "IFNg_SL1344" "2015-03-29" "vass" "2015-03-26" 29 7 0.193210532921721 0.07022 "frozen" "female" 1.69032752731357
"xugn_D" 17505168 0.532266309510938 "2015-07-22" 1 "IFNg_SL1344" "2015-06-11" "xugn" "2015-06-08" 44 8 0.128462362140362 0.07249 "frozen" "male" 3.48786721342933
"zaui_D" 7297175 0.250887074437376 "2015-11-18" 3 "IFNg_SL1344" "2015-10-11" "zaui" "2015-10-08" 41 9 0.219835807676744 0.073612 "frozen" "female" 3.6017018666313
"uaqe_D" 10294277 0.356799693383731 "2015-10-27" 1 "IFNg_SL1344" "2015-10-01" "uaqe" "2015-09-28" "NA" 7 0.218277844876963 0.090727 "frozen" "female" 4.59939570834911
"qaqx_D" 9562711 0.355390033860629 "2015-11-18" 1 "IFNg_SL1344" "2015-10-01" "qaqx" "2015-09-28" "NA" 11 0.30661352138567 0.072576 "frozen" "female" 2.78684999474948
startTime <- Sys.time()
library(devtools)
devtools::install_gitlab("grp-zaugg/grn", subdir = "src/GRN", host = "git.embl.de")
library(GRN)
rootDir = "/g/scb2/zaugg/carnold/Projects/GRN_pipeline/example/stable/"
# Genome assembly shortcut. Either hg19, hg38 or mm10. Both ATAC and RNA must have the same genome assembly
genomeAssembly = "hg38"
#############
# READ DATA #
#############
# Raw counts ATAC for the consensus peaks. All rows will be used for the pipeline, so this should constitute your consensus peaks.
# No row filtering is necessary, 0 rows will be filtered in the pipeline
# One of the columns must be an ID column called "peakID", in the usual format "chr:start-end". All other columns are the counts for the samples
file_input_ATAC = paste0(rootDir, "input/countsATAC.75k.tsv.gz")
# Raw counts RNA
# No row filtering is necessary, 0 rows will be filtered in the pipeline
# One of the columns must be an ID column called "ENSEMBL", denoting ENSEMBL IDs.
file_input_RNA = paste0(rootDir, "input/countsRNA.sampled.tsv.gz")
countsRNA.df = read_tsv(file_input_RNA, col_types = cols())
countsATAC.df = read_tsv(file_input_ATAC, col_types = cols())
# Normalization methods for RNA and ATAC. The default for RNA-Seq is quantile normalization, for ATAC-Seq a "regular" DESeq size factor normalization
# Possible values are: "quantile", "DESeq_sizeFactor", and "none". If "none" is used, you have to normalize counts beforehand on your own. However, make
# sure that the normalization is not sensitive to outliers. The classification can be impacted by RNA-Seq outlier counts, for example, which is why we found quantile normalization to work well for the datasets we worked with so far.
normMethodRNA = "quantile"
normMethodATAC = "DESeq_sizeFactor"
idColumn_ATAC = "peakID"
idColumn_RNA = "ENSEMBL"
file_metadata = paste0(rootDir, "input/metadata.sampled.tsv")
metadata.all = read_tsv(file_metadata)
# Root output folder (it does not matter whether you put a trailing slash or not)
dir_output = paste0(rootDir, "output/")
nCores = 2
# Arbitrary list with information and metadata that is stored within the GRN object
objectMetadata.l = list(name = paste0("Macrophages_infected_primed"),
file_atac = file_input_ATAC,
file_rna = file_input_RNA,
genomeAssembly = genomeAssembly,
file_metadata = file_metadata
)
#######################
# AUTOMATIC FROM HERE #
#######################
library(checkmate)
assertDirectoryExists(rootDir)
assertList(objectMetadata.l)
assertDirectory(dir_output)
assertChoice(genomeAssembly, c("mm9","mm10","hg19","hg38"))
assertDataFrame(countsATAC.df, min.rows = 1000, min.cols = 4)
assertDataFrame(countsRNA.df, min.rows = 1000, min.cols = 4)
#########################
# INITIALIZE GRN OBJECT #
#########################
GRN = initializeGRN(objectMetadata = objectMetadata.l,
outputFolder = dir_output,
genomeAssembly = genomeAssembly)
GRN = addData(GRN,
countsATAC.df, normalization_atac = normMethodATAC, idColumn_ATAC = idColumn_ATAC,
countsRNA.df, normalization_rna = normMethodRNA, idColumn_RNA = idColumn_RNA,
metadata = metadata.all,
forceRerun = TRUE)
######################################
# PCA plots and metadata correlation #
######################################
plotPCA_all(GRN, type = c("RNA", "ATAC"), outputFolder = NULL, forceRerun = TRUE)
###########################################
# Parse TFBS folder and Gene - TF mapping #
###########################################
# Should the pipeline be run for only a subset of TFs or all? The special keyword "all" will use all TF that are found in the HOCOMOCO folder; however, if only a subset should be considered, specify the subset here with c() and the TF names, as shown below
TFs = "all"
nTFMax = NULL
nTFMax = 50
# Base directory of the folder with the TFBS predictions.
# The TFBS predictions are expected as *.bed files as well as a translation table with the name translationTable.csv
# We provide all files here: https://www.embl.de/download/zaugg/GRN/hg19_hg38_mm10_PWMScan.zip (7.5 GB)
# Make sure they are in the same genome assembly as the ATAC data
hocomocoVersion = if_else(genomeAssembly == "hg38", "v11", "v10")
motifFolder = paste0("/g/scb2/zaugg/zaugg_shared/annotations/TFBS/", genomeAssembly, "/PWMScan_HOCOMOCO", hocomocoVersion)
GRN = addTFBS(GRN, motifFolder = motifFolder, TFs = TFs, nTFMax = nTFMax, filesTFBSPattern = "_TFBS", fileEnding = ".bed", forceRerun = TRUE)
#####################
# PEAKS TFBS OVERLAP #
######################
GRN = overlapPeaksAndTFBS(GRN, nCores = nCores, forceRerun = TRUE)
####################
# FILTER ATAC DATA #
####################
# Chromosomes to include for ATAC peaks and peak-gene associations. This should be a vector of chromosome names
chrToKeep_peaks = c(paste0("chr", 1:22), "chrX", "chrY")
GRN = filterData(GRN, minNormalizedMeanATAC = 5, minNormalizedMeanRNA = 1, chrToKeep_peaks = chrToKeep_peaks, maxPeakSize = 10000, forceRerun = TRUE)
#######################
# TF-PEAK CONNECTIONS #
#######################
useGCCorrection = FALSE
GRN = addConnections_TF_peak(GRN, add_TFActivity = TRUE, plotDiagnosticPlots = TRUE, remove_negCor_TFActivity = TRUE,
corMethod = "pearson",
useGCCorrection = useGCCorrection, percBackground_size = 75, percBackground_resample = TRUE,
forceRerun = TRUE)
##########################
# RNA-SEQ CLASSIFICATION #
##########################
# Optional step: Runs automatically for both expression and TF activitys
GRN = AR_classification_wrapper(GRN, significanceThreshold_Wilcoxon = 0.05,
plot_minNoTFBS_heatmap = 100, plotDiagnosticPlots = TRUE,
deleteIntermediateData = TRUE,
forceRerun = TRUE)
# Intermediate save
saveRDS(GRN, paste0(dir_output, "/GRN.rds"))
#########################
# PEAK_GENE CONNECTIONS #
#########################
# BED file with TAD domains. If no TSD domains are available, leave empty or set to NULL
file_input_TADs = ""
# Type of overlap for gene: Either "TSS" or "full". If "full", any extended peak-gene overlap is taken, regardless of where in the gene it occurs
# If set to "TSS", only overlap of extended peaks with the TSS of the gene (assumed to be at the 5' position) is considered
# Until 09.09.20, "full" was being used by default, this parameter did not exist before
overlapTypeGene = "TSS"
# Only relevant when no TAD domains are provided; if TADs are provided, this parameter can be ignored.
# Specifies the neighborhood size in bp (for both upstream and downstream of the peak) for peaks to find genes in vicinity and associate/correlate genes with peaks
# Default value 250000, here set to a smaller value to decrease running time
promoterRange = 10000
GRN = addConnections_peak_gene(GRN,
overlapTypeGene = overlapTypeGene,
corMethod = "pearson",
promoterRange = promoterRange, file_TADs = NULL,
nCores = nCores, plotDiagnosticPlots = TRUE,
forceRerun = TRUE)
######################
# FILTER CONNECTIONS #
######################
# Save a filtered GRN with genes with loose thresholds. The user can hen read it back it and filter more stringently.
# Which gene types to keep for peak-gene correlations when connecting TF-peaks and peak-genes?
# Set to "all" to keep all gene types, or a subset of gene types as defined by Gencode. Default is c("protein_coding", "lincRNA")
geneTypes_keep = c("protein_coding", "lincRNA")
TF_peak_FDR_selectViaCorBins = TRUE
for (TF_peak.connectionTypeCur in c("expression", "TFActivity")) {
TF_peak_fdr_cur = 0.3
peak_gene_fdr_cur = 0.3
GRN = filterGRNAndConnectGenes(GRN, TF_peak.fdr.threshold = TF_peak_fdr_cur, TF_peak.connectionTypes = TF_peak.connectionTypeCur,
peak_gene.p_raw.threshold = NULL,
peak_gene.fdr.threshold = peak_gene_fdr_cur,
gene.types = geneTypes_keep,
allowMissingTFs = FALSE, allowMissingGenes = TRUE, add_TF_gene_correlation = TRUE,
peak_gene.r_range = c(0,1), TF_peak_FDR_selectViaCorBins = TF_peak_FDR_selectViaCorBins
)
# Once the filter function has been called, they can be retrieved with a helper function
GRN_connections.all = getGRNConnections(GRN, permutation = 0, type = "all.filtered", include_TF_gene_correlations = TRUE)
filenameCur = paste0(GRN@config$directories$outputRoot,
"/GRN_TF-PeakFDR", TF_peak_fdr_cur,
"_", TF_peak.connectionTypeCur,
"_peakGeneFDR", peak_gene_fdr_cur, "_allowMissingGenes_perm0.tsv.gz")
write_tsv(GRN_connections.all, filenameCur)
TF_peak_fdr_cur = 0.2
peak_gene_fdr_cur = 0.1
for (allowMissingGenesCur in c(TRUE, FALSE)) {
GRN = filterGRNAndConnectGenes(GRN, TF_peak.fdr.threshold = TF_peak_fdr_cur, TF_peak.connectionTypes = TF_peak.connectionTypeCur,
peak_gene.p_raw.threshold = NULL,
peak_gene.fdr.threshold = peak_gene_fdr_cur,
gene.types = geneTypes_keep,
allowMissingTFs = FALSE, allowMissingGenes = allowMissingGenesCur, add_TF_gene_correlation = TRUE,
peak_gene.r_range = c(0,1))
# Once the filter function has been called, they can be retrieved with a helper function
GRN_connections.all = getGRNConnections(GRN, permutation = 0, type = "all.filtered", include_TF_gene_correlations = TRUE)
suffix = if_else(allowMissingGenesCur, "_allowMissingGenes", "_notAllowMissingGenes")