Skip to content
Snippets Groups Projects
Verified Commit 6aaa3720 authored by Renato Alves's avatar Renato Alves :seedling:
Browse files

Add solution to exercise 5 for day 3

parent b66e163a
No related branches found
No related tags found
No related merge requests found
# PathfindR tutorial and exercises
# 10.2020
# v.0.1
# Matt Rogon EMBL CBNA
install.packages("pathfindR")
library(pathfindR)
openVignette(pathfindR)
# Let's explore the package and functions
# https://github.com/egeulgen/pathfindR
# paper
# https://www.frontiersin.org/articles/10.3389/fgene.2019.00858/full
# website
# https://egeulgen.github.io/pathfindR/
# load sample data into a dataframe
input_df <- pathfindR.data::RA_input
# have a look at the formatting
View(input_df)
# all you need are 3 columns to execute the pathfindR main wrapper
output_df <- run_pathfindR(input_df)
# visualise the output_df results
#--------------------------------------------------------------------------------------
# Exercise 1:
# Load the following file:
# pathfinder_input_exercise_clusters.tsv
#
# prepare the dataframe for pathfindeR
# you will need 3 columns:
# Gene name
# logFC
# adj P Val
# Run pathfindR wrapper function on this dataset
# visualise
#--------------------------------------------------------------------------------------
# Exercise 2
# Create a custom script that will execute pathfindR with the following settings:
# Benjamini-Hochberg FDR
# adjusted p_val_threshold 0.01
# gene sets: GO-BP
# minimum gene set size 5, maximum 100
# PIN protein interaction reference network: IntAct
#
# generate a fuzzy-clustered result network of terms
# term gene heatmap
# term gene graph
# upset plot
#--------------------------------------------------------------------------------------
# Exercise 3
# Use the original file pathfinder_input_exercise_clusters.txt and compare clusters 1 and 2 with pathfindR
# using combine_pathfindR_results function
# produce visualisations
#--------------------------------------------------------------------------------------
# load the prepared file (columns must be numeric)
pathfinder_input_exercise_noClusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_noClusters.txt")
pathfinder_input_exercise_noClusters$adj.P.Val <- as.numeric(pathfinder_input_exercise_noClusters$adj.P.Val)
pathfinder_input_exercise_noClusters$logFC <- as.numeric(pathfinder_input_exercise_noClusters$logFC)
# make sure rows are complete
pathfinder_input_exercise_noClusters <- pathfinder_input_exercise_noClusters[complete.cases(pathfinder_input_exercise_noClusters), ]
# run it
output_noClust <- run_pathfindR(pathfinder_input_exercise_noClusters)
# define threshold
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, p_val_threshold = 0.01)
# define gene sets
# output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, gene_sets = "GO-MF")
## Including more terms for enrichment analysis
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters,
gene_sets = "GO-MF",
min_gset_size = 5,
max_gset_size = 500)
# By default, run_pathfindR() adjusts the enrichment p values via the “bonferroni” method
# and filters the enriched terms by adjusted-p value < 0.05.
# To change this adjustment method and the threshold, set adj_method and enrichment_threshold,
# respectively, here we have benjamini hochberg fdr with 0.01 thresholding:
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters,
adj_method = "fdr",
enrichment_threshold = 0.01)
# Protein-protein Interaction Network and Background
# For the active subnetwork search process, a protein-protein interaction network (PIN) is used. run_pathfindR() maps the input genes onto this PIN and identifies active subnetworks which are then be used for enrichment analyses. To change the default PIN (“Biogrid”), use the pin_name_path argument:
# The pin_name_path argument can be one of “Biogrid”, “STRING”, “GeneMania”, “IntAct”, “KEGG”, “mmu_STRING” or it can be the path to a custom PIN file provided by the user.
# NOTE: the PIN is also used for generating the background genes (in this case, all unique genes in the PIN) during hypergeometric-distribution-based tests in enrichment analyses. Therefore, a large PIN will generally result in better results.
# to use an external PIN of your choice
# output_df <- run_pathfindR(input_df, pin_name_path = "/path/to/myPIN.sif")
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, pin_name_path = "IntAct")
# plot the results with various graphs
term_gene_heatmap(result_df = output_df, genes_df = pathfinder_input_exercise_noClusters)
term_gene_graph(result_df = output_df, use_description = TRUE)
UpSet_plot(result_df = output_df, genes_df = RA_input)
# lets cluster the output enriched terms - fuzzy clustering
RA_clustered_fuzzy <- cluster_enriched_terms(output_df, method = "fuzzy")
#This function first calculates the pairwise kappa statistics between the terms in the data frame obtained from run_pathfindR(). It then clusters the terms and partitions them into relevant clusters. As can be seen in the diagram, there are two options for clustering: (1) hierarchical clustering and (2) fuzzy clustering.
# https://github.com/egeulgen/pathfindR/wiki/Clustering%20Documentation
# run Comparison of 2 pathfindR Results
The function combine_pathfindR_results() allows combination of two pathfindR active-subnetwork-oriented enrichment analysis results for investigating common and distinct terms between the groups. Below is an example for comparing results using two different rheumatoid arthritis-related data sets(RA_output and RA_comparison_output).
combined_df <- combine_pathfindR_results(result_A = RA_output,
result_B = RA_comparison_output,
plot_common = FALSE)
pathfinder_input_exercise_clusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_clusters.txt")
non human
https://cran.r-project.org/web/packages/pathfindR/vignettes/non_hs_analysis.html
# Running pathfindR manually to control the environment and statistics
# https://egeulgen.github.io/pathfindR/articles/manual_execution.html
#---------------------------------------------------------------------------------------
#1
#Process input data
# After checking that the data frame complies with the requirements,
# input_processing() filters the input so that genes with p values larger than
# p_val_threshold are excluded.
# Next, gene symbols that are not in the PIN are identified and excluded.
# For human genes, if aliases of these missing gene symbols are found in the PIN,
# these symbols are converted to the corresponding aliases (controlled by the argument
# convert2alias). This step is performed to best map the input data onto the PIN.
#---------------------------------------------------------------------------------------
RA_processed <- input_processing(input = RA_input, # the input: in this case, differential expression results
p_val_threshold = 0.05, # p value threshold to filter significant genes
pin_name_path = "Biogrid", # the name of the PIN to use for active subnetwork search
convert2alias = TRUE) # boolean indicating whether or not to convert missing symbols to alias symbols in the PIN
#---------------------------------------------------------------------------------------
# 2
# Obtain Gene Set Data necessary for enrichment analyses using fetch_gene_set():
# using "BioCarta" as our gene sets for enrichment
# available gene sets are
# “KEGG”, “Reactome”, “BioCarta”, “GO-All”, “GO-BP”, “GO-CC” and “GO-MF”.
# If the user prefers to use another gene set source, the gene_sets argument should be
# set to "Custom" and the custom gene sets (list) and the custom gene set descriptions
# (named vector) should be supplied via the arguments custom_genes and custom_descriptions,
# respectively. See ?fetch_gene_set for more details.
#---------------------------------------------------------------------------------------
biocarta_list <- fetch_gene_set(gene_sets = "BioCarta",
min_gset_size = 10,
max_gset_size = 300)
biocarta_gsets <- biocarta_list[[1]]
biocarta_descriptions <- biocarta_list[[2]]
#---------------------------------------------------------------------------------------
# 3. Active Subnetwork Search and Enrichment Analyses
# As outlined in the vignette Introduction to pathfindR, run_pathfindR() initially identifies and filters active subnetworks, then performs enrichment analyses on these subnetworks and summarize the results.
# To perform these steps manually, we utilize the function active_snw_search() for identifying and filtering active subnetworks and the function enrichment_analyses() for obtaining enriched terms using these subnetworks. Because the active subnetwork search algorithms are stochastic, we suggest iterating these subnetwork identification and enrichment steps multiple times (especially for “SA”)1:
#---------------------------------------------------------------------------------------
Active Subnetwork Search Method
# Currently, there are three algorithms implemented in pathfindR for active subnetwork search: Greedy Algorithm (default, based on Ideker et al. 2), Simulated Annealing Algorithm (based on Ideker et al. 3) and Genetic Algorithm (based on Ozisik et al. 4). For a detailed discussion on which algorithm to use see this wiki entry
# for simulated annealing:
#output_df <- run_pathfindR(input_df, search_method = "SA")
# for genetic algorithm:
# output_df <- run_pathfindR(input_df, search_method = "GA")
#
n_iter <- 10 ## number of iterations
combined_res <- NULL ## to store the result of each iteration
for (i in 1:n_iter) {
###### Active Subnetwork Search
snws_file <- paste0("active_snws_", i) # Name of output file
active_snws <- active_snw_search(input_for_search = RA_processed,
pin_name_path = "Biogrid",
snws_file = snws_file,
score_quan_thr = 0.8, # you may tweak these arguments for optimal filtering of subnetworks
sig_gene_thr = 0.02, # you may tweak these arguments for optimal filtering of subnetworks
search_method = "GR")
###### Enrichment Analyses
current_res <- enrichment_analyses(snws = active_snws,
sig_genes_vec = RA_processed$GENE,
pin_name_path = "Biogrid",
genes_by_term = biocarta_gsets,
term_descriptions = biocarta_descriptions,
adj_method = "bonferroni",
enrichment_threshold = 0.05,
list_active_snw_genes = TRUE) # listing the non-input active snw genes in output
###### Combine results via `rbind`
combined_res <- rbind(combined_res, current_res)
}
custom_result <- run_pathfindR(input_df,
gene_sets = "Custom",
custom_genes = custom_genes,
custom_descriptions = custom_descriptions,
max_gset_size = Inf, # DO NOT LIMIT GENE SET SIZE
iterations = 1, # for demo, setting number of iterations to 1
output_dir = "misc/v1.4/CREB_MYC")
# since we are using pathfindR we will not need pcsf, however have a look at it
# source("http://bioconductor.org/biocLite.R")
#biocLite("topGO")
#install.packages("devtools", dependencies=TRUE)
#devtools::install_github("IOR-Bioinformatics/PCSF", repos=BiocInstaller::biocinstallRepos(),
dependencies=TRUE, type="source", force=TRUE)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment