Exercise_5_pathfindR_tutorial.txt

# PathfindR tutorial and exercises
# 10.2020
# v.0.1
# Matt Rogon EMBL CBNA

install.packages("pathfindR")
library(pathfindR)

openVignette(pathfindR)
# Let's explore the package and functions
# https://github.com/egeulgen/pathfindR
# paper
# https://www.frontiersin.org/articles/10.3389/fgene.2019.00858/full
# website
# https://egeulgen.github.io/pathfindR/


# load sample data into a dataframe
input_df <- pathfindR.data::RA_input


# have a look at the formatting
View(input_df)

# all you need are 3 columns to execute the pathfindR main wrapper
output_df <- run_pathfindR(input_df)

# visualise the output_df results


#--------------------------------------------------------------------------------------
# Exercise 1:
# Load the following file: 
# pathfinder_input_exercise_clusters.tsv
#
# prepare the dataframe for pathfindeR
# you will need 3 columns:
# Gene name
# logFC
# adj P Val
# Run pathfindR wrapper function on this dataset
# visualise

#--------------------------------------------------------------------------------------

# Exercise 2
# Create a custom script that will execute pathfindR with the following settings:
# Benjamini-Hochberg FDR
# adjusted p_val_threshold 0.01
# gene sets: GO-BP
# minimum gene set size 5, maximum 100
# PIN protein interaction reference network: IntAct
# 
# generate a fuzzy-clustered result network of terms
# term gene heatmap
# term gene graph
# upset plot

#--------------------------------------------------------------------------------------

# Exercise 3
# Use the original file pathfinder_input_exercise_clusters.txt and compare clusters 1 and 2 with pathfindR
# using combine_pathfindR_results function
# produce visualisations

#--------------------------------------------------------------------------------------


# load the prepared file (columns must be numeric)
pathfinder_input_exercise_noClusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_noClusters.txt")
pathfinder_input_exercise_noClusters$adj.P.Val <- as.numeric(pathfinder_input_exercise_noClusters$adj.P.Val)
pathfinder_input_exercise_noClusters$logFC <- as.numeric(pathfinder_input_exercise_noClusters$logFC)

  
# make sure rows are complete
pathfinder_input_exercise_noClusters <- pathfinder_input_exercise_noClusters[complete.cases(pathfinder_input_exercise_noClusters), ]

# run it
output_noClust <- run_pathfindR(pathfinder_input_exercise_noClusters)

# define threshold
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, p_val_threshold = 0.01)

# define gene sets
# output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, gene_sets = "GO-MF")

## Including more terms for enrichment analysis
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, 
                           gene_sets = "GO-MF",
                           min_gset_size = 5,
                           max_gset_size = 500)

# By default, run_pathfindR() adjusts the enrichment p values via the “bonferroni” method 
# and filters the enriched terms by adjusted-p value < 0.05. 
# To change this adjustment method and the threshold, set adj_method and enrichment_threshold,
# respectively, here we have benjamini hochberg fdr with 0.01 thresholding:
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, 
                           adj_method = "fdr",
                           enrichment_threshold = 0.01)


# Protein-protein Interaction Network and Background
# For the active subnetwork search process, a protein-protein interaction network (PIN) is used. run_pathfindR() maps the input genes onto this PIN and identifies active subnetworks which are then be used for enrichment analyses. To change the default PIN (“Biogrid”), use the pin_name_path argument:
# The pin_name_path argument can be one of “Biogrid”, “STRING”, “GeneMania”, “IntAct”, “KEGG”, “mmu_STRING” or it can be the path to a custom PIN file provided by the user.
# NOTE: the PIN is also used for generating the background genes (in this case, all unique genes in the PIN) during hypergeometric-distribution-based tests in enrichment analyses. Therefore, a large PIN will generally result in better results.
# to use an external PIN of your choice
# output_df <- run_pathfindR(input_df, pin_name_path = "/path/to/myPIN.sif")

output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, pin_name_path = "IntAct")


# plot the results with various graphs
term_gene_heatmap(result_df = output_df, genes_df = pathfinder_input_exercise_noClusters)
term_gene_graph(result_df = output_df, use_description = TRUE)
UpSet_plot(result_df = output_df, genes_df = RA_input)


# lets cluster the output enriched terms - fuzzy clustering 
RA_clustered_fuzzy <- cluster_enriched_terms(output_df, method = "fuzzy")

#This function first calculates the pairwise kappa statistics between the terms in the data frame obtained from run_pathfindR(). It then clusters the terms and partitions them into relevant clusters. As can be seen in the diagram, there are two options for clustering: (1) hierarchical clustering and (2) fuzzy clustering.
# https://github.com/egeulgen/pathfindR/wiki/Clustering%20Documentation


# run Comparison of 2 pathfindR Results
The function combine_pathfindR_results() allows combination of two pathfindR active-subnetwork-oriented enrichment analysis results for investigating common and distinct terms between the groups. Below is an example for comparing results using two different rheumatoid arthritis-related data sets(RA_output and RA_comparison_output).

combined_df <- combine_pathfindR_results(result_A = RA_output, 
                                         result_B = RA_comparison_output, 
                                         plot_common = FALSE)
                                         
                                         
pathfinder_input_exercise_clusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_clusters.txt")


non human
https://cran.r-project.org/web/packages/pathfindR/vignettes/non_hs_analysis.html


# Running pathfindR manually to control the environment and statistics

# https://egeulgen.github.io/pathfindR/articles/manual_execution.html

#---------------------------------------------------------------------------------------
#1
#Process input data
# After checking that the data frame complies with the requirements, 
# input_processing() filters the input so that genes with p values larger than 
# p_val_threshold are excluded. 
# Next, gene symbols that are not in the PIN are identified and excluded. 
# For human genes, if aliases of these missing gene symbols are found in the PIN, 
# these symbols are converted to the corresponding aliases (controlled by the argument 
# convert2alias). This step is performed to best map the input data onto the PIN.
#---------------------------------------------------------------------------------------

RA_processed <- input_processing(input = RA_input, # the input: in this case, differential expression results
                                 p_val_threshold = 0.05, # p value threshold to filter significant genes
                                 pin_name_path  = "Biogrid", # the name of the PIN to use for active subnetwork search
                                 convert2alias = TRUE) # boolean indicating whether or not to convert missing symbols to alias symbols in the PIN


#---------------------------------------------------------------------------------------
# 2
# Obtain Gene Set Data necessary for enrichment analyses using fetch_gene_set():
# using "BioCarta" as our gene sets for enrichment
# available gene sets are 
# “KEGG”, “Reactome”, “BioCarta”, “GO-All”, “GO-BP”, “GO-CC” and “GO-MF”. 
# If the user prefers to use another gene set source, the gene_sets argument should be
# set to "Custom" and the custom gene sets (list) and the custom gene set descriptions 
# (named vector) should be supplied via the arguments custom_genes and custom_descriptions, 
# respectively. See ?fetch_gene_set for more details.
#---------------------------------------------------------------------------------------

biocarta_list <- fetch_gene_set(gene_sets = "BioCarta",
                                min_gset_size = 10,
                                max_gset_size = 300)
biocarta_gsets <- biocarta_list[[1]]
biocarta_descriptions <- biocarta_list[[2]]


#---------------------------------------------------------------------------------------
# 3. Active Subnetwork Search and Enrichment Analyses
# As outlined in the vignette Introduction to pathfindR, run_pathfindR() initially identifies and filters active subnetworks, then performs enrichment analyses on these subnetworks and summarize the results.
# To perform these steps manually, we utilize the function active_snw_search() for identifying and filtering active subnetworks and the function enrichment_analyses() for obtaining enriched terms using these subnetworks. Because the active subnetwork search algorithms are stochastic, we suggest iterating these subnetwork identification and enrichment steps multiple times (especially for “SA”)1:
#---------------------------------------------------------------------------------------


Active Subnetwork Search Method

# Currently, there are three algorithms implemented in pathfindR for active subnetwork search: Greedy Algorithm (default, based on Ideker et al. 2), Simulated Annealing Algorithm (based on Ideker et al. 3) and Genetic Algorithm (based on Ozisik et al. 4). For a detailed discussion on which algorithm to use see this wiki entry
# for simulated annealing:
#output_df <- run_pathfindR(input_df, search_method = "SA")
# for genetic algorithm:
# output_df <- run_pathfindR(input_df, search_method = "GA")
#


n_iter <- 10 ## number of iterations
combined_res <- NULL ## to store the result of each iteration

for (i in 1:n_iter) {
  
  ###### Active Subnetwork Search
  snws_file <- paste0("active_snws_", i) # Name of output file
  active_snws <- active_snw_search(input_for_search = RA_processed, 
                                   pin_name_path = "Biogrid", 
                                   snws_file = snws_file,
                                   score_quan_thr = 0.8, # you may tweak these arguments for optimal filtering of subnetworks
                                   sig_gene_thr = 0.02, # you may tweak these arguments for optimal filtering of subnetworks
                                   search_method = "GR")
  
  ###### Enrichment Analyses
  current_res <- enrichment_analyses(snws = active_snws,
                                     sig_genes_vec = RA_processed$GENE,
                                     pin_name_path = "Biogrid", 
                                     genes_by_term = biocarta_gsets,
                                     term_descriptions = biocarta_descriptions,
                                     adj_method = "bonferroni",
                                     enrichment_threshold = 0.05,
                                     list_active_snw_genes = TRUE) # listing the non-input active snw genes in output
  
  ###### Combine results via `rbind`
  combined_res <- rbind(combined_res, current_res)
}


custom_result <- run_pathfindR(input_df,
                               gene_sets = "Custom",
                               custom_genes = custom_genes,
                               custom_descriptions = custom_descriptions,
                               max_gset_size = Inf, # DO NOT LIMIT GENE SET SIZE
                               iterations = 1, # for demo, setting number of iterations to 1
                               output_dir = "misc/v1.4/CREB_MYC")


# since we are using pathfindR we will not need pcsf, however have a look at it
# source("http://bioconductor.org/biocLite.R")
#biocLite("topGO")
#install.packages("devtools", dependencies=TRUE)
#devtools::install_github("IOR-Bioinformatics/PCSF", repos=BiocInstaller::biocinstallRepos(),
                         dependencies=TRUE, type="source", force=TRUE)