# PathfindR tutorial and exercises # 10.2020 # v.0.1 # Matt Rogon EMBL CBNA install.packages("pathfindR") library(pathfindR) openVignette(pathfindR) # Let's explore the package and functions # https://github.com/egeulgen/pathfindR # paper # https://www.frontiersin.org/articles/10.3389/fgene.2019.00858/full # website # https://egeulgen.github.io/pathfindR/ # load sample data into a dataframe input_df <- pathfindR.data::RA_input # have a look at the formatting View(input_df) # all you need are 3 columns to execute the pathfindR main wrapper output_df <- run_pathfindR(input_df) # visualise the output_df results #-------------------------------------------------------------------------------------- # Exercise 1: # Load the following file: # pathfinder_input_exercise_clusters.tsv # # prepare the dataframe for pathfindeR # you will need 3 columns: # Gene name # logFC # adj P Val # Run pathfindR wrapper function on this dataset # visualise #-------------------------------------------------------------------------------------- # Exercise 2 # Create a custom script that will execute pathfindR with the following settings: # Benjamini-Hochberg FDR # adjusted p_val_threshold 0.01 # gene sets: GO-BP # minimum gene set size 5, maximum 100 # PIN protein interaction reference network: IntAct # # generate a fuzzy-clustered result network of terms # term gene heatmap # term gene graph # upset plot #-------------------------------------------------------------------------------------- # Exercise 3 # Use the original file pathfinder_input_exercise_clusters.txt and compare clusters 1 and 2 with pathfindR # using combine_pathfindR_results function # produce visualisations #-------------------------------------------------------------------------------------- # load the prepared file (columns must be numeric) pathfinder_input_exercise_noClusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_noClusters.txt") pathfinder_input_exercise_noClusters$adj.P.Val <- as.numeric(pathfinder_input_exercise_noClusters$adj.P.Val) pathfinder_input_exercise_noClusters$logFC <- as.numeric(pathfinder_input_exercise_noClusters$logFC) # make sure rows are complete pathfinder_input_exercise_noClusters <- pathfinder_input_exercise_noClusters[complete.cases(pathfinder_input_exercise_noClusters), ] # run it output_noClust <- run_pathfindR(pathfinder_input_exercise_noClusters) # define threshold output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, p_val_threshold = 0.01) # define gene sets # output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, gene_sets = "GO-MF") ## Including more terms for enrichment analysis output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, gene_sets = "GO-MF", min_gset_size = 5, max_gset_size = 500) # By default, run_pathfindR() adjusts the enrichment p values via the “bonferroni” method # and filters the enriched terms by adjusted-p value < 0.05. # To change this adjustment method and the threshold, set adj_method and enrichment_threshold, # respectively, here we have benjamini hochberg fdr with 0.01 thresholding: output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, adj_method = "fdr", enrichment_threshold = 0.01) # Protein-protein Interaction Network and Background # For the active subnetwork search process, a protein-protein interaction network (PIN) is used. run_pathfindR() maps the input genes onto this PIN and identifies active subnetworks which are then be used for enrichment analyses. To change the default PIN (“Biogrid”), use the pin_name_path argument: # The pin_name_path argument can be one of “Biogrid”, “STRING”, “GeneMania”, “IntAct”, “KEGG”, “mmu_STRING” or it can be the path to a custom PIN file provided by the user. # NOTE: the PIN is also used for generating the background genes (in this case, all unique genes in the PIN) during hypergeometric-distribution-based tests in enrichment analyses. Therefore, a large PIN will generally result in better results. # to use an external PIN of your choice # output_df <- run_pathfindR(input_df, pin_name_path = "/path/to/myPIN.sif") output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, pin_name_path = "IntAct") # plot the results with various graphs term_gene_heatmap(result_df = output_df, genes_df = pathfinder_input_exercise_noClusters) term_gene_graph(result_df = output_df, use_description = TRUE) UpSet_plot(result_df = output_df, genes_df = RA_input) # lets cluster the output enriched terms - fuzzy clustering RA_clustered_fuzzy <- cluster_enriched_terms(output_df, method = "fuzzy") #This function first calculates the pairwise kappa statistics between the terms in the data frame obtained from run_pathfindR(). It then clusters the terms and partitions them into relevant clusters. As can be seen in the diagram, there are two options for clustering: (1) hierarchical clustering and (2) fuzzy clustering. # https://github.com/egeulgen/pathfindR/wiki/Clustering%20Documentation # run Comparison of 2 pathfindR Results The function combine_pathfindR_results() allows combination of two pathfindR active-subnetwork-oriented enrichment analysis results for investigating common and distinct terms between the groups. Below is an example for comparing results using two different rheumatoid arthritis-related data sets(RA_output and RA_comparison_output). combined_df <- combine_pathfindR_results(result_A = RA_output, result_B = RA_comparison_output, plot_common = FALSE) pathfinder_input_exercise_clusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_clusters.txt") non human https://cran.r-project.org/web/packages/pathfindR/vignettes/non_hs_analysis.html # Running pathfindR manually to control the environment and statistics # https://egeulgen.github.io/pathfindR/articles/manual_execution.html #--------------------------------------------------------------------------------------- #1 #Process input data # After checking that the data frame complies with the requirements, # input_processing() filters the input so that genes with p values larger than # p_val_threshold are excluded. # Next, gene symbols that are not in the PIN are identified and excluded. # For human genes, if aliases of these missing gene symbols are found in the PIN, # these symbols are converted to the corresponding aliases (controlled by the argument # convert2alias). This step is performed to best map the input data onto the PIN. #--------------------------------------------------------------------------------------- RA_processed <- input_processing(input = RA_input, # the input: in this case, differential expression results p_val_threshold = 0.05, # p value threshold to filter significant genes pin_name_path = "Biogrid", # the name of the PIN to use for active subnetwork search convert2alias = TRUE) # boolean indicating whether or not to convert missing symbols to alias symbols in the PIN #--------------------------------------------------------------------------------------- # 2 # Obtain Gene Set Data necessary for enrichment analyses using fetch_gene_set(): # using "BioCarta" as our gene sets for enrichment # available gene sets are # “KEGG”, “Reactome”, “BioCarta”, “GO-All”, “GO-BP”, “GO-CC” and “GO-MF”. # If the user prefers to use another gene set source, the gene_sets argument should be # set to "Custom" and the custom gene sets (list) and the custom gene set descriptions # (named vector) should be supplied via the arguments custom_genes and custom_descriptions, # respectively. See ?fetch_gene_set for more details. #--------------------------------------------------------------------------------------- biocarta_list <- fetch_gene_set(gene_sets = "BioCarta", min_gset_size = 10, max_gset_size = 300) biocarta_gsets <- biocarta_list[[1]] biocarta_descriptions <- biocarta_list[[2]] #--------------------------------------------------------------------------------------- # 3. Active Subnetwork Search and Enrichment Analyses # As outlined in the vignette Introduction to pathfindR, run_pathfindR() initially identifies and filters active subnetworks, then performs enrichment analyses on these subnetworks and summarize the results. # To perform these steps manually, we utilize the function active_snw_search() for identifying and filtering active subnetworks and the function enrichment_analyses() for obtaining enriched terms using these subnetworks. Because the active subnetwork search algorithms are stochastic, we suggest iterating these subnetwork identification and enrichment steps multiple times (especially for “SA”)1: #--------------------------------------------------------------------------------------- Active Subnetwork Search Method # Currently, there are three algorithms implemented in pathfindR for active subnetwork search: Greedy Algorithm (default, based on Ideker et al. 2), Simulated Annealing Algorithm (based on Ideker et al. 3) and Genetic Algorithm (based on Ozisik et al. 4). For a detailed discussion on which algorithm to use see this wiki entry # for simulated annealing: #output_df <- run_pathfindR(input_df, search_method = "SA") # for genetic algorithm: # output_df <- run_pathfindR(input_df, search_method = "GA") # n_iter <- 10 ## number of iterations combined_res <- NULL ## to store the result of each iteration for (i in 1:n_iter) { ###### Active Subnetwork Search snws_file <- paste0("active_snws_", i) # Name of output file active_snws <- active_snw_search(input_for_search = RA_processed, pin_name_path = "Biogrid", snws_file = snws_file, score_quan_thr = 0.8, # you may tweak these arguments for optimal filtering of subnetworks sig_gene_thr = 0.02, # you may tweak these arguments for optimal filtering of subnetworks search_method = "GR") ###### Enrichment Analyses current_res <- enrichment_analyses(snws = active_snws, sig_genes_vec = RA_processed$GENE, pin_name_path = "Biogrid", genes_by_term = biocarta_gsets, term_descriptions = biocarta_descriptions, adj_method = "bonferroni", enrichment_threshold = 0.05, list_active_snw_genes = TRUE) # listing the non-input active snw genes in output ###### Combine results via `rbind` combined_res <- rbind(combined_res, current_res) } custom_result <- run_pathfindR(input_df, gene_sets = "Custom", custom_genes = custom_genes, custom_descriptions = custom_descriptions, max_gset_size = Inf, # DO NOT LIMIT GENE SET SIZE iterations = 1, # for demo, setting number of iterations to 1 output_dir = "misc/v1.4/CREB_MYC") # since we are using pathfindR we will not need pcsf, however have a look at it # source("http://bioconductor.org/biocLite.R") #biocLite("topGO") #install.packages("devtools", dependencies=TRUE) #devtools::install_github("IOR-Bioinformatics/PCSF", repos=BiocInstaller::biocinstallRepos(), dependencies=TRUE, type="source", force=TRUE)