Add solution to exercise 5 for day 3

6aaa3720 · Renato Alves · b66e163a · 6aaa3720
Verified Commit 6aaa3720 authored 4 years ago by Renato Alves
--- a/Day_3/Solutions/Exercise_5_pathfindR_tutorial.txt
+++ b/Day_3/Solutions/Exercise_5_pathfindR_tutorial.txt
+# PathfindR tutorial and exercises
+# 10.2020
+# v.0.1
+# Matt Rogon EMBL CBNA
+
+install.packages("pathfindR")
+library(pathfindR)
+
+openVignette(pathfindR)
+# Let's explore the package and functions
+# https://github.com/egeulgen/pathfindR
+# paper
+# https://www.frontiersin.org/articles/10.3389/fgene.2019.00858/full
+# website
+# https://egeulgen.github.io/pathfindR/
+
+
+# load sample data into a dataframe
+input_df <- pathfindR.data::RA_input
+
+
+# have a look at the formatting
+View(input_df)
+
+# all you need are 3 columns to execute the pathfindR main wrapper
+output_df <- run_pathfindR(input_df)
+
+# visualise the output_df results
+
+
+
+#--------------------------------------------------------------------------------------
+# Exercise 1:
+# Load the following file: 
+# pathfinder_input_exercise_clusters.tsv
+#
+# prepare the dataframe for pathfindeR
+# you will need 3 columns:
+# Gene name
+# logFC
+# adj P Val
+# Run pathfindR wrapper function on this dataset
+# visualise
+
+#--------------------------------------------------------------------------------------
+
+# Exercise 2
+# Create a custom script that will execute pathfindR with the following settings:
+# Benjamini-Hochberg FDR
+# adjusted p_val_threshold 0.01
+# gene sets: GO-BP
+# minimum gene set size 5, maximum 100
+# PIN protein interaction reference network: IntAct
+# 
+# generate a fuzzy-clustered result network of terms
+# term gene heatmap
+# term gene graph
+# upset plot
+
+#--------------------------------------------------------------------------------------
+
+# Exercise 3
+# Use the original file pathfinder_input_exercise_clusters.txt and compare clusters 1 and 2 with pathfindR
+# using combine_pathfindR_results function
+# produce visualisations
+
+#--------------------------------------------------------------------------------------
+
+
+
+
+
+
+
+
+
+
+# load the prepared file (columns must be numeric)
+pathfinder_input_exercise_noClusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_noClusters.txt")
+pathfinder_input_exercise_noClusters$adj.P.Val <- as.numeric(pathfinder_input_exercise_noClusters$adj.P.Val)
+pathfinder_input_exercise_noClusters$logFC <- as.numeric(pathfinder_input_exercise_noClusters$logFC)
+
+  
+# make sure rows are complete
+pathfinder_input_exercise_noClusters <- pathfinder_input_exercise_noClusters[complete.cases(pathfinder_input_exercise_noClusters), ]
+
+# run it
+output_noClust <- run_pathfindR(pathfinder_input_exercise_noClusters)
+
+# define threshold
+output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, p_val_threshold = 0.01)
+
+# define gene sets
+# output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, gene_sets = "GO-MF")
+
+## Including more terms for enrichment analysis
+output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, 
+                           gene_sets = "GO-MF",
+                           min_gset_size = 5,
+                           max_gset_size = 500)
+
+# By default, run_pathfindR() adjusts the enrichment p values via the “bonferroni” method 
+# and filters the enriched terms by adjusted-p value < 0.05. 
+# To change this adjustment method and the threshold, set adj_method and enrichment_threshold,
+# respectively, here we have benjamini hochberg fdr with 0.01 thresholding:
+output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, 
+                           adj_method = "fdr",
+                           enrichment_threshold = 0.01)
+
+
+# Protein-protein Interaction Network and Background
+# For the active subnetwork search process, a protein-protein interaction network (PIN) is used. run_pathfindR() maps the input genes onto this PIN and identifies active subnetworks which are then be used for enrichment analyses. To change the default PIN (“Biogrid”), use the pin_name_path argument:
+# The pin_name_path argument can be one of “Biogrid”, “STRING”, “GeneMania”, “IntAct”, “KEGG”, “mmu_STRING” or it can be the path to a custom PIN file provided by the user.
+# NOTE: the PIN is also used for generating the background genes (in this case, all unique genes in the PIN) during hypergeometric-distribution-based tests in enrichment analyses. Therefore, a large PIN will generally result in better results.
+# to use an external PIN of your choice
+# output_df <- run_pathfindR(input_df, pin_name_path = "/path/to/myPIN.sif")
+
+output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, pin_name_path = "IntAct")
+
+
+# plot the results with various graphs
+term_gene_heatmap(result_df = output_df, genes_df = pathfinder_input_exercise_noClusters)
+term_gene_graph(result_df = output_df, use_description = TRUE)
+UpSet_plot(result_df = output_df, genes_df = RA_input)
+
+
+# lets cluster the output enriched terms - fuzzy clustering 
+RA_clustered_fuzzy <- cluster_enriched_terms(output_df, method = "fuzzy")
+
+#This function first calculates the pairwise kappa statistics between the terms in the data frame obtained from run_pathfindR(). It then clusters the terms and partitions them into relevant clusters. As can be seen in the diagram, there are two options for clustering: (1) hierarchical clustering and (2) fuzzy clustering.
+# https://github.com/egeulgen/pathfindR/wiki/Clustering%20Documentation
+
+
+
+# run Comparison of 2 pathfindR Results
+The function combine_pathfindR_results() allows combination of two pathfindR active-subnetwork-oriented enrichment analysis results for investigating common and distinct terms between the groups. Below is an example for comparing results using two different rheumatoid arthritis-related data sets(RA_output and RA_comparison_output).
+
+combined_df <- combine_pathfindR_results(result_A = RA_output, 
+                                         result_B = RA_comparison_output, 
+                                         plot_common = FALSE)
+                                         
+                                         
+                 
+                 
+
+
+pathfinder_input_exercise_clusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_clusters.txt")
+
+
+                        
+
+
+non human
+https://cran.r-project.org/web/packages/pathfindR/vignettes/non_hs_analysis.html
+
+
+
+
+# Running pathfindR manually to control the environment and statistics
+
+# https://egeulgen.github.io/pathfindR/articles/manual_execution.html
+
+#---------------------------------------------------------------------------------------
+#1
+#Process input data
+# After checking that the data frame complies with the requirements, 
+# input_processing() filters the input so that genes with p values larger than 
+# p_val_threshold are excluded. 
+# Next, gene symbols that are not in the PIN are identified and excluded. 
+# For human genes, if aliases of these missing gene symbols are found in the PIN, 
+# these symbols are converted to the corresponding aliases (controlled by the argument 
+# convert2alias). This step is performed to best map the input data onto the PIN.
+#---------------------------------------------------------------------------------------
+
+RA_processed <- input_processing(input = RA_input, # the input: in this case, differential expression results
+                                 p_val_threshold = 0.05, # p value threshold to filter significant genes
+                                 pin_name_path  = "Biogrid", # the name of the PIN to use for active subnetwork search
+                                 convert2alias = TRUE) # boolean indicating whether or not to convert missing symbols to alias symbols in the PIN
+
+
+
+
+#---------------------------------------------------------------------------------------
+# 2
+# Obtain Gene Set Data necessary for enrichment analyses using fetch_gene_set():
+# using "BioCarta" as our gene sets for enrichment
+# available gene sets are 
+# “KEGG”, “Reactome”, “BioCarta”, “GO-All”, “GO-BP”, “GO-CC” and “GO-MF”. 
+# If the user prefers to use another gene set source, the gene_sets argument should be
+# set to "Custom" and the custom gene sets (list) and the custom gene set descriptions 
+# (named vector) should be supplied via the arguments custom_genes and custom_descriptions, 
+# respectively. See ?fetch_gene_set for more details.
+#---------------------------------------------------------------------------------------
+
+biocarta_list <- fetch_gene_set(gene_sets = "BioCarta",
+                                min_gset_size = 10,
+                                max_gset_size = 300)
+biocarta_gsets <- biocarta_list[[1]]
+biocarta_descriptions <- biocarta_list[[2]]
+
+
+#---------------------------------------------------------------------------------------
+# 3. Active Subnetwork Search and Enrichment Analyses
+# As outlined in the vignette Introduction to pathfindR, run_pathfindR() initially identifies and filters active subnetworks, then performs enrichment analyses on these subnetworks and summarize the results.
+# To perform these steps manually, we utilize the function active_snw_search() for identifying and filtering active subnetworks and the function enrichment_analyses() for obtaining enriched terms using these subnetworks. Because the active subnetwork search algorithms are stochastic, we suggest iterating these subnetwork identification and enrichment steps multiple times (especially for “SA”)1:
+#---------------------------------------------------------------------------------------
+
+
+Active Subnetwork Search Method
+
+# Currently, there are three algorithms implemented in pathfindR for active subnetwork search: Greedy Algorithm (default, based on Ideker et al. 2), Simulated Annealing Algorithm (based on Ideker et al. 3) and Genetic Algorithm (based on Ozisik et al. 4). For a detailed discussion on which algorithm to use see this wiki entry
+# for simulated annealing:
+#output_df <- run_pathfindR(input_df, search_method = "SA")
+# for genetic algorithm:
+# output_df <- run_pathfindR(input_df, search_method = "GA")
+#
+
+
+n_iter <- 10 ## number of iterations
+combined_res <- NULL ## to store the result of each iteration
+
+for (i in 1:n_iter) {
+  
+  ###### Active Subnetwork Search
+  snws_file <- paste0("active_snws_", i) # Name of output file
+  active_snws <- active_snw_search(input_for_search = RA_processed, 
+                                   pin_name_path = "Biogrid", 
+                                   snws_file = snws_file,
+                                   score_quan_thr = 0.8, # you may tweak these arguments for optimal filtering of subnetworks
+                                   sig_gene_thr = 0.02, # you may tweak these arguments for optimal filtering of subnetworks
+                                   search_method = "GR")
+  
+  ###### Enrichment Analyses
+  current_res <- enrichment_analyses(snws = active_snws,
+                                     sig_genes_vec = RA_processed$GENE,
+                                     pin_name_path = "Biogrid", 
+                                     genes_by_term = biocarta_gsets,
+                                     term_descriptions = biocarta_descriptions,
+                                     adj_method = "bonferroni",
+                                     enrichment_threshold = 0.05,
+                                     list_active_snw_genes = TRUE) # listing the non-input active snw genes in output
+  
+  ###### Combine results via `rbind`
+  combined_res <- rbind(combined_res, current_res)
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+custom_result <- run_pathfindR(input_df,
+                               gene_sets = "Custom",
+                               custom_genes = custom_genes,
+                               custom_descriptions = custom_descriptions,
+                               max_gset_size = Inf, # DO NOT LIMIT GENE SET SIZE
+                               iterations = 1, # for demo, setting number of iterations to 1
+                               output_dir = "misc/v1.4/CREB_MYC")
+
+
+
+
+
+
+
+# since we are using pathfindR we will not need pcsf, however have a look at it
+# source("http://bioconductor.org/biocLite.R")
+#biocLite("topGO")
+#install.packages("devtools", dependencies=TRUE)
+#devtools::install_github("IOR-Bioinformatics/PCSF", repos=BiocInstaller::biocinstallRepos(),
+                         dependencies=TRUE, type="source", force=TRUE)