Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
# PathfindR tutorial and exercises
# 10.2020
# v.0.1
# Matt Rogon EMBL CBNA
install.packages("pathfindR")
library(pathfindR)
openVignette(pathfindR)
# Let's explore the package and functions
# https://github.com/egeulgen/pathfindR
# paper
# https://www.frontiersin.org/articles/10.3389/fgene.2019.00858/full
# website
# https://egeulgen.github.io/pathfindR/
# load sample data into a dataframe
input_df <- pathfindR.data::RA_input
# have a look at the formatting
View(input_df)
# all you need are 3 columns to execute the pathfindR main wrapper
output_df <- run_pathfindR(input_df)
# visualise the output_df results
#--------------------------------------------------------------------------------------
# Exercise 1:
# Load the following file:
# pathfinder_input_exercise_clusters.tsv
#
# prepare the dataframe for pathfindeR
# you will need 3 columns:
# Gene name
# logFC
# adj P Val
# Run pathfindR wrapper function on this dataset
# visualise
#--------------------------------------------------------------------------------------
# Exercise 2
# Create a custom script that will execute pathfindR with the following settings:
# Benjamini-Hochberg FDR
# adjusted p_val_threshold 0.01
# gene sets: GO-BP
# minimum gene set size 5, maximum 100
# PIN protein interaction reference network: IntAct
#
# generate a fuzzy-clustered result network of terms
# term gene heatmap
# term gene graph
# upset plot
#--------------------------------------------------------------------------------------
# Exercise 3
# Use the original file pathfinder_input_exercise_clusters.txt and compare clusters 1 and 2 with pathfindR
# using combine_pathfindR_results function
# produce visualisations
#--------------------------------------------------------------------------------------
# load the prepared file (columns must be numeric)
pathfinder_input_exercise_noClusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_noClusters.txt")
pathfinder_input_exercise_noClusters$adj.P.Val <- as.numeric(pathfinder_input_exercise_noClusters$adj.P.Val)
pathfinder_input_exercise_noClusters$logFC <- as.numeric(pathfinder_input_exercise_noClusters$logFC)
# make sure rows are complete
pathfinder_input_exercise_noClusters <- pathfinder_input_exercise_noClusters[complete.cases(pathfinder_input_exercise_noClusters), ]
# run it
output_noClust <- run_pathfindR(pathfinder_input_exercise_noClusters)
# define threshold
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, p_val_threshold = 0.01)
# define gene sets
# output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, gene_sets = "GO-MF")
## Including more terms for enrichment analysis
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters,
gene_sets = "GO-MF",
min_gset_size = 5,
max_gset_size = 500)
# By default, run_pathfindR() adjusts the enrichment p values via the “bonferroni” method
# and filters the enriched terms by adjusted-p value < 0.05.
# To change this adjustment method and the threshold, set adj_method and enrichment_threshold,
# respectively, here we have benjamini hochberg fdr with 0.01 thresholding:
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters,
adj_method = "fdr",
enrichment_threshold = 0.01)
# Protein-protein Interaction Network and Background
# For the active subnetwork search process, a protein-protein interaction network (PIN) is used. run_pathfindR() maps the input genes onto this PIN and identifies active subnetworks which are then be used for enrichment analyses. To change the default PIN (“Biogrid”), use the pin_name_path argument:
# The pin_name_path argument can be one of “Biogrid”, “STRING”, “GeneMania”, “IntAct”, “KEGG”, “mmu_STRING” or it can be the path to a custom PIN file provided by the user.
# NOTE: the PIN is also used for generating the background genes (in this case, all unique genes in the PIN) during hypergeometric-distribution-based tests in enrichment analyses. Therefore, a large PIN will generally result in better results.
# to use an external PIN of your choice
# output_df <- run_pathfindR(input_df, pin_name_path = "/path/to/myPIN.sif")
output_df <- run_pathfindR(pathfinder_input_exercise_noClusters, pin_name_path = "IntAct")
# plot the results with various graphs
term_gene_heatmap(result_df = output_df, genes_df = pathfinder_input_exercise_noClusters)
term_gene_graph(result_df = output_df, use_description = TRUE)
UpSet_plot(result_df = output_df, genes_df = RA_input)
# lets cluster the output enriched terms - fuzzy clustering
RA_clustered_fuzzy <- cluster_enriched_terms(output_df, method = "fuzzy")
#This function first calculates the pairwise kappa statistics between the terms in the data frame obtained from run_pathfindR(). It then clusters the terms and partitions them into relevant clusters. As can be seen in the diagram, there are two options for clustering: (1) hierarchical clustering and (2) fuzzy clustering.
# https://github.com/egeulgen/pathfindR/wiki/Clustering%20Documentation
# run Comparison of 2 pathfindR Results
The function combine_pathfindR_results() allows combination of two pathfindR active-subnetwork-oriented enrichment analysis results for investigating common and distinct terms between the groups. Below is an example for comparing results using two different rheumatoid arthritis-related data sets(RA_output and RA_comparison_output).
combined_df <- combine_pathfindR_results(result_A = RA_output,
result_B = RA_comparison_output,
plot_common = FALSE)
pathfinder_input_exercise_clusters <- read.delim("/Users/rogon/Work/01. Teaching/CBNA Courses/2020 CBNA-DeNBI Enrichment course - my materials/Part 2 - R-code session ClusterProfiler, ReactomePA, pathfindR/pathfindR/exercise/pathfinder_input_exercise_clusters.txt")
non human
https://cran.r-project.org/web/packages/pathfindR/vignettes/non_hs_analysis.html
# Running pathfindR manually to control the environment and statistics
# https://egeulgen.github.io/pathfindR/articles/manual_execution.html
#---------------------------------------------------------------------------------------
#1
#Process input data
# After checking that the data frame complies with the requirements,
# input_processing() filters the input so that genes with p values larger than
# p_val_threshold are excluded.
# Next, gene symbols that are not in the PIN are identified and excluded.
# For human genes, if aliases of these missing gene symbols are found in the PIN,
# these symbols are converted to the corresponding aliases (controlled by the argument
# convert2alias). This step is performed to best map the input data onto the PIN.
#---------------------------------------------------------------------------------------
RA_processed <- input_processing(input = RA_input, # the input: in this case, differential expression results
p_val_threshold = 0.05, # p value threshold to filter significant genes
pin_name_path = "Biogrid", # the name of the PIN to use for active subnetwork search
convert2alias = TRUE) # boolean indicating whether or not to convert missing symbols to alias symbols in the PIN
#---------------------------------------------------------------------------------------
# 2
# Obtain Gene Set Data necessary for enrichment analyses using fetch_gene_set():
# using "BioCarta" as our gene sets for enrichment
# available gene sets are
# “KEGG”, “Reactome”, “BioCarta”, “GO-All”, “GO-BP”, “GO-CC” and “GO-MF”.
# If the user prefers to use another gene set source, the gene_sets argument should be
# set to "Custom" and the custom gene sets (list) and the custom gene set descriptions
# (named vector) should be supplied via the arguments custom_genes and custom_descriptions,
# respectively. See ?fetch_gene_set for more details.
#---------------------------------------------------------------------------------------
biocarta_list <- fetch_gene_set(gene_sets = "BioCarta",
min_gset_size = 10,
max_gset_size = 300)
biocarta_gsets <- biocarta_list[[1]]
biocarta_descriptions <- biocarta_list[[2]]
#---------------------------------------------------------------------------------------
# 3. Active Subnetwork Search and Enrichment Analyses
# As outlined in the vignette Introduction to pathfindR, run_pathfindR() initially identifies and filters active subnetworks, then performs enrichment analyses on these subnetworks and summarize the results.
# To perform these steps manually, we utilize the function active_snw_search() for identifying and filtering active subnetworks and the function enrichment_analyses() for obtaining enriched terms using these subnetworks. Because the active subnetwork search algorithms are stochastic, we suggest iterating these subnetwork identification and enrichment steps multiple times (especially for “SA”)1:
#---------------------------------------------------------------------------------------
Active Subnetwork Search Method
# Currently, there are three algorithms implemented in pathfindR for active subnetwork search: Greedy Algorithm (default, based on Ideker et al. 2), Simulated Annealing Algorithm (based on Ideker et al. 3) and Genetic Algorithm (based on Ozisik et al. 4). For a detailed discussion on which algorithm to use see this wiki entry
# for simulated annealing:
#output_df <- run_pathfindR(input_df, search_method = "SA")
# for genetic algorithm:
# output_df <- run_pathfindR(input_df, search_method = "GA")
#
n_iter <- 10 ## number of iterations
combined_res <- NULL ## to store the result of each iteration
for (i in 1:n_iter) {
###### Active Subnetwork Search
snws_file <- paste0("active_snws_", i) # Name of output file
active_snws <- active_snw_search(input_for_search = RA_processed,
pin_name_path = "Biogrid",
snws_file = snws_file,
score_quan_thr = 0.8, # you may tweak these arguments for optimal filtering of subnetworks
sig_gene_thr = 0.02, # you may tweak these arguments for optimal filtering of subnetworks
search_method = "GR")
###### Enrichment Analyses
current_res <- enrichment_analyses(snws = active_snws,
sig_genes_vec = RA_processed$GENE,
pin_name_path = "Biogrid",
genes_by_term = biocarta_gsets,
term_descriptions = biocarta_descriptions,
adj_method = "bonferroni",
enrichment_threshold = 0.05,
list_active_snw_genes = TRUE) # listing the non-input active snw genes in output
###### Combine results via `rbind`
combined_res <- rbind(combined_res, current_res)
}
custom_result <- run_pathfindR(input_df,
gene_sets = "Custom",
custom_genes = custom_genes,
custom_descriptions = custom_descriptions,
max_gset_size = Inf, # DO NOT LIMIT GENE SET SIZE
iterations = 1, # for demo, setting number of iterations to 1
output_dir = "misc/v1.4/CREB_MYC")
# since we are using pathfindR we will not need pcsf, however have a look at it
# source("http://bioconductor.org/biocLite.R")
#biocLite("topGO")
#install.packages("devtools", dependencies=TRUE)
#devtools::install_github("IOR-Bioinformatics/PCSF", repos=BiocInstaller::biocinstallRepos(),
dependencies=TRUE, type="source", force=TRUE)