topGO: Bioconductor, Paper
topGO (with enrichment_barplot function): GitHub
Demo Dataset: E-MTAB-8411 from The clock gene Bmal1 inhibits macrophage motility, phagocytosis, and impairs defense against pneumonia. PNAS. 2020;117(3):1543-1551.
License: GPL-3.0
Rcd /ngs/GO-Enrichment-Analysis-Demo
R
library("topGO")
library("org.Mm.eg.db")
library("ggplot2")If you have downloaded the DESeq2_DEG.txt file with wget:
data <- data.table::fread("DESeq2_DEG.txt")
data$GeneID <- substr(data$GeneID, 1, 18)If you like to donwload the file in R now:
data <- data.table::fread("https://raw.githubusercontent.com/ycl6/GO-Enrichment-Analysis-Demo/master/DESeq2_DEG.txt")
data$GeneID <- substr(data$GeneID, 1, 18)data## GeneID GeneSymbol log2fc pvalue padj
## 1: ENSMUSG00000000001 Gnai3 0.08804493 0.1925609 0.6146732
## 2: ENSMUSG00000000003 Pbsn NA NA NA
## 3: ENSMUSG00000000028 Cdc45 -0.32106635 0.1401127 0.5331437
## 4: ENSMUSG00000000031 H19 -1.20339889 0.7161464 NA
## 5: ENSMUSG00000000037 Scml2 -0.57746426 0.2979159 NA
## ---
## 55381: ENSMUSG00000118636 AC117663.3 NA NA NA
## 55382: ENSMUSG00000118637 AL772212.1 NA NA NA
## 55383: ENSMUSG00000118638 AL805980.1 NA NA NA
## 55384: ENSMUSG00000118639 AL590997.4 NA NA NA
## 55385: ENSMUSG00000118640 AC167036.2 0.06415390 0.9208414 NA
up.idx <- which(data$padj < 0.05 & data$log2fc > 0) # FDR < 0.05 and logFC > 0
dn.idx <- which(data$padj < 0.05 & data$log2fc < 0) # FDR < 0.05 and logFC < 0dim(data)## [1] 55385 5
length(up.idx)## [1] 383
length(dn.idx)## [1] 429
all.genes <- data$GeneSymbol
up.genes <- data[up.idx,]$GeneSymbol
dn.genes <- data[dn.idx,]$GeneSymbolhead(up.genes, 10)## [1] "Axin2" "Hnrnpd" "Kcnn3" "Mapk7" "Agpat3" "Sema6b" "Efnb2" "Il16" "Ltbp1"
## [10] "Rgs19"
head(dn.genes, 10)## [1] "Cox5a" "Pdgfb" "Itga5" "Cd52" "Dnmt3l" "Tubb6" "Ell2" "Ifrd1" "Stk38l"
## [10] "Ubl3"
Alternatively, if you only have Ensembl gene ID
all.genes <- data$GeneID
up.genes <- data[up.idx,]$GeneID
dn.genes <- data[dn.idx,]$GeneIDontology <- "BP"The default algorithm used by the topGO package is weight01, it is a mixture between the elim and the weight algorithms. Possible choices includes:
algorithm <- "weight01"For tests based on gene counts:
statistic = "fisher")For tests based on gene scores or gene ranks:
statistic = "ks")statistic = "t")For tests based on gene expression:
statistic = "globaltest")statistic <- "fisher"outTitle <- paste0("topGO_GO-", ontology, "_ORA_", algorithm,"_", statistic)
outTitle## [1] "topGO_GO-BP_ORA_weight01_fisher"
We use all the annotated genes (all.genes) as the “gene universe” or geneList and the differentially expressed genes (up.genes and dn.genes) as our genes of interest. So, we will define the geneList as 1 if a gene is differentially expressed, 0 otherwise
upList <- factor(as.integer(all.genes %in% up.genes))
names(upList) <- all.genes
head(upList, 30)## Gnai3 Pbsn Cdc45 H19 Scml2 Apoh Narf Cav2 Klf6 Scmh1
## 0 0 0 0 0 0 0 0 0 0
## Cox5a Tbx2 Tbx4 Zfy2 Ngfr Wnt3 Wnt9a Fer Xpo6 Tfe3
## 0 0 0 0 0 0 0 0 0 0
## Axin2 Brat1 Gna12 Slc22a18 Itgb2l Igsf5 Pih1d2 Dlat Sdhd Fgf23
## 1 0 0 0 0 0 0 0 0 0
## Levels: 0 1
table(upList)## upList
## 0 1
## 55002 383
dnList <- factor(as.integer(all.genes %in% dn.genes))
names(dnList) <- all.genes
head(dnList, 30)## Gnai3 Pbsn Cdc45 H19 Scml2 Apoh Narf Cav2 Klf6 Scmh1
## 0 0 0 0 0 0 0 0 0 0
## Cox5a Tbx2 Tbx4 Zfy2 Ngfr Wnt3 Wnt9a Fer Xpo6 Tfe3
## 1 0 0 0 0 0 0 0 0 0
## Axin2 Brat1 Gna12 Slc22a18 Itgb2l Igsf5 Pih1d2 Dlat Sdhd Fgf23
## 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
table(dnList)## dnList
## 0 1
## 54956 429
Here, our geneList is named with gene symbol, hence we use ID = "SYMBOL". If geneList is named with Ensembl gene ID, you need to use ID = "ENSEMBL"
upGOdata <- new("topGOdata", ontology = ontology, allGenes = upList,geneSel = function(x)(x == 1),
nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")##
## Building most specific GOs .....
## ( 12355 GO terms found. )
##
## Build GO DAG topology ..........
## ( 16019 GO terms and 37457 relations. )
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Annotating nodes ...............
## ( 22036 genes annotated to the GO terms. )
dnGOdata <- new("topGOdata", ontology = ontology, allGenes = dnList,geneSel = function(x)(x == 1),
nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")##
## Building most specific GOs .....
## ( 12355 GO terms found. )
##
## Build GO DAG topology ..........
## ( 16019 GO terms and 37457 relations. )
##
## Annotating nodes ...............
## ( 22036 genes annotated to the GO terms. )
upRes <- runTest(upGOdata, algorithm = algorithm, statistic = statistic)##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 4279 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 17: 1 nodes to be scored (0 eliminated genes)
##
## Level 16: 8 nodes to be scored (0 eliminated genes)
##
## Level 15: 19 nodes to be scored (27 eliminated genes)
##
## Level 14: 54 nodes to be scored (230 eliminated genes)
##
## Level 13: 82 nodes to be scored (665 eliminated genes)
##
## Level 12: 171 nodes to be scored (1720 eliminated genes)
##
## Level 11: 284 nodes to be scored (4086 eliminated genes)
##
## Level 10: 458 nodes to be scored (6265 eliminated genes)
##
## Level 9: 614 nodes to be scored (8328 eliminated genes)
##
## Level 8: 671 nodes to be scored (10432 eliminated genes)
##
## Level 7: 694 nodes to be scored (12484 eliminated genes)
##
## Level 6: 554 nodes to be scored (14219 eliminated genes)
##
## Level 5: 362 nodes to be scored (16402 eliminated genes)
##
## Level 4: 196 nodes to be scored (17206 eliminated genes)
##
## Level 3: 90 nodes to be scored (17560 eliminated genes)
##
## Level 2: 20 nodes to be scored (17916 eliminated genes)
##
## Level 1: 1 nodes to be scored (18066 eliminated genes)
upRes##
## Description:
## Ontology: BP
## 'weight01' algorithm with the 'fisher' test
## 6821 GO terms scored: 63 terms with p < 0.01
## Annotation data:
## Annotated genes: 22072
## Significant genes: 342
## Min. no. of genes annotated to a GO: 10
## Nontrivial nodes: 4279
dnRes <- runTest(dnGOdata, algorithm = algorithm, statistic = statistic)##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 4461 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 17: 6 nodes to be scored (0 eliminated genes)
##
## Level 16: 12 nodes to be scored (0 eliminated genes)
##
## Level 15: 23 nodes to be scored (132 eliminated genes)
##
## Level 14: 69 nodes to be scored (279 eliminated genes)
##
## Level 13: 108 nodes to be scored (688 eliminated genes)
##
## Level 12: 183 nodes to be scored (1806 eliminated genes)
##
## Level 11: 308 nodes to be scored (4281 eliminated genes)
##
## Level 10: 485 nodes to be scored (6336 eliminated genes)
##
## Level 9: 628 nodes to be scored (8521 eliminated genes)
##
## Level 8: 697 nodes to be scored (10716 eliminated genes)
##
## Level 7: 672 nodes to be scored (12720 eliminated genes)
##
## Level 6: 573 nodes to be scored (14384 eliminated genes)
##
## Level 5: 383 nodes to be scored (15349 eliminated genes)
##
## Level 4: 201 nodes to be scored (17225 eliminated genes)
##
## Level 3: 93 nodes to be scored (17570 eliminated genes)
##
## Level 2: 19 nodes to be scored (17789 eliminated genes)
##
## Level 1: 1 nodes to be scored (18077 eliminated genes)
dnRes##
## Description:
## Ontology: BP
## 'weight01' algorithm with the 'fisher' test
## 6821 GO terms scored: 90 terms with p < 0.01
## Annotation data:
## Annotated genes: 22072
## Significant genes: 389
## Min. no. of genes annotated to a GO: 10
## Nontrivial nodes: 4461
png(paste0(outTitle, "_up.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(upGOdata, upRes, showTerms = 20, numChar = 50, orderBy = "Scores",
title = paste0("GO-", ontology," ORA of up-regulated genes"))
invisible(dev.off())“topGO_GO-BP_ORA_weight01_fisher_up.png”
png(paste0(outTitle, "_dn.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(dnGOdata, dnRes, showTerms = 20, numChar = 50, orderBy = "Scores",
title = paste0("GO-", ontology," ORA of down-regulated genes"))
invisible(dev.off())“topGO_GO-BP_ORA_weight01_fisher_dn.png”
Here, we retrieve test results of top 20 GO terms, you can use topNodes = length(usedGO(GOdata_Obj)) to retrieve results of all available GO terms
up.tab <- GenTable(upGOdata, Pval = upRes, topNodes = 20)
up.tab## GO.ID Term Annotated Significant Expected
## 1 GO:0090022 regulation of neutrophil chemotaxis 34 5 0.53
## 2 GO:0042754 negative regulation of circadian rhythm 18 4 0.28
## 3 GO:0030574 collagen catabolic process 37 5 0.57
## 4 GO:0120255 olefinic compound biosynthetic process 34 5 0.53
## 5 GO:0007015 actin filament organization 433 19 6.71
## 6 GO:0050766 positive regulation of phagocytosis 82 8 1.27
## 7 GO:0097278 complement-dependent cytotoxicity 11 3 0.17
## 8 GO:0071732 cellular response to nitric oxide 11 3 0.17
## 9 GO:0010718 positive regulation of epithelial to mes... 47 5 0.73
## 10 GO:0071380 cellular response to prostaglandin E sti... 13 3 0.20
## 11 GO:0031668 cellular response to extracellular stimu... 214 8 3.32
## 12 GO:0035023 regulation of Rho protein signal transdu... 138 9 2.14
## 13 GO:0072659 protein localization to plasma membrane 294 11 4.56
## 14 GO:0009081 branched-chain amino acid metabolic proc... 20 4 0.31
## 15 GO:0043299 leukocyte degranulation 72 4 1.12
## 16 GO:0021932 hindbrain radial glia guided cell migrat... 15 3 0.23
## 17 GO:0070371 ERK1 and ERK2 cascade 323 13 5.00
## 18 GO:0002820 negative regulation of adaptive immune r... 47 3 0.73
## 19 GO:0002526 acute inflammatory response 107 5 1.66
## 20 GO:0016045 detection of bacterium 18 3 0.28
## Pval
## 1 0.00012
## 2 0.00015
## 3 0.00025
## 4 0.00039
## 5 0.00042
## 6 0.00054
## 7 0.00055
## 8 0.00055
## 9 0.00078
## 10 0.00094
## 11 0.00098
## 12 0.00129
## 13 0.00134
## 14 0.00139
## 15 0.00140
## 16 0.00146
## 17 0.00204
## 18 0.00232
## 19 0.00252
## 20 0.00253
dn.tab <- GenTable(dnGOdata, Pval = dnRes, topNodes = 20)
dn.tab## GO.ID Term Annotated Significant Expected
## 1 GO:0051092 positive regulation of NF-kappaB transcr... 147 12 2.59
## 2 GO:0002755 MyD88-dependent toll-like receptor signa... 20 5 0.35
## 3 GO:0150079 negative regulation of neuroinflammatory... 12 4 0.21
## 4 GO:0006750 glutathione biosynthetic process 12 4 0.21
## 5 GO:1902041 regulation of extrinsic apoptotic signal... 49 8 0.86
## 6 GO:0006954 inflammatory response 719 39 12.67
## 7 GO:0045944 positive regulation of transcription by ... 1251 46 22.05
## 8 GO:0043031 negative regulation of macrophage activa... 15 4 0.26
## 9 GO:1903978 regulation of microglial cell activation 16 4 0.28
## 10 GO:0032946 positive regulation of mononuclear cell ... 140 5 2.47
## 11 GO:0071277 cellular response to calcium ion 77 7 1.36
## 12 GO:0034143 regulation of toll-like receptor 4 signa... 20 4 0.35
## 13 GO:2000489 regulation of hepatic stellate cell acti... 10 3 0.18
## 14 GO:0030168 platelet activation 83 8 1.46
## 15 GO:0031643 positive regulation of myelination 24 4 0.42
## 16 GO:0031954 positive regulation of protein autophosp... 24 4 0.42
## 17 GO:0034101 erythrocyte homeostasis 144 8 2.54
## 18 GO:0007035 vacuolar acidification 21 4 0.37
## 19 GO:0034135 regulation of toll-like receptor 2 signa... 11 3 0.19
## 20 GO:0045454 cell redox homeostasis 63 6 1.11
## Pval
## 1 0.000012
## 2 0.000021
## 3 0.000042
## 4 0.000042
## 5 0.000051
## 6 0.00010
## 7 0.00011
## 8 0.00011
## 9 0.00015
## 10 0.00031
## 11 0.00042
## 12 0.00059
## 13 0.00059
## 14 0.00063
## 15 0.00076
## 16 0.00076
## 17 0.00079
## 18 0.00080
## 19 0.00081
## 20 0.00084
# Update table with full GO term name
up.tab$Term <- sapply(up.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
up.tab$Term## [1] "regulation of neutrophil chemotaxis"
## [2] "negative regulation of circadian rhythm"
## [3] "collagen catabolic process"
## [4] "olefinic compound biosynthetic process"
## [5] "actin filament organization"
## [6] "positive regulation of phagocytosis"
## [7] "complement-dependent cytotoxicity"
## [8] "cellular response to nitric oxide"
## [9] "positive regulation of epithelial to mesenchymal transition"
## [10] "cellular response to prostaglandin E stimulus"
## [11] "cellular response to extracellular stimulus"
## [12] "regulation of Rho protein signal transduction"
## [13] "protein localization to plasma membrane"
## [14] "branched-chain amino acid metabolic process"
## [15] "leukocyte degranulation"
## [16] "hindbrain radial glia guided cell migration"
## [17] "ERK1 and ERK2 cascade"
## [18] "negative regulation of adaptive immune response"
## [19] "acute inflammatory response"
## [20] "detection of bacterium"
dn.tab$Term <- sapply(dn.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
dn.tab$Term## [1] "positive regulation of NF-kappaB transcription factor activity"
## [2] "MyD88-dependent toll-like receptor signaling pathway"
## [3] "negative regulation of neuroinflammatory response"
## [4] "glutathione biosynthetic process"
## [5] "regulation of extrinsic apoptotic signaling pathway via death domain receptors"
## [6] "inflammatory response"
## [7] "positive regulation of transcription by RNA polymerase II"
## [8] "negative regulation of macrophage activation"
## [9] "regulation of microglial cell activation"
## [10] "positive regulation of mononuclear cell proliferation"
## [11] "cellular response to calcium ion"
## [12] "regulation of toll-like receptor 4 signaling pathway"
## [13] "regulation of hepatic stellate cell activation"
## [14] "platelet activation"
## [15] "positive regulation of myelination"
## [16] "positive regulation of protein autophosphorylation"
## [17] "erythrocyte homeostasis"
## [18] "vacuolar acidification"
## [19] "regulation of toll-like receptor 2 signaling pathway"
## [20] "cell redox homeostasis"
The GenTable function only provide the significant gene count for each significant GO term, here we will add the gene symbols to the result table
# Obtain the list of significant genes
up.sigGenes <- sigGenes(upGOdata)
dn.sigGenes <- sigGenes(dnGOdata)
# Retrieve gene symbols for each GO from the test result
up.AnnoList <- lapply(up.tab$"GO.ID",
function(x) as.character(unlist(genesInTerm(object = upGOdata, whichGO = x))))
dn.AnnoList <- lapply(dn.tab$"GO.ID",
function(x) as.character(unlist(genesInTerm(object = dnGOdata, whichGO = x))))
up.SigList <- lapply(up.AnnoList, function(x) intersect(x, up.sigGenes))
dn.SigList <- lapply(dn.AnnoList, function(x) intersect(x, dn.sigGenes))
# Coerce gene list to a comma-separated vector
up.tab$Genes <- sapply(up.SigList, paste, collapse = ",")
dn.tab$Genes <- sapply(dn.SigList, paste, collapse = ",")cbind(head(up.tab$Genes, 5))## [,1]
## [1,] "Bst1,C5ar2,Lbp,Nod2,Ripor2"
## [2,] "Cry1,Cry2,Per2,Ptger4"
## [3,] "Ctsl,Itgb1,Mmp27,Mmp9,Prtn3"
## [4,] "Alox15,Alox5,Bmp2,Prkg1,Scp2"
## [5,] "Add3,Alox15,Arhgap12,Bcar1,Cald1,Cdc42ep3,Coro1a,Dpysl3,Efs,Elmo2,Enah,Gba2,Kank2,Myo1d,Myo6,Ptger4,Ssh2,Tjp1,Trf"
cbind(head(dn.tab$Genes, 5))## [,1]
## [1,] "Cat,Ikbkg,Irak2,Lrrfip2,Malt1,Mtpn,Prkcb,Rab7b,Ticam1,Tlr2,Traf1,Trim8"
## [2,] "Irak2,Lrrfip2,Tlr1,Tlr2,Tnip1"
## [3,] "Cst7,Nr1d1,Syt11,Tnfrsf1b"
## [4,] "Gclc,Gclm,Nfe2l2,Slc7a11"
## [5,] "Atf3,Bcl2l1,Fem1b,Lgals3,Serpine1,Skil,Tmbim1,Tmc8"
write.table(up.tab, file = paste0(outTitle, "_up.txt"), sep = "\t", quote = F,
row.names = F, col.names = T)
write.table(dn.tab, file = paste0(outTitle, "_dn.txt"), sep = "\t", quote = F,
row.names = F, col.names = T)sessionInfo()## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/r4/lib/libopenblasp-r0.3.12.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8
## [4] LC_COLLATE=en_GB.UTF-8 LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods
## [9] base
##
## other attached packages:
## [1] SparseM_1.81 ggplot2_3.3.3 org.Mm.eg.db_3.12.0 AnnotationDbi_1.52.0
## [5] IRanges_2.24.1 S4Vectors_0.28.1 Biobase_2.50.0 topGO_2.41.0
## [9] graph_1.68.0 BiocGenerics_0.36.0 knitr_1.31
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 lattice_0.20-41 GO.db_3.12.1 png_0.1-7
## [5] assertthat_0.2.1 digest_0.6.27 utf8_1.1.4 R6_2.5.0
## [9] RSQLite_2.2.3 evaluate_0.14 highr_0.8 pillar_1.5.0
## [13] rlang_0.4.10 data.table_1.14.0 jquerylib_0.1.3 blob_1.2.1
## [17] rmarkdown_2.7 labeling_0.4.2 stringr_1.4.0 bit_4.0.4
## [21] munsell_0.5.0 compiler_4.0.3 xfun_0.21 pkgconfig_2.0.3
## [25] htmltools_0.5.1.1 tidyselect_1.1.0 tibble_3.1.0 matrixStats_0.58.0
## [29] fansi_0.4.2 crayon_1.4.1 dplyr_1.0.4 withr_2.4.1
## [33] grid_4.0.3 jsonlite_1.7.2 gtable_0.3.0 lifecycle_1.0.0
## [37] DBI_1.1.1 magrittr_2.0.1 scales_1.1.1 stringi_1.5.3
## [41] cachem_1.0.4 farver_2.1.0 bslib_0.2.4 ellipsis_0.3.1
## [45] vctrs_0.3.6 generics_0.1.0 tools_4.0.3 bit64_4.0.5
## [49] glue_1.4.2 purrr_0.3.4 fastmap_1.1.0 yaml_2.2.1
## [53] colorspace_2.0-0 memoise_2.0.0 sass_0.3.1