topGO: Bioconductor, Paper
topGO (with enrichment_barplot
function): GitHub
Demo Dataset: E-MTAB-8411 from The clock gene Bmal1 inhibits macrophage motility, phagocytosis, and impairs defense against pneumonia. PNAS. 2020;117(3):1543-1551.
License: GPL-3.0
R
cd /ngs/GO-Enrichment-Analysis-Demo
R
library("topGO")
library("org.Mm.eg.db")
library("ggplot2")
If you have downloaded the DESeq2_DEG.txt
file with wget
:
data.table::fread("DESeq2_DEG.txt")
data <-$GeneID <- substr(data$GeneID, 1, 18) data
If you like to donwload the file in R
now:
data.table::fread("https://raw.githubusercontent.com/ycl6/GO-Enrichment-Analysis-Demo/master/DESeq2_DEG.txt")
data <-$GeneID <- substr(data$GeneID, 1, 18) data
data
## GeneID GeneSymbol log2fc pvalue padj
## 1: ENSMUSG00000000001 Gnai3 0.08804493 0.1925609 0.6146732
## 2: ENSMUSG00000000003 Pbsn NA NA NA
## 3: ENSMUSG00000000028 Cdc45 -0.32106635 0.1401127 0.5331437
## 4: ENSMUSG00000000031 H19 -1.20339889 0.7161464 NA
## 5: ENSMUSG00000000037 Scml2 -0.57746426 0.2979159 NA
## ---
## 55381: ENSMUSG00000118636 AC117663.3 NA NA NA
## 55382: ENSMUSG00000118637 AL772212.1 NA NA NA
## 55383: ENSMUSG00000118638 AL805980.1 NA NA NA
## 55384: ENSMUSG00000118639 AL590997.4 NA NA NA
## 55385: ENSMUSG00000118640 AC167036.2 0.06415390 0.9208414 NA
which(data$padj < 0.05 & data$log2fc > 0) # FDR < 0.05 and logFC > 0
up.idx <- which(data$padj < 0.05 & data$log2fc < 0) # FDR < 0.05 and logFC < 0 dn.idx <-
dim(data)
## [1] 55385 5
length(up.idx)
## [1] 383
length(dn.idx)
## [1] 429
data$GeneSymbol
all.genes <- data[up.idx,]$GeneSymbol
up.genes <- data[dn.idx,]$GeneSymbol dn.genes <-
head(up.genes, 10)
## [1] "Axin2" "Hnrnpd" "Kcnn3" "Mapk7" "Agpat3" "Sema6b" "Efnb2" "Il16" "Ltbp1"
## [10] "Rgs19"
head(dn.genes, 10)
## [1] "Cox5a" "Pdgfb" "Itga5" "Cd52" "Dnmt3l" "Tubb6" "Ell2" "Ifrd1" "Stk38l"
## [10] "Ubl3"
Alternatively, if you only have Ensembl gene ID
data$GeneID
all.genes <- data[up.idx,]$GeneID
up.genes <- data[dn.idx,]$GeneID dn.genes <-
"BP" ontology <-
The default algorithm used by the topGO package is weight01
, it is a mixture between the elim
and the weight
algorithms. Possible choices includes:
"weight01" algorithm <-
For tests based on gene counts:
statistic = "fisher"
)For tests based on gene scores or gene ranks:
statistic = "ks"
)statistic = "t"
)For tests based on gene expression:
statistic = "globaltest"
) "fisher" statistic <-
paste0("topGO_GO-", ontology, "_ORA_", algorithm,"_", statistic)
outTitle <- outTitle
## [1] "topGO_GO-BP_ORA_weight01_fisher"
We use all the annotated genes (all.genes
) as the “gene universe” or geneList
and the differentially expressed genes (up.genes
and dn.genes
) as our genes of interest. So, we will define the geneList
as 1 if a gene is differentially expressed, 0 otherwise
factor(as.integer(all.genes %in% up.genes))
upList <-names(upList) <- all.genes
head(upList, 30)
## Gnai3 Pbsn Cdc45 H19 Scml2 Apoh Narf Cav2 Klf6 Scmh1
## 0 0 0 0 0 0 0 0 0 0
## Cox5a Tbx2 Tbx4 Zfy2 Ngfr Wnt3 Wnt9a Fer Xpo6 Tfe3
## 0 0 0 0 0 0 0 0 0 0
## Axin2 Brat1 Gna12 Slc22a18 Itgb2l Igsf5 Pih1d2 Dlat Sdhd Fgf23
## 1 0 0 0 0 0 0 0 0 0
## Levels: 0 1
table(upList)
## upList
## 0 1
## 55002 383
factor(as.integer(all.genes %in% dn.genes))
dnList <-names(dnList) <- all.genes
head(dnList, 30)
## Gnai3 Pbsn Cdc45 H19 Scml2 Apoh Narf Cav2 Klf6 Scmh1
## 0 0 0 0 0 0 0 0 0 0
## Cox5a Tbx2 Tbx4 Zfy2 Ngfr Wnt3 Wnt9a Fer Xpo6 Tfe3
## 1 0 0 0 0 0 0 0 0 0
## Axin2 Brat1 Gna12 Slc22a18 Itgb2l Igsf5 Pih1d2 Dlat Sdhd Fgf23
## 0 0 0 0 0 0 0 0 0 0
## Levels: 0 1
table(dnList)
## dnList
## 0 1
## 54956 429
Here, our geneList
is named with gene symbol, hence we use ID = "SYMBOL"
. If geneList
is named with Ensembl gene ID, you need to use ID = "ENSEMBL"
new("topGOdata", ontology = ontology, allGenes = upList,geneSel = function(x)(x == 1),
upGOdata <-nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")
##
## Building most specific GOs .....
## ( 12355 GO terms found. )
##
## Build GO DAG topology ..........
## ( 16019 GO terms and 37457 relations. )
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
##
## Annotating nodes ...............
## ( 22036 genes annotated to the GO terms. )
new("topGOdata", ontology = ontology, allGenes = dnList,geneSel = function(x)(x == 1),
dnGOdata <-nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")
##
## Building most specific GOs .....
## ( 12355 GO terms found. )
##
## Build GO DAG topology ..........
## ( 16019 GO terms and 37457 relations. )
##
## Annotating nodes ...............
## ( 22036 genes annotated to the GO terms. )
runTest(upGOdata, algorithm = algorithm, statistic = statistic) upRes <-
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 4279 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 17: 1 nodes to be scored (0 eliminated genes)
##
## Level 16: 8 nodes to be scored (0 eliminated genes)
##
## Level 15: 19 nodes to be scored (27 eliminated genes)
##
## Level 14: 54 nodes to be scored (230 eliminated genes)
##
## Level 13: 82 nodes to be scored (665 eliminated genes)
##
## Level 12: 171 nodes to be scored (1720 eliminated genes)
##
## Level 11: 284 nodes to be scored (4086 eliminated genes)
##
## Level 10: 458 nodes to be scored (6265 eliminated genes)
##
## Level 9: 614 nodes to be scored (8328 eliminated genes)
##
## Level 8: 671 nodes to be scored (10432 eliminated genes)
##
## Level 7: 694 nodes to be scored (12484 eliminated genes)
##
## Level 6: 554 nodes to be scored (14219 eliminated genes)
##
## Level 5: 362 nodes to be scored (16402 eliminated genes)
##
## Level 4: 196 nodes to be scored (17206 eliminated genes)
##
## Level 3: 90 nodes to be scored (17560 eliminated genes)
##
## Level 2: 20 nodes to be scored (17916 eliminated genes)
##
## Level 1: 1 nodes to be scored (18066 eliminated genes)
upRes
##
## Description:
## Ontology: BP
## 'weight01' algorithm with the 'fisher' test
## 6821 GO terms scored: 63 terms with p < 0.01
## Annotation data:
## Annotated genes: 22072
## Significant genes: 342
## Min. no. of genes annotated to a GO: 10
## Nontrivial nodes: 4279
runTest(dnGOdata, algorithm = algorithm, statistic = statistic) dnRes <-
##
## -- Weight01 Algorithm --
##
## the algorithm is scoring 4461 nontrivial nodes
## parameters:
## test statistic: fisher
##
## Level 17: 6 nodes to be scored (0 eliminated genes)
##
## Level 16: 12 nodes to be scored (0 eliminated genes)
##
## Level 15: 23 nodes to be scored (132 eliminated genes)
##
## Level 14: 69 nodes to be scored (279 eliminated genes)
##
## Level 13: 108 nodes to be scored (688 eliminated genes)
##
## Level 12: 183 nodes to be scored (1806 eliminated genes)
##
## Level 11: 308 nodes to be scored (4281 eliminated genes)
##
## Level 10: 485 nodes to be scored (6336 eliminated genes)
##
## Level 9: 628 nodes to be scored (8521 eliminated genes)
##
## Level 8: 697 nodes to be scored (10716 eliminated genes)
##
## Level 7: 672 nodes to be scored (12720 eliminated genes)
##
## Level 6: 573 nodes to be scored (14384 eliminated genes)
##
## Level 5: 383 nodes to be scored (15349 eliminated genes)
##
## Level 4: 201 nodes to be scored (17225 eliminated genes)
##
## Level 3: 93 nodes to be scored (17570 eliminated genes)
##
## Level 2: 19 nodes to be scored (17789 eliminated genes)
##
## Level 1: 1 nodes to be scored (18077 eliminated genes)
dnRes
##
## Description:
## Ontology: BP
## 'weight01' algorithm with the 'fisher' test
## 6821 GO terms scored: 90 terms with p < 0.01
## Annotation data:
## Annotated genes: 22072
## Significant genes: 389
## Min. no. of genes annotated to a GO: 10
## Nontrivial nodes: 4461
png(paste0(outTitle, "_up.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(upGOdata, upRes, showTerms = 20, numChar = 50, orderBy = "Scores",
title = paste0("GO-", ontology," ORA of up-regulated genes"))
invisible(dev.off())
png(paste0(outTitle, "_dn.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(dnGOdata, dnRes, showTerms = 20, numChar = 50, orderBy = "Scores",
title = paste0("GO-", ontology," ORA of down-regulated genes"))
invisible(dev.off())
Here, we retrieve test results of top 20 GO terms, you can use topNodes = length(usedGO(GOdata_Obj))
to retrieve results of all available GO terms
GenTable(upGOdata, Pval = upRes, topNodes = 20)
up.tab <- up.tab
## GO.ID Term Annotated Significant Expected
## 1 GO:0090022 regulation of neutrophil chemotaxis 34 5 0.53
## 2 GO:0042754 negative regulation of circadian rhythm 18 4 0.28
## 3 GO:0030574 collagen catabolic process 37 5 0.57
## 4 GO:0120255 olefinic compound biosynthetic process 34 5 0.53
## 5 GO:0007015 actin filament organization 433 19 6.71
## 6 GO:0050766 positive regulation of phagocytosis 82 8 1.27
## 7 GO:0097278 complement-dependent cytotoxicity 11 3 0.17
## 8 GO:0071732 cellular response to nitric oxide 11 3 0.17
## 9 GO:0010718 positive regulation of epithelial to mes... 47 5 0.73
## 10 GO:0071380 cellular response to prostaglandin E sti... 13 3 0.20
## 11 GO:0031668 cellular response to extracellular stimu... 214 8 3.32
## 12 GO:0035023 regulation of Rho protein signal transdu... 138 9 2.14
## 13 GO:0072659 protein localization to plasma membrane 294 11 4.56
## 14 GO:0009081 branched-chain amino acid metabolic proc... 20 4 0.31
## 15 GO:0043299 leukocyte degranulation 72 4 1.12
## 16 GO:0021932 hindbrain radial glia guided cell migrat... 15 3 0.23
## 17 GO:0070371 ERK1 and ERK2 cascade 323 13 5.00
## 18 GO:0002820 negative regulation of adaptive immune r... 47 3 0.73
## 19 GO:0002526 acute inflammatory response 107 5 1.66
## 20 GO:0016045 detection of bacterium 18 3 0.28
## Pval
## 1 0.00012
## 2 0.00015
## 3 0.00025
## 4 0.00039
## 5 0.00042
## 6 0.00054
## 7 0.00055
## 8 0.00055
## 9 0.00078
## 10 0.00094
## 11 0.00098
## 12 0.00129
## 13 0.00134
## 14 0.00139
## 15 0.00140
## 16 0.00146
## 17 0.00204
## 18 0.00232
## 19 0.00252
## 20 0.00253
GenTable(dnGOdata, Pval = dnRes, topNodes = 20)
dn.tab <- dn.tab
## GO.ID Term Annotated Significant Expected
## 1 GO:0051092 positive regulation of NF-kappaB transcr... 147 12 2.59
## 2 GO:0002755 MyD88-dependent toll-like receptor signa... 20 5 0.35
## 3 GO:0150079 negative regulation of neuroinflammatory... 12 4 0.21
## 4 GO:0006750 glutathione biosynthetic process 12 4 0.21
## 5 GO:1902041 regulation of extrinsic apoptotic signal... 49 8 0.86
## 6 GO:0006954 inflammatory response 719 39 12.67
## 7 GO:0045944 positive regulation of transcription by ... 1251 46 22.05
## 8 GO:0043031 negative regulation of macrophage activa... 15 4 0.26
## 9 GO:1903978 regulation of microglial cell activation 16 4 0.28
## 10 GO:0032946 positive regulation of mononuclear cell ... 140 5 2.47
## 11 GO:0071277 cellular response to calcium ion 77 7 1.36
## 12 GO:0034143 regulation of toll-like receptor 4 signa... 20 4 0.35
## 13 GO:2000489 regulation of hepatic stellate cell acti... 10 3 0.18
## 14 GO:0030168 platelet activation 83 8 1.46
## 15 GO:0031643 positive regulation of myelination 24 4 0.42
## 16 GO:0031954 positive regulation of protein autophosp... 24 4 0.42
## 17 GO:0034101 erythrocyte homeostasis 144 8 2.54
## 18 GO:0007035 vacuolar acidification 21 4 0.37
## 19 GO:0034135 regulation of toll-like receptor 2 signa... 11 3 0.19
## 20 GO:0045454 cell redox homeostasis 63 6 1.11
## Pval
## 1 0.000012
## 2 0.000021
## 3 0.000042
## 4 0.000042
## 5 0.000051
## 6 0.00010
## 7 0.00011
## 8 0.00011
## 9 0.00015
## 10 0.00031
## 11 0.00042
## 12 0.00059
## 13 0.00059
## 14 0.00063
## 15 0.00076
## 16 0.00076
## 17 0.00079
## 18 0.00080
## 19 0.00081
## 20 0.00084
# Update table with full GO term name
$Term <- sapply(up.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
up.tab$Term up.tab
## [1] "regulation of neutrophil chemotaxis"
## [2] "negative regulation of circadian rhythm"
## [3] "collagen catabolic process"
## [4] "olefinic compound biosynthetic process"
## [5] "actin filament organization"
## [6] "positive regulation of phagocytosis"
## [7] "complement-dependent cytotoxicity"
## [8] "cellular response to nitric oxide"
## [9] "positive regulation of epithelial to mesenchymal transition"
## [10] "cellular response to prostaglandin E stimulus"
## [11] "cellular response to extracellular stimulus"
## [12] "regulation of Rho protein signal transduction"
## [13] "protein localization to plasma membrane"
## [14] "branched-chain amino acid metabolic process"
## [15] "leukocyte degranulation"
## [16] "hindbrain radial glia guided cell migration"
## [17] "ERK1 and ERK2 cascade"
## [18] "negative regulation of adaptive immune response"
## [19] "acute inflammatory response"
## [20] "detection of bacterium"
$Term <- sapply(dn.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
dn.tab$Term dn.tab
## [1] "positive regulation of NF-kappaB transcription factor activity"
## [2] "MyD88-dependent toll-like receptor signaling pathway"
## [3] "negative regulation of neuroinflammatory response"
## [4] "glutathione biosynthetic process"
## [5] "regulation of extrinsic apoptotic signaling pathway via death domain receptors"
## [6] "inflammatory response"
## [7] "positive regulation of transcription by RNA polymerase II"
## [8] "negative regulation of macrophage activation"
## [9] "regulation of microglial cell activation"
## [10] "positive regulation of mononuclear cell proliferation"
## [11] "cellular response to calcium ion"
## [12] "regulation of toll-like receptor 4 signaling pathway"
## [13] "regulation of hepatic stellate cell activation"
## [14] "platelet activation"
## [15] "positive regulation of myelination"
## [16] "positive regulation of protein autophosphorylation"
## [17] "erythrocyte homeostasis"
## [18] "vacuolar acidification"
## [19] "regulation of toll-like receptor 2 signaling pathway"
## [20] "cell redox homeostasis"
The GenTable
function only provide the significant gene count for each significant GO term, here we will add the gene symbols to the result table
# Obtain the list of significant genes
sigGenes(upGOdata)
up.sigGenes <- sigGenes(dnGOdata)
dn.sigGenes <-
# Retrieve gene symbols for each GO from the test result
lapply(up.tab$"GO.ID",
up.AnnoList <-function(x) as.character(unlist(genesInTerm(object = upGOdata, whichGO = x))))
lapply(dn.tab$"GO.ID",
dn.AnnoList <-function(x) as.character(unlist(genesInTerm(object = dnGOdata, whichGO = x))))
lapply(up.AnnoList, function(x) intersect(x, up.sigGenes))
up.SigList <- lapply(dn.AnnoList, function(x) intersect(x, dn.sigGenes))
dn.SigList <-
# Coerce gene list to a comma-separated vector
$Genes <- sapply(up.SigList, paste, collapse = ",")
up.tab$Genes <- sapply(dn.SigList, paste, collapse = ",") dn.tab
cbind(head(up.tab$Genes, 5))
## [,1]
## [1,] "Bst1,C5ar2,Lbp,Nod2,Ripor2"
## [2,] "Cry1,Cry2,Per2,Ptger4"
## [3,] "Ctsl,Itgb1,Mmp27,Mmp9,Prtn3"
## [4,] "Alox15,Alox5,Bmp2,Prkg1,Scp2"
## [5,] "Add3,Alox15,Arhgap12,Bcar1,Cald1,Cdc42ep3,Coro1a,Dpysl3,Efs,Elmo2,Enah,Gba2,Kank2,Myo1d,Myo6,Ptger4,Ssh2,Tjp1,Trf"
cbind(head(dn.tab$Genes, 5))
## [,1]
## [1,] "Cat,Ikbkg,Irak2,Lrrfip2,Malt1,Mtpn,Prkcb,Rab7b,Ticam1,Tlr2,Traf1,Trim8"
## [2,] "Irak2,Lrrfip2,Tlr1,Tlr2,Tnip1"
## [3,] "Cst7,Nr1d1,Syt11,Tnfrsf1b"
## [4,] "Gclc,Gclm,Nfe2l2,Slc7a11"
## [5,] "Atf3,Bcl2l1,Fem1b,Lgals3,Serpine1,Skil,Tmbim1,Tmc8"
write.table(up.tab, file = paste0(outTitle, "_up.txt"), sep = "\t", quote = F,
row.names = F, col.names = T)
write.table(dn.tab, file = paste0(outTitle, "_dn.txt"), sep = "\t", quote = F,
row.names = F, col.names = T)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/r4/lib/libopenblasp-r0.3.12.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C LC_TIME=en_GB.UTF-8
## [4] LC_COLLATE=en_GB.UTF-8 LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods
## [9] base
##
## other attached packages:
## [1] SparseM_1.81 ggplot2_3.3.3 org.Mm.eg.db_3.12.0 AnnotationDbi_1.52.0
## [5] IRanges_2.24.1 S4Vectors_0.28.1 Biobase_2.50.0 topGO_2.41.0
## [9] graph_1.68.0 BiocGenerics_0.36.0 knitr_1.31
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 lattice_0.20-41 GO.db_3.12.1 png_0.1-7
## [5] assertthat_0.2.1 digest_0.6.27 utf8_1.1.4 R6_2.5.0
## [9] RSQLite_2.2.3 evaluate_0.14 highr_0.8 pillar_1.5.0
## [13] rlang_0.4.10 data.table_1.14.0 jquerylib_0.1.3 blob_1.2.1
## [17] rmarkdown_2.7 labeling_0.4.2 stringr_1.4.0 bit_4.0.4
## [21] munsell_0.5.0 compiler_4.0.3 xfun_0.21 pkgconfig_2.0.3
## [25] htmltools_0.5.1.1 tidyselect_1.1.0 tibble_3.1.0 matrixStats_0.58.0
## [29] fansi_0.4.2 crayon_1.4.1 dplyr_1.0.4 withr_2.4.1
## [33] grid_4.0.3 jsonlite_1.7.2 gtable_0.3.0 lifecycle_1.0.0
## [37] DBI_1.1.1 magrittr_2.0.1 scales_1.1.1 stringi_1.5.3
## [41] cachem_1.0.4 farver_2.1.0 bslib_0.2.4 ellipsis_0.3.1
## [45] vctrs_0.3.6 generics_0.1.0 tools_4.0.3 bit64_4.0.5
## [49] glue_1.4.2 purrr_0.3.4 fastmap_1.1.0 yaml_2.2.1
## [53] colorspace_2.0-0 memoise_2.0.0 sass_0.3.1