topGO: Bioconductor, Paper

topGO (with enrichment_barplot function): GitHub

Demo Dataset: E-MTAB-8411 from The clock gene Bmal1 inhibits macrophage motility, phagocytosis, and impairs defense against pneumonia. PNAS. 2020;117(3):1543-1551.

License: GPL-3.0

Start R

cd /ngs/GO-Enrichment-Analysis-Demo

R

Load package and set path

library("topGO")
library("org.Mm.eg.db")
library("ggplot2")

Load data

If you have downloaded the DESeq2_DEG.txt file with wget:

data <- data.table::fread("DESeq2_DEG.txt")
data$GeneID <- substr(data$GeneID, 1, 18)

If you like to donwload the file in R now:

data <- data.table::fread("https://raw.githubusercontent.com/ycl6/GO-Enrichment-Analysis-Demo/master/DESeq2_DEG.txt")
data$GeneID <- substr(data$GeneID, 1, 18)
data
##                    GeneID GeneSymbol      log2fc    pvalue      padj
##     1: ENSMUSG00000000001      Gnai3  0.08804493 0.1925609 0.6146732
##     2: ENSMUSG00000000003       Pbsn          NA        NA        NA
##     3: ENSMUSG00000000028      Cdc45 -0.32106635 0.1401127 0.5331437
##     4: ENSMUSG00000000031        H19 -1.20339889 0.7161464        NA
##     5: ENSMUSG00000000037      Scml2 -0.57746426 0.2979159        NA
##    ---                                                              
## 55381: ENSMUSG00000118636 AC117663.3          NA        NA        NA
## 55382: ENSMUSG00000118637 AL772212.1          NA        NA        NA
## 55383: ENSMUSG00000118638 AL805980.1          NA        NA        NA
## 55384: ENSMUSG00000118639 AL590997.4          NA        NA        NA
## 55385: ENSMUSG00000118640 AC167036.2  0.06415390 0.9208414        NA

Define significance threshold

up.idx <- which(data$padj < 0.05 & data$log2fc > 0) # FDR < 0.05 and logFC > 0
dn.idx <- which(data$padj < 0.05 & data$log2fc < 0) # FDR < 0.05 and logFC < 0
dim(data)
## [1] 55385     5
length(up.idx)
## [1] 383
length(dn.idx)
## [1] 429

Define significant genes

all.genes <- data$GeneSymbol
up.genes <- data[up.idx,]$GeneSymbol
dn.genes <- data[dn.idx,]$GeneSymbol
head(up.genes, 10)
##  [1] "Axin2"  "Hnrnpd" "Kcnn3"  "Mapk7"  "Agpat3" "Sema6b" "Efnb2"  "Il16"   "Ltbp1" 
## [10] "Rgs19"
head(dn.genes, 10)
##  [1] "Cox5a"  "Pdgfb"  "Itga5"  "Cd52"   "Dnmt3l" "Tubb6"  "Ell2"   "Ifrd1"  "Stk38l"
## [10] "Ubl3"

Alternatively, if you only have Ensembl gene ID

all.genes <- data$GeneID
up.genes <- data[up.idx,]$GeneID
dn.genes <- data[dn.idx,]$GeneID

Decide the sub-ontology to test

  • BP: Biological Process
  • CC: Cellular Component
  • MF: Molecular Function
ontology <- "BP"

Decide test algorithm

The default algorithm used by the topGO package is weight01, it is a mixture between the elim and the weight algorithms. Possible choices includes:

  • classic
  • elim
  • weight
  • weight01
  • lea
  • parentchild
algorithm <- "weight01"

Define the statistical test used

For tests based on gene counts:

  • Fischer’s exact test (statistic = "fisher")

For tests based on gene scores or gene ranks:

  • Kolmogorov-Smirnov test (statistic = "ks")
  • t-test (statistic = "t")

For tests based on gene expression:

  • globaltest (statistic = "globaltest")
statistic <- "fisher"

Set outfile prefix

outTitle <- paste0("topGO_GO-", ontology, "_ORA_", algorithm,"_", statistic)
outTitle
## [1] "topGO_GO-BP_ORA_weight01_fisher"

Prepare input data

We use all the annotated genes (all.genes) as the “gene universe” or geneList and the differentially expressed genes (up.genes and dn.genes) as our genes of interest. So, we will define the geneList as 1 if a gene is differentially expressed, 0 otherwise

upList <- factor(as.integer(all.genes %in% up.genes))
names(upList) <- all.genes

head(upList, 30)
##    Gnai3     Pbsn    Cdc45      H19    Scml2     Apoh     Narf     Cav2     Klf6    Scmh1 
##        0        0        0        0        0        0        0        0        0        0 
##    Cox5a     Tbx2     Tbx4     Zfy2     Ngfr     Wnt3    Wnt9a      Fer     Xpo6     Tfe3 
##        0        0        0        0        0        0        0        0        0        0 
##    Axin2    Brat1    Gna12 Slc22a18   Itgb2l    Igsf5   Pih1d2     Dlat     Sdhd    Fgf23 
##        1        0        0        0        0        0        0        0        0        0 
## Levels: 0 1
table(upList)
## upList
##     0     1 
## 55002   383
dnList <- factor(as.integer(all.genes %in% dn.genes))
names(dnList) <- all.genes

head(dnList, 30)
##    Gnai3     Pbsn    Cdc45      H19    Scml2     Apoh     Narf     Cav2     Klf6    Scmh1 
##        0        0        0        0        0        0        0        0        0        0 
##    Cox5a     Tbx2     Tbx4     Zfy2     Ngfr     Wnt3    Wnt9a      Fer     Xpo6     Tfe3 
##        1        0        0        0        0        0        0        0        0        0 
##    Axin2    Brat1    Gna12 Slc22a18   Itgb2l    Igsf5   Pih1d2     Dlat     Sdhd    Fgf23 
##        0        0        0        0        0        0        0        0        0        0 
## Levels: 0 1
table(dnList)
## dnList
##     0     1 
## 54956   429

Create topGOdata object

Here, our geneList is named with gene symbol, hence we use ID = "SYMBOL". If geneList is named with Ensembl gene ID, you need to use ID = "ENSEMBL"

upGOdata <- new("topGOdata", ontology = ontology, allGenes = upList,geneSel = function(x)(x == 1), 
        nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")
## 
## Building most specific GOs .....
##  ( 12355 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 16019 GO terms and 37457 relations. )
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## 
## Annotating nodes ...............
##  ( 22036 genes annotated to the GO terms. )
dnGOdata <- new("topGOdata", ontology = ontology, allGenes = dnList,geneSel = function(x)(x == 1), 
        nodeSize = 10, annot = annFUN.org, mapping = "org.Mm.eg.db", ID = "SYMBOL")
## 
## Building most specific GOs .....
##  ( 12355 GO terms found. )
## 
## Build GO DAG topology ..........
##  ( 16019 GO terms and 37457 relations. )
## 
## Annotating nodes ...............
##  ( 22036 genes annotated to the GO terms. )

Test for enrichment

upRes <- runTest(upGOdata, algorithm = algorithm, statistic = statistic)
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 4279 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 17:  1 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  8 nodes to be scored    (0 eliminated genes)
## 
##   Level 15:  19 nodes to be scored   (27 eliminated genes)
## 
##   Level 14:  54 nodes to be scored   (230 eliminated genes)
## 
##   Level 13:  82 nodes to be scored   (665 eliminated genes)
## 
##   Level 12:  171 nodes to be scored  (1720 eliminated genes)
## 
##   Level 11:  284 nodes to be scored  (4086 eliminated genes)
## 
##   Level 10:  458 nodes to be scored  (6265 eliminated genes)
## 
##   Level 9:   614 nodes to be scored  (8328 eliminated genes)
## 
##   Level 8:   671 nodes to be scored  (10432 eliminated genes)
## 
##   Level 7:   694 nodes to be scored  (12484 eliminated genes)
## 
##   Level 6:   554 nodes to be scored  (14219 eliminated genes)
## 
##   Level 5:   362 nodes to be scored  (16402 eliminated genes)
## 
##   Level 4:   196 nodes to be scored  (17206 eliminated genes)
## 
##   Level 3:   90 nodes to be scored   (17560 eliminated genes)
## 
##   Level 2:   20 nodes to be scored   (17916 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (18066 eliminated genes)
upRes
## 
## Description:  
## Ontology: BP 
## 'weight01' algorithm with the 'fisher' test
## 6821 GO terms scored: 63 terms with p < 0.01
## Annotation data:
##     Annotated genes: 22072 
##     Significant genes: 342 
##     Min. no. of genes annotated to a GO: 10 
##     Nontrivial nodes: 4279
dnRes <- runTest(dnGOdata, algorithm = algorithm, statistic = statistic)
## 
##           -- Weight01 Algorithm -- 
## 
##       the algorithm is scoring 4461 nontrivial nodes
##       parameters: 
##           test statistic: fisher
## 
##   Level 17:  6 nodes to be scored    (0 eliminated genes)
## 
##   Level 16:  12 nodes to be scored   (0 eliminated genes)
## 
##   Level 15:  23 nodes to be scored   (132 eliminated genes)
## 
##   Level 14:  69 nodes to be scored   (279 eliminated genes)
## 
##   Level 13:  108 nodes to be scored  (688 eliminated genes)
## 
##   Level 12:  183 nodes to be scored  (1806 eliminated genes)
## 
##   Level 11:  308 nodes to be scored  (4281 eliminated genes)
## 
##   Level 10:  485 nodes to be scored  (6336 eliminated genes)
## 
##   Level 9:   628 nodes to be scored  (8521 eliminated genes)
## 
##   Level 8:   697 nodes to be scored  (10716 eliminated genes)
## 
##   Level 7:   672 nodes to be scored  (12720 eliminated genes)
## 
##   Level 6:   573 nodes to be scored  (14384 eliminated genes)
## 
##   Level 5:   383 nodes to be scored  (15349 eliminated genes)
## 
##   Level 4:   201 nodes to be scored  (17225 eliminated genes)
## 
##   Level 3:   93 nodes to be scored   (17570 eliminated genes)
## 
##   Level 2:   19 nodes to be scored   (17789 eliminated genes)
## 
##   Level 1:   1 nodes to be scored    (18077 eliminated genes)
dnRes
## 
## Description:  
## Ontology: BP 
## 'weight01' algorithm with the 'fisher' test
## 6821 GO terms scored: 90 terms with p < 0.01
## Annotation data:
##     Annotated genes: 22072 
##     Significant genes: 389 
##     Min. no. of genes annotated to a GO: 10 
##     Nontrivial nodes: 4461

Plot enrichment

Up-regulated genes

png(paste0(outTitle, "_up.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(upGOdata, upRes, showTerms = 20, numChar = 50, orderBy = "Scores", 
           title = paste0("GO-", ontology," ORA of up-regulated genes"))
invisible(dev.off())
"topGO_GO-BP_ORA_weight01_fisher_up.png"

“topGO_GO-BP_ORA_weight01_fisher_up.png”

Down-regulated genes

png(paste0(outTitle, "_dn.png"), width = 8, height = 6, units = "in", res = 300)
enrichment_barplot(dnGOdata, dnRes, showTerms = 20, numChar = 50, orderBy = "Scores",
                   title = paste0("GO-", ontology," ORA of down-regulated genes"))
invisible(dev.off())
"topGO_GO-BP_ORA_weight01_fisher_dn.png"

“topGO_GO-BP_ORA_weight01_fisher_dn.png”

Create result table

Here, we retrieve test results of top 20 GO terms, you can use topNodes = length(usedGO(GOdata_Obj)) to retrieve results of all available GO terms

up.tab <- GenTable(upGOdata, Pval = upRes, topNodes = 20)
up.tab
##         GO.ID                                        Term Annotated Significant Expected
## 1  GO:0090022         regulation of neutrophil chemotaxis        34           5     0.53
## 2  GO:0042754     negative regulation of circadian rhythm        18           4     0.28
## 3  GO:0030574                  collagen catabolic process        37           5     0.57
## 4  GO:0120255      olefinic compound biosynthetic process        34           5     0.53
## 5  GO:0007015                 actin filament organization       433          19     6.71
## 6  GO:0050766         positive regulation of phagocytosis        82           8     1.27
## 7  GO:0097278           complement-dependent cytotoxicity        11           3     0.17
## 8  GO:0071732           cellular response to nitric oxide        11           3     0.17
## 9  GO:0010718 positive regulation of epithelial to mes...        47           5     0.73
## 10 GO:0071380 cellular response to prostaglandin E sti...        13           3     0.20
## 11 GO:0031668 cellular response to extracellular stimu...       214           8     3.32
## 12 GO:0035023 regulation of Rho protein signal transdu...       138           9     2.14
## 13 GO:0072659     protein localization to plasma membrane       294          11     4.56
## 14 GO:0009081 branched-chain amino acid metabolic proc...        20           4     0.31
## 15 GO:0043299                     leukocyte degranulation        72           4     1.12
## 16 GO:0021932 hindbrain radial glia guided cell migrat...        15           3     0.23
## 17 GO:0070371                       ERK1 and ERK2 cascade       323          13     5.00
## 18 GO:0002820 negative regulation of adaptive immune r...        47           3     0.73
## 19 GO:0002526                 acute inflammatory response       107           5     1.66
## 20 GO:0016045                      detection of bacterium        18           3     0.28
##       Pval
## 1  0.00012
## 2  0.00015
## 3  0.00025
## 4  0.00039
## 5  0.00042
## 6  0.00054
## 7  0.00055
## 8  0.00055
## 9  0.00078
## 10 0.00094
## 11 0.00098
## 12 0.00129
## 13 0.00134
## 14 0.00139
## 15 0.00140
## 16 0.00146
## 17 0.00204
## 18 0.00232
## 19 0.00252
## 20 0.00253
dn.tab <- GenTable(dnGOdata, Pval = dnRes, topNodes = 20)
dn.tab
##         GO.ID                                        Term Annotated Significant Expected
## 1  GO:0051092 positive regulation of NF-kappaB transcr...       147          12     2.59
## 2  GO:0002755 MyD88-dependent toll-like receptor signa...        20           5     0.35
## 3  GO:0150079 negative regulation of neuroinflammatory...        12           4     0.21
## 4  GO:0006750            glutathione biosynthetic process        12           4     0.21
## 5  GO:1902041 regulation of extrinsic apoptotic signal...        49           8     0.86
## 6  GO:0006954                       inflammatory response       719          39    12.67
## 7  GO:0045944 positive regulation of transcription by ...      1251          46    22.05
## 8  GO:0043031 negative regulation of macrophage activa...        15           4     0.26
## 9  GO:1903978    regulation of microglial cell activation        16           4     0.28
## 10 GO:0032946 positive regulation of mononuclear cell ...       140           5     2.47
## 11 GO:0071277            cellular response to calcium ion        77           7     1.36
## 12 GO:0034143 regulation of toll-like receptor 4 signa...        20           4     0.35
## 13 GO:2000489 regulation of hepatic stellate cell acti...        10           3     0.18
## 14 GO:0030168                         platelet activation        83           8     1.46
## 15 GO:0031643          positive regulation of myelination        24           4     0.42
## 16 GO:0031954 positive regulation of protein autophosp...        24           4     0.42
## 17 GO:0034101                     erythrocyte homeostasis       144           8     2.54
## 18 GO:0007035                      vacuolar acidification        21           4     0.37
## 19 GO:0034135 regulation of toll-like receptor 2 signa...        11           3     0.19
## 20 GO:0045454                      cell redox homeostasis        63           6     1.11
##        Pval
## 1  0.000012
## 2  0.000021
## 3  0.000042
## 4  0.000042
## 5  0.000051
## 6   0.00010
## 7   0.00011
## 8   0.00011
## 9   0.00015
## 10  0.00031
## 11  0.00042
## 12  0.00059
## 13  0.00059
## 14  0.00063
## 15  0.00076
## 16  0.00076
## 17  0.00079
## 18  0.00080
## 19  0.00081
## 20  0.00084
# Update table with full GO term name
up.tab$Term <- sapply(up.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
up.tab$Term
##  [1] "regulation of neutrophil chemotaxis"                        
##  [2] "negative regulation of circadian rhythm"                    
##  [3] "collagen catabolic process"                                 
##  [4] "olefinic compound biosynthetic process"                     
##  [5] "actin filament organization"                                
##  [6] "positive regulation of phagocytosis"                        
##  [7] "complement-dependent cytotoxicity"                          
##  [8] "cellular response to nitric oxide"                          
##  [9] "positive regulation of epithelial to mesenchymal transition"
## [10] "cellular response to prostaglandin E stimulus"              
## [11] "cellular response to extracellular stimulus"                
## [12] "regulation of Rho protein signal transduction"              
## [13] "protein localization to plasma membrane"                    
## [14] "branched-chain amino acid metabolic process"                
## [15] "leukocyte degranulation"                                    
## [16] "hindbrain radial glia guided cell migration"                
## [17] "ERK1 and ERK2 cascade"                                      
## [18] "negative regulation of adaptive immune response"            
## [19] "acute inflammatory response"                                
## [20] "detection of bacterium"
dn.tab$Term <- sapply(dn.tab$"GO.ID", function(go) Term(GO.db::GOTERM[[go]]))
dn.tab$Term
##  [1] "positive regulation of NF-kappaB transcription factor activity"                
##  [2] "MyD88-dependent toll-like receptor signaling pathway"                          
##  [3] "negative regulation of neuroinflammatory response"                             
##  [4] "glutathione biosynthetic process"                                              
##  [5] "regulation of extrinsic apoptotic signaling pathway via death domain receptors"
##  [6] "inflammatory response"                                                         
##  [7] "positive regulation of transcription by RNA polymerase II"                     
##  [8] "negative regulation of macrophage activation"                                  
##  [9] "regulation of microglial cell activation"                                      
## [10] "positive regulation of mononuclear cell proliferation"                         
## [11] "cellular response to calcium ion"                                              
## [12] "regulation of toll-like receptor 4 signaling pathway"                          
## [13] "regulation of hepatic stellate cell activation"                                
## [14] "platelet activation"                                                           
## [15] "positive regulation of myelination"                                            
## [16] "positive regulation of protein autophosphorylation"                            
## [17] "erythrocyte homeostasis"                                                       
## [18] "vacuolar acidification"                                                        
## [19] "regulation of toll-like receptor 2 signaling pathway"                          
## [20] "cell redox homeostasis"

Add gene symbol to result table

The GenTable function only provide the significant gene count for each significant GO term, here we will add the gene symbols to the result table

# Obtain the list of significant genes
up.sigGenes <- sigGenes(upGOdata)
dn.sigGenes <- sigGenes(dnGOdata)

# Retrieve gene symbols for each GO from the test result
up.AnnoList <- lapply(up.tab$"GO.ID", 
              function(x) as.character(unlist(genesInTerm(object = upGOdata, whichGO = x))))
dn.AnnoList <- lapply(dn.tab$"GO.ID", 
              function(x) as.character(unlist(genesInTerm(object = dnGOdata, whichGO = x))))

up.SigList <- lapply(up.AnnoList, function(x) intersect(x, up.sigGenes))
dn.SigList <- lapply(dn.AnnoList, function(x) intersect(x, dn.sigGenes))

# Coerce gene list to a comma-separated vector
up.tab$Genes <- sapply(up.SigList, paste, collapse = ",")
dn.tab$Genes <- sapply(dn.SigList, paste, collapse = ",")
cbind(head(up.tab$Genes, 5))
##      [,1]                                                                                                               
## [1,] "Bst1,C5ar2,Lbp,Nod2,Ripor2"                                                                                       
## [2,] "Cry1,Cry2,Per2,Ptger4"                                                                                            
## [3,] "Ctsl,Itgb1,Mmp27,Mmp9,Prtn3"                                                                                      
## [4,] "Alox15,Alox5,Bmp2,Prkg1,Scp2"                                                                                     
## [5,] "Add3,Alox15,Arhgap12,Bcar1,Cald1,Cdc42ep3,Coro1a,Dpysl3,Efs,Elmo2,Enah,Gba2,Kank2,Myo1d,Myo6,Ptger4,Ssh2,Tjp1,Trf"
cbind(head(dn.tab$Genes, 5))
##      [,1]                                                                    
## [1,] "Cat,Ikbkg,Irak2,Lrrfip2,Malt1,Mtpn,Prkcb,Rab7b,Ticam1,Tlr2,Traf1,Trim8"
## [2,] "Irak2,Lrrfip2,Tlr1,Tlr2,Tnip1"                                         
## [3,] "Cst7,Nr1d1,Syt11,Tnfrsf1b"                                             
## [4,] "Gclc,Gclm,Nfe2l2,Slc7a11"                                              
## [5,] "Atf3,Bcl2l1,Fem1b,Lgals3,Serpine1,Skil,Tmbim1,Tmc8"

Output results to files

write.table(up.tab, file = paste0(outTitle, "_up.txt"), sep = "\t", quote = F, 
        row.names = F, col.names = T)

write.table(dn.tab, file = paste0(outTitle, "_dn.txt"), sep = "\t", quote = F, 
        row.names = F, col.names = T)

Session information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/r4/lib/libopenblasp-r0.3.12.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=en_GB.UTF-8     LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods  
## [9] base     
## 
## other attached packages:
##  [1] SparseM_1.81         ggplot2_3.3.3        org.Mm.eg.db_3.12.0  AnnotationDbi_1.52.0
##  [5] IRanges_2.24.1       S4Vectors_0.28.1     Biobase_2.50.0       topGO_2.41.0        
##  [9] graph_1.68.0         BiocGenerics_0.36.0  knitr_1.31          
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.6         lattice_0.20-41    GO.db_3.12.1       png_0.1-7         
##  [5] assertthat_0.2.1   digest_0.6.27      utf8_1.1.4         R6_2.5.0          
##  [9] RSQLite_2.2.3      evaluate_0.14      highr_0.8          pillar_1.5.0      
## [13] rlang_0.4.10       data.table_1.14.0  jquerylib_0.1.3    blob_1.2.1        
## [17] rmarkdown_2.7      labeling_0.4.2     stringr_1.4.0      bit_4.0.4         
## [21] munsell_0.5.0      compiler_4.0.3     xfun_0.21          pkgconfig_2.0.3   
## [25] htmltools_0.5.1.1  tidyselect_1.1.0   tibble_3.1.0       matrixStats_0.58.0
## [29] fansi_0.4.2        crayon_1.4.1       dplyr_1.0.4        withr_2.4.1       
## [33] grid_4.0.3         jsonlite_1.7.2     gtable_0.3.0       lifecycle_1.0.0   
## [37] DBI_1.1.1          magrittr_2.0.1     scales_1.1.1       stringi_1.5.3     
## [41] cachem_1.0.4       farver_2.1.0       bslib_0.2.4        ellipsis_0.3.1    
## [45] vctrs_0.3.6        generics_0.1.0     tools_4.0.3        bit64_4.0.5       
## [49] glue_1.4.2         purrr_0.3.4        fastmap_1.1.0      yaml_2.2.1        
## [53] colorspace_2.0-0   memoise_2.0.0      sass_0.3.1