16S-rDNA-V3-V4 repository: https://github.com/ycl6/16S-rDNA-V3-V4

phyloseq: GitHub, Documentation, Paper

GUniFrac: Paper

Demo Dataset: PRJEB27564 from Gut Microbiota in Parkinson’s Disease: Temporal Stability and Relations to Disease Progression. EBioMedicine. 2019;44:691-707

License: GPL-3.0


Workflow (Continues from Part 1)

Start R

cd /ngs/16S-Demo/PRJEB27564

R

Load package and set path

library("data.table")
library("phyloseq")
library("DESeq2")
library("ggplot2")
library("ggbeeswarm")
library("ggrepel")
library("vegan")
library("tidyverse")

Set the full-path to additional scripts

Note: Remember to change the PATH to the script accordingly

source("/path-to-script/taxa_summary.R", local = TRUE)

Load saved workspace from Part 1

load("1_dada2_tutorial.RData")

Check phyloseq object

ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7714 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 7714 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7714 tips and 7712 internal nodes ]

Filter Taxa

Subset ps using subset_taxa

Here, we limit the taxa to those from Bacteria, Phylum and Class in not unassigned, and not belonging to the Mitochondria Family

ps0 = subset_taxa(ps, Kingdom == "Bacteria" & !is.na(Phylum) & !is.na(Class) & 
          Family != "Mitochondria")

Create a total counts data.table

tdt = data.table(setDT(as.data.frame(tax_table(ps0))), 
         TotalCounts = taxa_sums(ps0), SV = taxa_names(ps0))

tdt
##        Kingdom     Phylum      Class           Order          Family           Genus
##    1: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    2: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    3: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    4: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##    5: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##   ---                                                                               
## 6698: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6699: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6700: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6701: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 6702: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##       Species TotalCounts     SV
##    1:    <NA>         187 SV2272
##    2:    <NA>          76 SV3398
##    3:    <NA>          13 SV5815
##    4:    <NA>        4873 SV0306
##    5:    <NA>          80 SV3349
##   ---                           
## 6698:    <NA>        4273 SV0343
## 6699:    <NA>          28 SV4887
## 6700:    <NA>          36 SV4481
## 6701:    <NA>          19 SV5326
## 6702:    <NA>      456881 SV0001
ggplot(tdt, aes(TotalCounts)) + geom_histogram(bins = 50) + theme_bw() + 
    ggtitle("Histogram of Total Counts")

tdt[(TotalCounts == 1), .N]     # singletons
## [1] 4
tdt[(TotalCounts == 2), .N]     # doubletons
## [1] 243
# taxa cumulative sum
taxcumsum = tdt[, .N, by = TotalCounts]
setkey(taxcumsum, TotalCounts)
taxcumsum[, CumSum := cumsum(N)]

# Plot the cumulative sum of ASVs against the total counts
pCumSum = ggplot(taxcumsum, aes(TotalCounts, CumSum)) + geom_point() + theme_bw() + 
    xlab("Filtering Threshold") + ylab("ASV Filtered")

png("images/FilterTaxa-taxa-abundance.png", width = 8, height = 8, units = "in", res = 300)
gridExtra::grid.arrange(pCumSum, pCumSum + xlim(0, 500), 
            pCumSum + xlim(0, 100), pCumSum + xlim(0, 50), nrow = 2, 
            top = "ASVs that would be filtered vs. minimum taxa counts threshold")
invisible(dev.off())
"images/FilterTaxa-taxa-abundance.png"

“images/FilterTaxa-taxa-abundance.png”

Create a prevalence data.table

mdt = fast_melt(ps0)

mdt
##          Kingdom     Phylum      Class           Order          Family           Genus
##      1: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      2: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      3: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      4: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##      5: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##     ---                                                                               
## 844448: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 844449: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 844450: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 844451: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
## 844452: Bacteria Firmicutes Clostridia Oscillospirales Ruminococcaceae Subdoligranulum
##         Species TaxaID SampleID count RelativeAbundance
##      1:    <NA> SV0001   C0005B  4573        0.08854167
##      2:    <NA> SV0001   C0009B   910        0.01713231
##      3:    <NA> SV0001   C0019B  2059        0.02681269
##      4:    <NA> SV0001   C0020B  1630        0.03978424
##      5:    <NA> SV0001   C0024B 25660        0.24607536
##     ---                                                
## 844448:    <NA> SV7714   P0120F     0        0.00000000
## 844449:    <NA> SV7714   P0045F     0        0.00000000
## 844450:    <NA> SV7714   P0045B     0        0.00000000
## 844451:    <NA> SV7714   P0083F     0        0.00000000
## 844452:    <NA> SV7714   P0083B     0        0.00000000
prevdt = mdt[, list(Prevalence = sum(count > 0), TotalCounts = sum(count), 
            MaxCounts = max(count)), by = TaxaID]

prevdt
##       TaxaID Prevalence TotalCounts MaxCounts
##    1: SV0001        125      456881     25660
##    2: SV0002         95      325950     20183
##    3: SV0003        109      310049     18095
##    4: SV0004        117      295543     17405
##    5: SV0005        111      283096     19687
##   ---                                        
## 6698: SV7706          1           2         2
## 6699: SV7709          1           1         1
## 6700: SV7710          1           1         1
## 6701: SV7713          1           1         1
## 6702: SV7714          1           1         1
ggplot(prevdt, aes(Prevalence)) + geom_histogram(bins = 50) + theme_bw() +
    ggtitle("Histogram of Taxa Prevalence")

prevdt[(Prevalence == 1), .N]   # singletons
## [1] 3077
prevdt[(Prevalence == 2), .N]   # doubletons
## [1] 1215
ggplot(prevdt, aes(MaxCounts)) + geom_histogram(bins = 100) + xlim(0, 500) + theme_bw() + 
    ggtitle("Histogram of Maximum TotalCounts")

table(prevdt$MaxCounts)[1:50]
## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20  21  22 
##   4 256 249 202 192 169 172 126 135 117 109 109 100  92  89  87  76  84  59  62  76  66 
##  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44 
##  49  48  44  56  49  57  49  44  50  44  52  35  47  41  43  39  41  43  29  33  35  35 
##  45  46  47  48  49  50 
##  29  39  30  36  32  27
prevdt[(MaxCounts == 1)]    # singletons
##    TaxaID Prevalence TotalCounts MaxCounts
## 1: SV7709          1           1         1
## 2: SV7710          1           1         1
## 3: SV7713          1           1         1
## 4: SV7714          1           1         1
# taxa cumulative sum
prevcumsum = prevdt[, .N, by = Prevalence]
setkey(prevcumsum, Prevalence)
prevcumsum[, CumSum := cumsum(N)]

# Plot the cumulative sum of ASVs against the prevalence
pPrevCumSum = ggplot(prevcumsum, aes(Prevalence, CumSum)) + geom_point(size = 2, alpha = 0.5) + 
    theme_bw() + xlab("Filtering Threshold") + ylab("ASVs Filtered") + 
    ggtitle("ASVs that would be filtered vs. minimum sample count threshold")

png("images/FilterTaxa-taxa-prevalence.png", width = 8, height = 8, units = "in", res = 300)
pPrevCumSum
invisible(dev.off())
"images/FilterTaxa-taxa-prevalence.png"

“images/FilterTaxa-taxa-prevalence.png”

Prevalence vs. Total Count Scatter plot

png("images/FilterTaxa-Prevalence-TotalCounts.png", width = 8, height = 8, units = "in", res = 300)
ggplot(prevdt, aes(Prevalence, TotalCounts)) + geom_point(size = 2, alpha = 0.5) + 
    scale_y_log10() + theme_bw() + xlab("Prevalence [No. Samples]") + ylab("TotalCounts [Taxa]")
invisible(dev.off())
"images/FilterTaxa-Prevalence-TotalCounts.png"

“images/FilterTaxa-Prevalence-TotalCounts.png”

Colored by phylum

addPhylum = unique(copy(mdt[, list(TaxaID, Phylum)]))

# Join by TaxaID
setkey(prevdt, TaxaID)
setkey(addPhylum, TaxaID)
prevdt = addPhylum[prevdt]
setkey(prevdt, Phylum)

png("images/FilterTaxa-Prevalence-TotalCounts-Phylum.png", width = 8, height = 8, 
    units = "in", res = 300)
ggplot(prevdt, aes(Prevalence, TotalCounts, color = Phylum)) + 
    geom_point(size = 1, alpha = 0.5) + scale_y_log10() + theme_bw() + 
    facet_wrap(~Phylum, nrow = 4) + theme(legend.position="none") + 
    xlab("Prevalence [No. Samples]") + ylab("Total Abundance")
invisible(dev.off())
"images/FilterTaxa-Prevalence-TotalCounts-Phylum.png"

“images/FilterTaxa-Prevalence-TotalCounts-Phylum.png”

Define filter threshold

Remember to review the images/FilterTaxa-*.png plots to select the best paramters

prevalenceThreshold = 5
abundanceThreshold = 10
maxThreshold = 5

Perform taxa filtering

keepTaxa = prevdt[(Prevalence > prevalenceThreshold & 
           TotalCounts > abundanceThreshold & 
           MaxCounts > maxThreshold), TaxaID]

ps1 = prune_taxa(keepTaxa, ps0)

phy_tree(ps1) = ape::root(phy_tree(ps1), sample(taxa_names(ps1), 1), resolve.root = TRUE)

ps1
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1450 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 1450 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1450 tips and 1449 internal nodes ]

Create output files

Create FASTA file

dada2::uniquesToFasta(df[rownames(df) %in% taxa_names(ps1),], "outfiles/expr.asv.fasta", 
              ids = df[rownames(df) %in% taxa_names(ps1),]$id)

Create BIOM file

biomformat::write_biom(biomformat::make_biom(data = t(as.matrix(otu_table(ps1)))), 
               "outfiles/expr.biom")

Create ASV and Taxonomy tables

write.table(as.data.table(otu_table(ps1), keep.rownames = T), file = "outfiles/expr.otu_table.txt", 
        sep = "\t", quote = F, row.names = F, col.names = T)
write.table(as.data.table(tax_table(ps1), keep.rownames = T), file = "outfiles/expr.tax_table.txt", 
        sep = "\t", quote = F, row.names = F, col.names = T)

Create raw abundance tables

write.table(ps1 %>% psmelt() %>% arrange(OTU) %>% rename(ASV = OTU) %>% 
        select(ASV, Kingdom, Phylum, Class, Order, Family, Genus, Species, Sample_ID, Abundance) %>% 
        spread(Sample_ID, Abundance),
        file = "outfiles/expr.abundance.all.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Phylum") %>% psmelt() %>% 
        select(Phylum, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance),
        file = "outfiles/expr.abundance.abphy.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Class") %>% psmelt() %>% 
        select(Class, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance),
        file = "outfiles/expr.abundance.abcls.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Family") %>% psmelt() %>%
        select(Family, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance),
        file = "outfiles/expr.abundance.abfam.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Genus") %>% psmelt() %>%
        select(Genus, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance),
        file = "outfiles/expr.abundance.abgen.txt", sep = "\t", 
        quote = F, row.names = F, col.names = T)

Create relative abundance tables

Use the transform_sample_counts function to transforms the sample counts of a taxa abundance matrix according to the provided function, in the case the fractions of the whole sum

write.table(ps1 %>% transform_sample_counts(function(x) {x/sum(x)} ) %>% psmelt() %>% 
        arrange(OTU) %>% rename(ASV = OTU) %>% 
        select(ASV, Kingdom, Phylum, Class, Order, Family, Genus, Species, Sample_ID, Abundance) %>%
        spread(Sample_ID, Abundance), 
    file = "outfiles/expr.relative_abundance.all.txt", 
    sep = "\t", quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Phylum") %>% 
        transform_sample_counts(function(x) {x/sum(x)} ) %>% psmelt() %>%
        select(Phylum, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance), 
    file = "outfiles/expr.relative_abundance.abphy.txt", 
    sep = "\t", quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Class") %>%
        transform_sample_counts(function(x) {x/sum(x)} ) %>% psmelt() %>% 
        select(Class, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance), 
    file = "outfiles/expr.relative_abundance.abcls.txt", 
    sep = "\t", quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Family") %>%
        transform_sample_counts(function(x) {x/sum(x)} ) %>% psmelt() %>% 
        select(Family, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance), 
    file = "outfiles/expr.relative_abundance.abfam.txt", 
    sep = "\t", quote = F, row.names = F, col.names = T)
write.table(ps1 %>% tax_glom(taxrank = "Genus") %>% 
        transform_sample_counts(function(x) {x/sum(x)} ) %>% psmelt() %>% 
        select(Genus, Sample_ID, Abundance) %>% spread(Sample_ID, Abundance), 
    file = "outfiles/expr.relative_abundance.abgen.txt", 
    sep = "\t", quote = F, row.names = F, col.names = T)

Plot abundance

# Transform to proportions (relative abundances)
ps1.rp = transform_sample_counts(ps1, function(OTU) OTU/sum(OTU))

# Top N taxa
N = 200
topN = names(sort(taxa_sums(ps1), decreasing=TRUE))[1:N]
ps1.topN = transform_sample_counts(ps1, function(OTU) OTU/sum(OTU))
ps1.topN = prune_taxa(topN, ps1.topN)

ps1.topN
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 200 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 7 sample variables ]
## tax_table()   Taxonomy Table:    [ 200 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 200 tips and 199 internal nodes ]

At Phylum level

# All taxa
ptaxa1 = plot_bar(ps1.rp, x = "Sample_ID", fill = "Phylum", 
          title = paste(ntaxa(ps1.rp), "Taxa (Phylum)")) + 
    geom_bar(stat = "identity", size = 0.1, color = "black") + 
    facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) + 
    scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
    theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7), 
          axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1), 
          axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative abundance")

# Top 200 taxa
ptaxa2 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Phylum", 
          title = paste("Top",N, "Taxa (Phylum)")) + 
    geom_bar(stat = "identity", size = 0.1, color = "black") + 
    facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) + 
    scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
    theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7), 
          axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1), 
          axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_phylum.png", width = 10, height = 8, units = "in", res = 300)
gridExtra::grid.arrange(ptaxa1, ptaxa2, nrow = 2)
invisible(dev.off())
"images/plot_bar_phylum.png"

“images/plot_bar_phylum.png”

At Class level

ptaxa3 = plot_bar(ps1.rp, x = "Sample_ID", fill = "Class", 
          title = paste(ntaxa(ps1.rp), "Taxa (Class)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) +
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative abundance")

ptaxa4 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Class", 
          title = paste("Top",N, "Taxa (Class)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) +
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_class.png", width = 10, height = 8, units = "in", res = 300)
gridExtra::grid.arrange(ptaxa3, ptaxa4, nrow = 2)
invisible(dev.off())
"images/plot_bar_class.png"

“images/plot_bar_class.png”

At Family level

ptaxa5 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Family", 
          title = paste("Top",N, "Taxa (Family)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 1)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_family.png", width = 10, height = 8, units = "in", res = 300)
ptaxa5
invisible(dev.off())
"images/plot_bar_family.png"

“images/plot_bar_family.png”

At Genus level

ptaxa6 = plot_bar(ps1.topN, x = "Sample_ID", fill = "Genus", 
          title = paste("Top",N, "Taxa (Genus)")) +
        geom_bar(stat = "identity", size = 0.1, color = "black") +
        facet_wrap(~Group2, scales = "free_x", nrow = 1) + guides(fill = guide_legend(ncol = 2)) +
        scale_y_continuous(breaks = seq(0, 1, 0.1)) + 
        theme(legend.title = element_text(size = 8), legend.text = element_text(size = 7),
              axis.text.x = element_text(size = 5, angle = 90, vjust = 0.5, hjust = 1),
              axis.text.y = element_text(size = 12)) + xlab("Sample") + ylab("Relative Abundance")

png("images/plot_bar_genus.png", width = 10, height = 8, units = "in", res = 300)
ptaxa6
invisible(dev.off())
"images/plot_bar_genus.png"

“images/plot_bar_genus.png”

Alpha diversity

Explore alpha diversity using the plot_richness function

# Select alpha-diversity measures
divIdx = c("Observed", "Chao1", "Shannon", "Simpson")

png("images/plot_richness.png", width = 6, height = 5, units = "in", res = 300)
plot_richness(ps1, x = "Group2", measures = divIdx, color = "Group2", nrow = 1) + 
    geom_point(size = 0.8) + theme_bw() + 
    theme(legend.position = "none", 
          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
    labs(x = "Group", y = "Alpha Diversity Measure")
invisible(dev.off())
"images/plot_richness.png"

“images/plot_richness.png”

Create a long-format data.frame and plot with ggplot function

ad = estimate_richness(ps1, measures = divIdx)
ad = merge(data.frame(sample_data(ps1)), ad, by = "row.names")
ad = ad %>% select(Sample_ID, Group2, all_of(divIdx)) %>% 
    gather(key = "alpha", value = "measure", -c(Sample_ID, Group2))

png("images/plot_richness_boxplot.png", width = 6, height = 5, units = "in", res = 300)
ggplot(ad, aes(Group2, measure, color = Group2)) + 
    geom_boxplot(outlier.shape = NA, size = 0.8, width = 0.8) + 
    geom_quasirandom(size = 0.8, color = "black") + theme_bw() + 
    facet_wrap(~ alpha, scales = "free_y", nrow = 1) + 
    theme(legend.position = "none", 
          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) + 
    labs(x = "Group", y = "Alpha Diversity Measure")
invisible(dev.off())
"images/plot_richness_boxplot.png"

“images/plot_richness_boxplot.png”

Beta diversity

Perform various flavors of unifrac measurements using the GUniFrac package

unifracs = GUniFrac::GUniFrac(otu_table(ps1), phy_tree(ps1), alpha = c(0, 0.5, 1))$unifracs

Perform ordination

# Calculate geometric means prior to estimate size factors
gm_mean = function(x, na.rm = TRUE){
    exp(sum(log(x[x > 0]), na.rm = na.rm) / length(x))
}

# Perform variance stabilizing transformation
ps1.vsd = ps1
dds = phyloseq_to_deseq2(ps1, ~ Group2)
## converting counts to integer mode
geoMeans = apply(counts(dds), 1, gm_mean)
dds = estimateSizeFactors(dds, geoMeans = geoMeans)
dds = estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
abund.vsd = getVarianceStabilizedData(dds)
abund.vsd[abund.vsd < 0] = 0    # set negative values to 0
otu_table(ps1.vsd) = otu_table(abund.vsd, taxa_are_rows = TRUE)

# Create distance objects
dist_un = as.dist(unifracs[, , "d_UW"])       # Unweighted UniFrac
attr(dist_un, "method") = "Unweighted UniFrac"

dist_wu = as.dist(unifracs[, , "d_1"])        # Weighted UniFrac
attr(dist_wu, "method") = "Weighted UniFrac"

dist_vu = as.dist(unifracs[, , "d_VAW"])      # Variance-adjusted-weighted UniFrac
attr(dist_vu, "method") = "Variance-adjusted-weighted UniFrac"

dist_gu = as.dist(unifracs[, , "d_0.5"])      # GUniFrac with alpha 0.5
attr(dist_gu, "method") = "GUniFrac with alpha 0.5"

dist_bc = phyloseq::distance(ps1.vsd, "bray") # Bray-Curtis

# Perform ordination
ord_un = ordinate(ps1, method = "PCoA", distance = dist_un)
ord_wu = ordinate(ps1, method = "PCoA", distance = dist_wu)
ord_vu = ordinate(ps1, method = "PCoA", distance = dist_vu)
ord_gu = ordinate(ps1, method = "PCoA", distance = dist_gu)
set.seed(12345)
ord_bc = ordinate(ps1, method = "NMDS", distance = dist_bc)
## Run 0 stress 0.164095 
## Run 1 stress 0.164095 
## ... Procrustes: rmse 0.0003787895  max resid 0.003675405 
## ... Similar to previous best
## Run 2 stress 0.4123708 
## Run 3 stress 0.1640947 
## ... New best solution
## ... Procrustes: rmse 0.0002492135  max resid 0.002473993 
## ... Similar to previous best
## Run 4 stress 0.1640948 
## ... Procrustes: rmse 0.0001457638  max resid 0.001053662 
## ... Similar to previous best
## Run 5 stress 0.1657887 
## Run 6 stress 0.1640946 
## ... New best solution
## ... Procrustes: rmse 0.00008415684  max resid 0.0008498087 
## ... Similar to previous best
## Run 7 stress 0.164095 
## ... Procrustes: rmse 0.0002501875  max resid 0.002166513 
## ... Similar to previous best
## Run 8 stress 0.1640946 
## ... New best solution
## ... Procrustes: rmse 0.0001519537  max resid 0.001130737 
## ... Similar to previous best
## Run 9 stress 0.1657897 
## Run 10 stress 0.1640946 
## ... Procrustes: rmse 0.0001009962  max resid 0.0009432627 
## ... Similar to previous best
## Run 11 stress 0.165789 
## Run 12 stress 0.1640952 
## ... Procrustes: rmse 0.0003352119  max resid 0.002984929 
## ... Similar to previous best
## Run 13 stress 0.1640948 
## ... Procrustes: rmse 0.0002358344  max resid 0.001624603 
## ... Similar to previous best
## Run 14 stress 0.1640947 
## ... Procrustes: rmse 0.00006278543  max resid 0.0003796508 
## ... Similar to previous best
## Run 15 stress 0.1907991 
## Run 16 stress 0.1657896 
## Run 17 stress 0.1657889 
## Run 18 stress 0.1657893 
## Run 19 stress 0.1640951 
## ... Procrustes: rmse 0.0003231099  max resid 0.002735841 
## ... Similar to previous best
## Run 20 stress 0.1657883 
## *** Solution reached

Output to PDF

pdf("images/plot_ordination.pdf", width = 6, height = 5, pointsize = 12)
plot_ordination(ps1, ord_un, type = "samples", color = "Group2", 
        title = "PCoA on unweighted UniFrac") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_wu, type = "samples", color = "Group2", 
        title = "PCoA on weighted UniFrac") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_vu, type = "samples", color = "Group2", 
        title = "PCoA on Variance adjusted weighted UniFrac") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_gu, type = "samples", color = "Group2", 
        title = "PCoA on GUniFrac with alpha 0.5") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")

plot_ordination(ps1, ord_bc, type = "samples", color = "Group2", 
        title = "NMDS on Bray-Curtis distance (VST)") + 
        theme_bw() + coord_fixed(ratio = 1) + 
        stat_ellipse(aes(group = Group2), type = "t", size = 0.5, linetype = "dashed")
invisible(dev.off())

Example: PCoA on weighted UniFrac

Multivariate analyses

PERMANOVA

Permutational multivariate analysis of variance (PERMANOVA) test for differences between independent groups using the adonis function

set.seed(12345)
adonis(dist_un ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_un ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## Group2      3    0.3596 0.11988  1.1438 0.02736 0.1499
## Residuals 122   12.7869 0.10481         0.97264       
## Total     125   13.1465                 1.00000
set.seed(12345)
adonis(dist_wu ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_wu ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs  MeanSqs F.Model      R2 Pr(>F)
## Group2      3    0.1706 0.056865  1.2852 0.03063 0.1459
## Residuals 122    5.3982 0.044248         0.96937       
## Total     125    5.5688                  1.00000
set.seed(12345)
adonis(dist_vu ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_vu ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs    MeanSqs  F.Model       R2 Pr(>F)
## Group2      3   -0.0181 -0.0060325 -0.46755 -0.01163 0.9071
## Residuals 122    1.5741  0.0129022           1.01163       
## Total     125    1.5560                      1.00000
set.seed(12345)
adonis(dist_gu ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_gu ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## Group2      3    0.3953 0.13176  1.1215 0.02684 0.2068
## Residuals 122   14.3330 0.11748         0.97316       
## Total     125   14.7283                 1.00000
set.seed(12345)
adonis(dist_bc ~ Group2, data.frame(sample_data(ps1)), permutations = 1000)
## 
## Call:
## adonis(formula = dist_bc ~ Group2, data = data.frame(sample_data(ps1)),      permutations = 1000) 
## 
## Permutation: free
## Number of permutations: 1000
## 
## Terms added sequentially (first to last)
## 
##            Df SumsOfSqs MeanSqs F.Model      R2 Pr(>F)
## Group2      3    0.6691 0.22304  1.1532 0.02758 0.1239
## Residuals 122   23.5960 0.19341         0.97242       
## Total     125   24.2652                 1.00000

Multivariate dispersion

Test for homogeneity condition among groups using the betadisper function

disp1 = betadisper(dist_un, group = data.frame(sample_data(ps1))$Group2)
set.seed(12345)
permutest(disp1, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df   Sum Sq    Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.001941 0.00064713 0.3975   1000 0.7493
## Residuals 122 0.198631 0.00162812
disp2 = betadisper(dist_wu, group = data.frame(sample_data(ps1))$Group2) 
set.seed(12345)
permutest(disp2, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.01620 0.0053986 1.7576   1000 0.1548
## Residuals 122 0.37473 0.0030715
disp3 = betadisper(dist_vu, group = data.frame(sample_data(ps1))$Group2)
set.seed(12345)
permutest(disp3, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df   Sum Sq    Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.000999 0.00033284 0.3408   1000 0.8232
## Residuals 122 0.119158 0.00097671
disp4 = betadisper(dist_gu, group = data.frame(sample_data(ps1))$Group2)
set.seed(12345)
permutest(disp4, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df   Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.006557 0.0021856 0.8989   1000 0.4555
## Residuals 122 0.296638 0.0024315
disp5 = betadisper(dist_bc, group = data.frame(sample_data(ps1))$Group2)
set.seed(12345)
permutest(disp5, permutations = 1000)
## 
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 1000
## 
## Response: Distances
##            Df  Sum Sq   Mean Sq      F N.Perm Pr(>F)
## Groups      3 0.01731 0.0057686 1.6737   1000 0.1898
## Residuals 122 0.42049 0.0034467

Example: Dispersion of unweighted UniFrac

plot(disp3, hull = FALSE, ellipse = TRUE)

Save current workspace

save.image(file = "2_phyloseq_tutorial.RData")

Session information

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3_4.10.0/envs/r4-16S/lib/libopenblasp-r0.3.15.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=en_GB.UTF-8     LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets  methods  
## [9] base     
## 
## other attached packages:
##  [1] forcats_0.5.1               stringr_1.4.0               dplyr_1.0.7                
##  [4] purrr_0.3.4                 readr_1.4.0                 tidyr_1.1.3                
##  [7] tibble_3.1.2                tidyverse_1.3.1             vegan_2.5-7                
## [10] lattice_0.20-44             permute_0.9-5               ggrepel_0.9.1              
## [13] ggbeeswarm_0.6.0            ggplot2_3.3.4               DESeq2_1.32.0              
## [16] SummarizedExperiment_1.22.0 Biobase_2.52.0              MatrixGenerics_1.4.0       
## [19] matrixStats_0.59.0          GenomicRanges_1.44.0        GenomeInfoDb_1.28.0        
## [22] IRanges_2.26.0              S4Vectors_0.30.0            BiocGenerics_0.38.0        
## [25] phyloseq_1.36.0             data.table_1.14.0           knitr_1.33                 
## 
## loaded via a namespace (and not attached):
##   [1] colorspace_2.0-2         hwriter_1.3.2            ellipsis_0.3.2          
##   [4] XVector_0.32.0           fs_1.5.0                 rstudioapi_0.13         
##   [7] farver_2.1.0             bit64_4.0.5              AnnotationDbi_1.54.0    
##  [10] fansi_0.4.2              lubridate_1.7.10         xml2_1.3.2              
##  [13] codetools_0.2-18         splines_4.1.0            cachem_1.0.5            
##  [16] dada2_1.20.0             geneplotter_1.70.0       ade4_1.7-17             
##  [19] jsonlite_1.7.2           Rsamtools_2.8.0          broom_0.7.8             
##  [22] annotate_1.70.0          cluster_2.1.2            dbplyr_2.1.1            
##  [25] png_0.1-7                compiler_4.1.0           httr_1.4.2              
##  [28] backports_1.2.1          assertthat_0.2.1         Matrix_1.3-4            
##  [31] fastmap_1.1.0            cli_2.5.0                htmltools_0.5.1.1       
##  [34] tools_4.1.0              igraph_1.2.6             gtable_0.3.0            
##  [37] glue_1.4.2               GenomeInfoDbData_1.2.6   reshape2_1.4.4          
##  [40] ShortRead_1.50.0         Rcpp_1.0.6               cellranger_1.1.0        
##  [43] vctrs_0.3.8              Biostrings_2.60.0        rhdf5filters_1.4.0      
##  [46] multtest_2.48.0          ape_5.5                  nlme_3.1-152            
##  [49] iterators_1.0.13         xfun_0.24                ps_1.6.0                
##  [52] rvest_1.0.0              lifecycle_1.0.0          XML_3.99-0.6            
##  [55] zlibbioc_1.38.0          MASS_7.3-54              scales_1.1.1            
##  [58] hms_1.1.0                biomformat_1.20.0        rhdf5_2.36.0            
##  [61] RColorBrewer_1.1-2       yaml_2.2.1               gridExtra_2.3           
##  [64] memoise_2.0.0            latticeExtra_0.6-29      stringi_1.6.2           
##  [67] RSQLite_2.2.5            highr_0.9                genefilter_1.74.0       
##  [70] foreach_1.5.1            BiocParallel_1.26.0      rlang_0.4.11            
##  [73] pkgconfig_2.0.3          bitops_1.0-7             evaluate_0.14           
##  [76] Rhdf5lib_1.14.0          labeling_0.4.2           GenomicAlignments_1.28.0
##  [79] bit_4.0.4                tidyselect_1.1.1         plyr_1.8.6              
##  [82] magrittr_2.0.1           R6_2.5.0                 generics_0.1.0          
##  [85] DelayedArray_0.18.0      DBI_1.1.1                pillar_1.6.1            
##  [88] haven_2.4.1              withr_2.4.2              mgcv_1.8-36             
##  [91] survival_3.2-11          KEGGREST_1.32.0          RCurl_1.98-1.3          
##  [94] modelr_0.1.8             crayon_1.4.1             utf8_1.2.1              
##  [97] rmarkdown_2.9            jpeg_0.1-8.1             locfit_1.5-9.4          
## [100] grid_4.1.0               readxl_1.3.1             blob_1.2.1              
## [103] reprex_2.0.0             digest_0.6.27            xtable_1.8-4            
## [106] RcppParallel_5.1.4       munsell_0.5.0            beeswarm_0.4.0          
## [109] vipor_0.4.5