scRUtils 0.1.0
scRUtils
provides various utilities for visualising and functional analysis of RNA-seq data,
particularly single-cell dataset. It evolved from a collection of helper functions that were
used in our in-house scRNA-seq processing workflow.
The documentation of this package is divided into 5 sections:
This vignette (#5) will show how the demo datasets in this package were created. The codes were not evaluated to minimise the time needed to build the vignette.
phases_assignments
from cyclone
resultsA mock reference (ref
) with 1000 cells and the expression of 2000 genes was created using
mockSCE()
from the scuttle package, and its gene expression data was used to
train a classifier for cell cycle phase using sandbag()
from the scran package.
A second mock object (phases
) with 200 cells was created and the cells were classified into
their cell cycle phases using cyclone()
from the scran package. The results were
saved as a RData file.
library(scuttle)
library(scran)
set.seed(10010)
ref <- mockSCE(ncells = 1000)
# Constructing a classifier:
is.G1 <- which(ref$Cell_Cycle %in% c("G1", "G0"))
is.S <- which(ref$Cell_Cycle == "S")
is.G2M <- which(ref$Cell_Cycle == "G2M")
out <- sandbag(ref, list(G1 = is.G1, S = is.S, G2M = is.G2M))
set.seed(10010)
phases <- mockSCE(ncells = 200)
# Run cyclone
phases_assignments <- cyclone(phases, out)
save(phases_assignments, file = "phases_assignments.rda", compress = "xz")
dbl_results
from findDoubletClusters
resultsA SingleCellExperiment
object with 3 clusters of cells and the expression of 200 genes was
created using mockDoubletSCE()
from the scDblFinder package. Using the same
package, the doublet-ness of each cluster was computed using findDoubletClusters()
. The
results were saved as a RData file.
library(SingleCellExperiment)
library(scDblFinder)
set.seed(10010)
dbl <- mockDoubletSCE(ncells = c(100, 300, 250), ngenes = 200)
# Run findDoubletClusters
dbl_results <- findDoubletClusters(counts(dbl), dbl$cluster)
save(dbl_results, file = "dbl_results.rda", compress = "xz")
SingleCellExperiment
objectA mock cell type reference (ref
) with 20 samples and the expression of 1000 genes was
created using the SingleR package, which was used in the next section to annotate
cell types. A second mock object (sce
) with 250 cells was created using .mockTestData()
and
.mockRefData()
. The genes were given mocked up symbols.
library(SingleR)
library(scater)
library(scran)
# Generate 1000 random gene symbols
set.seed(10010)
symbols <- paste0(do.call(paste0, replicate(3, sample(LETTERS, 1000, TRUE), FALSE)),
do.call(paste0, replicate(2, sample(1:9, 1000, TRUE), FALSE)))
# table(table(symbols) == 1) # check uniqueness
# Create mock reference for SingleR
set.seed(10010)
ref <- .mockRefData()
colnames(ref) <- paste0("SAMPLE_", seq_len(ncol(ref)))
rownames(ref) <- symbols
ref <- logNormCounts(ref)
ref$label <- paste("Type", as.numeric(factor(ref$label)))
# Create mock SingleCellExperiment
set.seed(10010)
sce <- as(.mockTestData(.mockRefData(), ncells = 250), "SingleCellExperiment")
colnames(sce) <- paste0("CELL_", seq_len(ncol(sce)))
rownames(sce) <- symbols
Then, the sce
object is processed as follow:
addPerCellQC()
and addPerFeatureQC
.logNormCounts()
.getTopHVGs()
based on the variance modelling statistics
from modelGeneVar()
.SingleR()
with mock reference (ref
).findMarkers()
.The resulting object was saved as a RData file.
sce <- addPerCellQC(sce)
sce <- addPerFeatureQC(sce)
sce <- logNormCounts(sce)
dec <- modelGeneVar(sce)
hvg <- getTopHVGs(dec, n = 200, var.threshold = 0)
metadata(sce)[['modelGeneVar']] <- dec
metadata(sce)[['HVG']] <- hvg
set.seed(10010)
sce <- runPCA(sce)
set.seed(10010)
sce <- runTSNE(sce)
set.seed(10010)
sce <- runUMAP(sce)
# Run SingleR
pred <- SingleR(sce, ref, labels = ref$label)
sce$CellType <- pred$pruned.labels
metadata(sce)[['SingleR']] <- pred
# Run findMarkers
out1 <- findMarkers(sce, groups = sce$label, pval.type = "any")
metadata(sce)[['findMarkers1']] <- out1
out2 <- findMarkers(sce, groups = sce$label, pval.type = "all")
metadata(sce)[['findMarkers2']] <- out2
out3 <- findMarkers(sce, groups = sce$label, pval.type = "any", min.prop = 0.5)
metadata(sce)[['findMarkers3']] <- out3
out4 <- findMarkers(sce, groups = sce$label, test.type = "wilcox")
metadata(sce)[['findMarkers4']] <- out4
save(sce, file = "sce.rda", compress = "xz")
pasilla
RNA-seq datasetThe pasilla package containing the Drosophila melanogaster RNA-seq with RNAi knockdown experiment of Pasilla was used as the demo dataset for generating results following differential expression (DE) analysis by edgeR and DESeq2.
library(pasilla)
# Get data from the pasilla package
pasCts <- system.file("extdata", "pasilla_gene_counts.tsv",
package = "pasilla", mustWork = TRUE)
pasAnno <- system.file("extdata", "pasilla_sample_annotation.csv",
package = "pasilla", mustWork = TRUE)
cts <- as.matrix(read.csv(pasCts, sep = "\t", row.names = "gene_id"))
coldata <- read.csv(pasAnno, row.names = 1)
The pasilla
RNA-seq data was processed to harmonise the samples in the provided count and
annotation data files. Additional gene annotation were obtained from BioMart using the
biomaRt package, genes that have annotations were retained for DE analysis.
library(biomaRt)
# Prepare sample annotations
coldata <- coldata[,c("condition","type")]
coldata$condition <- factor(coldata$condition)
coldata$condition <- relevel(coldata$condition, ref = "untreated")
coldata$type <- factor(coldata$type)
levels(coldata$type) <- sub("-", "_", levels(coldata$type))
rownames(coldata) <- sub("fb", "", rownames(coldata))
cts <- cts[, rownames(coldata)]
# Use BioMart for gene annotations
ensembl <- useEnsembl(biomart = "genes", dataset = "dmelanogaster_gene_ensembl")
geneAnno <- getBM(attributes = c("chromosome_name","start_position",
"end_position","ensembl_gene_id",
"external_gene_name","strand","gene_biotype"),
filters = "ensembl_gene_id", values = rownames(cts), mart = ensembl)
# Keep genes with annotation
cts <- cts[geneAnno$ensembl_gene_id,]
all(rownames(cts) == geneAnno$ensembl_gene_id)
DESeqResults
object res_deseq2
The DESeq2 package was used to perform DE analysis on the pasilla
dataset,
and the results were saved as a RData file.
library(DESeq2)
# Object construction
dds <- DESeqDataSetFromMatrix(countData = cts, colData = coldata,
design = ~ type + condition)
dds <- DESeq(dds)
res_deseq2 <- results(dds)
res_deseq2@listData <- c(as.list(geneAnno), res_deseq2@listData)
res_deseq2@elementMetadata <- rbind(DataFrame(type = "annotation",
description = colnames(geneAnno)),
res_deseq2@elementMetadata)
save(res_deseq2, file = "res_deseq2.rda", compress = "xz")
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/jupyterlab/lib/libopenblasp-r0.3.20.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] BiocStyle_2.22.0
##
## loaded via a namespace (and not attached):
## [1] bookdown_0.26 digest_0.6.29 R6_2.5.1
## [4] jsonlite_1.8.0 magrittr_2.0.3 evaluate_0.15
## [7] stringi_1.7.6 rlang_1.0.2 cli_3.2.0
## [10] jquerylib_0.1.4 bslib_0.3.1 rmarkdown_2.13
## [13] tools_4.1.3 stringr_1.4.0 xfun_0.30
## [16] yaml_2.3.5 fastmap_1.1.0 compiler_4.1.3
## [19] BiocManager_1.30.16 htmltools_0.5.2 knitr_1.38
## [22] sass_0.4.1