The scRNAseq
package provides convenient access to several datasets. See the package Bioconductor page for more information.
# install BiocManager package if not installed yet.
# BiocManager is the package installer for Bioconductor software.
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
# install scRNAseq package if not yet installed.
if(!"scRNAseq" %in% installed.packages()[,1]) BiocManager::install("scRNAseq")
# Code below might ask you to create an ExperimentHub directory.
# Type 'yes' and hit Enter, to allow this.
suppressPackageStartupMessages(library(scRNAseq))
sce <- MacoskoRetinaData()
## snapshotDate(): 2019-10-22
## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
## loading from cache
## see ?scRNAseq and browseVignettes('scRNAseq') for documentation
## loading from cache
SingleCellExperiment
objectsce
## class: SingleCellExperiment
## dim: 24658 49300
## metadata(0):
## assays(1): counts
## rownames(24658): KITL TMTC3 ... 1110059M19RIK GM20861
## rowData names(0):
## colnames(49300): r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA ... p1_TAACGCGCTCCT
## p1_ATTCTTGTTCTT
## colData names(2): cell.id cluster
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
SingleCellExperiment
objectPlease see Figure 4.1 in OSCA for an overview of a SingleCellExperiment
object.
# Data: assays
assays(sce)
## List of length 1
## names(1): counts
assays(sce)$counts[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA r1_GCGCAACTGCTC r1_GATTGGGAGGCA
## KITL . . 1 .
## TMTC3 3 . . .
## CEP290 1 3 . 2
## 4930430F08RIK 2 1 2 .
## 1700017N19RIK . . . .
## r1_CCTCCTAGTTGG
## KITL .
## TMTC3 2
## CEP290 1
## 4930430F08RIK 1
## 1700017N19RIK .
# Feature metadata: rowData
rowData(sce) # empty for now
## DataFrame with 24658 rows and 0 columns
# Cell metadata: colData
colData(sce)
## DataFrame with 49300 rows and 2 columns
## cell.id cluster
## <character> <integer>
## r1_GGCCGCAGTCCG r1_GGCCGCAGTCCG 2
## r1_CTTGTGCGGGAA r1_CTTGTGCGGGAA 2
## r1_GCGCAACTGCTC r1_GCGCAACTGCTC 2
## r1_GATTGGGAGGCA r1_GATTGGGAGGCA 2
## r1_CCTCCTAGTTGG r1_CCTCCTAGTTGG NA
## ... ... ...
## p1_TCAAAAGCCGGG p1_TCAAAAGCCGGG 24
## p1_ATTAAGTTCCAA p1_ATTAAGTTCCAA 34
## p1_CTGTCTGAGACC p1_CTGTCTGAGACC 2
## p1_TAACGCGCTCCT p1_TAACGCGCTCCT 24
## p1_ATTCTTGTTCTT p1_ATTCTTGTTCTT 24
# Reduced dimensions: reducedDims
reducedDims(sce) # empty for now
## List of length 0
## names(0):
SingleCellExperiment
objectsceNew <- SingleCellExperiment(assays = list(counts = assays(sce)$counts))
sceNew
## class: SingleCellExperiment
## dim: 24658 49300
## metadata(0):
## assays(1): counts
## rownames(24658): KITL TMTC3 ... 1110059M19RIK GM20861
## rowData names(0):
## colnames(49300): r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA ... p1_TAACGCGCTCCT
## p1_ATTCTTGTTCTT
## colData names(0):
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
rm(sceNew)
SingleCellExperiment
objectfakeGeneNames <- paste0("gene", 1:nrow(sce))
rowData(sce)$fakeName <- fakeGeneNames
head(rowData(sce))
## DataFrame with 6 rows and 1 column
## fakeName
## <character>
## KITL gene1
## TMTC3 gene2
## CEP290 gene3
## 4930430F08RIK gene4
## 1700017N19RIK gene5
## MGAT4C gene6
# Remove again by setting to NULL
rowData(sce)$fakeName <- NULL
assays(sce)$logCounts <- log1p(assays(sce)$counts)
assays(sce)
## List of length 2
## names(2): counts logCounts
assays(sce)$logCounts[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
## r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA r1_GCGCAACTGCTC r1_GATTGGGAGGCA
## KITL . . 0.6931472 .
## TMTC3 1.3862944 . . .
## CEP290 0.6931472 1.3862944 . 1.098612
## 4930430F08RIK 1.0986123 0.6931472 1.0986123 .
## 1700017N19RIK . . . .
## r1_CCTCCTAGTTGG
## KITL .
## TMTC3 1.0986123
## CEP290 0.6931472
## 4930430F08RIK 0.6931472
## 1700017N19RIK .
assays(sce)$logCounts <- NULL
keep <- rowSums(assays(sce)$counts > 0) > 10
table(keep)
## keep
## FALSE TRUE
## 6771 17887
sce <- sce[keep,]
library(scater)
## Loading required package: ggplot2
is.mito <- grepl("^MT-", rownames(sce))
sum(is.mito) # 31 mitochondrial genes
## [1] 28
sce <- addPerCellQC(sce, subsets=list(Mito=is.mito))
# the QC variables have now been added to the colData of our SCE object.
colData(sce)
## DataFrame with 49300 rows and 12 columns
## cell.id cluster sum detected percent_top_50
## <character> <integer> <numeric> <integer> <numeric>
## r1_GGCCGCAGTCCG r1_GGCCGCAGTCCG 2 37478 7235 11.2866214846043
## r1_CTTGTGCGGGAA r1_CTTGTGCGGGAA 2 32034 6921 12.2494849222701
## r1_GCGCAACTGCTC r1_GCGCAACTGCTC 2 28140 6390 12.9459843638948
## r1_GATTGGGAGGCA r1_GATTGGGAGGCA 2 20352 5727 12.3968160377358
## r1_CCTCCTAGTTGG r1_CCTCCTAGTTGG NA 19550 5769 12.690537084399
## ... ... ... ... ... ...
## p1_TCAAAAGCCGGG p1_TCAAAAGCCGGG 24 817 537 30.3549571603427
## p1_ATTAAGTTCCAA p1_ATTAAGTTCCAA 34 817 574 29.1309669522644
## p1_CTGTCTGAGACC p1_CTGTCTGAGACC 2 816 636 20.9558823529412
## p1_TAACGCGCTCCT p1_TAACGCGCTCCT 24 816 488 35.5392156862745
## p1_ATTCTTGTTCTT p1_ATTCTTGTTCTT 24 816 484 38.1127450980392
## percent_top_100 percent_top_200 percent_top_500
## <numeric> <numeric> <numeric>
## r1_GGCCGCAGTCCG 16.8418805699344 24.8465766583062 39.2123379049042
## r1_CTTGTGCGGGAA 18.2337516388837 26.1690703627396 40.6630455141412
## r1_GCGCAACTGCTC 19.1862117981521 27.4520255863539 42.2992181947406
## r1_GATTGGGAGGCA 17.8115172955975 25.5896226415094 40.4530267295598
## r1_CCTCCTAGTTGG 18.6240409207161 26.6700767263427 41.2736572890026
## ... ... ... ...
## p1_TCAAAAGCCGGG 42.5948592411261 58.7515299877601 95.4712362301102
## p1_ATTAAGTTCCAA 41.3708690330477 54.2227662178703 90.9424724602203
## p1_CTGTCTGAGACC 33.2107843137255 46.5686274509804 83.3333333333333
## p1_TAACGCGCTCCT 47.7941176470588 64.7058823529412 100
## p1_ATTCTTGTTCTT 50.3676470588235 65.1960784313726 100
## subsets_Mito_sum subsets_Mito_detected subsets_Mito_percent
## <numeric> <integer> <numeric>
## r1_GGCCGCAGTCCG 427 14 1.13933507657826
## r1_CTTGTGCGGGAA 503 15 1.57020665542861
## r1_GCGCAACTGCTC 460 13 1.63468372423596
## r1_GATTGGGAGGCA 326 11 1.60180817610063
## r1_CCTCCTAGTTGG 264 9 1.35038363171356
## ... ... ... ...
## p1_TCAAAAGCCGGG 13 4 1.59118727050184
## p1_ATTAAGTTCCAA 10 5 1.22399020807834
## p1_CTGTCTGAGACC 24 7 2.94117647058824
## p1_TAACGCGCTCCT 27 5 3.30882352941176
## p1_ATTCTTGTTCTT 16 4 1.96078431372549
## total
## <numeric>
## r1_GGCCGCAGTCCG 37478
## r1_CTTGTGCGGGAA 32034
## r1_GCGCAACTGCTC 28140
## r1_GATTGGGAGGCA 20352
## r1_CCTCCTAGTTGG 19550
## ... ...
## p1_TCAAAAGCCGGG 817
## p1_ATTAAGTTCCAA 817
## p1_CTGTCTGAGACC 816
## p1_TAACGCGCTCCT 816
## p1_ATTCTTGTTCTT 816
High-quality cells should have many features expressed, and a low contribution of mitochondrial genes. Here, we see that several cells have a very low number of expressed genes, and where most of the molecules are derived from mitochondrial genes. This indicates likely damaged cells, presumably because of loss of cytoplasmic RNA from perforated cells, so we’d want to remove these for the downstream analysis.
# Number of genes vs library size
plotColData(sce, x = "sum", y="detected", colour_by="cluster")
# Mitochondrial genes
plotColData(sce, x = "detected", y="subsets_Mito_percent")
# check outliers using adaptive threshold
outlierResults <- quickPerCellQC(sce,
percent_subsets = c("subsets_Mito_percent"))
# cells to remove and why
colSums(as.matrix(outlierResults))
## low_lib_size low_n_features high_subsets_Mito_percent
## 0 13 3410
## discard
## 3423
# add to metadata
colData(sce)$discard <- outlierResults$discard
# visualize cells to be removed
plotColData(sce, x = "detected", y="subsets_Mito_percent", colour_by = "discard")
# remove cells
sce <- sce[, !colData(sce)$discard]
For normalization, the size factors \(s_i\) computed here are simply scaled library sizes: \[ N_i = \sum_g Y_{gi} \] \[ s_i = N_i / \bar{N}_i \]
sce <- logNormCounts(sce)
# note we also returned log counts: see the additional logcounts assay.
sce
## class: SingleCellExperiment
## dim: 17887 45877
## metadata(0):
## assays(2): counts logcounts
## rownames(17887): KITL TMTC3 ... GM16012 GM21464
## rowData names(0):
## colnames(45877): r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA ... p1_TAACGCGCTCCT
## p1_ATTCTTGTTCTT
## colData names(13): cell.id cluster ... total discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
# you can extract size factors using
sf <- librarySizeFactors(sce)
mean(sf) # equal to 1 due to scaling.
## [1] 1
plot(x= log(colSums(assays(sce)$counts)),
y=sf)
library(scran)
dec <- modelGeneVar(sce)
fitRetina <- metadata(dec)
plot(fitRetina$mean, fitRetina$var,
xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(fitRetina$trend(x), col="dodgerblue", add=TRUE, lwd=2)
# get 10% highly variable genes
hvg <- getTopHVGs(dec, prop=0.1)
head(hvg)
## [1] "RHO" "CALM1" "MEG3" "GNGT1" "SAG" "RPGRIP1"
# plot these
plot(fitRetina$mean, fitRetina$var,
col = c("orange", "darkseagreen3")[(names(fitRetina$mean) %in% hvg)+1],
xlab="Mean of log-expression",
ylab="Variance of log-expression")
curve(fitRetina$trend(x), col="dodgerblue", add=TRUE, lwd=2)
legend("topleft",
legend = c("Selected", "Not selected"),
col = c("darkseagreen3", "orange"),
pch = 16,
bty='n')
Note that, below, we color the cells using the known, true cell type label as defined in the metadata. In reality, we don’t know this yet at this stage.
set.seed(1234)
sce <- runPCA(sce, ncomponents=30, subset_row=hvg)
plotPCA(sce)
sce <- runUMAP(sce, dimred = 'PCA', external_neighbors=TRUE)
plotUMAP(sce)
# Build a shared nearest-neighbor graph from PCA space
g <- buildSNNGraph(sce, use.dimred = 'PCA')
# Louvain clustering on the SNN graph, and add to sce
colData(sce)$label <- factor(igraph::cluster_louvain(g)$membership)
# Visualization.
plotUMAP(sce, colour_by="label")