Import data

The scRNAseq package provides convenient access to several datasets. See the package Bioconductor page for more information.

# install BiocManager package if not installed yet.
# BiocManager is the package installer for Bioconductor software.
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

# install scRNAseq package if not yet installed.
if(!"scRNAseq" %in% installed.packages()[,1]) BiocManager::install("scRNAseq")

# Code below might ask you to create an ExperimentHub directory. 
# Type 'yes' and hit Enter, to allow this.
suppressPackageStartupMessages(library(scRNAseq))
sce <- MacoskoRetinaData()

## snapshotDate(): 2019-10-22

## see ?scRNAseq and browseVignettes('scRNAseq') for documentation

## loading from cache

## see ?scRNAseq and browseVignettes('scRNAseq') for documentation

## loading from cache

A `SingleCellExperiment` object

sce

## class: SingleCellExperiment 
## dim: 24658 49300 
## metadata(0):
## assays(1): counts
## rownames(24658): KITL TMTC3 ... 1110059M19RIK GM20861
## rowData names(0):
## colnames(49300): r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA ... p1_TAACGCGCTCCT
##   p1_ATTCTTGTTCTT
## colData names(2): cell.id cluster
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):

Accessing data from a `SingleCellExperiment` object

Please see Figure 4.1 in OSCA for an overview of a SingleCellExperiment object.

# Data: assays
assays(sce)

## List of length 1
## names(1): counts

assays(sce)$counts[1:5, 1:5]

## 5 x 5 sparse Matrix of class "dgCMatrix"
##               r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA r1_GCGCAACTGCTC r1_GATTGGGAGGCA
## KITL                        .               .               1               .
## TMTC3                       3               .               .               .
## CEP290                      1               3               .               2
## 4930430F08RIK               2               1               2               .
## 1700017N19RIK               .               .               .               .
##               r1_CCTCCTAGTTGG
## KITL                        .
## TMTC3                       2
## CEP290                      1
## 4930430F08RIK               1
## 1700017N19RIK               .

# Feature metadata: rowData
rowData(sce) # empty for now

## DataFrame with 24658 rows and 0 columns

# Cell metadata: colData
colData(sce)

## DataFrame with 49300 rows and 2 columns
##                         cell.id   cluster
##                     <character> <integer>
## r1_GGCCGCAGTCCG r1_GGCCGCAGTCCG         2
## r1_CTTGTGCGGGAA r1_CTTGTGCGGGAA         2
## r1_GCGCAACTGCTC r1_GCGCAACTGCTC         2
## r1_GATTGGGAGGCA r1_GATTGGGAGGCA         2
## r1_CCTCCTAGTTGG r1_CCTCCTAGTTGG        NA
## ...                         ...       ...
## p1_TCAAAAGCCGGG p1_TCAAAAGCCGGG        24
## p1_ATTAAGTTCCAA p1_ATTAAGTTCCAA        34
## p1_CTGTCTGAGACC p1_CTGTCTGAGACC         2
## p1_TAACGCGCTCCT p1_TAACGCGCTCCT        24
## p1_ATTCTTGTTCTT p1_ATTCTTGTTCTT        24

# Reduced dimensions: reducedDims
reducedDims(sce) # empty for now

## List of length 0
## names(0):

Creating a new `SingleCellExperiment` object

sceNew <- SingleCellExperiment(assays = list(counts = assays(sce)$counts))
sceNew

## class: SingleCellExperiment 
## dim: 24658 49300 
## metadata(0):
## assays(1): counts
## rownames(24658): KITL TMTC3 ... 1110059M19RIK GM20861
## rowData names(0):
## colnames(49300): r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA ... p1_TAACGCGCTCCT
##   p1_ATTCTTGTTCTT
## colData names(0):
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):

rm(sceNew)

Storing (meta)data in a `SingleCellExperiment` object

fakeGeneNames <- paste0("gene", 1:nrow(sce))
rowData(sce)$fakeName <- fakeGeneNames
head(rowData(sce))

## DataFrame with 6 rows and 1 column
##                  fakeName
##               <character>
## KITL                gene1
## TMTC3               gene2
## CEP290              gene3
## 4930430F08RIK       gene4
## 1700017N19RIK       gene5
## MGAT4C              gene6

# Remove again by setting to NULL
rowData(sce)$fakeName <- NULL

assays(sce)$logCounts <- log1p(assays(sce)$counts)
assays(sce)

## List of length 2
## names(2): counts logCounts

assays(sce)$logCounts[1:5, 1:5]

## 5 x 5 sparse Matrix of class "dgCMatrix"
##               r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA r1_GCGCAACTGCTC r1_GATTGGGAGGCA
## KITL                .               .               0.6931472        .       
## TMTC3               1.3862944       .               .                .       
## CEP290              0.6931472       1.3862944       .                1.098612
## 4930430F08RIK       1.0986123       0.6931472       1.0986123        .       
## 1700017N19RIK       .               .               .                .       
##               r1_CCTCCTAGTTGG
## KITL                .        
## TMTC3               1.0986123
## CEP290              0.6931472
## 4930430F08RIK       0.6931472
## 1700017N19RIK       .

assays(sce)$logCounts <- NULL

Filtering non-informative genes

keep <- rowSums(assays(sce)$counts > 0) > 10
table(keep)

## keep
## FALSE  TRUE 
##  6771 17887

sce <- sce[keep,]

Quality control

Calculate QC variables

library(scater)

## Loading required package: ggplot2

is.mito <- grepl("^MT-", rownames(sce))
sum(is.mito) # 31 mitochondrial genes

## [1] 28

sce <- addPerCellQC(sce, subsets=list(Mito=is.mito))
# the QC variables have now been added to the colData of our SCE object.
colData(sce)

## DataFrame with 49300 rows and 12 columns
##                         cell.id   cluster       sum  detected   percent_top_50
##                     <character> <integer> <numeric> <integer>        <numeric>
## r1_GGCCGCAGTCCG r1_GGCCGCAGTCCG         2     37478      7235 11.2866214846043
## r1_CTTGTGCGGGAA r1_CTTGTGCGGGAA         2     32034      6921 12.2494849222701
## r1_GCGCAACTGCTC r1_GCGCAACTGCTC         2     28140      6390 12.9459843638948
## r1_GATTGGGAGGCA r1_GATTGGGAGGCA         2     20352      5727 12.3968160377358
## r1_CCTCCTAGTTGG r1_CCTCCTAGTTGG        NA     19550      5769  12.690537084399
## ...                         ...       ...       ...       ...              ...
## p1_TCAAAAGCCGGG p1_TCAAAAGCCGGG        24       817       537 30.3549571603427
## p1_ATTAAGTTCCAA p1_ATTAAGTTCCAA        34       817       574 29.1309669522644
## p1_CTGTCTGAGACC p1_CTGTCTGAGACC         2       816       636 20.9558823529412
## p1_TAACGCGCTCCT p1_TAACGCGCTCCT        24       816       488 35.5392156862745
## p1_ATTCTTGTTCTT p1_ATTCTTGTTCTT        24       816       484 38.1127450980392
##                  percent_top_100  percent_top_200  percent_top_500
##                        <numeric>        <numeric>        <numeric>
## r1_GGCCGCAGTCCG 16.8418805699344 24.8465766583062 39.2123379049042
## r1_CTTGTGCGGGAA 18.2337516388837 26.1690703627396 40.6630455141412
## r1_GCGCAACTGCTC 19.1862117981521 27.4520255863539 42.2992181947406
## r1_GATTGGGAGGCA 17.8115172955975 25.5896226415094 40.4530267295598
## r1_CCTCCTAGTTGG 18.6240409207161 26.6700767263427 41.2736572890026
## ...                          ...              ...              ...
## p1_TCAAAAGCCGGG 42.5948592411261 58.7515299877601 95.4712362301102
## p1_ATTAAGTTCCAA 41.3708690330477 54.2227662178703 90.9424724602203
## p1_CTGTCTGAGACC 33.2107843137255 46.5686274509804 83.3333333333333
## p1_TAACGCGCTCCT 47.7941176470588 64.7058823529412              100
## p1_ATTCTTGTTCTT 50.3676470588235 65.1960784313726              100
##                 subsets_Mito_sum subsets_Mito_detected subsets_Mito_percent
##                        <numeric>             <integer>            <numeric>
## r1_GGCCGCAGTCCG              427                    14     1.13933507657826
## r1_CTTGTGCGGGAA              503                    15     1.57020665542861
## r1_GCGCAACTGCTC              460                    13     1.63468372423596
## r1_GATTGGGAGGCA              326                    11     1.60180817610063
## r1_CCTCCTAGTTGG              264                     9     1.35038363171356
## ...                          ...                   ...                  ...
## p1_TCAAAAGCCGGG               13                     4     1.59118727050184
## p1_ATTAAGTTCCAA               10                     5     1.22399020807834
## p1_CTGTCTGAGACC               24                     7     2.94117647058824
## p1_TAACGCGCTCCT               27                     5     3.30882352941176
## p1_ATTCTTGTTCTT               16                     4     1.96078431372549
##                     total
##                 <numeric>
## r1_GGCCGCAGTCCG     37478
## r1_CTTGTGCGGGAA     32034
## r1_GCGCAACTGCTC     28140
## r1_GATTGGGAGGCA     20352
## r1_CCTCCTAGTTGG     19550
## ...                   ...
## p1_TCAAAAGCCGGG       817
## p1_ATTAAGTTCCAA       817
## p1_CTGTCTGAGACC       816
## p1_TAACGCGCTCCT       816
## p1_ATTCTTGTTCTT       816

EDA

High-quality cells should have many features expressed, and a low contribution of mitochondrial genes. Here, we see that several cells have a very low number of expressed genes, and where most of the molecules are derived from mitochondrial genes. This indicates likely damaged cells, presumably because of loss of cytoplasmic RNA from perforated cells, so we’d want to remove these for the downstream analysis.

# Number of genes vs library size
plotColData(sce, x = "sum", y="detected", colour_by="cluster")

# Mitochondrial genes
plotColData(sce, x = "detected", y="subsets_Mito_percent")

QC using adaptive thresholds

# check outliers using adaptive threshold
outlierResults <- quickPerCellQC(sce,
                                 percent_subsets = c("subsets_Mito_percent"))
# cells to remove and why
colSums(as.matrix(outlierResults))

##              low_lib_size            low_n_features high_subsets_Mito_percent 
##                         0                        13                      3410 
##                   discard 
##                      3423

# add to metadata
colData(sce)$discard <- outlierResults$discard

# visualize cells to be removed
plotColData(sce, x = "detected", y="subsets_Mito_percent", colour_by = "discard")

# remove cells
sce <- sce[, !colData(sce)$discard]

Normalization

For normalization, the size factors \(s_i\) computed here are simply scaled library sizes: \[ N_i = \sum_g Y_{gi} \] \[ s_i = N_i / \bar{N}_i \]

sce <- logNormCounts(sce)

# note we also returned log counts: see the additional logcounts assay.
sce

## class: SingleCellExperiment 
## dim: 17887 45877 
## metadata(0):
## assays(2): counts logcounts
## rownames(17887): KITL TMTC3 ... GM16012 GM21464
## rowData names(0):
## colnames(45877): r1_GGCCGCAGTCCG r1_CTTGTGCGGGAA ... p1_TAACGCGCTCCT
##   p1_ATTCTTGTTCTT
## colData names(13): cell.id cluster ... total discard
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):

# you can extract size factors using
sf <- librarySizeFactors(sce)
mean(sf) # equal to 1 due to scaling.

## [1] 1

plot(x= log(colSums(assays(sce)$counts)), 
     y=sf)

Feature selection

library(scran)
dec <- modelGeneVar(sce)
fitRetina <- metadata(dec)
plot(fitRetina$mean, fitRetina$var, 
     xlab="Mean of log-expression",
    ylab="Variance of log-expression")
curve(fitRetina$trend(x), col="dodgerblue", add=TRUE, lwd=2)

# get 10% highly variable genes
hvg <- getTopHVGs(dec, prop=0.1)
head(hvg)

## [1] "RHO"     "CALM1"   "MEG3"    "GNGT1"   "SAG"     "RPGRIP1"

# plot these 
plot(fitRetina$mean, fitRetina$var, 
     col = c("orange", "darkseagreen3")[(names(fitRetina$mean) %in% hvg)+1],
     xlab="Mean of log-expression",
    ylab="Variance of log-expression")
curve(fitRetina$trend(x), col="dodgerblue", add=TRUE, lwd=2)
legend("topleft", 
       legend = c("Selected", "Not selected"), 
       col = c("darkseagreen3", "orange"),
       pch = 16,
       bty='n')

Dimensionality reduction

Note that, below, we color the cells using the known, true cell type label as defined in the metadata. In reality, we don’t know this yet at this stage.

set.seed(1234)
sce <- runPCA(sce, ncomponents=30, subset_row=hvg)
plotPCA(sce)

sce <- runUMAP(sce, dimred = 'PCA', external_neighbors=TRUE)
plotUMAP(sce)

Clustering

# Build a shared nearest-neighbor graph from PCA space
g <- buildSNNGraph(sce, use.dimred = 'PCA')
# Louvain clustering on the SNN graph, and add to sce
colData(sce)$label <- factor(igraph::cluster_louvain(g)$membership)

# Visualization.
plotUMAP(sce, colour_by="label")

scRNA-seq workflow: Macosko et al.

Koen Van den Berge

11/16/2020

Import data

A `SingleCellExperiment` object

Accessing data from a `SingleCellExperiment` object

Creating a new `SingleCellExperiment` object

Storing (meta)data in a `SingleCellExperiment` object

Filtering non-informative genes

Quality control

Calculate QC variables

EDA

QC using adaptive thresholds

Normalization

Feature selection

Dimensionality reduction

Clustering

scRNA-seq workflow: Macosko et al.

Koen Van den Berge

11/16/2020

Import data

A SingleCellExperiment object

Accessing data from a SingleCellExperiment object

Creating a new SingleCellExperiment object

Storing (meta)data in a SingleCellExperiment object

Filtering non-informative genes

Quality control

Calculate QC variables

EDA

QC using adaptive thresholds

Normalization

Feature selection

Dimensionality reduction

Clustering

A `SingleCellExperiment` object

Accessing data from a `SingleCellExperiment` object

Creating a new `SingleCellExperiment` object

Storing (meta)data in a `SingleCellExperiment` object