## Loading required package: S4Vectors

## Loading required package: stats4

## Loading required package: BiocGenerics

## Loading required package: parallel

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, append,
##     as.data.frame, basename, cbind, colnames, dirname, do.call,
##     duplicated, eval, evalq, get, grep, grepl, intersect,
##     is.unsorted, lapply, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, rank, rbind, rownames, sapply,
##     setdiff, sort, table, tapply, union, unique, unsplit, which,
##     which.max, which.min

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:base':
## 
##     expand.grid

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: GenomeInfoDb

## Loading required package: SummarizedExperiment

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## Loading required package: DelayedArray

## Loading required package: matrixStats

## 
## Attaching package: 'matrixStats'

## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians

## Loading required package: BiocParallel

## 
## Attaching package: 'DelayedArray'

## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

## The following objects are masked from 'package:base':
## 
##     aperm, apply, rowsum

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:IRanges':
## 
##     space

## The following object is masked from 'package:S4Vectors':
## 
##     space

## The following object is masked from 'package:stats':
## 
##     lowess

1 Introduction

Researchers assessed the effect of spinal nerve ligation (SNL) on the transcriptome of rats. In this experiment, transcriptome profiling occurred at two weeks and two months after treatment, for both the SNL group and a control group. Two biological replicates are used for every treatment - time combination. The researchers are interested in early and late effects and in genes for which the effect changes over time.

file="http://bowtie-bio.sourceforge.net/recount/ExpressionSets/hammer_eset.RData"
load(url(file))
hammer.eset

## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 29516 features, 8 samples 
##   element names: exprs 
## protocolData: none
## phenoData
##   sampleNames: SRX020102 SRX020103 ... SRX020098-101 (8 total)
##   varLabels: sample.id num.tech.reps ... Time (5 total)
##   varMetadata: labelDescription
## featureData
##   featureNames: ENSRNOG00000000001 ENSRNOG00000000007 ...
##     ENSRNOG00000045521 (29516 total)
##   fvarLabels: gene
##   fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:

pData(hammer.eset)

##                   sample.id num.tech.reps protocol         strain     Time
## SRX020102         SRX020102             1  control Sprague Dawley 2 months
## SRX020103         SRX020103             2  control Sprague Dawley 2 months
## SRX020104         SRX020104             1   L5 SNL Sprague Dawley 2 months
## SRX020105         SRX020105             2   L5 SNL Sprague Dawley  2months
## SRX020091-3     SRX020091-3             1  control Sprague Dawley  2 weeks
## SRX020088-90   SRX020088-90             2  control Sprague Dawley  2 weeks
## SRX020094-7     SRX020094-7             1   L5 SNL Sprague Dawley  2 weeks
## SRX020098-101 SRX020098-101             2   L5 SNL Sprague Dawley  2 weeks

library(tidyverse)

## -- Attaching packages ------------------------------ tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.2
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x dplyr::collapse()   masks IRanges::collapse()
## x dplyr::combine()    masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count()      masks matrixStats::count()
## x dplyr::desc()       masks IRanges::desc()
## x tidyr::expand()     masks S4Vectors::expand()
## x dplyr::filter()     masks stats::filter()
## x dplyr::first()      masks S4Vectors::first()
## x dplyr::lag()        masks stats::lag()
## x purrr::reduce()     masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename()     masks S4Vectors::rename()
## x purrr::simplify()   masks DelayedArray::simplify()
## x dplyr::slice()      masks IRanges::slice()

pData(hammer.eset)

##                   sample.id num.tech.reps protocol         strain     Time
## SRX020102         SRX020102             1  control Sprague Dawley 2 months
## SRX020103         SRX020103             2  control Sprague Dawley 2 months
## SRX020104         SRX020104             1   L5 SNL Sprague Dawley 2 months
## SRX020105         SRX020105             2   L5 SNL Sprague Dawley  2months
## SRX020091-3     SRX020091-3             1  control Sprague Dawley  2 weeks
## SRX020088-90   SRX020088-90             2  control Sprague Dawley  2 weeks
## SRX020094-7     SRX020094-7             1   L5 SNL Sprague Dawley  2 weeks
## SRX020098-101 SRX020098-101             2   L5 SNL Sprague Dawley  2 weeks

hammer.eset %>% exprs %>% head

##                    SRX020102 SRX020103 SRX020104 SRX020105 SRX020091-3
## ENSRNOG00000000001         2         4        18        24           7
## ENSRNOG00000000007         4         1         3         1           5
## ENSRNOG00000000008         0         1         4         2           0
## ENSRNOG00000000009         0         0         0         0           0
## ENSRNOG00000000010        19        10        19        13          50
## ENSRNOG00000000012         7         5         1         0          31
##                    SRX020088-90 SRX020094-7 SRX020098-101
## ENSRNOG00000000001            4          93            77
## ENSRNOG00000000007            4           9             4
## ENSRNOG00000000008            5           2             6
## ENSRNOG00000000009            0           0             0
## ENSRNOG00000000010           57          45            58
## ENSRNOG00000000012           26          12             9

2 Design

pData(hammer.eset)$time<-factor(rep(c("2m","2w"),each=4),levels = c("2w","2m"))
levels(pData(hammer.eset)$protocol)<-c("c","snl")

3 Setup DESEQ2 object

ds_matrix <- DESeqDataSetFromMatrix(countData = exprs(hammer.eset), 
colData = pData(hammer.eset),
design = ~ time*protocol)

4 Data exploration

With DESeq2 we can first do a variance stabilizing transformation before we make a principal component plot.

vsd <- DESeq2::vst(ds_matrix)
plotPCA(vsd, intgroup = c("protocol","time"))

5 DE-analysis

As we have already specified an experimental design when we created the DESeqDataSet, we can run the differential expression pipeline on the raw counts with a single call to the function DESeq. We can also plot the estimated dispersions.

ds_matrix <- DESeq(ds_matrix)

## estimating size factors

## estimating dispersions

## gene-wise dispersion estimates

## mean-dispersion relationship

## final dispersion estimates

## fitting model and testing

plotDispEsts(ds_matrix)

5.1 Tests

The researchers are interested in an effect of the treatment at the early time point, the late timepoint and an interaction.

The following model is used at the gene-level

\[ \left\{ \begin{array}{lcl} y_{ig} &\sim& NB(\mu_{ig},\phi_g)\\ E[y_{ig}\vert \mathbf{x}_{ig}]&=&\mu_{ig}\\ log(\mu_{ig})&=&\eta_{ig}\\ \eta_{ig}&=&\beta_0 + \beta_{snl} x_{snl,i} + \beta_{t2m}x_{t2m,i} + \beta_\text{snl,t2m} x_{snl,i}x_{t2m,i} + \log N_i \end{array}\right. \]

with \(x_{snl,i}\) a dummy variable that is 1 if a mouse had the spinal nerve ligation and is 0 otherwise, \(x_{t2m,i}\) a dummy variable that 1 one if the mouse was sacrificed after 2 months and 0 otherwise and \(\log{N}_i\) a normalisation offset to correct for sequencing depth. Note, that \(\beta_{snl}\) is the main effect for spinal nerve ligation, and corresponds to the average log fold change between treated and control mice after two weeks. The interaction \(\beta_\text{snl,t2m}\) can be interpreted as the average change in log FC between treated and control mouse at the late and early timepoint. The researchers are also interested in a third contrast: the effect of the treatment at the late time point.

\[ \log \text{FC}^\text{2 months}_\text{snl-c}= \beta_{snl}+\beta_{snl,t2m}\]

Below we implement the contrasts related to each of these research questions. Is the gene DE at the early, the late timepoint and does the average log FC due to the treatment change over time?

L <- matrix(0,nrow=3,ncol=length(resultsNames(ds_matrix)))
colnames(L)<-resultsNames(ds_matrix)
rownames(L)<-c("early","late","interaction")
L[1,3]<-1
L[2,3:4]<-1
L[3,4]<-1
L

##             Intercept time_2m_vs_2w protocol_snl_vs_c time2m.protocolsnl
## early               0             0                 1                  0
## late                0             0                 1                  1
## interaction         0             0                 0                  1

results<-apply(L,1,function(fit,contrast) results(fit, contrast=contrast),fit=ds_matrix)
head(results$early)

## log2 fold change (MLE): 0,0,+1,0 
## Wald test p-value: 0,0,+1,0 
## DataFrame with 6 rows and 6 columns
##                            baseMean     log2FoldChange             lfcSE
##                           <numeric>          <numeric>         <numeric>
## ENSRNOG00000000001 21.3037898353568   3.44433228426384  0.62864804051679
## ENSRNOG00000000007 3.54819286782193 0.0225496208590792   1.2108645710959
## ENSRNOG00000000008 2.51440015548349  0.176945720176101  1.59470653296445
## ENSRNOG00000000009                0                 NA                NA
## ENSRNOG00000000010 28.3401935116933 -0.558733024021648 0.433262109976245
## ENSRNOG00000000012 8.63308167355526  -1.94638538642785 0.754138707830786
##                                  stat               pvalue
##                             <numeric>            <numeric>
## ENSRNOG00000000001   5.47895175404091 4.27852995436882e-08
## ENSRNOG00000000007 0.0186227439445772    0.985142058937118
## ENSRNOG00000000008  0.110958170997877    0.911649516551866
## ENSRNOG00000000009                 NA                   NA
## ENSRNOG00000000010  -1.28959586161892    0.197191013340913
## ENSRNOG00000000012  -2.58093818314465  0.00985322235423953
##                                   padj
##                              <numeric>
## ENSRNOG00000000001 3.8784823936471e-07
## ENSRNOG00000000007   0.991760472321361
## ENSRNOG00000000008   0.950077568329443
## ENSRNOG00000000009                  NA
## ENSRNOG00000000010   0.342468699894264
## ENSRNOG00000000012  0.0292929035542801

The first column, baseMean, is a just the average of the normalized count values, dividing by size factors, taken over all samples in the DESeqDataSet. The remaining four columns refer to a specific contrast.

The column log2FoldChange is the effect size estimate. This value is reported on a logarithmic scale to base 2: for example, a log2 fold change of 1.5 means that the gene???s expression is increased by a multiplicative factor of \(2^{1.5} \approx 2.82\).

Of course, this estimate has an uncertainty associated with it, which is available in the column lfcSE, the standard error estimate for the log2 fold change estimate. Results of a hypothesis test for the contrast is also provided and is reported as a p value, and it is found in the column pvalue.

DESeq2 uses the Benjamini-Hochberg (BH) False Discovery Rate adjustment (Benjamini and Hochberg 1995) as implemented in the base R p.adjust function to correct for multiple testing. These values, called the BH-adjusted p values, are given in the column padj of the res object from DESeq2.

Sometimes a subset of the p values in results will be NA (“not available”“). This is DESeq2’ss way of reporting that all counts for this gene were zero, and hence no test was applied. In addition, p values can be assigned NA if the gene was excluded from analysis because it contained an extreme count outlier. For more information, see the outlier detection section of the DESeq2 vignette.

Note, that if you want to test one specific parameter you can also provide the name of the parameter. E.g. “resultsNames(ds_matrix)[3]” is protocol_snl_vs_c the main effect for SNL vs C i.e. the log2FC at the early timepoint. By default the results function assesses the null hypothesis that parameter associated with the last column of the design matrix equals 0 using a Wald test. Here, this is the treatment x time interaction.

head(results(ds_matrix))

## log2 fold change (MLE): time2m.protocolsnl 
## Wald test p-value: time2m.protocolsnl 
## DataFrame with 6 rows and 6 columns
##                            baseMean    log2FoldChange             lfcSE
##                           <numeric>         <numeric>         <numeric>
## ENSRNOG00000000001 21.3037898353568 -0.75719777316548 0.991741895620587
## ENSRNOG00000000007 3.54819286782193 -0.41297647947652  1.86695727630851
## ENSRNOG00000000008 2.51440015548349  2.28363872199791  2.59728493012289
## ENSRNOG00000000009                0                NA                NA
## ENSRNOG00000000010 28.3401935116933 0.608655687969142  0.68799651552782
## ENSRNOG00000000012 8.63308167355526  -1.7397010895344  1.79913607426771
##                                  stat            pvalue      padj
##                             <numeric>         <numeric> <numeric>
## ENSRNOG00000000001 -0.763502859472988 0.445163557651901        NA
## ENSRNOG00000000007 -0.221202962015868 0.824934403029993        NA
## ENSRNOG00000000008  0.879240739247605 0.379270759719744        NA
## ENSRNOG00000000009                 NA                NA        NA
## ENSRNOG00000000010   0.88467844564909 0.376330084072618        NA
## ENSRNOG00000000012 -0.966964708460141 0.333561676405458        NA

head(results(ds_matrix,name=resultsNames(ds_matrix)[3]))

## log2 fold change (MLE): protocol snl vs c 
## Wald test p-value: protocol snl vs c 
## DataFrame with 6 rows and 6 columns
##                            baseMean     log2FoldChange             lfcSE
##                           <numeric>          <numeric>         <numeric>
## ENSRNOG00000000001 21.3037898353568   3.44433228426384  0.62864804051679
## ENSRNOG00000000007 3.54819286782193 0.0225496208590792   1.2108645710959
## ENSRNOG00000000008 2.51440015548349  0.176945720176101  1.59470653296445
## ENSRNOG00000000009                0                 NA                NA
## ENSRNOG00000000010 28.3401935116933 -0.558733024021648 0.433262109976245
## ENSRNOG00000000012 8.63308167355526  -1.94638538642785 0.754138707830786
##                                  stat               pvalue
##                             <numeric>            <numeric>
## ENSRNOG00000000001   5.47895175404091 4.27852995436882e-08
## ENSRNOG00000000007 0.0186227439445772    0.985142058937118
## ENSRNOG00000000008  0.110958170997877    0.911649516551866
## ENSRNOG00000000009                 NA                   NA
## ENSRNOG00000000010  -1.28959586161892    0.197191013340913
## ENSRNOG00000000012  -2.58093818314465  0.00985322235423953
##                                   padj
##                              <numeric>
## ENSRNOG00000000001 3.8784823936471e-07
## ENSRNOG00000000007   0.991760472321361
## ENSRNOG00000000008   0.950077568329443
## ENSRNOG00000000009                  NA
## ENSRNOG00000000010   0.342468699894264
## ENSRNOG00000000012  0.0292929035542801

5.2 Evaluate Results

5.2.1 Results early

summary(results$early)

## 
## out of 18635 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3224, 17%
## LFC < 0 (down)     : 3271, 18%
## outliers [1]       : 0, 0%
## low counts [2]     : 3152, 17%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

hist(results$early$pvalue,xlab="p-value")

volcanoEarly<- ggplot(results$early %>% as.data.frame,aes(x=log2FoldChange,y=-log10(pvalue),color=padj<0.05)) + geom_point() + scale_color_manual(values=c("black","red")) + ggtitle(paste("contrast","early"))
print(volcanoEarly)

## Warning: Removed 14033 rows containing missing values (geom_point).

plotMA(results$early)

mat <- assay(vsd)[head(order(results$early$padj), 30), ]
pheatmap(mat)

5.2.2 Results late

summary(results$late)

## 
## out of 18635 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 3110, 17%
## LFC < 0 (down)     : 3282, 18%
## outliers [1]       : 0, 0%
## low counts [2]     : 3502, 19%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

hist(results$late$pvalue,xlab="p-value")

volcanoLate<- ggplot(results$late %>% as.data.frame,aes(x=log2FoldChange,y=-log10(pvalue),color=padj<0.05)) + geom_point() + scale_color_manual(values=c("black","red")) + ggtitle(paste("contrast","late"))
print(volcanoLate)

## Warning: Removed 14383 rows containing missing values (geom_point).

plotMA(results$late)

mat <- assay(vsd)[head(order(results$late$padj), 30), ]
pheatmap(mat)

5.2.3 Results interaction

summary(results$interaction)

## 
## out of 18635 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 11, 0.059%
## LFC < 0 (down)     : 16, 0.086%
## outliers [1]       : 0, 0%
## low counts [2]     : 12606, 68%
## (mean count < 263)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

hist(results$interaction$pvalue,xlab="p-value")

volcanoInter<- ggplot(results$interaction %>% as.data.frame,aes(x=log2FoldChange,y=-log10(pvalue),color=padj<0.05)) + geom_point() + scale_color_manual(values=c("black","red")) + ggtitle(paste("contrast","interaction"))
print(volcanoInter)

## Warning: Removed 23487 rows containing missing values (geom_point).

plotMA(results$interaction)

mat <- assay(vsd)[head(order(results$interaction$padj), sum(results$interaction$padj<0.05,na.rm=TRUE)), ]
pheatmap(mat)

5.2.4 Plot count data of individual genes

DESeq2 allows a straightforward way of plotting the raw or normalised counts for a gene.

plotCounts(ds_matrix, gene = "ENSRNOG00000002419", intgroup = c("protocol","time"), 
normalized = TRUE, transform = FALSE)

Hammer Dataset with DESeq2

21 November 2019

Contents