## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## Filter, Find, Map, Position, Reduce, anyDuplicated, append,
## as.data.frame, basename, cbind, colnames, dirname, do.call,
## duplicated, eval, evalq, get, grep, grepl, intersect,
## is.unsorted, lapply, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, rank, rbind, rownames, sapply,
## setdiff, sort, table, tapply, union, unique, unsplit, which,
## which.max, which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply, rowsum
##
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
##
## space
## The following object is masked from 'package:S4Vectors':
##
## space
## The following object is masked from 'package:stats':
##
## lowess
Researchers assessed the effect of spinal nerve ligation (SNL) on the transcriptome of rats. In this experiment, transcriptome profiling occurred at two weeks and two months after treatment, for both the SNL group and a control group. Two biological replicates are used for every treatment - time combination. The researchers are interested in early and late effects and in genes for which the effect changes over time.
file="http://bowtie-bio.sourceforge.net/recount/ExpressionSets/hammer_eset.RData"
load(url(file))
hammer.eset
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 29516 features, 8 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SRX020102 SRX020103 ... SRX020098-101 (8 total)
## varLabels: sample.id num.tech.reps ... Time (5 total)
## varMetadata: labelDescription
## featureData
## featureNames: ENSRNOG00000000001 ENSRNOG00000000007 ...
## ENSRNOG00000045521 (29516 total)
## fvarLabels: gene
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation:
pData(hammer.eset)
## sample.id num.tech.reps protocol strain Time
## SRX020102 SRX020102 1 control Sprague Dawley 2 months
## SRX020103 SRX020103 2 control Sprague Dawley 2 months
## SRX020104 SRX020104 1 L5 SNL Sprague Dawley 2 months
## SRX020105 SRX020105 2 L5 SNL Sprague Dawley 2months
## SRX020091-3 SRX020091-3 1 control Sprague Dawley 2 weeks
## SRX020088-90 SRX020088-90 2 control Sprague Dawley 2 weeks
## SRX020094-7 SRX020094-7 1 L5 SNL Sprague Dawley 2 weeks
## SRX020098-101 SRX020098-101 2 L5 SNL Sprague Dawley 2 weeks
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.2
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## x dplyr::collapse() masks IRanges::collapse()
## x dplyr::combine() masks Biobase::combine(), BiocGenerics::combine()
## x dplyr::count() masks matrixStats::count()
## x dplyr::desc() masks IRanges::desc()
## x tidyr::expand() masks S4Vectors::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks S4Vectors::first()
## x dplyr::lag() masks stats::lag()
## x purrr::reduce() masks GenomicRanges::reduce(), IRanges::reduce()
## x dplyr::rename() masks S4Vectors::rename()
## x purrr::simplify() masks DelayedArray::simplify()
## x dplyr::slice() masks IRanges::slice()
pData(hammer.eset)
## sample.id num.tech.reps protocol strain Time
## SRX020102 SRX020102 1 control Sprague Dawley 2 months
## SRX020103 SRX020103 2 control Sprague Dawley 2 months
## SRX020104 SRX020104 1 L5 SNL Sprague Dawley 2 months
## SRX020105 SRX020105 2 L5 SNL Sprague Dawley 2months
## SRX020091-3 SRX020091-3 1 control Sprague Dawley 2 weeks
## SRX020088-90 SRX020088-90 2 control Sprague Dawley 2 weeks
## SRX020094-7 SRX020094-7 1 L5 SNL Sprague Dawley 2 weeks
## SRX020098-101 SRX020098-101 2 L5 SNL Sprague Dawley 2 weeks
hammer.eset %>% exprs %>% head
## SRX020102 SRX020103 SRX020104 SRX020105 SRX020091-3
## ENSRNOG00000000001 2 4 18 24 7
## ENSRNOG00000000007 4 1 3 1 5
## ENSRNOG00000000008 0 1 4 2 0
## ENSRNOG00000000009 0 0 0 0 0
## ENSRNOG00000000010 19 10 19 13 50
## ENSRNOG00000000012 7 5 1 0 31
## SRX020088-90 SRX020094-7 SRX020098-101
## ENSRNOG00000000001 4 93 77
## ENSRNOG00000000007 4 9 4
## ENSRNOG00000000008 5 2 6
## ENSRNOG00000000009 0 0 0
## ENSRNOG00000000010 57 45 58
## ENSRNOG00000000012 26 12 9
pData(hammer.eset)$time<-factor(rep(c("2m","2w"),each=4),levels = c("2w","2m"))
levels(pData(hammer.eset)$protocol)<-c("c","snl")
ds_matrix <- DESeqDataSetFromMatrix(countData = exprs(hammer.eset),
colData = pData(hammer.eset),
design = ~ time*protocol)
With DESeq2 we can first do a variance stabilizing transformation before we make a principal component plot.
vsd <- DESeq2::vst(ds_matrix)
plotPCA(vsd, intgroup = c("protocol","time"))
As we have already specified an experimental design when we created the DESeqDataSet, we can run the differential expression pipeline on the raw counts with a single call to the function DESeq. We can also plot the estimated dispersions.
ds_matrix <- DESeq(ds_matrix)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(ds_matrix)
The researchers are interested in an effect of the treatment at the early time point, the late timepoint and an interaction.
The following model is used at the gene-level
\[ \left\{ \begin{array}{lcl} y_{ig} &\sim& NB(\mu_{ig},\phi_g)\\ E[y_{ig}\vert \mathbf{x}_{ig}]&=&\mu_{ig}\\ log(\mu_{ig})&=&\eta_{ig}\\ \eta_{ig}&=&\beta_0 + \beta_{snl} x_{snl,i} + \beta_{t2m}x_{t2m,i} + \beta_\text{snl,t2m} x_{snl,i}x_{t2m,i} + \log N_i \end{array}\right. \]
with \(x_{snl,i}\) a dummy variable that is 1 if a mouse had the spinal nerve ligation and is 0 otherwise, \(x_{t2m,i}\) a dummy variable that 1 one if the mouse was sacrificed after 2 months and 0 otherwise and \(\log{N}_i\) a normalisation offset to correct for sequencing depth. Note, that \(\beta_{snl}\) is the main effect for spinal nerve ligation, and corresponds to the average log fold change between treated and control mice after two weeks. The interaction \(\beta_\text{snl,t2m}\) can be interpreted as the average change in log FC between treated and control mouse at the late and early timepoint. The researchers are also interested in a third contrast: the effect of the treatment at the late time point.
\[ \log \text{FC}^\text{2 months}_\text{snl-c}= \beta_{snl}+\beta_{snl,t2m}\]
Below we implement the contrasts related to each of these research questions. Is the gene DE at the early, the late timepoint and does the average log FC due to the treatment change over time?
L <- matrix(0,nrow=3,ncol=length(resultsNames(ds_matrix)))
colnames(L)<-resultsNames(ds_matrix)
rownames(L)<-c("early","late","interaction")
L[1,3]<-1
L[2,3:4]<-1
L[3,4]<-1
L
## Intercept time_2m_vs_2w protocol_snl_vs_c time2m.protocolsnl
## early 0 0 1 0
## late 0 0 1 1
## interaction 0 0 0 1
results<-apply(L,1,function(fit,contrast) results(fit, contrast=contrast),fit=ds_matrix)
head(results$early)
## log2 fold change (MLE): 0,0,+1,0
## Wald test p-value: 0,0,+1,0
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSRNOG00000000001 21.3037898353568 3.44433228426384 0.62864804051679
## ENSRNOG00000000007 3.54819286782193 0.0225496208590792 1.2108645710959
## ENSRNOG00000000008 2.51440015548349 0.176945720176101 1.59470653296445
## ENSRNOG00000000009 0 NA NA
## ENSRNOG00000000010 28.3401935116933 -0.558733024021648 0.433262109976245
## ENSRNOG00000000012 8.63308167355526 -1.94638538642785 0.754138707830786
## stat pvalue
## <numeric> <numeric>
## ENSRNOG00000000001 5.47895175404091 4.27852995436882e-08
## ENSRNOG00000000007 0.0186227439445772 0.985142058937118
## ENSRNOG00000000008 0.110958170997877 0.911649516551866
## ENSRNOG00000000009 NA NA
## ENSRNOG00000000010 -1.28959586161892 0.197191013340913
## ENSRNOG00000000012 -2.58093818314465 0.00985322235423953
## padj
## <numeric>
## ENSRNOG00000000001 3.8784823936471e-07
## ENSRNOG00000000007 0.991760472321361
## ENSRNOG00000000008 0.950077568329443
## ENSRNOG00000000009 NA
## ENSRNOG00000000010 0.342468699894264
## ENSRNOG00000000012 0.0292929035542801
The first column, baseMean, is a just the average of the normalized count values, dividing by size factors, taken over all samples in the DESeqDataSet. The remaining four columns refer to a specific contrast.
The column log2FoldChange is the effect size estimate. This value is reported on a logarithmic scale to base 2: for example, a log2 fold change of 1.5 means that the gene???s expression is increased by a multiplicative factor of \(2^{1.5} \approx 2.82\).
Of course, this estimate has an uncertainty associated with it, which is available in the column lfcSE, the standard error estimate for the log2 fold change estimate. Results of a hypothesis test for the contrast is also provided and is reported as a p value, and it is found in the column pvalue.
DESeq2 uses the Benjamini-Hochberg (BH) False Discovery Rate adjustment (Benjamini and Hochberg 1995) as implemented in the base R p.adjust function to correct for multiple testing. These values, called the BH-adjusted p values, are given in the column padj of the res object from DESeq2.
Sometimes a subset of the p values in results will be NA (“not available”“). This is DESeq2’ss way of reporting that all counts for this gene were zero, and hence no test was applied. In addition, p values can be assigned NA if the gene was excluded from analysis because it contained an extreme count outlier. For more information, see the outlier detection section of the DESeq2 vignette.
Note, that if you want to test one specific parameter you can also provide the name of the parameter. E.g. “resultsNames(ds_matrix)[3]” is protocol_snl_vs_c the main effect for SNL vs C i.e. the log2FC at the early timepoint. By default the results function assesses the null hypothesis that parameter associated with the last column of the design matrix equals 0 using a Wald test. Here, this is the treatment x time interaction.
head(results(ds_matrix))
## log2 fold change (MLE): time2m.protocolsnl
## Wald test p-value: time2m.protocolsnl
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSRNOG00000000001 21.3037898353568 -0.75719777316548 0.991741895620587
## ENSRNOG00000000007 3.54819286782193 -0.41297647947652 1.86695727630851
## ENSRNOG00000000008 2.51440015548349 2.28363872199791 2.59728493012289
## ENSRNOG00000000009 0 NA NA
## ENSRNOG00000000010 28.3401935116933 0.608655687969142 0.68799651552782
## ENSRNOG00000000012 8.63308167355526 -1.7397010895344 1.79913607426771
## stat pvalue padj
## <numeric> <numeric> <numeric>
## ENSRNOG00000000001 -0.763502859472988 0.445163557651901 NA
## ENSRNOG00000000007 -0.221202962015868 0.824934403029993 NA
## ENSRNOG00000000008 0.879240739247605 0.379270759719744 NA
## ENSRNOG00000000009 NA NA NA
## ENSRNOG00000000010 0.88467844564909 0.376330084072618 NA
## ENSRNOG00000000012 -0.966964708460141 0.333561676405458 NA
head(results(ds_matrix,name=resultsNames(ds_matrix)[3]))
## log2 fold change (MLE): protocol snl vs c
## Wald test p-value: protocol snl vs c
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE
## <numeric> <numeric> <numeric>
## ENSRNOG00000000001 21.3037898353568 3.44433228426384 0.62864804051679
## ENSRNOG00000000007 3.54819286782193 0.0225496208590792 1.2108645710959
## ENSRNOG00000000008 2.51440015548349 0.176945720176101 1.59470653296445
## ENSRNOG00000000009 0 NA NA
## ENSRNOG00000000010 28.3401935116933 -0.558733024021648 0.433262109976245
## ENSRNOG00000000012 8.63308167355526 -1.94638538642785 0.754138707830786
## stat pvalue
## <numeric> <numeric>
## ENSRNOG00000000001 5.47895175404091 4.27852995436882e-08
## ENSRNOG00000000007 0.0186227439445772 0.985142058937118
## ENSRNOG00000000008 0.110958170997877 0.911649516551866
## ENSRNOG00000000009 NA NA
## ENSRNOG00000000010 -1.28959586161892 0.197191013340913
## ENSRNOG00000000012 -2.58093818314465 0.00985322235423953
## padj
## <numeric>
## ENSRNOG00000000001 3.8784823936471e-07
## ENSRNOG00000000007 0.991760472321361
## ENSRNOG00000000008 0.950077568329443
## ENSRNOG00000000009 NA
## ENSRNOG00000000010 0.342468699894264
## ENSRNOG00000000012 0.0292929035542801
summary(results$early)
##
## out of 18635 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3224, 17%
## LFC < 0 (down) : 3271, 18%
## outliers [1] : 0, 0%
## low counts [2] : 3152, 17%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
hist(results$early$pvalue,xlab="p-value")
volcanoEarly<- ggplot(results$early %>% as.data.frame,aes(x=log2FoldChange,y=-log10(pvalue),color=padj<0.05)) + geom_point() + scale_color_manual(values=c("black","red")) + ggtitle(paste("contrast","early"))
print(volcanoEarly)
## Warning: Removed 14033 rows containing missing values (geom_point).
plotMA(results$early)
mat <- assay(vsd)[head(order(results$early$padj), 30), ]
pheatmap(mat)
summary(results$late)
##
## out of 18635 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3110, 17%
## LFC < 0 (down) : 3282, 18%
## outliers [1] : 0, 0%
## low counts [2] : 3502, 19%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
hist(results$late$pvalue,xlab="p-value")
volcanoLate<- ggplot(results$late %>% as.data.frame,aes(x=log2FoldChange,y=-log10(pvalue),color=padj<0.05)) + geom_point() + scale_color_manual(values=c("black","red")) + ggtitle(paste("contrast","late"))
print(volcanoLate)
## Warning: Removed 14383 rows containing missing values (geom_point).
plotMA(results$late)
mat <- assay(vsd)[head(order(results$late$padj), 30), ]
pheatmap(mat)
summary(results$interaction)
##
## out of 18635 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 11, 0.059%
## LFC < 0 (down) : 16, 0.086%
## outliers [1] : 0, 0%
## low counts [2] : 12606, 68%
## (mean count < 263)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
hist(results$interaction$pvalue,xlab="p-value")
volcanoInter<- ggplot(results$interaction %>% as.data.frame,aes(x=log2FoldChange,y=-log10(pvalue),color=padj<0.05)) + geom_point() + scale_color_manual(values=c("black","red")) + ggtitle(paste("contrast","interaction"))
print(volcanoInter)
## Warning: Removed 23487 rows containing missing values (geom_point).
plotMA(results$interaction)
mat <- assay(vsd)[head(order(results$interaction$padj), sum(results$interaction$padj<0.05,na.rm=TRUE)), ]
pheatmap(mat)
DESeq2 allows a straightforward way of plotting the raw or normalised counts for a gene.
plotCounts(ds_matrix, gene = "ENSRNOG00000002419", intgroup = c("protocol","time"),
normalized = TRUE, transform = FALSE)