library(tidyverse)
## ── Attaching packages ───────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.5 ✔ purrr 0.3.4
## ✔ tibble 3.1.6 ✔ dplyr 1.0.8
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ──────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
\[\sigma^2= \sigma^2_{bio}+\sigma^2_\text{lab} +\sigma^2_\text{extraction} + \sigma^2_\text{run} + \ldots\]
https://www.nature.com/articles/nmeth.3005.pdf
Oneway anova is a special case of a completely randomized design:
In a block design the experimental units are blocks which are sampled at random of the population of all possible blocks.
Duguet et al. (2017) MCP 16(8):1416-1432. doi: 10.1074/mcp.m116.062745
We focus on one protein
<- read_tsv("https://raw.githubusercontent.com/statOmics/PSLSData/main/mouseP16045.txt") mouse
## Rows: 14 Columns: 3
## ── Column specification ──────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): celltype, mouse
## dbl (1): intensity
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mouse
%>%
mouse ggplot(aes(x = mouse, y = intensity, col = celltype)) +
geom_point()
%>%
mouse ggplot(aes(x = celltype, y = intensity)) +
geom_point() +
geom_line(aes(group = mouse)) +
geom_point(aes(col = celltype))
The plots show evidence for - an upregulation of the protein expression in regulatory T-cells and - a considerable variability in expression from animal to animal!
This is a paired design, which is the most simple form of randomized complete block design.
In the introduction to statistical inference we would have analyzed the data by differencing.
<- mouse %>%
mouseWide spread(celltype, intensity) %>%
mutate(delta = Treg - Tcon)
mouseWide
%>%
mouseWide ggplot(aes(x = "", y = delta)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter()
<- mouseWide %>%
deltaSum summarize(
mean = mean(delta, na.rm = TRUE),
sd = sd(delta, na.rm = TRUE),
n = n()
%>%
) mutate(se = sd / sqrt(n))
deltaSum
Note, that the intensity data are not independent because we measured the expression in regulatory and ordinary T-cells for every animal
cor(mouseWide[, c("Tcon", "Treg")])
## Tcon Treg
## Tcon 1.00000 0.93874
## Treg 0.93874 1.00000
var(mouseWide[, c("Tcon", "Treg")])
## Tcon Treg
## Tcon 0.6101531 0.7245316
## Treg 0.7245316 0.9763042
var(mouseWide[, c("Tcon", "Treg")]) %>%
diag() %>%
sqrt()
## Tcon Treg
## 0.7811230 0.9880811
There is indeed a large correlation between the expression of the protein in conventional and regulatory T-cells.
Standard deviation of difference?
\[ \begin{array}{lcl} \text{sd}_{x_r - x_c} &=& \sqrt{1^2\hat \sigma_r^2 + (-1)^2 \hat \sigma_c^2 + 2 \times 1 \times -1 \times \hat\sigma_{r,c}}\\ &=&\sqrt{\hat \sigma_r^2 + \hat \sigma_c^2 - 2 \times \hat\sigma_{r,c}} \end{array} \]
<- (c(-1, 1) %*% var(mouseWide[, c("Tcon", "Treg")]) %*% c(-1, 1)) %>%
sdDelta2 sqrt()
sdDelta2
## [,1]
## [1,] 0.3706672
<- sdDelta2 / sqrt(deltaSum$n)
seDeltaBar seDeltaBar
## [,1]
## [1,] 0.140099
deltaSum
t.test(delta~1, mouseWide)
##
## One Sample t-test
##
## data: delta
## t = 8.5858, df = 6, p-value = 0.0001372
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.8600472 1.5456671
## sample estimates:
## mean of x
## 1.202857
We can also analyse the design using a linear model, i.e. with
Because we have measured the two cell types in every mouse, we can thus estimate the average log2-intensity of the protein in the T-cells for each mouse.
<- lm(intensity ~ celltype + mouse, mouse)
lmRCB plot(lmRCB, which = c(1, 2, 3))
If you have doubts on that your data violates the assumptions you can always simulate data from a model with similar effects as yours but where are distributional assumptions hold and compare the residual plots.
<- model.matrix(intensity ~ celltype + mouse, mouse)
design <- sqrt(car::Anova(lmRCB, type = "III")["mouse", "Sum Sq"] / car::Anova(lmRCB, type = "III")["mouse", "Df"])
sigmaMouse <- lmRCB$coefficients
betas <- mouse$mouse %>%
nMouse unique() %>%
length()
par(mfrow = c(3, 3))
for (i in 1:9)
{<- rnorm(nMouse, sd = sigmaMouse)
mouseEffect <- mouseEffect[-1] - mouseEffect[1]
betasMouse -c(1:2)] <- betasMouse
betas[<- design %*% betas + rnorm(nrow(design), sd = sigma(lmRCB))
ysim plot(lm(ysim ~ -1 + design), which = 1)
}
The deviations seen in our plot are in line with what those observed in simulations under the model assumptions. Hence, we can use the model for statistical inference.
<- car::Anova(lmRCB, type = "III")
anovaRCB summary(lmRCB)
##
## Call:
## lm(formula = intensity ~ celltype + mouse, data = mouse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2356 -0.1387 0.0000 0.1387 0.2356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.3671 0.1981 47.277 6.00e-09 ***
## celltypeTreg 1.2029 0.1401 8.586 0.000137 ***
## mousem2 0.5055 0.2621 1.929 0.102036
## mousem3 1.0255 0.2621 3.913 0.007869 **
## mousem4 0.8545 0.2621 3.260 0.017245 *
## mousem5 0.7880 0.2621 3.006 0.023809 *
## mousem6 2.1055 0.2621 8.033 0.000199 ***
## mousem7 2.4475 0.2621 9.338 8.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2621 on 6 degrees of freedom
## Multiple R-squared: 0.9717, Adjusted R-squared: 0.9388
## F-statistic: 29.47 on 7 and 6 DF, p-value: 0.000309
t.test(delta ~1 , mouseWide)
##
## One Sample t-test
##
## data: delta
## t = 8.5858, df = 6, p-value = 0.0001372
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.8600472 1.5456671
## sample estimates:
## mean of x
## 1.202857
anovaRCB
Notice that
the estimate, se, t-test statistic and p-value for the celltype effect of interest is the same as in the paired t-test!
the anova analysis shows that we model the total variability in the protein expression in T-cells using variability due to the cell type (CT), the variability from mouse to mouse (M) and residual variability (R) within mouse that we cannot explain. Indeed,
\[ \begin{array}{lcl} \text{SSTot} &=& \text{SSCT} + \text{SSM} + \text{SSE}\\ 14.6 &=& 5.1 + 9.1 + 0.4 \end{array} \]
So the celltype and the mouse effect explain respectively \[ \begin{array}{ll} \frac{\text{SSCT}}{\text{SSTot}}\times 100&\frac{\text{SSM}}{\text{SSTot}}\times 100\\\\ 34.7& 62.4\\ \end{array} \]
percent of the variability in the protein expression values and \[ \frac{\text{SSE}}{\text{SSTot}} \times 100 = 2.8 \] percent cannot be explained.
Note, that
Note, that the RCB also has to sacrifice a number of degrees of freedom to estimate the mouse effect, here 6 DF.
Hence, the power gain of the RCB is a trade-off between the variability that can be explained by the block effect and the loss in DF.
If you remember the equation of variance covariance matrix of the parameter estimators \[ \hat{\boldsymbol{\Sigma}}^2_{\hat{\boldsymbol{\beta}}} =\left(\mathbf{X}^T\mathbf{X}\right)^{-1}\hat\sigma^2 \]
you can see that the randomized complete block will have an impact on
\(\rightarrow\) We were able to isolate the variance in expression between animals/blocks from our analysis!
\(\rightarrow\) This reduces the variance of the residuals and leads to a power gain if the variability between mice/blocks is large.
Also note that,
\[ \hat\sigma^2 = \frac{\text{SSE}}{n-p} = \frac{SSTot - SSM - SSCT}{n-p} \]
Further, the degrees of freedom affect the t-distribution that is used for inference, which has broader tails if the residual degrees of freedom \(n-p\) are getting smaller.
In this section we will subset the original data in two experiments:
set.seed(859)
<- mouse %>%
mRcb pull(mouse) %>%
unique() %>%
sample(size = 3)
<- mouse %>% filter(mouse %in% mRcb)
rcbSmall rcbSmall
<- mouse %>%
mCrd pull(mouse) %>%
unique() %>%
sample(size = 6)
<-
crdSmall bind_rows(
%>%
mouse filter(mouse %in% mCrd[1:3]) %>%
filter(celltype == "Tcon"),
%>%
mouse filter(mouse %in% mCrd[-(1:3)]) %>%
filter(celltype == "Treg")
) crdSmall
So in both experiments we need to do six mass spectrometry runs.
<- lm(intensity ~ celltype, crdSmall)
lmCRDSmall summary(lmCRDSmall)
##
## Call:
## lm(formula = intensity ~ celltype, data = crdSmall)
##
## Residuals:
## 1 2 3 4 5 6
## -0.507 -0.201 0.708 -1.189 -0.275 1.464
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.5290 0.6077 17.326 6.51e-05 ***
## celltypeTreg 1.1300 0.8594 1.315 0.259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.053 on 4 degrees of freedom
## Multiple R-squared: 0.3018, Adjusted R-squared: 0.1272
## F-statistic: 1.729 on 1 and 4 DF, p-value: 0.2589
anova(lmCRDSmall)
<- lm(intensity ~ celltype + mouse, rcbSmall)
lmRCBSmall anova(lmRCBSmall)
summary(lmRCBSmall)
##
## Call:
## lm(formula = intensity ~ celltype + mouse, data = rcbSmall)
##
## Residuals:
## 1 2 3 4 5 6
## 0.06433 0.12633 -0.19067 -0.06433 -0.12633 0.19067
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.9577 0.1940 51.329 0.000379 ***
## celltypeTreg 1.0327 0.1940 5.323 0.033527 *
## mousem3 0.5200 0.2376 2.189 0.160094
## mousem7 1.9420 0.2376 8.173 0.014641 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2376 on 2 degrees of freedom
## Multiple R-squared: 0.9804, Adjusted R-squared: 0.951
## F-statistic: 33.32 on 3 and 2 DF, p-value: 0.02928
Note, that
A poor data analysis who forgets about the blocking will be back at square one:
<- lm(intensity ~ celltype, rcbSmall)
wrongRbc anova(wrongRbc)
summary(wrongRbc)
##
## Call:
## lm(formula = intensity ~ celltype, data = rcbSmall)
##
## Residuals:
## 1 2 3 4 5 6
## -0.7563 -0.1743 0.9307 -0.8850 -0.4270 1.3120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.7783 0.5885 18.316 5.23e-05 ***
## celltypeTreg 1.0327 0.8322 1.241 0.282
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.019 on 4 degrees of freedom
## Multiple R-squared: 0.2779, Adjusted R-squared: 0.09743
## F-statistic: 1.54 on 1 and 4 DF, p-value: 0.2825
So, the block to block variability is then absorbed in the variance estimator of the residual.
Of course, we are never allowed to analyse an RCB with a model for a CRD (without block factor) because blocking imposes a randomization restriction: we randomize the treatment within block.
<- sum(car::Anova(lmRCB, type = "III")[c("mouse", "Residuals"), "Sum Sq"]) / sum(car::Anova(lmRCB, type = "III")[c("mouse", "Residuals"), "Df"])
varBetweenPlusWithin
<- 0.05
alpha <- 20000
nSim <- 0
b0 <- sqrt(varBetweenPlusWithin)
sd <- c(3, 7)
ns <- lmRCB$coefficients["celltypeTreg"]
deltas
<- limma::makeContrasts("celltypeTreg", levels = c("(Intercept)", "celltypeTreg")) L
## Warning in limma::makeContrasts("celltypeTreg", levels =
## c("(Intercept)", : Renaming (Intercept) to Intercept
<- matrix(NA, nrow = length(ns) * length(deltas), ncol = 3) %>% as.data.frame()
powerFast names(powerFast) <- c("b1", "n", "power")
<- 0
i for (n in ns)
{<- n2 <- n
n1
### Simulation
<- data.frame(celltype = rep(c("Tcon", "Treg"), c(n1, n2)) %>% as.factor())
predictorData <- model.matrix(~celltype, predictorData)
design
for (b1 in deltas)
{<- rnorm(nrow(predictorData) * nSim, sd = sd)
ySim dim(ySim) <- c(nrow(predictorData), nSim)
<- ySim + c(design %*% c(b0, b1))
ySim <- t(ySim)
ySim
### Fitting
<- limma::lmFit(ySim, design)
fitAll
### Inference
<- c(t(L) %*% fitAll$cov.coefficients %*% L)
varUnscaled <- fitAll$coefficients %*% L
contrasts <- varUnscaled^.5 * fitAll$sigma
seContrasts <- contrasts / seContrasts
tstats <- pt(abs(tstats), fitAll$df.residual, lower.tail = FALSE) * 2
pvals
<- i + 1
i <- c(b1, n, mean(pvals < alpha))
powerFast[i, ]
}
} powerFast
Because we have a simple 2 group comparison we can also calculate the power using the power.t.test
function.
power.t.test(n = 3, delta = lmRCB$coefficients["celltypeTreg"], sd = sqrt(varBetweenPlusWithin))
##
## Two-sample t test power calculation
##
## n = 3
## delta = 1.202857
## sd = 0.8906339
## sig.level = 0.05
## power = 0.2477638
## alternative = two.sided
##
## NOTE: n is number in *each* group
power.t.test(n = 7, delta = lmRCB$coefficients["celltypeTreg"], sd = sqrt(varBetweenPlusWithin))
##
## Two-sample t test power calculation
##
## n = 7
## delta = 1.202857
## sd = 0.8906339
## sig.level = 0.05
## power = 0.6411438
## alternative = two.sided
##
## NOTE: n is number in *each* group
<- 0.05
alpha <- 20000
nSim <- 0
b0 <- sigma(lmRCB)
sd <- sqrt(car::Anova(lmRCB)["mouse", "Sum Sq"] / car::Anova(lmRCB)["mouse", "Df"])
sdMouse <- c(3, 7)
ns <- lmRCB$coefficients["celltypeTreg"]
deltas
<- matrix(NA, nrow = length(ns) * length(deltas), ncol = 3) %>% as.data.frame()
powerFastBlocking names(powerFastBlocking) <- c("b1", "n", "power")
<- 0
i for (n in ns)
{
### Simulation
<- data.frame(celltype = rep(c("Tcon", "Treg"), each = n) %>% as.factor(), mouse = paste0("m", rep(1:n, 2)))
predictorData <- model.matrix(~ celltype + mouse, predictorData)
design <- limma::makeContrasts("celltypeTreg", levels = colnames(design))
L
for (b1 in deltas)
{<- rnorm(nrow(predictorData) * nSim, sd = sd)
ySim dim(ySim) <- c(nrow(predictorData), nSim)
<- rnorm(n, sd = sdMouse)
mouseEffect <- mouseEffect[-1] - mouseEffect[1]
betasMouse <- ySim + c(design %*% c(b0, b1, betasMouse))
ySim <- t(ySim)
ySim
### Fitting
<- limma::lmFit(ySim, design)
fitAll
### Inference
<- c(t(L) %*% fitAll$cov.coefficients %*% L)
varUnscaled <- fitAll$coefficients %*% L
contrasts <- varUnscaled^.5 * fitAll$sigma
seContrasts <- contrasts / seContrasts
tstats <- pt(abs(tstats), fitAll$df.residual, lower.tail = FALSE) * 2
pvals
<- i + 1
i <- c(b1, n, mean(pvals < alpha))
powerFastBlocking[i, ]
} }
## Warning in limma::makeContrasts("celltypeTreg", levels =
## colnames(design)): Renaming (Intercept) to Intercept
## Warning in limma::makeContrasts("celltypeTreg", levels =
## colnames(design)): Renaming (Intercept) to Intercept
powerFastBlocking
Note, that the power is indeed much larger for the randomized complete block design. Both for the designs with 6 and 14 mass spectrometer runs.
Because we have an RCB with a block size of 2 (paired design) we can also calculate the power using the power.t.test function
with type = "one.sample"
and sd
equal to the standard deviation of the difference.
power.t.test(n = 3, delta = mean(mouseWide$delta), sd = sd(mouseWide$delta))
##
## Two-sample t test power calculation
##
## n = 3
## delta = 1.202857
## sd = 0.3706672
## sig.level = 0.05
## power = 0.8389961
## alternative = two.sided
##
## NOTE: n is number in *each* group
power.t.test(n = 7, delta = mean(mouseWide$delta), sd = sd(mouseWide$delta))
##
## Two-sample t test power calculation
##
## n = 7
## delta = 1.202857
## sd = 0.3706672
## sig.level = 0.05
## power = 0.999826
## alternative = two.sided
##
## NOTE: n is number in *each* group
Note, that the power is slightly different because for the power.t.test function we conditioned on the mice from our study. While in the simulation study we generated data for new mice by simulating the mouse effect from a normal distribution.
We will vary the block effect explains \[ \frac{\sigma^2_\text{between}}{\sigma^2_\text{between}+\sigma^2_\text{within}}=1-\frac{\sigma^2_\text{within}}{\sigma^2_\text{between}+\sigma^2_\text{within}} \] So in our example that is the ratio between the variability between the mice and the sum of the variability between and within the mice. Note, that the within mouse variability was the variance of the errors of the RCB. The ratio for our experiment equals
<- sum(car::Anova(lmRCB, type = "III")[c("mouse", "Residuals"), "Sum Sq"]) / sum(car::Anova(lmRCB, type = "III")[c("mouse", "Residuals"), "Df"])
varBetweenPlusWithin <- car::Anova(lmRCB)["Residuals", "Sum Sq"] / car::Anova(lmRCB)["Residuals", "Df"]
varWithin varBetweenPlusWithin
## [1] 0.7932287
varWithin
## [1] 0.06869707
1 - varWithin / varBetweenPlusWithin
## [1] 0.9133956
<- 0.05
alpha <- 20000
nSim <- 0
b0 <- sum(car::Anova(lmRCB, type = "III")[c("mouse", "Residuals"), "Sum Sq"]) / sum(car::Anova(lmRCB, type = "III")[c("mouse", "Residuals"), "Df"])
varBetweenPlusWithin
<- c(3, 7)
ns <- lmRCB$coefficients["celltypeTreg"]
deltas
<- seq(0, .95, .05)
fracVars
<- matrix(NA, nrow = length(ns) * length(fracVars), ncol = 3) %>% as.data.frame()
powerFastBlockingLow names(powerFastBlockingLow) <- c("fracVars", "n", "power")
<- 0
i
for (n in ns)
{
### Simulation
<- data.frame(celltype = rep(c("Tcon", "Treg"), each = n) %>% as.factor(), mouse = paste0("m", rep(1:n, 2)))
predictorData <- model.matrix(~ celltype + mouse, predictorData)
design <- limma::makeContrasts("celltypeTreg", levels = colnames(design))
L for (fracVar in fracVars)
{<- sqrt(varBetweenPlusWithin * (1 - fracVar))
sd <- sqrt(varBetweenPlusWithin * fracVar)
sdMouse for (b1 in deltas)
{<- rnorm(nrow(predictorData) * nSim, sd = sd)
ySim dim(ySim) <- c(nrow(predictorData), nSim)
<- rnorm(n, sd = sdMouse)
mouseEffect <- mouseEffect[-1] - mouseEffect[1]
betasMouse <- ySim + c(design %*% c(b0, b1, betasMouse))
ySim <- t(ySim)
ySim
### Fitting
<- limma::lmFit(ySim, design)
fitAll
### Inference
<- c(t(L) %*% fitAll$cov.coefficients %*% L)
varUnscaled <- fitAll$coefficients %*% L
contrasts <- varUnscaled^.5 * fitAll$sigma
seContrasts <- contrasts / seContrasts
tstats <- pt(abs(tstats), fitAll$df.residual, lower.tail = FALSE) * 2
pvals
<- i + 1
i <- c(fracVar, n, mean(pvals < alpha))
powerFastBlockingLow[i, ]
}
} }
## Warning in limma::makeContrasts("celltypeTreg", levels =
## colnames(design)): Renaming (Intercept) to Intercept
## Warning in limma::makeContrasts("celltypeTreg", levels =
## colnames(design)): Renaming (Intercept) to Intercept
powerFastBlockingLow
<- function(n) {
gg_color_hue <- seq(15, 375, length = n + 1)
hues hcl(h = hues, l = 65, c = 100)[1:n]
}<- gg_color_hue(2)
cols
%>%
powerFastBlockingLow as.data.frame() %>%
mutate(n = as.factor(n)) %>%
ggplot(aes(fracVars, power, group = n, color = n)) +
geom_line() +
geom_hline(yintercept = powerFast %>% filter(n == 3) %>% pull(power), color = cols[1]) +
annotate("text",
label = "CRD (n=3)",
x = 0.05, y = powerFast %>% filter(n == 3) %>% pull(power) + .02, size = 3, colour = cols[1]
+
) geom_hline(yintercept = powerFast %>% filter(n == 7) %>% pull(power), color = cols[2]) +
annotate("text",
label = "CRD (n=7)",
x = 0.05, y = powerFast %>% filter(n == 7) %>% pull(power) + .02, size = 3, colour = cols[2]
+
) xlab(expression(~ sigma[between]^2 / (sigma[between]^2 + sigma[within]^2))) +
geom_vline(xintercept = 1 - varWithin / varBetweenPlusWithin) +
xlim(0, 1)
So if the variance that is explained by the block effect is small you will loose power as compared to the analysis with a CRD design. Indeed, then
As soon as the block effect explains a large part of the variability it is very beneficial to use a randomized complete block design!
Note, that the same number of mass spectrometry runs have to be done for both the RCB and CRD design. However, for the RCB we only need half of the mice.
The production of penicillin corn steep liquor is used. Corn steep liquor is produced in blends and there is a considerable variability between the blends. Suppose that
How would you assign the methods to the blends.
data(penicillin, package = "faraway")
table(penicillin$blend, penicillin$treat)
##
## A B C D
## Blend1 1 1 1 1
## Blend2 1 1 1 1
## Blend3 1 1 1 1
## Blend4 1 1 1 1
## Blend5 1 1 1 1
head(penicillin)
matrix(penicillin$yield, nrow = 5, ncol = 4, byrow = TRUE, dimnames = list(levels(penicillin$blend), levels(penicillin$treat)))
## A B C D
## Blend1 89 88 97 94
## Blend2 84 77 92 79
## Blend3 81 87 87 85
## Blend4 87 92 89 84
## Blend5 79 81 80 88
%>%
penicillin ggplot(aes(x = blend, y = yield, group = treat, color = treat)) +
geom_line() +
geom_point()
%>%
penicillin ggplot(aes(x = treat, y = yield, group = blend, color = blend)) +
geom_line() +
geom_point()
We analyse the yield using
<- lm(yield ~ treat + blend, data = penicillin)
lmPen plot(lmPen)
::Anova(lmPen, type = "III") car
We conclude that the effect of the treatment on the penicillin yield is not significant at the 5% level of significance (p = 0.34.
We also observe that there is a large effect of the blend on the yield. Blend explains about 47.1% of the variability in the penicillin yield.
A study on the facultative pathogen Francisella tularensis was conceived by Ramond et al. (2015) [12].
F. tularensis enters the cells of its host by phagocytosis.
The authors showed that F. tularensis is arginine deficient and imports arginine from the host cell via an arginine transporter, ArgP, in order to efficiently escape from the phagosome and reach the cytosolic compartment, where it can actively multiply.
In their study, they compared the proteome of wild type F. tularensis (WT) to ArgP-gene deleted F. tularensis (knock-out, D8).
The sample for each bio-rep was run in technical triplicate on the mass-spectrometer.
We will use data for the protein 50S ribosomal protein L5 A0Q4J5
<- read_tsv("https://raw.githubusercontent.com/statOmics/PSLSData/main/francisellaA0Q4J5.txt") franc
## Rows: 18 Columns: 3
## ── Column specification ──────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): genotype, biorep
## dbl (1): intensityLog2
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
franc
%>%
franc ggplot(aes(biorep, intensityLog2, color = genotype)) +
geom_point()
\(\rightarrow\) Pseudo-replication, randomisation to bio-repeat and each bio-repeat measured in technical triplicate. \(\rightarrow\) If we would analyse the data using a linear model based on each measured intensity, we would act as if we had sampled 18 bio-repeats. \(\rightarrow\) Effect of interest has to be assessed between bio-repeats. So block analysis not possible!
If the same number of pseudo-replicates/technical replicates are available for each bio-repeat:
<- lm(intensityLog2 ~ -1 + biorep, franc)
lmBiorep lmBiorep
##
## Call:
## lm(formula = intensityLog2 ~ -1 + biorep, data = franc)
##
## Coefficients:
## biorepD8_n3 biorepD8_n4 biorepD8_n5 biorepWT_n3 biorepWT_n4
## 27.25 27.43 27.39 27.64 27.54
## biorepWT_n5
## 27.57
<- data.frame(genotype = rep(c("D8", "WT"), each = 3) %>% as.factor() %>% relevel("WT"), intensityLog2 = lmBiorep$coef)
francSum francSum
<- lm(intensityLog2 ~ genotype, francSum)
lmSum summary(lmSum)
##
## Call:
## lm(formula = intensityLog2 ~ genotype, data = francSum)
##
## Residuals:
## biorepD8_n3 biorepD8_n4 biorepD8_n5 biorepWT_n3 biorepWT_n4
## -0.10541 0.07010 0.03531 0.05566 -0.04531
## biorepWT_n5
## -0.01034
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.58266 0.04333 636.581 3.65e-11 ***
## genotypeD8 -0.22439 0.06128 -3.662 0.0215 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07505 on 4 degrees of freedom
## Multiple R-squared: 0.7702, Adjusted R-squared: 0.7128
## F-statistic: 13.41 on 1 and 4 DF, p-value: 0.02154
<- lm(intensityLog2 ~ genotype, franc)
lmWrong summary(lmWrong)
##
## Call:
## lm(formula = intensityLog2 ~ genotype, data = franc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21943 -0.04181 -0.01914 0.06537 0.17792
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.35826 0.03457 791.48 < 2e-16 ***
## genotypeWT 0.22439 0.04888 4.59 0.000302 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1037 on 16 degrees of freedom
## Multiple R-squared: 0.5684, Adjusted R-squared: 0.5414
## F-statistic: 21.07 on 1 and 16 DF, p-value: 0.0003017
Note, that the analysis where we ignore that we have multiple technical repeats for each bio-repeat returns results that are much more significant because we act as if we have much more independent observations.
<- sigma(lmBiorep)
sigmaWithin <- sigma(lmSum)
sigmaBetween <- model.matrix(~ -1 + biorep, franc)
xBiorep <- model.matrix(~genotype, franc)
xWrong
set.seed(2523)
<- 1000
nSim <- matrix(NA, nSim, 4) %>% as.data.frame()
resWrong names(resWrong) <- c("Estimate", "Std. Error", "t value", "pvalue")
<- resWrong
resCorrect <- franc$genotype
genotype <- francSum$genotype
genotypeSum <- franc$biorep
biorep
for (i in 1:nSim)
{<- rnorm(ncol(xBiorep), sd = sigmaBetween)
biorepSim <- xBiorep %*% biorepSim + rnorm(nrow(xBiorep), sd = sigmaWithin)
ySim <- lm(ySim ~ biorep)$coefficient
ySum <- summary(lm(ySim ~ genotype))$coefficient[2, ]
resWrong[i, ] <- summary(lm(ySum ~ genotypeSum))$coefficient[2, ]
resCorrect[i, ]
}mean(resCorrect$pvalue < 0.05)
## [1] 0.042
mean(resWrong$pvalue < 0.05)
## [1] 0.143
qplot(resCorrect$pvalue, geom = "histogram", boundary = c(0, 1)) +
stat_bin(breaks = seq(0, 1, .1)) +
xlab("pvalue") +
ggtitle("Correct analysis")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Computation failed in `stat_bin()`:
## 'from' must be of length 1
qplot(resWrong$pvalue, geom = "histogram", boundary = c(0, 1)) +
stat_bin(breaks = seq(0, 1, .1)) +
xlab("pvalue") +
ggtitle("Wrong analysis")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Computation failed in `stat_bin()`:
## 'from' must be of length 1
So we observe that the analysis that does not account for pseudo-replication is too liberal!
The analysis that first summarizes over the technical repeats leads to correct p-values and correct type I error control!
What to do when you have an unequal number of technical repeats: more advanced methods are required
Mixed models can also be used when inference between and within blocks is needed, e.g. split-plot designs.
But, mixed models are beyond the scope of the lecture series.