library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Histologic grade in breast cancer provides clinically important prognostic information. Researchers examined whether histologic grade was associated with gene expression profiles of breast cancers and whether such profiles could be used to improve histologic grading. In this tutorial we will assess the association between histologic grade and the expression of the KPNA2 gene that is known to be associated with poor BC prognosis. The patients, however, do not only differ in the histologic grade, but also on their lymph node status. The lymph nodes were not affected (0) or chirugically removed (1).
<- read.table("https://raw.githubusercontent.com/statOmics/SGA21/master/data/kpna2.txt",header=TRUE)
kpna2 kpna2
Because histolic grade and lymph node status are both categorical variables, we model them both as factors.
$grade <- ...
kpna2$node <- ... kpna2
%>%
kpna2 ggplot(aes(x=node:grade,y=gene,fill=node:grade)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter()
As discussed in a previous exercise, it seems that there is both an effect of histologic grade and lymph node status on the gene expression. There also seems to be a different effect of lymph node status on the gene expression for the different histologic grades.
As we saw before, we can model this with a model that contains both histologic grade, lymph node status and the interaction term between both. When checking the linear model assumptions, we see that the variance is not equal. Therefore we model the gene expression with a log2-transformation, which makes that all the assumptions of the linear model are satisfied.
#Model with main effects for histological grade and node and grade x node interaction
Check if the interaction term is significant:
library(car)
As we are dealing with a factorial design, we can calculate the mean gene expression for each group by the following parameter summations.
::VisualizeDesign(kpna2,~grade*node)$plotlist ExploreModelMatrix
The researchers want to know the power for testing following hypotheses (remark that we will have to adjust for multiple testing):
\[H_0: \log_2{FC}_{g3n0-g1n0} = \beta_{g3} = 0\]
\[H_0: \log_2{FC}_{g3n1-g1n1} = \beta_{g3} + \beta_{g3n1} = 0\]
\[H_0: \log_2{FC}_{g1n1-g1n0} = \beta_{n1} = 0\]
\[H_0: \log_2{FC}_{g3n1-g3n0} = \beta_{n1} + \beta_{g3n1} = 0\]
\[H_0: \log_2{FC}_{g3n1-g1n1} - \log_2{FC}_{g3n0-g1n0} = \beta_{g3n1} = 0\] which is an equivalent hypotheses with \[H_0: \log_2{FC}_{g3n1-g3n0} - \log_2{FC}_{g1n1-g1n0} = \beta_{g3n1} = 0\]
We can test this using multcomp, which controls for multiple testing.
library(multcomp)
We get a significant p-value for the first, second, third and fifth hypothesis. The fourth hypothesis is not significant at the overall 5% significance level.
Function to simulate data similar to that of our experiment under our model assumptions.
<- function(form, data, betas, sd, contrasts, alpha = .05, nSim = 10000, adjust = "bonferroni")
simFastMultipleContrasts
{<- rnorm(nrow(data)*nSim,sd=sd)
ySim dim(ySim) <-c(nrow(data),nSim)
<- model.matrix(form, data)
design <- ySim + c(design %*%betas)
ySim <- t(ySim)
ySim
### Fitting
<- limma::lmFit(ySim,design)
fitAll
if(class(contrasts) == "glht"){
<- t(contrasts$linfct)
contrasts
}
### Inference
<- t(contrasts)%*%fitAll$cov.coefficients%*%contrasts
varUnscaled <- fitAll$coefficients %*%contrasts
contrasts <- matrix(diag(varUnscaled)^.5,nrow=nSim,ncol=5,byrow = TRUE)*fitAll$sigma
seContrasts <- contrasts/seContrasts
tstats <- pt(abs(tstats),fitAll$df.residual,lower.tail = FALSE)*2
pvals <- t(apply(pvals, 1, p.adjust, method = adjust))
pvals
return(colMeans(pvals < alpha))
}
<- simFastMultipleContrasts(form = ...,
power1 data = ...,
betas = ...,
sd = ...,
contrasts = ...,
alpha = ...,
nSim = ...)
power1
We observe large powers for all contrasts, except for contrast nodeg3, which has a small effect size.
<- matrix(NA,nrow=9, ncol=6)
powers colnames(powers) <- c("n",colnames(contrasts))
1] <- 2:10
powers[,
# Zorg hier voor 1 designpunt (of observatie) per groep. In de for-loop gaan we deze designpunten herhalen voor het aantal observaties.
<- data.frame(grade = ...,
dataAllComb node = ...)
for (i in 1:nrow(powers))
{ <- data.frame(grade = rep(dataAllComb$grade, powers[i,1]),
predData node = rep(dataAllComb$node, powers[i,1]))
-1] <- simFastMultipleContrasts(form = ...,
powers[i,data = predData,
betas = ...,
sd = ...,
contrasts = ...,
alpha = ...,
nSim = ...)
} powers
%>%
powers %>%
as.data.frame gather(contrast, power, -n) %>%
ggplot(aes(n,power,color=contrast)) +
geom_line()
<- simFastMultipleContrasts(form = ...,
power3 data = ...,
betas = ...,
sd = ...,
contrasts = ...,
alpha = ...,
nSim = ...)
power3