library(tidyverse)
read.table("https://raw.githubusercontent.com/statOmics/SGA2020/data/gse2990BreastcancerOneGene.txt",header=TRUE)
gene <-head(gene)
## sample_name grade node size age gene
## 28 OXFT_2221 3 1 5.5 76 367.8179
## 29 OXFT_209 3 1 2.5 66 590.3576
## 30 OXFT_1769 1 1 3.5 86 346.6583
## 31 OXFT_928 1 0 1.1 47 118.6996
## 32 OXFT_2093 1 1 2.2 74 519.4489
## 33 OXFT_1770 1 1 1.7 69 258.4455
We will transform the variable grade and node to a factor
$grade <- as.factor(gene$grade)
gene$node <- as.factor(gene$node) gene
gene %>%
geneSum <- group_by(grade) %>%
summarize(mean = mean(gene),
sd = sd(gene),
n=length(gene)
%>%
) mutate(se = sd/sqrt(n))
geneSum
## # A tibble: 2 x 5
## grade mean sd n se
## <fct> <dbl> <dbl> <int> <dbl>
## 1 1 264. 117. 19 26.7
## 2 3 606. 267. 17 64.9
%>%
gene ggplot(aes(x=grade,y=gene)) +
geom_boxplot(outlier.shape=NA) +
geom_point()
We can also save the plots as objects for later use!
gene %>%
p1 <- ggplot(aes(x=grade,y=gene)) +
geom_boxplot(outlier.shape=NA) +
geom_point()
gene %>%
p2 <- filter(grade==1) %>%
ggplot(aes(sample=gene)) +
geom_qq() +
geom_qq_line()
gene %>%
p3 <- filter(grade==1) %>%
ggplot(aes(sample=gene)) +
geom_qq() +
geom_qq_line()
p1
p2
p3
Researchers want to assess the association of the histolical grade on KPNA2 gene expression
tibble(
effectSize <-delta = geneSum$mean[2]- geneSum$mean[1],
seDelta = geneSum %>%
pull(se) %>%
.^2 %>%
sum %>%
sqrt
) effectSize
## # A tibble: 1 x 2
## delta seDelta
## <dbl> <dbl>
## 1 342. 70.2
In general we start from alternative hypothese \(H_A\): we want to show an association
Gene expression of grade 1 and grade 3 patients is on average different
But, we will assess it by falsifying the opposite:
The average KPNA2 gene expression of grade 1 and grade 3 patients is equal
How likely is it to observe an equal or more extreme association than the one observed in the sample when the null hypothesis is true?
When we make assumptions about the distribution of our test statistic we can quantify this probability: p-value.
If the p-value is below a significance threshold \(\alpha\) we reject the null hypothesis
We control the probability on a false positive result at the \(\alpha\)-level (type I error)
library(gridExtra)
p1
grid.arrange(p2,p3,ncol=2)
t.test(gene~grade,data=gene)
##
## Welch Two Sample t-test
##
## data: gene by grade
## t = -4.8806, df = 21.352, p-value = 7.61e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -488.1734 -196.6587
## sample estimates:
## mean in group 1 mean in group 3
## 263.5516 605.9677
effectSize %>%
effectSize <- mutate(t.stat=delta/seDelta) %>%
mutate(p.value= pt(-abs(t.stat),21.352)*2)
effectSize
## # A tibble: 1 x 4
## delta seDelta t.stat p.value
## <dbl> <dbl> <dbl> <dbl>
## 1 342. 70.2 4.88 0.0000761
\[ \log_2(B) - \log_2(A) = \log_2 \frac{B}{A} = \log_2 FC_{\frac{B}{A}} \]
gene %>%
gene <- mutate(lgene = log2(gene))
gene %>%
p1 <- ggplot(aes(x=grade,y=lgene)) +
geom_boxplot(outlier.shape=NA) +
geom_point()
gene %>%
p2 <- filter(grade==1) %>%
ggplot(aes(sample=lgene)) +
geom_qq() +
geom_qq_line()
gene %>%
p3 <- filter(grade==1) %>%
ggplot(aes(sample=lgene)) +
geom_qq() +
geom_qq_line()
p1
grid.arrange(p2,p3,ncol=2)
t.test(lgene~grade,data=gene)
logtest <- logtest
##
## Welch Two Sample t-test
##
## data: lgene by grade
## t = -6.0508, df = 33.962, p-value = 7.432e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.6236927 -0.8072052
## sample estimates:
## mean in group 1 mean in group 3
## 7.912963 9.128412
logtest$estimate[2]-logtest$estimate[1]
log2FC <- log2FC
## mean in group 3
## 1.215449
names(log2FC) <- "g3-g1"
2^log2FC
## g3-g1
## 2.32213
There is a extremely significant association of the histological grade on the gene expression in tumor tissue. On average, the gene expression for the grade 3 patients is 2.3221304 times higher than the gene expression in grade 1 patients (95% CI [1.75, 3.08], \(p<<0.001\)).
The patients also differ in the their lymph node status. Hence, we have a two factorial design: grade x lymph node status!!!
Solution??
How can we integrate multiple factors and continuous covariates in linear model.
\[ y_i= \beta_0 + \beta_1 x_{i,1} + \beta_2 x_{i,2} + \beta_{12}x_{i,1}x_{i,2}+\epsilon_i, \] with
lm(gene~grade*node,data=gene)
lm1 <-summary(lm1)
##
## Call:
## lm(formula = gene ~ grade * node, data = gene)
##
## Residuals:
## Min 1Q Median 3Q Max
## -356.85 -91.98 -31.47 53.00 612.73
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.60 60.00 3.460 0.00155 **
## grade3 434.21 84.85 5.117 1.41e-05 ***
## node1 132.88 92.46 1.437 0.16040
## grade3:node1 -234.43 136.92 -1.712 0.09655 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 199 on 32 degrees of freedom
## Multiple R-squared: 0.4809, Adjusted R-squared: 0.4322
## F-statistic: 9.881 on 3 and 32 DF, p-value: 9.181e-05
plot(lm1)
library(ExploreModelMatrix)
VisualizeDesign(gene,designFormula = ~grade*node)
explMx <-$plotlist explMx
## [[1]]
You can also explore the model matrix interactively:
ExploreModelMatrix(gene,designFormula = ~grade*node)
\[ \begin{array}{ccc} \frac{\partial RSS}{\partial \mathbf{\beta}}&=&\mathbf{0}\\\\ \frac{(\mathbf{Y}-\mathbf{X\beta})^T(\mathbf{Y}-\mathbf{X\beta})}{\partial \mathbf{\beta}}&=&\mathbf{0}\\\\ -2\mathbf{X}^T(\mathbf{Y}-\mathbf{X\beta})&=&\mathbf{0}\\\\ \mathbf{X}^T\mathbf{X\beta}&=&\mathbf{X}^T\mathbf{Y}\\\\ \hat{\mathbf{\beta}}&=&(\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{Y} \end{array} \]
Toy Example: fit without intercept
set.seed(4)
c(2,0,0)
x1 <- c(0,2,0)
x2 <- x1*0.5 + x2*0.5 + rnorm(3,2)
y <- lm(y~-1+x1+x2) fit <-
# predict values on regular xy grid
seq(-1, 3, length.out = 10)
x1pred <- seq(-1, 3, length.out = 10)
x2pred <- expand.grid(x1 = x1pred,
xy <-x2 = x2pred)
matrix (nrow = 30, ncol = 30,
ypred <-data = predict(fit, newdata = data.frame(xy),
interval = "prediction"))
library(plot3D)
# fitted points for droplines to surface
20
th=5
ph=scatter3D(x1,
x2,
y,pch = 16,
col="darkblue",
cex = 1,
theta = th,
ticktype = "detailed",
xlab = "x1",
ylab = "x2",
zlab = "y",
colvar=FALSE,
bty = "g",
xlim=c(-1,3),
ylim=c(-1,3),
zlim=c(-2,4))
for (i in 1:3)
lines3D(
x = rep(x1[i],2),
y = rep(x2[i],2),
z = c(y[i],fit$fitted[i]),
col="darkblue",
add=TRUE,
lty=2)
outer(
z.pred3D <-
x1pred,
x2pred,function(x1,x2)
{$coef[1]*x1+fit$coef[2]*x2
fit
})
outer(
x.pred3D <-
x1pred,
x2pred,function(x,y) x)
outer(
y.pred3D <-
x1pred,
x2pred,function(x,y) y)
surf3D(
x.pred3D,
y.pred3D,
z.pred3D,col="blue",
facets=NA,
add=TRUE)
We can also interpret the fit as the projection of the \(n\times 1\) vector \(\mathbf{Y}\) on the column space of the matrix \(\mathbf{X}\).
So each column in \(\mathbf{X}\) is also an \(n\times 1\) vector.
For the toy example n=3 and p=2. So the column space of X is a plane in the three dimensional space.
\[ \hat{\mathbf{Y}} = \mathbf{X} (\mathbf{X}^T\mathbf{X})^{-1} \mathbf{X}^T \mathbf{Y} \]
arrows3D(0,0,0,x1[1],x1[2],x1[3],xlim=c(0,5),ylim=c(0,5),zlim=c(0,5),bty = "g",theta=th,col=2,xlab="row 1",ylab="row 2",zlab="row 3")
text3D(x1[1],x1[2],x1[3],labels="X1",col=2,add=TRUE)
arrows3D(0,0,0,x2[1],x2[2],x2[3],add=TRUE,col=2)
text3D(x2[1],x2[2],x2[3],labels="X2",col=2,add=TRUE)
arrows3D(0,0,0,x1[1],x1[2],x1[3],xlim=c(0,5),ylim=c(0,5),zlim=c(0,5),bty = "g",theta=th,col=2,xlab="row 1",ylab="row 2",zlab="row 3")
text3D(x1[1],x1[2],x1[3],labels="X1",col=2,add=TRUE)
arrows3D(0,0,0,x2[1],x2[2],x2[3],add=TRUE,col=2)
text3D(x2[1],x2[2],x2[3],labels="X2",col=2,add=TRUE)
arrows3D(0,0,0,y[1],y[2],y[3],add=TRUE,col="darkblue")
text3D(y[1],y[2],y[3],labels="Y",col="darkblue",add=TRUE)
arrows3D(0,0,0,x1[1],x1[2],x1[3],xlim=c(0,5),ylim=c(0,5),zlim=c(0,5),bty = "g",theta=th,col=2,xlab="row 1",ylab="row 2",zlab="row 3")
text3D(x1[1],x1[2],x1[3],labels="X1",col=2,add=TRUE)
arrows3D(0,0,0,x2[1],x2[2],x2[3],add=TRUE,col=2)
text3D(x2[1],x2[2],x2[3],labels="X2",col=2,add=TRUE)
arrows3D(0,0,0,y[1],y[2],y[3],add=TRUE,col="darkblue")
text3D(y[1],y[2],y[3],labels="Y",col="darkblue",add=TRUE)
arrows3D(0,0,0,fit$fitted[1],fit$fitted[2],fit$fitted[3],add=TRUE,col="darkblue")
segments3D(y[1],y[2],y[3],fit$fitted[1],fit$fitted[2],fit$fitted[3],add=TRUE,lty=2,col="darkblue")
text3D(fit$fitted[1],fit$fitted[2],fit$fitted[3],labels="fit",col="darkblue",add=TRUE)
\[ \begin{array}{ccl} \hat{\boldsymbol{\Sigma}}_{\hat{\mathbf{\beta}}} &=&\text{var}\left[(\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{Y}\right]\\\\ &=&(\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\text{var}\left[\mathbf{Y}\right]\mathbf{X}(\mathbf{X}^T\mathbf{X})^{-1}\\\\ &=&(\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T(\mathbf{I}\sigma^2)\mathbf{X}(\mathbf{X}^T\mathbf{X})^{-1} \\\\ &=&(\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{I}\quad\mathbf{X}(\mathbf{X}^T\mathbf{X})^{-1}\sigma^2\\\\ %\hat{\boldmath{\Sigma}}_{\hat{\mathbf{\beta}}}&=&(\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\var\left[\mathbf{Y}\right](\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}\\ &=&(\mathbf{X}^T\mathbf{X})^{-1}\mathbf{X}^T\mathbf{X}(\mathbf{X}^T\mathbf{X})^{-1}\sigma^2\\\\ &=&(\mathbf{X}^T\mathbf{X})^{-1}\sigma^2 \end{array} \]
Hypothesis often involve linear combinations of the model parameters!
e.g.
\(H_0: \log_2{FC}_{g3n1-g1n1}= \beta_{g3} + \hat\beta_{g3n1}=0\) \(\rightarrow\) “grade3+grade3:node1 = 0”
Let \[ \boldsymbol{\beta} = \left[ \begin{array}{c} \beta_{0}\\ \beta_{g3}\\ \beta_{n1}\\ \beta_{g3:n1} \end{array} \right]\]
we can write that contrast using a contrast matrix: \[ \mathbf{L}=\left[\begin{array}{c}0\\1\\0\\1\end{array}\right] \rightarrow \mathbf{L}^T\boldsymbol{beta} \]
Then the variance becomes: \[ \text{var}_{\mathbf{L}^T\boldsymbol{\hat\beta}}= \mathbf{L}^T \boldsymbol{\Sigma}_{\boldsymbol{\hat\beta}}\mathbf{L} \]
Study the solution of the exercise to understand the analysis in R https://gtpb.github.io/PSLS20/pages/08-multipleRegression/08-multipleRegression_KPNA2.html
Calculate
Tip: details on the implementation can be found in the book of Faraway (chapter 2). https://people.bath.ac.uk/jjf23/book/
model.matrix(~grade*node,data=gene) X <-
t(X)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## (Intercept) 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## grade3 1 1 0 0 0 0 0 1 0 1 0 1 0 1 1 0 1 0 0 0 0 1 0
## node1 1 1 1 0 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1
## grade3:node1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0
## 24 25 26 27 28 29 30 31 32 33 34 35 36
## (Intercept) 1 1 1 1 1 1 1 1 1 1 1 1 1
## grade3 1 1 0 1 1 1 1 0 1 0 0 1 0
## node1 0 0 1 0 1 0 0 1 0 0 1 1 0
## grade3:node1 0 0 0 0 1 0 0 0 0 0 0 1 0
## attr(,"assign")
## [1] 0 1 2 3
## attr(,"contrasts")
## attr(,"contrasts")$grade
## [1] "contr.treatment"
##
## attr(,"contrasts")$node
## [1] "contr.treatment"
t(X)%*%X
## (Intercept) grade3 node1 grade3:node1
## (Intercept) 36 17 14 6
## grade3 17 17 6 6
## node1 14 6 14 6
## grade3:node1 6 6 6 6
\[ df = n-p\]
summary(lm1)
##
## Call:
## lm(formula = gene ~ grade * node, data = gene)
##
## Residuals:
## Min 1Q Median 3Q Max
## -356.85 -91.98 -31.47 53.00 612.73
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 207.60 60.00 3.460 0.00155 **
## grade3 434.21 84.85 5.117 1.41e-05 ***
## node1 132.88 92.46 1.437 0.16040
## grade3:node1 -234.43 136.92 -1.712 0.09655 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 199 on 32 degrees of freedom
## Multiple R-squared: 0.4809, Adjusted R-squared: 0.4322
## F-statistic: 9.881 on 3 and 32 DF, p-value: 9.181e-05
(nrow(X)-ncol(X))
dfRes <- dfRes
## [1] 32
\[ \hat \sigma^2 = \frac{\sum\limits_{i=1}^n\epsilon_i^2}{n-p} \]
Invert matrix: use function solve(.)
Diagonal elements of a matrix: use function diag(.)
t(X)%*%X
## (Intercept) grade3 node1 grade3:node1
## (Intercept) 36 17 14 6
## grade3 17 17 6 6
## node1 14 6 14 6
## grade3:node1 6 6 6 6
diag(t(X)%*%X)
## (Intercept) grade3 node1 grade3:node1
## 36 17 14 6