Estimation in the means model 1. Load the packages
library(tidyverse)
library(Matrix)
library(MASS)
t<-3; # the number of treatment levels
r<-4; # the number of replicates
n<-t*r # total number of experimental units
levels <- c("level 1", "level 2", "level 3");
fact <- rep(levels, each = r) %>% factor()
# fact <- gl(t,r,labels=levels) # alternative code
crd <- tibble( treatment = fact )
Z <- model.matrix(~ fact-1)
Z
## factlevel 1 factlevel 2 factlevel 3
## 1 1 0 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 0 1 0
## 6 0 1 0
## 7 0 1 0
## 8 0 1 0
## 9 0 0 1
## 10 0 0 1
## 11 0 0 1
## 12 0 0 1
## attr(,"assign")
## [1] 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$fact
## [1] "contr.treatment"
rankMatrix(Z)[1]
## [1] 3
# alternative code
# qr(Z)$rank
set.seed(13579)
mu<-500 # reference value
tau<-c(50, 0, -30) # treatment effects = differences wrt mu
sd<-10 # overall sd
means <- mu+tau %>% rep(each = r) # vector of means
y<-rnorm(n,mean = means, sd=sd)
crd$response<-y
y
## [1] 537.6528 537.4717 547.4522 534.7335 510.9711 524.8874 507.7948
## [8] 501.8838 459.7355 467.4329 477.4605 474.7122
ggplot(crd,aes(treatment,response))+geom_point()
5. Find the least squares estimate.
S<-t(Z)%*%Z
mu.hat <- solve(t(Z)%*%Z)%*%t(Z)%*%y
mu.hat
## [,1]
## factlevel 1 539.3276
## factlevel 2 511.3843
## factlevel 3 469.8353
by_group <- group_by(crd, treatment)
means.crd<-summarize(by_group, means = mean(response))
glimpse(means.crd)
## Observations: 3
## Variables: 2
## $ treatment <fct> level 1, level 2, level 3
## $ means <dbl> 539.3276, 511.3843, 469.8353
mod.crd.means<-lm(response~treatment-1, data=crd)
summary(mod.crd.means)
##
## Call:
## lm(formula = response ~ treatment - 1, data = crd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.100 -3.841 -1.765 5.564 13.503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## treatmentlevel 1 539.328 3.975 135.7 3.26e-16 ***
## treatmentlevel 2 511.384 3.975 128.7 5.26e-16 ***
## treatmentlevel 3 469.835 3.975 118.2 1.13e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.949 on 9 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9998
## F-statistic: 1.631e+04 on 3 and 9 DF, p-value: < 2.2e-16
mu.hat[2] - mu.hat[1]
## [1] -27.94328
mu.hat[3] - mu.hat[1]
## [1] -69.49227
mu.hat[3] - mu.hat[2]
## [1] -41.54899
Estimation in the treatment effects model 9. Construct the design matrix X
X <- cbind(1, as.matrix(Z))
colnames(X)<-c("reference", "effect 1", "effect 2", "effect 3")
X
## reference effect 1 effect 2 effect 3
## 1 1 1 0 0
## 2 1 1 0 0
## 3 1 1 0 0
## 4 1 1 0 0
## 5 1 0 1 0
## 6 1 0 1 0
## 7 1 0 1 0
## 8 1 0 1 0
## 9 1 0 0 1
## 10 1 0 0 1
## 11 1 0 0 1
## 12 1 0 0 1
rankMatrix(X)[1]
## [1] 3
# solve(t(X)%*%X)
# Because the problem with the normal equations is that X.T*T is singular and so can't be inverted.
# The first column of X is the sum of the rest of the columns
# X and X.T*X have the same column rank.
ginv1<-ginv(t(X)%*%X)
ginv
## function (X, tol = sqrt(.Machine$double.eps))
## {
## if (length(dim(X)) > 2L || !(is.numeric(X) || is.complex(X)))
## stop("'X' must be a numeric or complex matrix")
## if (!is.matrix(X))
## X <- as.matrix(X)
## Xsvd <- svd(X)
## if (is.complex(X))
## Xsvd$u <- Conj(Xsvd$u)
## Positive <- Xsvd$d > max(tol * Xsvd$d[1L], 0)
## if (all(Positive))
## Xsvd$v %*% (1/Xsvd$d * t(Xsvd$u))
## else if (!any(Positive))
## array(0, dim(X)[2L:1L])
## else Xsvd$v[, Positive, drop = FALSE] %*% ((1/Xsvd$d[Positive]) *
## t(Xsvd$u[, Positive, drop = FALSE]))
## }
## <bytecode: 0x7f95416600a8>
## <environment: namespace:MASS>
ginv2<-rbind(0,cbind(0,solve(t(Z)%*%Z)))
ginv2
## factlevel 1 factlevel 2 factlevel 3
## 0 0.00 0.00 0.00
## factlevel 1 0 0.25 0.00 0.00
## factlevel 2 0 0.00 0.25 0.00
## factlevel 3 0 0.00 0.00 0.25
Find a solution to the normal equations
Find two solutions to the normal equations based on the two types of generalized inverses desscribed above. Verify numerically that they actually solve the normal equations.
beta1 <- ginv1%*%t(X)%*%y
beta1
## [,1]
## [1,] 380.13679
## [2,] 159.19078
## [3,] 131.24750
## [4,] 89.69851
beta2 <- ginv2%*%t(X)%*%y
beta2
## [,1]
## 0.0000
## factlevel 1 539.3276
## factlevel 2 511.3843
## factlevel 3 469.8353
# now we verify that the above equations solve the normal equations
t(X)%*%X%*%beta1-t(X)%*%y
## [,1]
## reference 1.818989e-12
## effect 1 9.094947e-13
## effect 2 6.821210e-13
## effect 3 9.094947e-13
t(X)%*%X%*%beta1-t(X)%*%y
## [,1]
## reference 1.818989e-12
## effect 1 9.094947e-13
## effect 2 6.821210e-13
## effect 3 9.094947e-13
beta1[3] - beta1[2]
## [1] -27.94328
beta1[4] - beta1[2]
## [1] -69.49227
beta1[4] - beta1[3]
## [1] -41.54899
beta2[3] - beta2[2]
## [1] -27.94328
beta2[4] - beta2[2]
## [1] -69.49227
beta2[4] - beta2[3]
## [1] -41.54899
beta1[1] + beta1[2]
## [1] 539.3276
beta1[1] + beta1[3]
## [1] 511.3843
beta1[1] + beta1[3]
## [1] 511.3843
beta2[1] + beta2[2]
## [1] 539.3276
beta2[1] + beta2[3]
## [1] 511.3843
beta2[1] + beta2[4]
## [1] 469.8353
mod.crd.treat<-lm(response~treatment, data = crd)
summary(mod.crd.treat)
##
## Call:
## lm(formula = response ~ treatment, data = crd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.100 -3.841 -1.765 5.564 13.503
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 539.328 3.975 135.692 3.26e-16 ***
## treatmentlevel 2 -27.943 5.621 -4.971 0.000769 ***
## treatmentlevel 3 -69.492 5.621 -12.363 5.97e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.949 on 9 degrees of freedom
## Multiple R-squared: 0.9451, Adjusted R-squared: 0.9328
## F-statistic: 77.4 on 2 and 9 DF, p-value: 2.137e-06