env <- new.env()
load("../data.rda", envir = env)
Ignored due to lack of pedagogical benefit. See Chapter 16 for details.
df <- get("CH17TA02", envir = env)
names(df) <- c("y", "x1", "x2")
fit <- lm(y ~ factor(x1), df)
The last two steps demonstrate how to use the information to extract some specific values, but of course this information is already displayed in the ANOVA table, as the p-value tells us the result of the hypothesis of the F-test.
addmargins(xtabs(y ~ x2 + x1, df), 1, mean)
## x1
## x2 1 2 3 4
## 1 43.90 89.80 68.40 36.20
## 2 39.00 87.10 69.30 45.20
## 3 46.70 92.70 68.50 40.70
## 4 43.80 90.60 66.40 40.50
## 5 44.20 87.70 70.00 39.30
## 6 47.70 92.40 68.10 40.30
## 7 43.60 86.10 70.60 43.20
## 8 38.90 88.10 65.20 38.70
## 9 43.60 90.80 63.80 40.90
## 10 40.00 89.10 69.20 39.70
## mean 43.14 89.44 67.95 40.47
cat("Ybar.. = ", mean(df$y))
## Ybar.. = 60.25
anova(fit)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x1) 3 15953 5318 866 <2e-16 ***
## Residuals 36 221 6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit)[1, 3] / anova(fit)[2,3] # F* = MSTR / MSE = 866.12
## [1] 866.1
qf(0.95, 3, 36) # F(0.95, 3, 36) = 2.87 < F*
## [1] 2.866
df <- get("CH16TA01", envir = env)
names(df) <- c("y", "x1", "x2")
I don't know of a line plot function, but a makeshift way to generate a similar graph is to use stripchart (graphics).
with(df, stripchart(tapply(y, x1, mean), pch = 20, ylim = c(1, 2), xlim = c(12, 30), xlab = "Cases Sold"))
with(df, text(tapply(y, x1, mean), rep(1, 4), labels = seq(4), pos = 3))
# The factor means to be plotted
means <- with(df, tapply(y, x1, mean))
# The radius portion of the 95% confidence interval for each factor
diff <- with(df, qt(0.975, 15) * sqrt(10.55 / tapply(y, x1, length)))
par(mfrow = c(1, 2))
barplot(means, xlab = "Design", ylab = "Cases Sold")
title("(a) Bar Graph")
plot(means, type = "o", pch = 19, ylim = c(0, 30), xlab = "Design", ylab = "Cases Sold")
abline(h = mean(df$y), lty = 2, col = "gray40", lwd = 2)
title(main = "(b) Main Effects Plot")
bar <- barplot(means, xlab = "Design", ylab = "Cases Sold", ylim = c(0, 32))
arrows(bar, means+diff, bar, means-diff, angle = 90, code = 3)
title("(a) Bar-Interval Graph")
plot(means, pch = 19, ylim = c(0, 30), xlim = c(.5, 4.5), xlab = "Design", ylab = "Cases Sold")
arrows(seq(4), means+diff, seq(4), means-diff, angle = 90, code = 3)
abline(h = mean(df$y), lty = 2, col = "gray40", lwd = 2)
title("(b) Interval Plot")
means <- with(df, tapply(y, x1, mean)) # Defined from earlier
D = means['3'] - means['4'] # (17.11)
se = sqrt(10.55 * (1/4 + 1/5)) # (17.14)
c("lwr" = D, "upr" = D) + # (17.16) +/- "t(0.975, 15) x se"
c(-qt(0.975,15) * se, qt(0.975, 15) * se)
## lwr.3 upr.3
## -12.344 -3.056
cat('t* =', D / se) # (17.18)
## t* = -3.534
lengths <- with(df, tapply(y, x1, length))
means <- with(df, tapply(y, x1, mean))
cont <- c(1/2, 1/2, -1/2, -1/2) # contrasts
L <- sum(cont * means) # (17.20)
se <- sqrt(10.55 * sum(cont^2 / lengths)) # (17.22)
c('lwr' = L, 'upr' = L) + # (17.24)
c(-qt(.975, 15)*se, qt(.975, 15)*se)
## lwr upr
## -12.541 -6.159
cat('t* =', L / se) # (17.26)
## t* = -6.245
df <- get("CH17TA02", envir = env)
names(df) <- c("y", "x1", "x2")
The TukeyHSD method (stats) performs a test for pairwise differences and contains a method for plotting those differences. It is not the same as this figure, but it is just as informative. Note that TukeyHSD requires an aov object. It will not take an lm object.
Here TukeyHSD presents the appropriate p-values. If you want to test on the q* distribution, you can look into qtukey. Moreover, note the differences presented. The order matters (for directionality), but the comparisons result in the same conclusions.
This same approach can be used for “Example 2–Unequal Sample Sizes” (p 750). Assuming the Kenton Food data were loaded, the following should suffice. It will be presented in the Scheffe example below.
TukeyHSD(aov(y ~ factor(x1), df), conf.level = .90)
fit <- aov(y ~ factor(x1), df)
TukeyHSD(fit)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = y ~ factor(x1), data = df)
##
## $`factor(x1)`
## diff lwr upr p adj
## 2-1 46.30 43.316 49.2845 0.0000
## 3-1 24.81 21.826 27.7945 0.0000
## 4-1 -2.67 -5.654 0.3145 0.0933
## 3-2 -21.49 -24.474 -18.5055 0.0000
## 4-2 -48.97 -51.954 -45.9855 0.0000
## 4-3 -27.48 -30.464 -24.4955 0.0000
plot(TukeyHSD(fit)) # Diff 4-1 contains 0
df <- get("CH16TA01", envir = env)
names(df) <- c("y", "x1", "x2")
As far as we know, R has no methods for this multiple comparison procedure. Therefore, we've created a function that takes a contrasts matrix
Scheffe <- function(formula, data, cont, conf.level = 0.90, MSE)
{
f <- function(contrast, mean) {contrast %*% mean} # (17.41)
h <- function(contrast, n, MSE) {sqrt(MSE * sum(contrast^2 / n))} # (17.42)
df <- model.frame(formula, data)
y <- df[, 1]
x <- df[, 2]
r <- nrow(cont)
means <- tapply(y, x, mean)
L <- apply(cont, 2, f, mean = means)
se <- apply(cont, 2, h, n = tapply(y, x, length), MSE = MSE)
S <- sqrt((r-1) * qf(conf.level, r-1, nrow(df)-r)) # (17.43a)
L + S*cbind('lwr' = -se, 'upr' = se) # (17.43)
}
cont <- matrix(c(0.5, 0.5, -0.5, -0.5,
0.5, -0.5, 0.5, -0.5,
1, -1, 0, 0,
0, 0, 1, -1), nrow = 4)
Scheffe(y ~ x1, df, cont = cont, conf.level = 0.90, MSE = 10.55)
## lwr upr
## [1,] -13.442 -5.2579
## [2,] -7.342 0.8421
## [3,] -4.414 6.8143
## [4,] -13.655 -1.7451
TukeyHSD(aov(y ~ factor(x1), df), conf.level = 0.90) # For Example 2--Unequal Sample Sizes (p 750)
## Tukey multiple comparisons of means
## 90% family-wise confidence level
##
## Fit: aov(formula = y ~ factor(x1), data = df)
##
## $`factor(x1)`
## diff lwr upr p adj
## 2-1 -1.2 -6.341 3.941 0.9353
## 3-1 4.9 -0.553 10.353 0.1549
## 4-1 12.6 7.459 17.741 0.0001
## 3-2 6.1 0.647 11.553 0.0583
## 4-2 13.8 8.659 18.941 0.0000
## 4-3 7.7 2.247 13.153 0.0142
Bonferroni <- function(formula, data, cont, conf.level = 0.975, MSE) {
f <- function(contrast, mean) {contrast %*% mean} # (17.41)
h <- function(contrast, n, MSE) {sqrt(MSE * sum(contrast^2 / n))} # (17.42)
df <- model.frame(formula, data)
y <- df[, 1]
x <- df[, 2]
r <- nrow(cont)
g <- ncol(cont)
means <- tapply(y, x, mean)
L <- apply(cont, 2, f, mean = means)
se <- apply(cont, 2, h, n = tapply(y, x, length), MSE = MSE)
B <- qt(1 - (1 - conf.level)/(2*g), nrow(df)-r) # (17.46a)
L + B*cbind('lwr' = -se, 'upr' = se) # (17.46)
}
# Define contrast matrix
Bonferroni(y ~ x1, df, cont[, 1:2], conf.level = 0.975, MSE = 10.55)
## lwr upr
## [1,] -13.597 -5.1027
## [2,] -7.497 0.9973
Scheffe(y ~ x1, df, cont = cont[, 1:2], conf.level = 0.90, MSE = 10.55) # Compare with Scheffe
## lwr upr
## [1,] -13.442 -5.2579
## [2,] -7.342 0.8421
Bonferroni(y ~ x1, df, cont, conf.level = 0.975, MSE = 10.55) # Compare to Scheffe example above
## lwr upr
## [1,] -14.107 -4.5930
## [2,] -8.007 1.5070
## [3,] -5.327 7.7266
## [4,] -14.623 -0.7775
Scheffe(y ~ x1, df, cont = cont, conf.level = 0.90, MSE = 10.55) # From previous example
## lwr upr
## [1,] -13.442 -5.2579
## [2,] -7.342 0.8421
## [3,] -4.414 6.8143
## [4,] -13.655 -1.7451
We cannot find any ANOM methods in R, but since the computations are straightforward, both the values of interest and graphic could be designed from the following code.
r = 4
u = sum(means) / r # (17.48a)
n = with(df, tapply(y, x1, length)) # lengths ni
s = vector("numeric", r)
for(i in seq(r))
s[i] = (10.55/n[i]) * ((r-1)/r)^2 + (10.55/r^2) * sum(1/n[-i]) # (17.49)
plot(x = seq(r), means, pch = 20, ylim = c(10, 30), xlab = "Levels of Design", ylab = "Mean", xaxt = 'n')
axis(1, seq(4))
segments(seq(4), u, seq(4), means)
lines(seq(1, 4.5, 0.5), rep(u+s, each = 2), type = "S")
lines(seq(1, 4.5, 0.5), rep(u-s, each = 2), type = "S")
abline(h = u)
df <- get("CH17TA04", envir = env)
names(df) <- c("Y", "X1", "X2")
df <- transform(df, X1 = factor(X1, labels = c(' 6 hours', ' 8 hours', '10 hours', '12 hours')))
To get homogeneous subsets, one could use multcompTs (multcompView), and feed it the respective p-value matrix from TukeyHSD. This results in a matrix of {-1, 0, 1} values, where -1 indicates they are significantly different, 1 is the base (comparing a level to itself), and 0 is no significant difference. This may be more useful in locating those differences in large number of cases. However, since it is entirely based on the p-values of TukeyHSD, one could also (1) extract the non-significant p-values or plot them to maybe visually inspect those that include 0 in their confidence intervals.
fit <- aov(Y ~ X1, df)
addmargins(xtabs(Y ~ X1 + X2, df, sparse = TRUE), 2, list(list('Mean' = mean, 'SD' = sd)))
## 1 2 3 4 5 6 7 Mean SD
## 6 hours 40 39 39 36 42 43 41 40.00 2.309
## 8 hours 53 48 49 50 51 50 48 49.86 1.773
## 10 hours 53 58 56 59 53 59 58 56.57 2.637
## 12 hours 63 62 59 61 62 62 61 61.43 1.272
model.tables(fit, "means") # Alternative way to see table of means
## Tables of means
## Grand mean
##
## 51.96
##
## X1
## X1
## 6 hours 8 hours 10 hours 12 hours
## 40.00 49.86 56.57 61.43
summary(fit) # ANOVA summary
## Df Sum Sq Mean Sq F value Pr(>F)
## X1 3 1809 603 141 2.2e-15 ***
## Residuals 24 102 4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary.lm(fit) # Coefficients are mean difference from "6 hours" mean
##
## Call:
## aov(formula = Y ~ X1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.000 -1.000 0.143 1.429 3.143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.00 0.78 51.26 < 2e-16 ***
## X1 8 hours 9.86 1.10 8.93 4.2e-09 ***
## X110 hours 16.57 1.10 15.02 1.1e-13 ***
## X112 hours 21.43 1.10 19.42 3.5e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.06 on 24 degrees of freedom
## Multiple R-squared: 0.946, Adjusted R-squared: 0.94
## F-statistic: 141 on 3 and 24 DF, p-value: 2.17e-15
TukeyHSD(fit) # 1st 3 'diff' match lm model coefficients
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Y ~ X1, data = df)
##
## $X1
## diff lwr upr p adj
## 8 hours- 6 hours 9.857 6.813 12.901 0.000
## 10 hours- 6 hours 16.571 13.527 19.616 0.000
## 12 hours- 6 hours 21.429 18.384 24.473 0.000
## 10 hours- 8 hours 6.714 3.670 9.758 0.000
## 12 hours- 8 hours 11.571 8.527 14.616 0.000
## 12 hours-10 hours 4.857 1.813 7.901 0.001
plot(TukeyHSD(fit))
cat("q* =", qtukey(0.95, 4, 24))
## q* = 3.901
diff(with(df, tapply(Y, X1, mean))) # Diminishing returns to training (p 764)
## 8 hours 10 hours 12 hours
## 9.857 6.714 4.857
Since we're treating X1 as a factor, we manually specify it as a vector below. The mean can been taken as this vector mean (= 9).
x1 <- rep(seq(6, 12, 2), each = 7) # numeric encoding of X1
xx <- seq(min(x1), max(x1), len = 200) # For curve plotting below
df <- transform(df, x = c(scale(x1, TRUE, FALSE))) # Center X1
df <- transform(df, X = x1) # Store the numeric encoding
(df <- transform(df, xsq = x*x)) # TABLE 17.5
## Y X1 X2 x X xsq
## 1 40 6 hours 1 -3 6 9
## 2 39 6 hours 2 -3 6 9
## 3 39 6 hours 3 -3 6 9
## 4 36 6 hours 4 -3 6 9
## 5 42 6 hours 5 -3 6 9
## 6 43 6 hours 6 -3 6 9
## 7 41 6 hours 7 -3 6 9
## 8 53 8 hours 1 -1 8 1
## 9 48 8 hours 2 -1 8 1
## 10 49 8 hours 3 -1 8 1
## 11 50 8 hours 4 -1 8 1
## 12 51 8 hours 5 -1 8 1
## 13 50 8 hours 6 -1 8 1
## 14 48 8 hours 7 -1 8 1
## 15 53 10 hours 1 1 10 1
## 16 58 10 hours 2 1 10 1
## 17 56 10 hours 3 1 10 1
## 18 59 10 hours 4 1 10 1
## 19 53 10 hours 5 1 10 1
## 20 59 10 hours 6 1 10 1
## 21 58 10 hours 7 1 10 1
## 22 63 12 hours 1 3 12 9
## 23 62 12 hours 2 3 12 9
## 24 59 12 hours 3 3 12 9
## 25 61 12 hours 4 3 12 9
## 26 62 12 hours 5 3 12 9
## 27 62 12 hours 6 3 12 9
## 28 61 12 hours 7 3 12 9
plot(df$Y ~ x1, pch = 20, ylim = c(30, 65), xlab = "Hours of Training", ylab = "Number of Acceptable Units")
lines(xx, coef(lm(Y ~ X + I(X^2), df)) %*% rbind(1, xx, xx^2)) # Uses (17.51)
Recall that R anova will output a breakdown of the Regression SS. Simply aggregating the components appropriately produces the book results.
1764.35 + 43.75 = 1808.1
Divide that by the degrees of freedom (2) will match the Regression MS.
fit <- lm(Y ~ x + xsq, df) # (17.52)
anova(fit) # (a) Regression Model (17.51)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## x 1 1764 1764 428.8 <2e-16 ***
## xsq 1 44 44 10.6 0.0032 **
## Residuals 25 103 4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(Y ~ X1, df)) # (b) ANOVA (17.50)
## Analysis of Variance Table
##
## Response: Y
## Df Sum Sq Mean Sq F value Pr(>F)
## X1 3 1809 603 141 2.2e-15 ***
## Residuals 24 102 4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fit, lm(Y ~ X1, df)) # (c) ANOVA for Lack of Fit
## Analysis of Variance Table
##
## Model 1: Y ~ x + xsq
## Model 2: Y ~ X1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 25 103
## 2 24 102 1 0.579 0.14 0.72