env <- new.env()
load("../data.rda", envir = env)
df <- get("CH17TA02", envir = env)
names(df) <- c("y", "x1", "x2")
fit <- lm(y ~ factor(x1)-1, df)
xtabs(resid(fit) ~ x2 + x1, df)
## x1
## x2 1 2 3 4
## 1 0.76 0.36 0.45 -4.27
## 2 -4.14 -2.34 1.35 4.73
## 3 3.56 3.26 0.55 0.23
## 4 0.66 1.16 -1.55 0.03
## 5 1.06 -1.74 2.05 -1.17
## 6 4.56 2.96 0.15 -0.17
## 7 0.46 -3.34 2.65 2.73
## 8 -4.24 -1.34 -2.75 -1.77
## 9 0.46 1.36 -4.15 0.43
## 10 -3.14 -0.34 1.25 -0.77
plot(resid(fit) ~ fitted(fit), ylab = "Residual", xlab = expression(hat(Y)), pch = 19)
abline(0,0)
title(expression(paste("(a) Residual against ", hat(Y))))
stripchart(split(resid(fit), df$x1), method = "stack", pch = 19)
abline(h = seq(2, 4)-0.1)
title("(b) Aligned Residual Dot Plot")
qqnorm(resid(fit), xlab = "Exp Val", ylab = "Residual", pch = 19, main = "")
qqline(resid(fit))
title("(c) Normal Probability Plot")
Omitted; No data
df <- get("CH18TA02", envir = env)
names(df) <- c("y", "x1", "x2")
R has a function for the H distribution used in this example. It comes from maxFratio (SuppDists). The df and r (k) are reversed in its arguments from those used in the book. Note that (18.10) just is the variance (var) function applied to each factor level, as used below.
library(SuppDists)
tab <- xtabs(y ~ x2 + x1, df)
round(addmargins(tab, 1, FUN = list(list(mean, median, s = var))), 3)
## x1
## x2 1 2 3 4 5
## 1 14.870 18.430 16.950 8.590 11.550
## 2 16.810 18.760 12.280 10.900 13.360
## 3 15.830 20.120 12.000 8.600 13.640
## 4 15.470 19.110 13.180 10.130 12.160
## 5 13.600 19.810 14.990 10.280 11.620
## 6 14.760 18.430 15.760 9.980 12.390
## 7 17.400 17.160 19.350 9.410 12.050
## 8 14.620 16.400 15.520 10.040 11.950
## mean 15.420 18.527 15.004 9.741 12.340
## median 15.170 18.595 15.255 10.010 12.105
## s 1.531 1.570 6.183 0.667 0.592
vars <- with(df, tapply(y, x1, var)) # (18.10)
cat("H* = ", max(vars) / min(vars)) # (18.8)
## H* = 10.44
qmaxFratio(0.95, 7, 5) # (18.9)
## [1] 9.697
We round the y values so they actually stack at integer points. Otherwise, they're recognized as different numeric values and do not stack as desired.
stripchart(round(y) ~ x1, df, pch = 19, method = "stack", xlim = c(0, 30), xlab = "Pull Strength", ylab = "Type")
abline(h = seq(2,5) - 0.1)
R does contain a function for the Brown-Forsythe test called hov (HH). Since this test is not nearly as simple as the Hartley test, which is one of its prime advantages, making use of hov is a time saver. The BF computations serve no pedagogical value at this time, so they will be omitted.
library(HH)
hov(y ~ factor(x1), df)
##
## hov: Brown-Forsyth
##
## data: y
## F = 2.936, df:factor(x1) = 4, df:Residuals = 35, p-value = 0.03414
## alternative hypothesis: variances are not identical
data.frame(with(df, tapply(y, x1, function(x) list(abs(x - median(x))))))
## X1 X2 X3 X4 X5
## 1 0.30 0.165 1.695 1.42 0.555
## 2 1.64 0.165 2.975 0.89 1.255
## 3 0.66 1.525 3.255 1.41 1.535
## 4 0.30 0.515 2.075 0.12 0.055
## 5 1.57 1.215 0.265 0.27 0.485
## 6 0.41 0.165 0.505 0.03 0.285
## 7 2.23 1.435 4.095 0.60 0.055
## 8 0.55 2.195 0.265 0.03 0.155
w = 1 / vars # Factor level variances (18.10) defined earlier
w = rep(w, each = 8) # (18.14) -- Repeat each weight by # of x2 factor levels
fit <- lm(y ~ x1 -1, transform(df, x1 = factor(x1)), weights = w) # (18.17)
data.frame(
'i' = df$x1,
'j' = df$x2,
'Y' = df$y,
model.matrix(fit),
'Weights' = w,
'Reduced.Model' = 1)
## i j Y x11 x12 x13 x14 x15 Weights Reduced.Model
## 1 1 1 14.87 1 0 0 0 0 0.6534 1
## 2 1 2 16.81 1 0 0 0 0 0.6534 1
## 3 1 3 15.83 1 0 0 0 0 0.6534 1
## 4 1 4 15.47 1 0 0 0 0 0.6534 1
## 5 1 5 13.60 1 0 0 0 0 0.6534 1
## 6 1 6 14.76 1 0 0 0 0 0.6534 1
## 7 1 7 17.40 1 0 0 0 0 0.6534 1
## 8 1 8 14.62 1 0 0 0 0 0.6534 1
## 9 2 1 18.43 0 1 0 0 0 0.6370 1
## 10 2 2 18.76 0 1 0 0 0 0.6370 1
## 11 2 3 20.12 0 1 0 0 0 0.6370 1
## 12 2 4 19.11 0 1 0 0 0 0.6370 1
## 13 2 5 19.81 0 1 0 0 0 0.6370 1
## 14 2 6 18.43 0 1 0 0 0 0.6370 1
## 15 2 7 17.16 0 1 0 0 0 0.6370 1
## 16 2 8 16.40 0 1 0 0 0 0.6370 1
## 17 3 1 16.95 0 0 1 0 0 0.1617 1
## 18 3 2 12.28 0 0 1 0 0 0.1617 1
## 19 3 3 12.00 0 0 1 0 0 0.1617 1
## 20 3 4 13.18 0 0 1 0 0 0.1617 1
## 21 3 5 14.99 0 0 1 0 0 0.1617 1
## 22 3 6 15.76 0 0 1 0 0 0.1617 1
## 23 3 7 19.35 0 0 1 0 0 0.1617 1
## 24 3 8 15.52 0 0 1 0 0 0.1617 1
## 25 4 1 8.59 0 0 0 1 0 1.4996 1
## 26 4 2 10.90 0 0 0 1 0 1.4996 1
## 27 4 3 8.60 0 0 0 1 0 1.4996 1
## 28 4 4 10.13 0 0 0 1 0 1.4996 1
## 29 4 5 10.28 0 0 0 1 0 1.4996 1
## 30 4 6 9.98 0 0 0 1 0 1.4996 1
## 31 4 7 9.41 0 0 0 1 0 1.4996 1
## 32 4 8 10.04 0 0 0 1 0 1.4996 1
## 33 5 1 11.55 0 0 0 0 1 1.6892 1
## 34 5 2 13.36 0 0 0 0 1 1.6892 1
## 35 5 3 13.64 0 0 0 0 1 1.6892 1
## 36 5 4 12.16 0 0 0 0 1 1.6892 1
## 37 5 5 11.62 0 0 0 0 1 1.6892 1
## 38 5 6 12.39 0 0 0 0 1 1.6892 1
## 39 5 7 12.05 0 0 0 0 1 1.6892 1
## 40 5 8 11.95 0 0 0 0 1 1.6892 1
summary(fit)
##
## Call:
## lm(formula = y ~ x1 - 1, data = transform(df, x1 = factor(x1)),
## weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -1.6980 -0.6683 0.0174 0.5220 1.7478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x11 15.420 0.437 35.2 <2e-16 ***
## x12 18.528 0.443 41.8 <2e-16 ***
## x13 15.004 0.879 17.1 <2e-16 ***
## x14 9.741 0.289 33.7 <2e-16 ***
## x15 12.340 0.272 45.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1 on 35 degrees of freedom
## Multiple R-squared: 0.995, Adjusted R-squared: 0.994
## F-statistic: 1.3e+03 on 5 and 35 DF, p-value: <2e-16
anova(fit)
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## x1 5 6479 1296 1296 <2e-16 ***
## Residuals 35 35 1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(y ~ 1, df, weights = w)) # Reduced Model (18.19)
##
## Call:
## lm(formula = y ~ 1, data = df, weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -5.249 -1.311 0.922 2.656 5.781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.876 0.498 25.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.03 on 39 degrees of freedom
anova(lm(y ~ 1, df, weights = w))
## Analysis of Variance Table
##
## Response: y
## Df Sum Sq Mean Sq F value Pr(>F)
## Residuals 39 359 9.21
anova(lm(y ~ 1, df, weights = w), fit) # Confirm that x1 is significant to keep
## Analysis of Variance Table
##
## Model 1: y ~ 1
## Model 2: y ~ x1 - 1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 39 359
## 2 35 35 4 324 81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- get("CH18TA05", envir = env)
names(df) <- c("y", "x1", "x2")
addmargins(xtabs(y ~ x2 + x1, df), 1, list(list(mean, var)))
## x1
## x2 1 2 3
## 1 4.41 8.24 106.19
## 2 100.65 81.16 33.83
## 3 14.45 7.35 78.88
## 4 47.13 12.29 342.81
## 5 85.21 1.61 44.33
## mean 50.37 22.13 121.21
## var 1788.74 1103.45 16167.45
addmargins(xtabs(rank(y) ~ x2 + x1, df), 1, list(list(mean, var)))
## x1
## x2 1 2 3
## 1 2.0 4.0 14.0
## 2 13.0 11.0 7.0
## 3 6.0 3.0 10.0
## 4 9.0 5.0 15.0
## 5 12.0 1.0 8.0
## mean 8.4 4.8 10.8
## var 20.3 14.2 12.7
cbind(
'(1)' = with(df, tapply(y, x1, function(x) var(x) / mean(x))),
'(2)' = with(df, tapply(y, x1, function(x) sd(x) / mean(x))),
'(3)' = with(df, tapply(y, x1, function(x) sd(x) / mean(x)^2)))
## (1) (2) (3)
## 1 35.51 0.8397 0.016670
## 2 49.86 1.5011 0.067829
## 3 133.39 1.0490 0.008655
See Chapter 3 for details on manual calculations. Instead, boxcox (MASS) will be used.
We do not show the Bonferroni corrected confidence intervals. Instead, we're just going to see if there exists any pairwise differences. To this end pairwise.t.test with the p.adj = 'bonf' parameter set will suffice. It produces a pairwise matrix of p-values. In this case, “3-2” has a p-value of 5% indicating their is a statistically significant difference between the two means. We also provide a TukeyHSD result for comparison. Review the methods to check this appropriateness.
library(MASS)
fit <- lm(y ~ factor(x1) - 1, df)
boxcox(fit)
with(df, tapply(log(y), x1, var)) # More stable variances (p 792)
## 1 2 3
## 1.7420 1.9736 0.8181
with(df, tapply(log(y), x1, mean)) # Transformed means (p 793)
## 1 2 3
## 3.413 2.297 4.437
par(mfrow = c(1, 2), pch = 16)
qqnorm(resid(fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(fit))
title("(a) Original Data")
fit <- update(fit, log(.) ~ .)
qqnorm(resid(fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(fit))
title("(b) Transformed Data")
## Some statistics (p 793)
anova(lm(log(y) ~ 1, df), fit) # anova(reduced, full)
## Analysis of Variance Table
##
## Model 1: log(y) ~ 1
## Model 2: log(y) ~ factor(x1) - 1
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 14 29.6
## 2 12 18.1 2 11.4 3.79 0.053 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qf(0.90, 2, 12) # Compare with above F*
## [1] 2.807
summary(fit) # Authors typo mean(location2) = 2.792
##
## Call:
## lm(formula = log(y) ~ factor(x1) - 1, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9290 -0.6936 -0.0687 0.7362 2.0994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## factor(x1)1 3.41 0.55 6.21 4.5e-05 ***
## factor(x1)2 2.30 0.55 4.18 0.0013 **
## factor(x1)3 4.44 0.55 8.07 3.4e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.23 on 12 degrees of freedom
## Multiple R-squared: 0.91, Adjusted R-squared: 0.887
## F-statistic: 40.4 on 3 and 12 DF, p-value: 1.51e-06
with(df, pairwise.t.test(log(y), x1, p.adj = "bonf", conf.level = 0.9))
##
## Pairwise comparisons using t tests with pooled SD
##
## data: log(y) and x1
##
## 1 2
## 2 0.530 -
## 3 0.637 0.053
##
## P value adjustment method: bonferroni
TukeyHSD(aov(log(y) ~ factor(x1), df), conf.level = 0.90)
## Tukey multiple comparisons of means
## 90% family-wise confidence level
##
## Fit: aov(formula = log(y) ~ factor(x1), data = df)
##
## $`factor(x1)`
## diff lwr upr p adj
## 2-1 -1.116 -2.8772 0.6456 0.3549
## 3-1 1.024 -0.7376 2.7852 0.4131
## 3-2 2.140 0.3782 3.9011 0.0431
We provide the F-value in the ANOVA output for this nonparametric ranked response model. Per the author's comment (p 796), we also include the Kruskal-Wallis test using kruskal.test (stats). The results are very close in both cases. We manually calculate the multiple pairwise comparisons.
fit <- lm(rank(y) ~ factor(x1), df)
addmargins(xtabs(rank(y) ~ x2 + x1, df), 1, list(list(mean, var))) # Same results as earlier
## x1
## x2 1 2 3
## 1 2.0 4.0 14.0
## 2 13.0 11.0 7.0
## 3 6.0 3.0 10.0
## 4 9.0 5.0 15.0
## 5 12.0 1.0 8.0
## mean 8.4 4.8 10.8
## var 20.3 14.2 12.7
anova(fit) # Compare F-value with value below
## Analysis of Variance Table
##
## Response: rank(y)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(x1) 2 91.2 45.6 2.9 0.094 .
## Residuals 12 188.8 15.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("F* =", qf(0.90, 2, 12))
## F* = 2.807
kruskal.test(y ~ x1, df) # Compare the chi-squared value with value below
##
## Kruskal-Wallis rank sum test
##
## data: y by x1
## Kruskal-Wallis chi-squared = 4.56, df = 2, p-value = 0.1023
cat("chi* =", qchisq(0.90, 2))
## chi* = 4.605
# Multiple Pairwise Comparisons
B = qnorm(1 - 0.1 / 6) * sqrt((15 * (15 + 1)) / 12 * (1/5 + 1/5))
m = with(df, tapply(rank(y), x1, mean))
round(rbind(
"1-2" = c('diff' = m[[1]] - m[[2]], (m[1] - m[2]) + c('lwr' = -B, 'upr' = B)),
"3-2" = c('diff' = m[[3]] - m[[2]], (m[3] - m[2]) + c('lwr' = -B, 'upr' = B)),
"3-1" = c('diff' = m[[3]] - m[[1]], (m[3] - m[1]) + c('lwr' = -B, 'upr' = B))), 1)
## diff lwr upr
## 1-2 3.6 -2.4 9.6
## 3-2 6.0 0.0 12.0
## 3-1 2.4 -3.6 8.4
df <- get("CH18TA07", envir = env)
names(df) <- c("y", "x")
df <- transform(df, x = factor(x, labels = c("Low", "Medium", "High")))
Since the number of observations per treatment vary, we cannot present this in tabular form in R. Instead, we simply split them and show the separate treatment vectors of the response.
with(df, split(y, x))
## $Low
## [1] 44 551 127 1 297 46 60 65 12 1350 730 47 994 26
##
## $Medium
## [1] 15 280 1024 253 66 29 161 624 39 51 68 836 51
##
## $High
## [1] 3 136 65 25 64 322 23 54 63 50 10 48
library(HH)
library(MASS)
fit <- lm(y ~ x - 1, df)
stripchart(y ~ x, df, method = "jitter", pch = 19, xlab = "Survival Time")
title("(a) Dot Plots of Survival Times")
stripchart(rstudent(fit) ~ x, df, method = "jitter", pch = 19, xlab = "Studentized Residual")
title("(b) Dot Plots of Studentized Residuals")
qqnorm(rstudent(fit), pch = 19, xlab = "Expected Value", ylab = "Studentized Residual", main = "")
qqline(rstudent(fit))
title("(c) Normal Probability Plot\n of Studentized Residuals")
hov(y ~ x, df) # (p 799) test for constancy of variance
##
## hov: Brown-Forsyth
##
## data: y
## F = 1.911, df:x = 2, df:Residuals = 36, p-value = 0.1627
## alternative hypothesis: variances are not identical
boxcox(fit)
fit <- lm(log(y) ~ x, df)
summary(fit)
##
## Call:
## lm(formula = log(y) ~ x, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.502 -0.791 0.147 1.013 2.706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.502 0.417 10.78 8e-13 ***
## xMedium 0.295 0.602 0.49 0.63
## xHigh -0.778 0.614 -1.27 0.21
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.56 on 36 degrees of freedom
## Multiple R-squared: 0.0797, Adjusted R-squared: 0.0286
## F-statistic: 1.56 on 2 and 36 DF, p-value: 0.224
stripchart(rstudent(fit) ~ x, df, method = "jitter", pch = 19, xlab = "Studentized Residual")
title("(a) Dot Plots of Studentized Residuals")
qqnorm(rstudent(fit), pch = 19, xlab = "Expected Value", ylab = "Studentized Residual", main = "")
qqline(rstudent(fit))
title("(b) Normal Probability Plot\n of Studentized Residuals")