env <- new.env()
load("../data.rda", envir = env)
df <- get("CH11TA01", envir = env)
names(df) <- c("X", "Y")
fit <- lm(Y ~ X, df)
w <- fitted(lm(abs(resid(fit)) ~ X, df))^(-2)
fit <- update(fit, weights = w)
cbind(
"Subject" = seq(nrow(df)),
"Age" = df$X,
"DBP" = df$Y,
"e" = round(resid(lm(Y ~ X, df)), 2),
"|e|" = round(abs(resid(lm(Y ~ X, df))), 2),
"s" = round(sqrt(1 / w), 4),
"w" = round(fit$weights, 5))
## Subject Age DBP e |e| s w
## 1 1 27 73 1.18 1.18 3.801 0.06921
## 2 2 21 66 -2.34 2.34 2.612 0.14656
## 3 3 22 63 -5.92 5.92 2.810 0.12662
## 4 4 24 75 4.92 4.92 3.207 0.09725
## 5 5 25 71 0.34 0.34 3.405 0.08626
## 6 6 23 70 0.50 0.50 3.009 0.11049
## 7 7 20 65 -2.76 2.76 2.414 0.17161
## 8 8 20 70 2.24 2.24 2.414 0.17161
## 9 9 29 79 6.02 6.02 4.197 0.05676
## 10 10 24 72 1.92 1.92 3.207 0.09725
## 11 11 25 68 -2.66 2.66 3.405 0.08626
## 12 12 28 67 -5.40 5.40 3.999 0.06252
## 13 13 26 79 7.76 7.76 3.603 0.07703
## 14 14 38 91 12.80 12.80 5.981 0.02795
## 15 15 32 76 1.28 1.28 4.792 0.04355
## 16 16 33 69 -6.30 6.30 4.990 0.04016
## 17 17 31 66 -8.14 8.14 4.594 0.04739
## 18 18 34 73 -2.88 2.88 5.188 0.03715
## 19 19 37 78 0.38 0.38 5.783 0.02990
## 20 20 38 87 8.80 8.80 5.981 0.02795
## 21 21 33 76 0.70 0.70 4.990 0.04016
## 22 22 35 79 2.54 2.54 5.387 0.03446
## 23 23 30 73 -0.56 0.56 4.396 0.05175
## 24 24 31 80 5.86 5.86 4.594 0.04739
## 25 25 37 68 -9.62 9.62 5.783 0.02990
## 26 26 39 75 -3.78 3.78 6.179 0.02619
## 27 27 46 89 6.16 6.16 7.566 0.01747
## 28 28 49 101 16.42 16.42 8.161 0.01501
## 29 29 40 70 -9.36 9.36 6.377 0.02459
## 30 30 42 72 -8.52 8.52 6.774 0.02179
## 31 31 43 80 -1.10 1.10 6.972 0.02057
## 32 32 46 83 0.16 0.16 7.566 0.01747
## 33 33 43 75 -6.10 6.10 6.972 0.02057
## 34 34 44 71 -10.68 10.68 7.170 0.01945
## 35 35 46 80 -2.84 2.84 7.566 0.01747
## 36 36 47 96 12.58 12.58 7.765 0.01659
## 37 37 45 92 9.74 9.74 7.368 0.01842
## 38 38 49 80 -4.58 4.58 8.161 0.01501
## 39 39 48 70 -14.00 14.00 7.963 0.01577
## 40 40 40 90 10.64 10.64 6.377 0.02459
## 41 41 42 85 4.48 4.48 6.774 0.02179
## 42 42 55 76 -12.06 12.06 9.350 0.01144
## 43 43 54 71 -16.48 16.48 9.152 0.01194
## 44 44 57 99 9.78 9.78 9.746 0.01053
## 45 45 52 86 -0.32 0.32 8.755 0.01304
## 46 46 53 79 -7.90 7.90 8.954 0.01247
## 47 47 56 92 3.36 3.36 9.548 0.01097
## 48 48 52 85 -1.32 1.32 8.755 0.01304
## 49 49 50 71 -14.16 14.16 8.359 0.01431
## 50 50 59 90 -0.38 0.38 10.143 0.00972
## 51 51 50 91 5.84 5.84 8.359 0.01431
## 52 52 52 100 13.68 13.68 8.755 0.01304
## 53 53 58 80 -9.80 9.80 9.944 0.01011
## 54 54 57 109 19.78 19.78 9.746 0.01053
fit <- lm(Y ~ X, df)
par(mfrow = c(1, 3), pch = 19)
plot(Y ~ X, df, xlab = "Age", ylab = "Blood Pressure")
title("(a) Scatter Plot")
abline(fit)
plot(resid(fit) ~ X, df, xlab = "Age", ylab = "Residual")
title("(b) Residual Plot Against X")
abline(0, 0)
plot(abs(resid(fit)) ~ X, df, xlab = "Age", ylab = "Absolute Residual")
title("(c) Absolute Residual Plot Against X")
abline(lm(abs(resid(fit)) ~ X, df))
df <- get("CH07TA01", envir = env)
names(df) <- c("x1", "x2", "x3", "y")
fit <- lm(y ~ x1 + x2 + x3, df)
R has the lm.ridge (MASS) function, but it produces different results. I've tried uses methods in other packages, like genridge or ElemStateLearn, but the results were pretty much the same.
c <- c(0, .002, .004, .006, .008, .01, .02, .03, .04, .05, .1, .5, 1)
fun1 <- function(c, rxx, rxy) {solve(rxx + c * diag(ncol(rxx)), rxy)} # (11.34)
fun2 <- function(c, rxx)
{
bias = solve(rxx + c*diag(ncol(rxx)))
diag(bias %*% rxx %*% bias) # (11.36)
}
data.frame(lambda = c,
t(sapply(c, fun1, rxx = cor(df)[-4, -4], rxy = cor(df)[-4, 4])),
VIF = t(sapply(c, fun2, rxx = cor(df)[-4, -4])))
## lambda x1 x2 x3 VIF.x1 VIF.x2 VIF.x3
## 1 0.000 4.2637 -2.92870 -1.561417 708.8429 564.3434 104.6060
## 2 0.002 1.4407 -0.41129 -0.481273 50.5592 40.4483 8.2797
## 3 0.004 1.0063 -0.02484 -0.314866 16.9816 13.7247 3.3628
## 4 0.006 0.8300 0.13142 -0.247163 8.5033 6.9764 2.1185
## 5 0.008 0.7343 0.21576 -0.210302 5.1472 4.3046 1.6238
## 6 0.010 0.6742 0.26841 -0.187032 3.4855 2.9813 1.3770
## 7 0.020 0.5463 0.37740 -0.136872 1.1026 1.0805 1.0105
## 8 0.030 0.5004 0.41341 -0.118078 0.6257 0.6969 0.9235
## 9 0.040 0.4760 0.43024 -0.107583 0.4528 0.5553 0.8814
## 10 0.050 0.4605 0.43924 -0.100508 0.3705 0.4859 0.8531
## 11 0.100 0.4234 0.44896 -0.081246 0.2515 0.3735 0.7614
## 12 0.500 0.3377 0.37906 -0.029500 0.1540 0.2137 0.4033
## 13 1.000 0.2798 0.31007 -0.005939 0.1071 0.1358 0.2273
One could also plot the object returned by lm.ridge (MASS). It does lack the logarithmic scale on x, however. Simply use a code like plot(lm.ridge(y ~ ., df, lambda = seq(0, 1, by = 0.001))). Otherwise, you can manually take the return coefficients for each value of lambad and use matplot and matlines to recreate the plot or set the log scale on x.
lambda <- seq(0.001, 1, by = 0.001)
f <- function(c, x) {solve(cor(x)[-4, -4] + c * diag(3), cor(x)[-4, 4])}
tab <- data.frame(c = seq(0.001, 1, by = 0.001),
t(sapply(lambda, f, x = df)))
par(lwd = 2)
plot(x1 ~ c, tab, type = "l", log = "x", xaxt = "n", lty = 5,
xlim = c(0.001, 1), ylim = c(-1, 3), xlab = "", ylab = "")
axis(1, c(0.001, 0.01, .10, 1.00))
lines(x2 ~ c, tab, lty = 3)
lines(x3 ~ c, tab, lty = 1)
abline(0,0)
abline(v = 0.02, lwd = 1) # Authors choice of c
text(0.02, -1, pos=4, labels = "0.02")
df <- get("CH11TA04", envir = env)
names(df) <- c("State", "Y", "X1", "X2", "X3", "X4", "X5")
The author confuses the variables of interest. They begin by regressing on X2 (HOMELIB), but he talks about X3 (READING). I will maintain using X2 as the variable of interest.
Note that there is an R robust regression function rlm (MASS). It uses both the huber (psi = psi.huber) and bisquare (psi.bisquare) weighing functions. The latter method requires library lqs. For more see
http://cran.r-project.org/doc/contrib/Fox-Companion/appendix-robust-regression.pdf
You can also use hccm (car) which gives you the ability to extract the adjusted VCV for your coefficients directly using a number of methods, (e.g., Huber-Weight HCO or Efron HC3). You would do something like
robust <- sqrt(diag(hccm(fit, type = "hc3"))) # Robust Standard Errorscoef(fit) / robust' # Robust SE based t-test
Examples of this can be seen around the first hour in Dr. Justin Esarey's online lecture at http://www.youtube.com/watch?v=kBlUStQcnJU
library(MASS)
par(mfrow = c(3, 2), pch = 19)
xlab <- expression(X[2])
# Linear Regression Fit
fit <- lm(Y ~ X2, df)
outliers <- subset(df, State %in% c("Guam", "D.C.", "Virgin_Islands"))
plot(Y ~ X2, df, xlab = xlab, ylab = "Y") # (a)
lines(loess.smooth(df$X2, df$Y), lty = 2)
title("(a) Lowess and Linear Regression Fits")
abline(fit)
with(outliers, text(Y ~ X2, labels = State, pos = 4))
plot(resid(fit) ~ X2, df, xlab = xlab, ylab = "Residual") # (b)
title("(b) Residuals from Linear Regression")
abline(0, 0)
# OLS Quadratic Fit
fit <- lm(Y ~ x + xsq, transform(df,
x = scale(X2, T, F),
xsq = scale(X2, T, F)^2))
x <- data.frame(X2 = df$X2, Y = fitted(fit))
plot(Y ~ X2, df, xlab = expression(X[2]), ylab = "Y") # (c)
title("(c) OLS Quadratic Fit")
lines(Y ~ X2, x[order(x$X2), ])
plot(cooks.distance(fit), xlab = "Index", ylab = "D", type = "o") # (d)
title("(d) Cook's Distance--OLS Quadratic Fit")
# Robust Quadratic Fit
fit <- rlm(formula(fit), model.frame(fit))
x <- data.frame(X2 = df$X2, Y = fitted(fit))
plot(Y ~ X2, df, xlab = expression(X[2]), ylab = "Y") # (e)
title("(e) Robust Quadratic Fit")
lines(Y ~ X2, x[order(x$X2), ])
plot(fit$w, xlab = "Index", ylab = "W", type = "o") # (f)
title("(f) Robust Weights")
with(df, cbind("State" = State,
"MATHPROF" = Y,
"PARENTS" = X1,
"HOMELIB" = X2,
"READING" = X3,
"TVWATCH" = X4,
"ABSENCES" = X5))
## State MATHPROF PARENTS HOMELIB READING TVWATCH ABSENCES
## [1,] 1 252 75 78 34 18 18
## [2,] 2 259 75 73 41 12 26
## [3,] 3 256 77 77 28 20 23
## [4,] 4 256 78 68 42 11 28
## [5,] 5 267 78 85 38 9 25
## [6,] 6 270 79 86 43 12 22
## [7,] 8 261 75 83 32 18 28
## [8,] 7 231 47 76 24 33 37
## [9,] 9 255 75 73 31 19 27
## [10,] 10 258 73 80 36 17 22
## [11,] 11 231 81 64 32 20 28
## [12,] 12 251 78 69 36 23 26
## [13,] 13 272 84 84 48 7 21
## [14,] 14 260 78 82 43 14 21
## [15,] 15 267 81 84 37 11 23
## [16,] 16 278 83 88 43 8 20
## [17,] 17 256 79 78 36 14 23
## [18,] 18 246 73 76 36 19 27
## [19,] 19 260 75 83 34 19 27
## [20,] 20 264 77 84 31 14 25
## [21,] 21 276 83 88 36 7 20
## [22,] 22 280 83 88 44 6 21
## [23,] 23 276 85 88 42 9 19
## [24,] 24 273 83 88 40 7 22
## [25,] 25 269 79 84 41 13 23
## [26,] 26 256 77 72 40 11 27
## [27,] 27 261 76 79 35 17 29
## [28,] 28 250 74 78 37 21 25
## [29,] 29 281 85 90 41 6 14
## [30,] 30 264 79 84 36 11 22
## [31,] 31 263 78 78 37 14 22
## [32,] 32 271 81 82 41 9 31
## [33,] 33 266 80 86 34 10 24
## [34,] 34 260 78 80 38 12 28
## [35,] 35 258 77 70 34 15 18
## [36,] 37 218 63 76 23 27 22
## [37,] 36 264 78 82 33 16 24
## [38,] 38 256 82 80 36 16 25
## [39,] 39 274 81 86 38 8 21
## [40,] 40 272 85 86 43 7 23
This is displayed purely for pedagogical reasons. The final results are similar to, per the above example, rlm(formula(fit)). Just compare the last column (w7) below with rlm(formula(fit) and model.frame(fit))$w.
fit <- lm(Y ~ x + xsq, transform(df,
x = scale(X2, T, F),
xsq = scale(X2, T, F)^2))
IRLS <- function(model, i = 1)
{
e <- resid(model)
MAD <- median(abs(e - median(e))) / 0.6745 # (11.46)
u <- e / MAD # (11.47)
w <- apply(data.frame(u), 1, function(x) if (abs(x) <= 1.345) 1 else 1.345/abs(x)) # (11.44)
model <- update(model, weights = w)
if (i > 1) return(IRLS(model, i-1)) else return(model) # Recursive return definition
}
tab <- cbind(
"e0" = resid(fit),
"u0" = resid(fit) / (median(abs(resid(fit) - median(resid(fit)))) / 0.6745),
"w1" = IRLS(fit, 1)$weights,
"e1" = resid( IRLS(fit, 1) ),
"w2" = IRLS(fit, 2)$weights,
"e2" = resid( IRLS(fit, 2) ),
"w7" = IRLS(fit, 7)$weights,
"e7" = resid( IRLS(fit, 7) ))
round(tab, 4)
## e0 u0 w1 e1 w2 e2 w7 e7
## 1 -2.4109 -0.5164 1.0000 -3.7542 1.0000 -4.0354 1.0000 -4.1269
## 2 10.5724 2.2647 0.5939 8.4297 0.7151 7.4848 0.8601 6.7698
## 3 3.0454 0.6523 1.0000 1.5411 1.0000 1.1559 1.0000 0.9731
## 4 10.3104 2.2085 0.6090 7.3822 0.8166 5.4138 1.0000 3.6583
## 5 -1.2395 -0.2655 1.0000 -1.4402 1.0000 -1.3970 1.0000 -1.3160
## 6 -0.7342 -0.1573 1.0000 -0.7695 1.0000 -0.7376 1.0000 -0.6986
## 7 -2.6394 -0.5654 1.0000 -3.1694 1.0000 -3.1469 1.0000 -3.0318
## 8 -20.6282 -4.4187 0.3044 -22.2929 0.2704 -22.7964 0.2526 -23.0873
## 9 6.5724 1.4078 0.9554 4.4297 1.0000 3.4848 1.0000 2.7698
## 10 0.2871 0.0615 1.0000 -0.7325 1.0000 -0.8490 1.0000 -0.8079
## 11 -14.8358 -3.1779 0.4232 -18.3824 0.3279 -21.4287 0.2402 -24.3167
## 12 5.0224 1.0758 1.0000 2.2502 1.0000 0.5153 1.0000 -0.9988
## 13 6.1255 1.3121 1.0000 5.7598 1.0000 5.7999 0.9875 5.9063
## 14 -1.5341 -0.3286 1.0000 -2.2278 1.0000 -2.2373 1.0000 -2.1301
## 15 1.1255 0.2411 1.0000 0.7598 1.0000 0.7999 1.0000 0.9063
## 16 1.8868 0.4042 1.0000 2.1841 1.0000 2.1504 1.0000 2.0553
## 17 1.5891 0.3404 1.0000 0.2458 1.0000 -0.0354 1.0000 -0.1269
## 18 -5.6282 -1.2056 1.0000 -7.2929 0.8266 -7.7964 0.7215 -8.0873
## 19 -3.6394 -0.7796 1.0000 -4.1694 1.0000 -4.1469 1.0000 -4.0318
## 20 -1.8745 -0.4015 1.0000 -2.2402 1.0000 -2.2001 1.0000 -2.0937
## 21 -0.1132 -0.0242 1.0000 0.1841 1.0000 0.1504 1.0000 0.0553
## 22 3.8868 0.8326 1.0000 4.1841 1.0000 4.1504 1.0000 4.0553
## 23 -0.1132 -0.0242 1.0000 0.1841 1.0000 0.1504 1.0000 0.0553
## 24 -3.1132 -0.6669 1.0000 -2.8159 1.0000 -2.8496 1.0000 -2.9447
## 25 3.1255 0.6695 1.0000 2.7598 1.0000 2.7999 1.0000 2.9063
## 26 8.3796 1.7950 0.7493 6.0787 0.9917 4.9579 1.0000 4.0681
## 27 5.0030 1.0717 1.0000 3.8213 1.0000 3.6296 1.0000 3.6128
## 28 -4.4109 -0.9448 1.0000 -5.7542 1.0000 -6.0354 0.9521 -6.1269
## 29 -1.0114 -0.2166 1.0000 -0.3793 1.0000 -0.5362 1.0000 -0.8322
## 30 -1.8745 -0.4015 1.0000 -2.2402 1.0000 -2.2001 1.0000 -2.0937
## 31 8.5891 1.8398 0.7310 7.2458 0.8320 6.9646 0.8482 6.8731
## 32 9.4659 2.0276 0.6633 8.7722 0.6872 8.7627 0.6575 8.8699
## 33 -4.7342 -1.0141 1.0000 -4.7695 1.0000 -4.7376 1.0000 -4.6986
## 34 2.2871 0.4899 1.0000 1.2675 1.0000 1.1510 1.0000 1.1921
## 35 11.6046 2.4858 0.5411 8.9889 0.6707 7.4732 0.9403 6.1839
## 36 -33.6282 -7.2033 0.1867 -35.2929 0.1708 -35.7964 0.1616 -36.0873
## 37 2.4659 0.5282 1.0000 1.7722 1.0000 1.7627 1.0000 1.8699
## 38 -1.7129 -0.3669 1.0000 -2.7325 1.0000 -2.8490 1.0000 -2.8079
## 39 3.2658 0.6995 1.0000 3.2305 1.0000 3.2624 1.0000 3.3014
## 40 1.2658 0.2711 1.0000 1.2305 1.0000 1.2624 1.0000 1.3014
The use of pairs could produce these results, but I want to demonstrate the facility of the scatterplotMatrix function (car). It has an spm use for convenience. Also, the author's plot isn't right for TVWATCH. If you plot the scatterplot of it against any of the other variables, you do not get the results in this scatterplot matrix. The rest of the scatterplots look fine, and so does the correlation table.
library(car)
cor(df[-1])
## Y X1 X2 X3 X4 X5
## Y 1.0000 0.7414 0.7451 0.7166 -0.8735 -0.4803
## X1 0.7414 1.0000 0.3945 0.6930 -0.8312 -0.5653
## X2 0.7451 0.3945 1.0000 0.3769 -0.5936 -0.4426
## X3 0.7166 0.6930 0.3769 1.0000 -0.7919 -0.3567
## X4 -0.8735 -0.8312 -0.5936 -0.7919 1.0000 0.5117
## X5 -0.4803 -0.5653 -0.4426 -0.3567 0.5117 1.0000
spm(df[-1], span = 0.75, diagonal = "none", reg.line = FALSE, spread = FALSE,
var.labels = c("MATHPROF", "PARENTS", "HOMELIB",
"READING", "TVWATCH", "ABSENCES"))
fit <- lm(Y ~ X1 + X2 + X3 + X4 + X5, df)
data.frame("State" = levels(df$State),
"h" = round(hatvalues(fit), 2),
"t" = round(rstudent(fit), 2),
"D" = round(cooks.distance(fit), 2))
## State h t D
## 1 Alabama 0.16 -0.05 0.00
## 2 Arizona 0.19 0.40 0.01
## 3 Arkansas 0.16 1.41 0.06
## 4 California 0.29 0.10 0.00
## 5 Colorado 0.10 -0.57 0.01
## 6 Connecticut 0.12 0.03 0.00
## 7 D.C. 0.12 0.64 0.01
## 8 Delaware 0.69 1.41 0.72
## 9 Florida 0.09 1.47 0.03
## 10 Georgia 0.08 0.48 0.00
## 11 Guam 0.34 -2.83 0.57
## 12 Hawaii 0.28 1.68 0.17
## 13 Idaho 0.16 -0.81 0.02
## 14 Illinois 0.15 -0.85 0.02
## 15 Indiana 0.05 -0.15 0.00
## 16 Iowa 0.09 0.36 0.00
## 17 Kentucky 0.04 -0.56 0.00
## 18 Louisiana 0.06 -1.05 0.01
## 19 Maryland 0.12 0.51 0.01
## 20 Michigan 0.12 0.43 0.00
## 21 Minnesota 0.14 0.31 0.00
## 22 Montana 0.09 0.32 0.00
## 23 Nebraska 0.10 0.09 0.00
## 24 New_Hampshire 0.08 -0.64 0.01
## 25 New_Jersey 0.07 0.44 0.00
## 26 New_Mexico 0.19 -0.35 0.01
## 27 New_York 0.08 0.82 0.01
## 28 North_Carolina 0.15 -0.36 0.00
## 29 North_Dakota 0.19 0.53 0.01
## 30 Ohio 0.06 -0.47 0.00
## 31 Oklahoma 0.04 0.84 0.01
## 32 Oregon 0.21 0.06 0.00
## 33 Pennsylvania 0.13 -0.62 0.01
## 34 Rhode_Island 0.08 -0.71 0.01
## 35 Texas 0.30 2.25 0.33
## 36 Virginia 0.32 -5.21 1.21
## 37 Virgin_Islands 0.06 0.90 0.01
## 38 West_Virginia 0.13 -0.91 0.02
## 39 Wisconsin 0.08 0.39 0.00
## 40 Wyoming 0.08 -0.91 0.01
The author indicates that the residual plots present no strong indication of nonconstancy of the error variance for the states aside from the outliers.
par(mfrow = c(3, 2))
for (n in seq(5))
{
x <- model.frame(fit)[, n+1]
predictor <- paste("X", n, sep = "")
plot(resid(fit) ~ x, ylab = "Residual", main = predictor)
abline(0, 0)
}
In the earlier chapter I used regsubsets (leaps) to produce the possible submodels and their selection criteria, displaying the best based on the number of the “best” parameter. Another approach is to use bestglm (bestglm). The regsubsets will produce an output similar to that found in FIGURE 11.7, but bestglm offers some of its own flexibility. See more at the document:
http://cran.r-project.org/web/packages/bestglm/vignettes/bestglm.pdf
library(bestglm)
bestglm(model.frame(fit)[c(2:6, 1)], IC = "AIC")$Subsets # Author's choice is 3 on each search
## (Intercept) X1 X2 X3 X4 X5 logLikelihood AIC
## 0 TRUE FALSE FALSE FALSE FALSE FALSE -102.69 205.4
## 1 TRUE FALSE FALSE FALSE TRUE FALSE -73.90 149.8
## 2 TRUE FALSE TRUE FALSE TRUE FALSE -65.76 135.5
## 3 TRUE FALSE TRUE TRUE TRUE FALSE -64.55 135.1
## 4* TRUE TRUE TRUE TRUE TRUE FALSE -63.53 135.1
## 5 TRUE TRUE TRUE TRUE TRUE TRUE -63.22 136.4
bestglm(model.frame(fit)[c(2:6, 1)], IC = "BIC")$Subsets
## (Intercept) X1 X2 X3 X4 X5 logLikelihood BIC
## 0 TRUE FALSE FALSE FALSE FALSE FALSE -102.69 205.4
## 1 TRUE FALSE FALSE FALSE TRUE FALSE -73.90 151.5
## 2* TRUE FALSE TRUE FALSE TRUE FALSE -65.76 138.9
## 3 TRUE FALSE TRUE TRUE TRUE FALSE -64.55 140.2
## 4 TRUE TRUE TRUE TRUE TRUE FALSE -63.53 141.8
## 5 TRUE TRUE TRUE TRUE TRUE TRUE -63.22 144.9
summary(rlm(Y ~ X2 + X3 + X4, df)) # (11.53)
##
## Call: rlm(formula = Y ~ X2 + X3 + X4, data = df)
## Residuals:
## Min 1Q Median 3Q Max
## -22.40 -3.00 0.56 2.68 9.41
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 207.681 17.696 11.736
## X2 0.797 0.140 5.698
## X3 0.161 0.221 0.728
## X4 -1.169 0.223 -5.241
##
## Residual standard error: 4.34 on 36 degrees of freedom
summary(lm (Y ~ X2 + X3 + X4, df)) # (11.54)
##
## Call:
## lm(formula = Y ~ X2 + X3 + X4, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.926 -2.948 0.256 2.461 9.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 199.611 21.529 9.27 4.5e-11 ***
## X2 0.780 0.170 4.59 5.3e-05 ***
## X3 0.401 0.269 1.49 0.14423
## X4 -1.156 0.271 -4.26 0.00014 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.29 on 36 degrees of freedom
## Multiple R-squared: 0.851, Adjusted R-squared: 0.839
## F-statistic: 68.8 on 3 and 36 DF, p-value: 5.65e-15
with(summary(regsubsets(formula(fit), df, nbest=2)),
cbind(which[, -1], rss, rsq, adjr2, cp, bic))
## X1 X2 X3 X4 X5 rss rsq adjr2 cp bic
## 1 0 0 0 1 0 1609.4 0.7630 0.7567 21.993 -50.20
## 1 0 1 0 0 0 3020.6 0.5551 0.5434 72.842 -25.02
## 2 0 1 0 1 0 1071.3 0.8422 0.8337 4.604 -62.79
## 2 1 1 0 0 0 1410.5 0.7923 0.7810 16.826 -51.79
## 3 0 1 1 1 0 1008.9 0.8514 0.8390 4.354 -61.51
## 3 1 1 0 1 0 1013.6 0.8507 0.8383 4.524 -61.32
## 4 1 1 1 1 0 958.3 0.8589 0.8427 4.532 -59.88
## 4 1 1 0 1 1 993.7 0.8536 0.8369 5.808 -58.42
## 5 1 1 1 1 1 943.6 0.8610 0.8406 6.000 -56.81
df <- get("CH11TA07", envir = env)
names(df) <- c("X1", "X2", "Y")
fit <- lm(Y ~ X1 + X2, df) # (6.1)
lo <- function(model, newdata = data.frame(X1 = 30, X2 = 3), q = 0.5)
{
x<- transform(model.frame(model),
d = sqrt(scale(X1, newdata$X1, sd(X1))^2 + scale(X2, newdata$X2, sd(X2))^2))
dq <- with(x, sort(d)[nrow(x) * q])
x <- transform(x, w = ifelse(d > dq, 0, (1-(d/dq)^3)^3))
with(x, update(model, weights = w))
}
transform(df,
d = sqrt(scale(X1, 30, sd(X1))^2 + scale(X2, 3, sd(X2))^2),
w = lo(fit)$weights)
## X1 X2 Y d w
## 1 66.29 7 240 3.01254 0.00000
## 2 40.96 5 73 1.14306 0.29974
## 3 73.00 10 311 4.21154 0.00000
## 4 45.01 6 136 1.65285 0.00000
## 5 57.20 4 183 1.89606 0.00000
## 6 26.85 5 13 0.89378 0.59668
## 7 38.12 4 35 0.70141 0.78781
## 8 35.84 6 61 1.36080 0.08631
## 9 65.80 9 319 3.56056 0.00000
## 10 37.41 5 30 1.00292 0.46835
## 11 54.38 2 148 1.70985 0.00000
## 12 46.19 7 116 2.05401 0.00000
## 13 46.13 4 71 1.17728 0.26047
## 14 30.37 3 10 0.02483 0.99999
## 15 39.06 5 89 1.06353 0.39479
## 16 79.38 1 316 3.46092 0.00000
## 17 52.77 8 154 2.66338 0.00000
## 18 55.92 6 164 2.18782 0.00000
predict(lo(fit), data.frame(X1 = 30, X2 = 3)) # 4.664607
## 1
## 4.665
# Compare
fit <- loess(Y ~ X1 + X2, df, span = 0.5, degree = 1)
predict(fit, data.frame(X1 = 30, X2 = 3)) # returns 4.2894 with normalize = FALSE
## 1
## 9.824
Loess regressions can have a variety of results due to small calibrations in the possible parameters. While I cannot fit a model to exactly produce what the book expects, I demonstrate how to do contour plots, extend that to a 3D surface, and do the associated coplots manually. There does exist a coplot function, but you will have to figure it out.
The contour function operates as a 2D analog to persp. Therefore, the same approach used in the lm3d function defined in chapter 6 for FIGURE 6.7 can be used here. Simply feed it the 3 dimensional sequences as below. To that end, lm3d has been modified to invisibly return those 3 sequences (as a list) and are then passed to contour.
lm3d <- function(model, res, ...) {
ivs <- labels(terms(model))
x <- model.frame(model)[, ivs[1]] # 1st independent variable
y <- model.frame(model)[, ivs[2]] # 2nd independent variable
xr <- seq(min(x), max(x), len = res) # equidistant sequence from range of 1st iv
yr <- seq(min(y), max(y), len = res) # equidistant sequence from range of 2nd iv
# Create a list object of these sequences to be turned into a grid
newdata <- list(xr, yr)
names(newdata) <- ivs
newdata <- do.call('expand.grid', newdata)
zr <- matrix(predict(model, newdata), res, res)
persp(xr, yr, zr, ...)
invisible(list(x = xr, y = yr, z = zr)) # included in this example for `contour`
}
fit <- loess(Y ~ X1 + X2, df, span = 0.5, degree = 1)
lm3d(fit, res = 30, theta = 270, phi = 0, ticktype = "detailed", expand = 2/3, shade = 0.5)
# Store the dimensional sequences for the plots below
df <- lm3d(fit, res = 30, theta = 300, phi = 30, ticktype = "detailed", expand = 2/3, shade = 0.5)
with(df, contour(x, y, z))
par(mfrow = c(1, 3))
plot(df$x, predict(fit, data.frame(X1=df$x, X2=3)), type='l', ylim=c(0, 360), ylab = 'y', xlab = 'X1')
title("X2 = 3")
plot(df$x, predict(fit, data.frame(X1=df$x, X2=6)), type='l', ylim=c(0, 360), ylab = 'y', xlab = 'X1')
title("X2 = 6")
# Very different scale than book
plot(df$x, predict(fit, data.frame(X1=df$x, X2=9)), type='l', ylim=c(-100, 360), ylab = 'y', xlab = 'X1')
title("X2 = 9")
df <- get("CH11TA08", envir = env)
names(df) <-c("Y", "X")
The cut function will easily factor Age into the regions ® of interest. However, since these are left-closed, right-opened, the appropriate parameters are specified. The book says to not include Age = 25 (it's right open). This doesn't seem right, and cut doesn't allow it. The mean score is not too different.
df <- transform(df,
R = cut(X, c(8, 9, 10, 13, 14, 25), include.lowest = TRUE, right = FALSE))
with(df, tapply(Y, R, mean))
## [8,9) [9,10) [10,13) [13,14) [14,25]
## 3.550 8.133 13.675 16.950 22.269
While the above method demonstrates an approach to categorizing our bins in a regression tree, the authors have not demonstrated much in the way of how to construct the trees. Therefore, I will simply use R facilities to generate the authors output as close as possible. For more on the topic of data mining and regression tress I suggest the author's recommendation (11.15), Hastie et al. The Elements of Statistical Learning.
R has a regression tree function rpart (rpart). It takes in a regression formula or object and generates a tree. To manage the parameters of the tree, you manipulate the control parameter using the rpart function rpart.control. The selection of parameters in this case were trial-and-error. Since there were at least two Y values in each bin, I set the minsplit = 2. The cp value defaults to 0.01, which results in breaking up the [14, 25] bin. It turns out that cp = 0.013 maintained the author's results. The return object of rpart can be used just like an lm object as demonstrated below.
library(rpart)
fit <- rpart(Y ~ X, df, control = rpart.control(minsplit = 2, cp = 0.013))
# FIGURE 11.9a
plot(Y ~ X, df, xlab = "Age", ylab = "Steroid Level")
lines(predict(fit, data.frame(X = sort(X))) ~ sort(X), df, type = 's') # Step plot
# FIGURE 11.9b
plot(resid(fit) ~ predict(fit), xlab = "Predicted", ylab = "Residual")
# FIGURE 11.9c
plot(fit)
text(fit, use.n = TRUE, xpd = TRUE)
To grow the tree is basically to manipulate the parameters in rpart. Therefore, I will construct a few different models and plot them like the above step function. To faciliate that, I will write a convenience wrapper that plots the results. All parameters go straight to rpart.
rplot <- function(formula, data, ...)
{
df <- model.frame(formula, data)
model <- rpart(Y ~ X, df, ...)
plot(formula, df, xlab = "AGE", ylab = "Steroid Level")
lines(predict(model, data.frame(X = sort(X))) ~ sort(X), df, type = 's')
}
par(mfrow = c(2, 2))
rplot(Y ~ X, df, minsplit = 2, cp = 0.095)
rplot(Y ~ X, df, minsplit = 2, cp = 0.04)
rplot(Y ~ X, df, minsplit = 2, cp = 0.03)
rplot(Y ~ X, df, minsplit = 2, cp = 0.015)
df <- get("APPENC04", envir = env)
names(df) <- c("id", "gpa", "rank", "score", "year")
The lm3d function is generic enough to even handle regression tree models. The only requirement is that the rpart parameter model = TRUE be set so that model.frame will actually return the data frame that comprises this model.
library(rpart)
set.seed(38383) # I want more control over what model gets generated here.
indices <- sample(nrow(df), 400)
df.test <- df[indices, ]
df.valid <- df[-indices, ]
fit <- rpart(gpa ~ score + rank, df.test, cp = 0.015, model = TRUE)
mspr <- function(model, validation)
{
x <- model.frame(formula(model), validation)
x <- sum((x[, 1] - predict(model, x[-1]))^2) / nrow(x) # (9.20)
return(x)
}
x <- NULL
for (cp in seq(0.01, 0.04, by = 0.001))
x <- append(x, mspr(rpart(gpa ~ rank + score, df.test, cp = cp), df.valid))
x <- NULL
for (n in seq(30)) {
for (cp in seq(0.01, 0.03, by = 0.001)) {
x <- append(x, mspr(rpart(gpa ~ rank + score, df.test, maxdepth = n, cp = cp), df.valid))
}
}
x <- data.frame(depth = rep(seq(30), each = 21), cp = seq(0.01, 0.03, by = 0.001), mspr = x)
# FIGURE 11.12 (a)
# FIGURE 11.12 (b)
lm3d(fit, 30, theta = 300, phi = 30, ticktype = "detailed", expand = 2/3, shade = 0.5, xlab = "Score", ylab = "Rank", zlab = "GPA")
lm3d(fit, 30, theta = 35, phi = 30, ticktype = "detailed", expand = 2/3, shade = 0.5, xlab = "Score", ylab = "Rank", zlab = "GPA")
# FIGURE 11.12 (c)
plot(fit, compress = TRUE, uniform = TRUE)
text(fit, use.n = TRUE, xpd = TRUE, digits = 4)
title("(c)")
# FIGURE 11.12 (d)
plot(predict(fit), resid(fit), xlab = "Predicted GPA", ylab = "Residual")
title("(d)")
df <- get("CH11TA09", envir = env)
names(df) <- c("X", "Y")
fit <- lm(Y ~ X, df)
Columns (5) and (6) will be different since (5) is selected by a random sample from the residuals of the original model. The given trial will not be part of the following 1000 bootstraps. This is here for completeness.
estar <- sample(resid(fit), size = nrow(df), replace = TRUE)
cbind(df,
'Yh' = fitted(fit),
'ei' = resid(fit),
'estar' = estar,
'ystar' = fitted(fit) + estar)
## X Y Yh ei estar ystar
## 1 80 399 348.0 51.0180 38.8261 386.8
## 2 30 121 169.5 -48.4719 5.3160 174.8
## 3 50 221 240.9 -19.8760 -48.4719 192.4
## 4 90 376 383.7 -7.6840 -66.3861 317.3
## 5 70 361 312.3 48.7200 -52.5780 259.7
## 6 60 224 276.6 -52.5780 51.0180 327.6
## 7 120 546 490.8 55.2099 42.5281 533.3
## 8 80 352 348.0 4.0180 51.0180 399.0
## 9 100 353 419.4 -66.3861 -20.0881 399.3
## 10 50 157 240.9 -83.8760 -7.6840 233.2
## 11 40 160 205.2 -45.1739 0.6139 205.8
## 12 70 252 312.3 -60.2800 -66.3861 245.9
## 13 90 389 383.7 5.3160 10.7200 394.4
## 14 20 113 133.8 -20.7699 -20.0881 113.7
## 15 110 435 455.1 -20.0881 -20.0881 435.0
## 16 100 420 419.4 0.6139 -52.5780 366.8
## 17 30 212 169.5 42.5281 -48.4719 121.0
## 18 50 268 240.9 27.1240 0.6139 241.5
## 19 90 377 383.7 -6.6840 4.0180 387.7
## 20 110 421 455.1 -34.0881 42.5281 497.6
## 21 30 273 169.5 103.5281 -60.2800 109.2
## 22 90 468 383.7 84.3160 -45.1739 338.5
## 23 40 244 205.2 38.8261 -20.7699 184.4
## 24 80 342 348.0 -5.9820 51.0180 399.0
## 25 70 323 312.3 10.7200 4.0180 316.3
bootstrap <- function(model, times = 1000, alpha = 0.05)
{
b <- coef(model)[[2]]
n <- nrow(model.frame(model))
coefs <- vector(mode = "numeric", length = times)
for (i in seq(times))
{
estar <- sample(resid(model), size = n, replace = TRUE)
fstar <- fitted(model) + estar
bootmodel <- lm(fstar ~ ., data = model.frame(model))
coefs[i] <- coef(bootmodel)[[2]]
}
p <- quantile(coefs, probs = c(alpha/2, 1-alpha/2))
statistics <- cbind(
"mean" = mean(coefs),
"sd" = sd(coefs),
"b(a/2)" = p[[1]],
"b(1-a/2)" = p[[2]])
confint <- cbind(
'd1' = b - p[[1]],
'd2' = p[[2]] - b,
'lwr' = 2*b - p[[2]],
'upr' = 2*b - p[[1]])
return (list(coefs = coefs, statistics = statistics, confint = confint))
}
z <- bootstrap(fit)
z$statistics
## mean sd b(a/2) b(1-a/2)
## [1,] -0.001328 0.1993 -0.3981 0.3745
z$confint
## d1 d2 lwr upr
## [1,] 3.968 -3.196 6.766 7.539
hist(z$coefs, breaks = 40, freq = F, main = "", xlab = "Bootstrap b1*")
df <- get("CH11TA10", envir = env)
names(df) <- c("X", "Y")
fit <- lm(Y ~ X, df)
xstar <- sample(df$X, size = nrow(df), replace = TRUE)
ystar <- sample(df$Y, size = nrow(df), replace = TRUE)
estar <- abs(resid(lm(ystar ~ xstar) ))
cbind(df,
'X*' = xstar,
'Y*' = ystar,
'e*' = round( resid( lm(ystar ~ xstar) ), 2),
's*' = round( fitted( lm(estar ~ xstar) ), 2),
'w*' = round( 1 / (fitted( lm(estar ~ xstar) ))^2, 4))
## X Y X* Y* e* s* w*
## 1 27 73 33 101 21.38 8.58 0.0136
## 2 21 66 43 73 -3.82 7.44 0.0181
## 3 22 63 57 92 19.11 5.84 0.0293
## 4 24 75 49 71 -4.14 6.75 0.0219
## 5 25 71 57 66 -6.89 5.84 0.0293
## 6 23 70 48 90 14.58 6.87 0.0212
## 7 20 65 47 63 -12.70 6.98 0.0205
## 8 20 70 45 71 -5.26 7.21 0.0192
## 9 29 79 50 71 -3.86 6.64 0.0227
## 10 24 72 43 92 15.18 7.44 0.0181
## 11 25 68 30 72 -8.46 8.92 0.0126
## 12 28 67 24 83 0.86 9.60 0.0108
## 13 26 79 42 80 2.90 7.55 0.0175
## 14 38 91 56 73 -0.17 5.95 0.0282
## 15 32 76 46 68 -7.98 7.09 0.0199
## 16 33 69 25 101 19.14 9.49 0.0111
## 17 31 66 48 73 -2.42 6.87 0.0212
## 18 34 73 50 80 5.14 6.64 0.0227
## 19 37 78 31 65 -15.18 8.80 0.0129
## 20 38 87 50 76 1.14 6.64 0.0227
## 21 33 76 22 90 7.29 9.83 0.0103
## 22 35 79 34 85 5.66 8.46 0.0140
## 23 30 73 25 79 -2.86 9.49 0.0111
## 24 31 80 38 65 -13.22 8.01 0.0156
## 25 37 68 52 71 -3.30 6.41 0.0243
## 26 39 75 42 73 -4.10 7.55 0.0175
## 27 46 89 22 92 9.29 9.83 0.0103
## 28 49 101 52 70 -4.30 6.41 0.0243
## 29 40 70 52 70 -4.30 6.41 0.0243
## 30 42 72 52 85 10.70 6.41 0.0243
## 31 43 80 52 69 -5.30 6.41 0.0243
## 32 46 83 27 90 8.70 9.26 0.0117
## 33 43 75 29 89 8.26 9.03 0.0123
## 34 44 71 57 79 6.11 5.84 0.0293
## 35 46 80 54 83 9.27 6.18 0.0262
## 36 47 96 20 70 -13.27 10.06 0.0099
## 37 45 92 34 91 11.66 8.46 0.0140
## 38 49 80 42 80 2.90 7.55 0.0175
## 39 48 70 59 70 -2.33 5.61 0.0318
## 40 40 90 26 91 9.42 9.37 0.0114
## 41 42 85 27 92 10.70 9.26 0.0117
## 42 55 76 27 79 -2.30 9.26 0.0117
## 43 54 71 43 72 -4.82 7.44 0.0181
## 44 57 99 40 70 -7.66 7.78 0.0165
## 45 52 86 50 79 4.14 6.64 0.0227
## 46 53 79 26 65 -16.58 9.37 0.0114
## 47 56 92 20 73 -10.27 10.06 0.0099
## 48 52 85 46 66 -9.98 7.09 0.0199
## 49 50 71 24 72 -10.14 9.60 0.0108
## 50 59 90 58 67 -5.61 5.72 0.0305
## 51 50 91 50 70 -4.86 6.64 0.0227
## 52 52 100 40 83 5.34 7.78 0.0165
## 53 58 80 43 68 -8.82 7.44 0.0181
## 54 57 109 21 79 -3.99 9.95 0.0101
I first code an algorithm similar to the one used in the previous example. I then finish by showing the R approach using boot (boot). To use boot requires you to define a statistic. In this case, we define a function boot.coef that extracts the coefficient of interest as in the other defined functions. The boot function then calculates the relevant information, and we can use the boot.ci function to generate confidence intervals using a number of methods.
library(boot)
bootstrap <- function(model, times = 1000, alpha = 0.05)
{
b <- coef(model)[[2]]
n <- nrow(model.frame(model))
coefs <- vector(mode = "numeric", length = times)
for(i in seq(times))
{
indices <- sample(1:n, size = n, replace = TRUE)
xstar <- model.frame(model)[indices, 2]
ystar <- model.frame(model)[indices, 1]
mod <- lm(ystar ~ xstar)
mod <- update(mod, weights = fitted(lm(abs(resid(mod)) ~ xstar))^(-2))
coefs[i] <- coef(mod)[[2]]
}
p <- quantile(coefs, probs = c(alpha/2, 1-alpha/2))
statistics <- cbind(
"mean" = mean(coefs),
"sd" = sd(coefs),
"b(a/2)" = p[[1]],
"b(1-a/2)" = p[[2]])
confint <- cbind(
'd1' = b - p[[1]],
'd2' = p[[2]] - b,
'lwr' = 2*b - p[[2]],
'upr' = 2*b - p[[1]])
return (list(coefs = coefs, statistics = statistics, confint = confint))
}
boot.coef <- function(data, indices)
{
x <- data[indices, 2]
y <- data[indices, 1]
mod <- lm(y ~ x)
mod <- update(mod, weights = fitted(lm(abs(resid(mod)) ~ x))^(-2))
coef(mod)[[2]]
}
z <- bootstrap(fit)
z$statistics
## mean sd b(a/2) b(1-a/2)
## [1,] 0.5963 0.08503 0.4294 0.7613
z$confint
## d1 d2 lwr upr
## [1,] 0.1507 0.1813 0.3987 0.7307
hist(z$coefs, breaks = 40, freq = F, main = "", xlab = "Bootstrap b1*")
# Using the boot package
z <- boot(data = model.frame(fit), statistic = boot.coef, R = 1000)
print(z)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = model.frame(fit), statistic = boot.coef, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.5963 -0.001853 0.08567
boot.ci(z)
## Warning: bootstrap variances needed for studentized intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = z)
##
## Intervals :
## Level Normal Basic
## 95% ( 0.4303, 0.7661 ) ( 0.4309, 0.7672 )
##
## Level Percentile BCa
## 95% ( 0.4255, 0.7618 ) ( 0.4315, 0.7675 )
## Calculations and Intervals on Original Scale
hist(z$t, breaks = 40, freq = F, main = "", xlab = "Bootstrap b1*")
df <- get("CH11TA11", envir = env)
names(df) <- c("AADT", "CTYPOP", "LANES", "WIDTH",
"CONTROL", "CLASS", "TRUCK", "LOCALE")
df <- transform(df,
CONTROL = factor(CONTROL),
CLASS = factor(CLASS),
TRUCK = factor(TRUCK),
LOCALE = factor(LOCALE))
fit <- lm(AADT ~ ., df)
{
print(head(df))
cat("...\n")
print(tail(df))
}
## AADT CTYPOP LANES WIDTH CONTROL CLASS TRUCK LOCALE
## 1 1616 13404 2 52 2 2 5 1
## 2 1329 52314 2 60 2 2 5 1
## 3 3933 30982 2 57 2 4 5 2
## 4 3786 25207 2 64 2 4 5 2
## 5 465 20594 2 40 2 2 5 1
## 6 794 11507 2 44 2 2 5 1
## ...
## AADT CTYPOP LANES WIDTH CONTROL CLASS TRUCK LOCALE
## 116 5576 194279 2 40 2 4 5 2
## 117 13723 941411 2 44 2 4 5 2
## 118 21535 941411 4 60 2 4 5 3
## 119 14905 459784 4 68 2 4 5 2
## 120 15408 459784 2 40 2 4 5 3
## 121 1266 43784 2 44 2 4 5 2
library(car)
spm(df, diagonal = "none", span = 0.75, reg.line = F, by.groups = TRUE)
Since the data was input with the qualitative variables as factor, regsubsets function will automatically split them into dummy variables. However, vif cannot account for the factors giving a different result than the book indicates, but it is still indicative nonetheless.
library(leaps)
library(car)
# Some Mentioned Diagnostics
plot(resid(fit) ~ fitted(fit), xlab = "Fitted", ylab = "Residual", pch = 19)
vif(fit)
## GVIF Df GVIF^(1/(2*Df))
## CTYPOP 1.767 1 1.329
## LANES 2.751 1 1.658
## WIDTH 1.473 1 1.214
## CONTROL 24.549 1 4.955
## CLASS 492.832 3 2.810
## TRUCK 7.561 4 1.288
## LOCALE 24.127 2 2.216
max(cooks.distance(fit), na.rm = TRUE)
## [1] 0.2076
# Best Subsets Analysis
best <- regsubsets(formula(fit), df, force.in = c(1, 2), nbest=5, nvmax = 7)
best$xnames <- paste("X", 0:13, sep = "")
with(summary(best), cbind(
which[,-c(1:3)],
Cp = round(cp, 4),
Rsq = round(rsq, 4),
AdjRsq = round(adjr2, 4)))
## X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 Cp Rsq AdjRsq
## 3 0 0 0 1 0 0 0 0 0 0 0 5.231 0.8045 0.7995
## 3 0 1 0 0 0 0 0 0 0 0 0 37.390 0.7514 0.7450
## 3 0 0 0 0 1 0 0 0 0 0 0 47.353 0.7349 0.7281
## 3 0 0 0 0 0 0 0 0 0 0 1 56.308 0.7201 0.7129
## 3 0 0 0 0 0 0 0 0 0 1 0 65.732 0.7045 0.6969
## 4 0 0 0 1 0 0 0 0 0 1 0 3.699 0.8104 0.8038
## 4 0 0 1 1 0 0 0 0 0 0 0 5.659 0.8071 0.8005
## 4 0 0 0 1 1 0 0 0 0 0 0 5.954 0.8066 0.8000
## 4 0 0 0 1 0 0 0 1 0 0 0 6.156 0.8063 0.7996
## 4 0 0 0 1 0 0 0 0 0 0 1 6.234 0.8062 0.7995
## 5 1 0 0 1 0 0 0 0 0 1 0 4.552 0.8123 0.8041
## 5 0 0 0 1 0 0 0 1 0 1 0 4.592 0.8122 0.8040
## 5 0 0 0 1 0 1 0 0 0 1 0 4.681 0.8120 0.8039
## 5 0 0 0 1 0 0 0 0 1 1 0 5.225 0.8111 0.8029
## 5 0 0 0 1 1 0 0 0 0 1 0 5.565 0.8106 0.8023
## 6 0 1 1 1 1 0 0 0 0 0 0 5.022 0.8148 0.8050
## 6 1 0 0 1 0 0 0 1 0 1 0 5.154 0.8146 0.8048
## 6 0 0 0 1 0 0 0 1 1 1 0 5.202 0.8145 0.8047
## 6 0 0 0 1 0 1 0 1 0 1 0 5.827 0.8135 0.8036
## 6 1 0 0 1 0 1 0 0 0 1 0 5.903 0.8133 0.8035
## 7 0 1 1 1 1 0 0 0 0 1 0 5.761 0.8169 0.8055
## 7 1 1 1 1 1 0 0 0 0 0 0 5.800 0.8168 0.8055
## 7 0 1 1 1 1 1 0 0 0 0 0 6.017 0.8165 0.8051
## 7 0 1 1 1 1 0 0 1 0 0 0 6.018 0.8164 0.8051
## 7 1 0 0 1 0 0 0 1 1 1 0 6.442 0.8157 0.8043
To collapse CLASS2 and CLASS4 into one group we need only change the levels attributed to those factors. In other words, we replace the level value for CLASS2 to that for CLASS4 as shown below.
The names for the pool of variables will go as follows:
intercept X0 CTYPOP X1 LANES X2
CONTROL1 X3 CLASS1 X4 CLASS3 X5
CTYPOP2 X6 LANES2 X7 CTYPOP:LANES X8
CTYPOP:CONTROL1 X9 CTYPOP:CLASS1 X10 CTYPOP:CLASS3 X11
LANES:CONTROL1 X12 LANES:CLASS1 X13 LANES:CLASS3 X14
CONTROL1:CLASS1 X15 CONTROL1:CLASS3 X16
levels(df$CLASS)[2] <- "4"
df <- transform(df,
CTYPOP2 = scale(CTYPOP, T, F)^2,
LANES2 = scale(LANES, T, F)^2)
pool <- AADT ~ (CTYPOP + LANES + CONTROL + CLASS)^2 + CTYPOP2 + LANES2
best <- regsubsets(pool, df, force.in = c(1, 2), nbest = 5)
## Warning: 4 linear dependencies found
## Reordering variables and trying again:
best$xnames <- paste("X", 0:16, sep = "")
with(summary(best), cbind(
which[,-c(1:3)],
Cp = round(cp, 4),
Rsq = round(rsq, 4),
AdjRsq = round(adjr2, 4)))
## X3 X4 X5 X6 X7 X8 X9 X13 X10 X11 X14 X12 X15 X16 Cp Rsq AdjRsq
## 3 0 0 0 0 0 0 1 0 0 0 0 0 0 0 65.554 0.8956 0.8929
## 3 0 0 0 0 0 0 0 0 1 0 0 0 0 0 74.711 0.8902 0.8874
## 3 0 0 0 0 0 0 0 1 0 0 0 0 0 0 79.860 0.8872 0.8843
## 3 0 0 0 0 0 0 0 0 0 0 0 1 0 0 159.749 0.8405 0.8364
## 3 0 0 0 0 0 1 0 0 0 0 0 0 0 0 194.316 0.8203 0.8157
## 4 0 0 0 0 1 0 1 0 0 0 0 0 0 0 19.029 0.9240 0.9213
## 4 0 0 0 0 1 0 0 0 1 0 0 0 0 0 30.072 0.9175 0.9147
## 4 0 0 0 0 1 0 0 1 0 0 0 0 0 0 33.726 0.9154 0.9125
## 4 0 0 0 0 0 1 0 0 1 0 0 0 0 0 41.339 0.9109 0.9079
## 4 0 0 0 0 0 1 0 1 0 0 0 0 0 0 41.940 0.9106 0.9075
## 5 0 1 0 0 0 0 0 0 1 1 0 0 0 0 8.139 0.9315 0.9285
## 5 0 0 1 0 0 0 1 0 0 0 0 1 0 0 9.479 0.9307 0.9277
## 5 0 0 0 0 1 0 1 0 1 0 0 0 0 0 10.787 0.9300 0.9269
## 5 0 1 0 0 0 0 0 1 0 1 0 0 0 0 11.023 0.9298 0.9268
## 5 0 0 0 0 1 0 1 1 0 0 0 0 0 0 12.593 0.9289 0.9258
## 6 0 1 0 0 0 1 0 0 1 1 0 0 0 0 4.572 0.9348 0.9313
## 6 0 0 0 0 1 1 1 0 1 0 0 0 0 0 4.679 0.9347 0.9313
## 6 0 0 0 0 1 1 1 1 0 0 0 0 0 0 5.636 0.9341 0.9307
## 6 0 1 0 0 0 1 0 1 0 1 0 0 0 0 6.924 0.9334 0.9299
## 6 0 0 1 0 0 1 1 0 0 0 0 1 0 0 6.973 0.9334 0.9298
## 7 0 1 0 1 0 1 0 0 1 1 0 0 0 0 1.071 0.9380 0.9341
## 7 0 0 0 1 1 1 1 0 1 0 0 0 0 0 1.708 0.9376 0.9337
## 7 0 0 0 1 1 1 1 1 0 0 0 0 0 0 2.051 0.9374 0.9335
## 7 0 1 0 1 0 1 0 1 0 1 0 0 0 0 2.229 0.9373 0.9334
## 7 1 0 0 1 0 1 1 0 0 0 1 0 0 0 4.061 0.9362 0.9323
## 8 0 1 0 1 1 1 0 0 1 1 0 0 0 0 2.312 0.9384 0.9340
## 8 0 1 0 1 0 1 1 0 1 1 0 0 0 0 2.454 0.9383 0.9339
## 8 0 1 0 1 0 1 0 0 1 1 0 0 1 0 2.482 0.9383 0.9339
## 8 0 1 0 1 0 1 0 0 1 0 1 0 1 0 2.482 0.9383 0.9339
## 8 0 0 0 1 0 1 0 0 1 1 1 0 1 0 2.482 0.9383 0.9339
## 9 0 1 1 1 0 1 0 0 1 1 0 1 0 0 3.511 0.9389 0.9339
## 9 0 1 1 1 1 1 0 0 1 1 0 0 0 0 3.822 0.9387 0.9337
## 9 0 1 1 1 0 1 1 0 1 1 0 0 0 0 3.827 0.9387 0.9337
## 9 0 1 1 1 0 1 0 1 1 1 0 0 0 0 3.927 0.9386 0.9337
## 9 0 1 0 1 1 1 0 0 1 1 0 1 0 0 4.018 0.9386 0.9336
par(mfrow = c(1, 2), pch = 19)
fit <- lm(AADT ~ CTYPOP + LANES + CONTROL + CLASS, df)
plot(rstudent(fit) ~ fitted(fit), xlab = "Fitted Values", ylab = "Studentized Residual")
title("(a) First-Order OLS Model")
fit <- lm(AADT ~ CTYPOP + LANES + LANES2 + CONTROL + CTYPOP:CONTROL, df)
plot(rstudent(fit) ~ fitted(fit), xlab = "Fitted Values", ylab = "Studentized Residual")
title("(b) Second-order OLS Model")
The weights can be extrated using the method used in TABLE 11.1 above. The standard deviation function will just be modifed as the authors detail. The R values come out as the authors describe. They repeated this four times in all. The summaries show that the results (particularly for CONTROL1) do not come out the same at all (e.g., CONTROL1 is negative).
fit <- lm(AADT ~ CTYPOP + LANES + LANES2 + CONTROL + CTYPOP:CONTROL, df)
summary(fit) # Compare before iterations
##
## Call:
## lm(formula = AADT ~ CTYPOP + LANES + LANES2 + CONTROL + CTYPOP:CONTROL,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32514 -2958 -705 2532 36775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.74e+04 4.51e+03 -3.86 0.00019 ***
## CTYPOP 8.08e-02 4.47e-03 18.08 < 2e-16 ***
## LANES 6.73e+03 9.47e+02 7.10 1.1e-10 ***
## LANES2 2.35e+03 3.67e+02 6.40 3.5e-09 ***
## CONTROL2 3.03e+03 3.16e+03 0.96 0.34007
## CTYPOP:CONTROL2 -6.93e-02 5.42e-03 -12.79 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8480 on 115 degrees of freedom
## Multiple R-squared: 0.925, Adjusted R-squared: 0.921
## F-statistic: 282 on 5 and 115 DF, p-value: <2e-16
w <- fitted(lm(abs(resid(fit)) ~ CTYPOP + LANES, df))^(-2)
fit <- update(fit, weights = w) # 1st iteration
w <- fitted(lm(abs(resid(fit)) ~ CTYPOP + LANES, df))^(-2)
fit <- update(fit, weights = w) # 2nd iteration
w <- fitted(lm(abs(resid(fit)) ~ CTYPOP + LANES, df))^(-2)
fit <- update(fit, weights = w) # 3nd iteration
w <- fitted(lm(abs(resid(fit)) ~ CTYPOP + LANES, df))^(-2)
fit <- update(fit, weights = w) # 4nd iteration
summary(fit) # Compare after iterations
##
## Call:
## lm(formula = AADT ~ CTYPOP + LANES + LANES2 + CONTROL + CTYPOP:CONTROL,
## data = df, weights = w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.809 -0.903 -0.248 0.667 6.965
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.35e+04 4.64e+03 -2.92 0.0042 **
## CTYPOP 7.83e-02 7.85e-03 9.98 < 2e-16 ***
## LANES 6.16e+03 9.30e+02 6.62 1.2e-09 ***
## LANES2 2.24e+03 7.52e+02 2.99 0.0035 **
## CONTROL2 2.19e+02 3.36e+03 0.06 0.9483
## CTYPOP:CONTROL2 -6.37e-02 8.42e-03 -7.57 1.0e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.4 on 115 degrees of freedom
## Multiple R-squared: 0.803, Adjusted R-squared: 0.794
## F-statistic: 93.6 on 5 and 115 DF, p-value: <2e-16
par(mfrow = c(1, 2), pch = 19)
plot(rstudent(fit) ~ fitted(fit), xlab = "Fitted", ylab = "Student. Residual")
title("(a) Residual Plot against Fitted")
qqnorm(rstudent(fit), xlab = "Expected", ylab = "Student. Residual", main = "")
qqline(rstudent(fit))
title("(b) Normal Probability Plot")