env <- new.env()
load("../data.rda", envir = env)
df <- get("CH09TA01", envir = env)
names(df) <- c(paste("X", 1:8, sep = ""), "Y" , "lnY")
data.names <- c("Bloodclot", "Progindex", "Enzyme", "Liver",
"Age", "Gender", "Alc.Mod", "Alc.Heavy",
"Survival", "LnSurvival")
{
print(head(df))
cat("... \n")
print(tail(df))
}
## X1 X2 X3 X4 X5 X6 X7 X8 Y lnY
## 1 6.7 62 81 2.59 50 0 1 0 695 6.544
## 2 5.1 59 66 1.70 39 0 0 0 403 5.999
## 3 7.4 57 83 2.16 55 0 0 0 710 6.565
## 4 6.5 73 41 2.01 48 0 0 0 349 5.854
## 5 7.8 65 115 4.30 45 0 0 1 2343 7.759
## 6 5.8 38 72 1.42 65 1 1 0 348 5.852
## ...
## X1 X2 X3 X4 X5 X6 X7 X8 Y lnY
## 49 5.1 67 77 2.86 66 1 0 0 581 6.365
## 50 3.9 82 103 4.55 50 0 1 0 1078 6.983
## 51 6.6 77 46 1.95 50 0 1 0 405 6.005
## 52 6.4 85 40 1.21 58 0 0 1 579 6.361
## 53 6.4 59 85 2.33 63 0 1 0 550 6.310
## 54 8.8 78 72 3.20 56 0 0 0 651 6.478
Included here are the stem-and-leaf plots for each of the predictor variables chosen to be used at this time (i.e., the first four). Also included is the scatterplot matrix and correlation matrix.
Since I think the boxplot is more visually telling than the stem-and-leaf plots, I have also included a 2x2 graphics output of the boxplots for each of the predictor variables.
with(df,
{
stem(X1, 2)
stem(X2, 4)
stem(X3, 4)
stem(X4)
cor(df[1:4])
})
##
## The decimal point is at the |
##
## 2 | 6
## 3 | 244
## 3 | 67779
## 4 | 3
## 4 | 588
## 5 | 0112222344
## 5 | 678888888
## 6 | 003344
## 6 | 5556777
## 7 | 344
## 7 | 78
## 8 |
## 8 | 788
## 9 |
## 9 |
## 10 |
## 10 |
## 11 | 2
##
##
## The decimal point is at the |
##
## 8 | 0
## 10 |
## 12 |
## 14 |
## 16 |
## 18 |
## 20 |
## 22 |
## 24 |
## 26 | 0
## 28 | 0
## 30 |
## 32 |
## 34 |
## 36 |
## 38 | 0
## 40 | 0
## 42 |
## 44 | 0
## 46 | 0
## 48 | 0
## 50 | 000
## 52 | 000
## 54 | 0
## 56 | 000
## 58 | 00000
## 60 | 00
## 62 | 00
## 64 | 00
## 66 | 000
## 68 | 00
## 70 |
## 72 | 000
## 74 | 00
## 76 | 00000
## 78 | 0
## 80 |
## 82 | 000
## 84 | 000
## 86 | 00
## 88 |
## 90 |
## 92 |
## 94 |
## 96 | 0
##
##
## The decimal point is at the |
##
## 22 | 0
## 24 |
## 26 |
## 28 | 0
## 30 |
## 32 |
## 34 |
## 36 |
## 38 |
## 40 | 000
## 42 | 0
## 44 |
## 46 | 0
## 48 |
## 50 |
## 52 | 0
## 54 |
## 56 | 0
## 58 | 0
## 60 |
## 62 | 0
## 64 | 0
## 66 | 00
## 68 | 00
## 70 | 0
## 72 | 00000
## 74 | 0
## 76 | 0000
## 78 |
## 80 | 00
## 82 | 00
## 84 | 00
## 86 | 0000
## 88 | 000
## 90 | 0
## 92 | 000
## 94 | 0
## 96 |
## 98 | 00
## 100 | 00
## 102 | 0
## 104 |
## 106 | 0
## 108 |
## 110 |
## 112 |
## 114 | 00
## 116 |
## 118 | 0
##
##
## The decimal point is at the |
##
## 0 | 7
## 1 | 12345678899
## 2 | 0011234455556666679999
## 3 | 000012344556
## 4 | 001136
## 5 | 6
## 6 | 4
## X1 X2 X3 X4
## X1 1.00000 0.09012 -0.14963 0.5024
## X2 0.09012 1.00000 -0.02361 0.3690
## X3 -0.14963 -0.02361 1.00000 0.4164
## X4 0.50242 0.36903 0.41642 1.0000
pairs(df[1:4], labels = data.names[1:4])
with(df,
{
par(mfrow = c(2, 2))
boxplot(X1, main = "Bloodclot")
boxplot(X2, main = "Progindex")
boxplot(X3, main = "Enzyme" )
boxplot(X4, main = "Liver" )
})
par(mfrow = c(2, 2))
fit <- lm(Y ~ X1 + X2 + X3 + X4, df)
plot(fitted(fit), resid(fit), xlab = "Predicted value", ylab = "Residual")
title("(a) Residual Plot for Y")
abline(0, 0)
qqnorm(resid(fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(fit))
title("(b) Normal Plot for Y")
fit <- lm(lnY ~ X1 + X2 + X3 + X4, df)
plot(fitted(fit), resid(fit), xlab = "Predicted value", ylab = "Residual")
title("(c) Residual Plot for lnY")
abline(0, 0)
qqnorm(resid(fit), xlab = "Expected Value", ylab = "Residual", main = "")
qqline(resid(fit))
title("(d) Normal Prot for lnY")
cor(model.frame(fit))
## lnY X1 X2 X3 X4
## lnY 1.0000 0.24619 0.46994 0.65389 0.6493
## X1 0.2462 1.00000 0.09012 -0.14963 0.5024
## X2 0.4699 0.09012 1.00000 -0.02361 0.3690
## X3 0.6539 -0.14963 -0.02361 1.00000 0.4164
## X4 0.6493 0.50242 0.36903 0.41642 1.0000
pairs(model.frame(fit), labels = data.names[c(10, 1:4)], pch = 18, col = "grey50")
R contains many ways by which to access the relevant selection criteria values. There is also the regsubsets function in the “leaps” library that calculates most of these values for all the subsets you specify. However, if you desired a way to obtain the values for each combination of models, then you could use an algorithm similar to the follow:
models <- list()
for(i in seq(p)) models[[i]] <- t(combn(1:p, i))
models <- lapply(models, function(set) {apply(set, 1, function(row) lm(y ~ ., data[row]))})
unlist(lapply(models, function(set) lapply(set, sumfun)))
The requirements are that p is the number of parameters to combine, the data needs to be arranged so the first columns are the predictors, y names the response variable (use transform on the full model's frame), and sumfun names either your own or some other defined function to operate on lm objects. In this case, there are functions for AIC (stats), PRESS (MPV), and others. Since the models object contains a list of lists of the possible combinations of p predictors, the use of unlist is to return the raw values as a vector. Then you can cbind them into a new dataframe to summarize results. Note, for BIC results set the AIC k value to log(n).
Since regsubsets provides most of the criteria in TABLE 9.2, I will simply make use of that output. I will also provide some plotting with regsubsets. Since this procedure will be repeated a few times, I will make a “wrapper” function to generate the table of interest.
The plots of regsubsets produces shaded squares, where shaded means that variable is included in the model. See the help pages for plot.regsubsets for details. This approach can be useful when a corresponding table is large.
library(leaps)
best <- function(model, ...)
{
subsets <- regsubsets(formula(model), model.frame(model), ...)
subsets <- with(summary(subsets),
cbind(p = as.numeric(rownames(which)), which, rss, rsq, adjr2, cp, bic))
return(subsets)
}
round(best(fit, nbest = 4), 4)
## p (Intercept) X1 X2 X3 X4 rss rsq adjr2 cp bic
## 1 1 1 0 0 1 0 7.332 0.4276 0.4166 66.49 -22.146
## 1 1 1 0 0 0 1 7.409 0.4215 0.4104 67.71 -21.581
## 1 1 1 0 1 0 0 9.979 0.2208 0.2059 108.56 -5.498
## 1 1 1 1 0 0 0 12.031 0.0606 0.0425 141.16 4.602
## 2 2 1 0 1 1 0 4.312 0.6633 0.6501 20.52 -46.814
## 2 2 1 0 0 1 1 5.130 0.5995 0.5838 33.50 -37.443
## 2 2 1 1 0 1 0 5.781 0.5486 0.5309 43.85 -30.989
## 2 2 1 0 1 0 1 6.622 0.4830 0.4627 57.21 -23.654
## 3 3 1 1 1 1 0 3.108 0.7573 0.7427 3.39 -60.502
## 3 3 1 0 1 1 1 3.614 0.7178 0.7009 11.42 -52.365
## 3 3 1 1 0 1 1 4.968 0.6121 0.5889 32.93 -35.186
## 3 3 1 1 1 0 1 6.570 0.4870 0.4562 58.39 -20.089
## 4 4 1 1 1 1 1 3.084 0.7592 0.7396 5.00 -56.942
subsets <- regsubsets(formula(fit), model.frame(fit), nbest = 4)
plot(subsets)
plot(subsets, scale = "Cp")
plot(subsets, scale = "adjr2")
plot(subsets, scale = "r2")
The 1:4 is hard-coded into the 'lines' calls. This could be automated with something like this:
seq(unique(x[, 'p']))
x <- best(fit, nbest = 6)
par(mfrow = c(2, 2), pch = 19)
plot(rsq ~ p, x, xlab = "(a)", ylab = "Rsq", col = "gray50")
lines(1:4, tapply(x[, "rsq"], x[, "p"], max), lwd = 2)
plot(adjr2 ~ p, x, xlab = "(b)", ylab = "Adj Rsq", col = "gray50")
lines(1:4, tapply(x[, "adjr2"], x[, "p"], max), lwd = 2)
plot(cp ~ p, x, xlab = "(c)", ylab = "Cp", col = "gray50")
lines(1:4, tapply(x[, "cp"], x[, "p"], min), lwd = 2)
plot(bic ~ p, x, xlab = "(d)", ylab = "BIC", col = "gray50")
lines(1:4, tapply(x[, "bic"], x[, "p"], min), lwd = 2)
fit <- lm(lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, df)
x <- best(fit, nbest = 20)
par(mfrow = c(2, 2), pch = 19)
plot(rsq ~ p, x, xlab = "(a)", ylab = "Rsq", col = "gray50")
lines(1:8, tapply(x[, "rsq"], x[, "p"], max), lwd = 2)
plot(adjr2 ~ p, x, xlab = "(b)", ylab = "Adj Rsq", col = "gray50")
lines(1:8, tapply(x[, "adjr2"], x[, "p"], max), lwd = 2)
plot(cp ~ p, x, xlab = "(c)", ylab = "Cp", col = "gray50")
lines(1:8, tapply(x[, "cp"], x[, "p"], min), lwd = 2)
plot(bic ~ p, x, xlab = "(d)", ylab = "BIC", col = "gray50")
lines(1:8, tapply(x[, "bic"], x[, "p"], min), lwd = 2)
round(best(fit, nbest = 2), 4) # Fundamentally same info as in TABLE 9.3
## p (Intercept) X1 X2 X3 X4 X5 X6 X7 X8 rss rsq adjr2 cp bic
## 1 1 1 0 0 1 0 0 0 0 0 7.332 0.4276 0.4166 117.409 -22.15
## 1 1 1 0 0 0 1 0 0 0 0 7.409 0.4215 0.4104 119.171 -21.58
## 2 2 1 0 1 1 0 0 0 0 0 4.312 0.6633 0.6501 50.472 -46.81
## 2 2 1 0 0 1 1 0 0 0 0 5.130 0.5995 0.5838 69.132 -37.44
## 3 3 1 0 1 1 0 0 0 0 1 2.843 0.7780 0.7647 18.915 -65.33
## 3 3 1 1 1 1 0 0 0 0 0 3.108 0.7573 0.7427 24.980 -60.50
## 4 4 1 1 1 1 0 0 0 0 1 2.179 0.8299 0.8160 5.751 -75.70
## 4 4 1 0 1 1 1 0 0 0 1 2.377 0.8144 0.7993 10.267 -71.01
## 5 5 1 1 1 1 0 0 1 0 1 2.082 0.8374 0.8205 5.541 -74.17
## 5 5 1 1 1 1 0 1 0 0 1 2.103 0.8358 0.8187 6.018 -73.63
## 6 6 1 1 1 1 0 1 1 0 1 2.005 0.8434 0.8234 5.787 -72.21
## 6 6 1 1 1 1 0 0 1 1 1 2.060 0.8392 0.8187 7.029 -70.76
## 7 7 1 1 1 1 0 1 1 1 1 1.972 0.8460 0.8226 7.029 -69.12
## 7 7 1 1 1 1 1 1 1 0 1 2.003 0.8436 0.8198 7.735 -68.28
## 8 8 1 1 1 1 1 1 1 1 1 1.971 0.8461 0.8188 9.000 -65.17
R comes with a number of stepwise functions. The most basic of these is step (base). There is also stepAIC (MASS), which provides for a wider range of object classes. Also, there is stepwise (Rcmdr) that appears to pose an easier interface to using the BIC instead of AIC calculation. As above, you need to define the k parameter appropriately to use the BIC in your stepwise regression methods. Additionally, there's mle.stepwise (wle). We will focus on only using base functions here.
If one wanted to run the process themselves, there exists the intuitive functions add1 and drop1 that provide for an easy way to accomplish that. They provide RSS and AIC to make comparisons at each step. For repeated use, the x parameter can be passed a model matrix, but no checks are done on its validity. Read their help files for details.
BIC <- log(nrow(model.frame(fit)))
fit <- lm(lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, df) # Full Model
step(fit, direction = "forward") # AIC = -160.77 Drop None
## Start: AIC=-160.8
## lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8
##
## Call:
## lm(formula = lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = df)
##
## Coefficients:
## (Intercept) X1 X2 X3 X4
## 4.05052 0.06851 0.01345 0.01495 0.00802
## X5 X6 X7 X8
## -0.00357 0.08421 0.05786 0.38838
step(fit, dir = "forward", k = BIC) # BIC = -142.87 Drop None
## Start: AIC=-142.9
## lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8
##
## Call:
## lm(formula = lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8, data = df)
##
## Coefficients:
## (Intercept) X1 X2 X3 X4
## 4.05052 0.06851 0.01345 0.01495 0.00802
## X5 X6 X7 X8
## -0.00357 0.08421 0.05786 0.38838
step(fit, dir = "backward") # AIC = -163.83 Drop X4 and X7
## Start: AIC=-160.8
## lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8
##
## Df Sum of Sq RSS AIC
## - X4 1 0.001 1.97 -163
## - X7 1 0.032 2.00 -162
## - X5 1 0.074 2.04 -161
## <none> 1.97 -161
## - X6 1 0.084 2.05 -160
## - X1 1 0.318 2.29 -155
## - X8 1 0.846 2.82 -144
## - X2 1 2.090 4.06 -124
## - X3 1 2.991 4.96 -113
##
## Step: AIC=-162.7
## lnY ~ X1 + X2 + X3 + X5 + X6 + X7 + X8
##
## Df Sum of Sq RSS AIC
## - X7 1 0.03 2.01 -163.8
## <none> 1.97 -162.7
## - X5 1 0.09 2.06 -162.4
## - X6 1 0.10 2.07 -162.1
## - X1 1 0.63 2.60 -149.8
## - X8 1 0.84 2.82 -145.5
## - X2 1 2.67 4.65 -118.5
## - X3 1 5.10 7.07 -95.8
##
## Step: AIC=-163.8
## lnY ~ X1 + X2 + X3 + X5 + X6 + X8
##
## Df Sum of Sq RSS AIC
## <none> 2.01 -163.8
## - X5 1 0.08 2.08 -163.8
## - X6 1 0.10 2.10 -163.3
## - X1 1 0.63 2.63 -151.1
## - X8 1 0.90 2.91 -145.8
## - X2 1 2.76 4.77 -119.1
## - X3 1 5.08 7.09 -97.7
##
## Call:
## lm(formula = lnY ~ X1 + X2 + X3 + X5 + X6 + X8, data = df)
##
## Coefficients:
## (Intercept) X1 X2 X3 X5
## 4.05397 0.07152 0.01376 0.01512 -0.00345
## X6 X8
## 0.08732 0.35090
step(fit, dir = "backward", k = BIC) # BIC = -153.41 Drop X4 through X7 -- This model matches the book's search
## Start: AIC=-142.9
## lnY ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8
##
## Df Sum of Sq RSS AIC
## - X4 1 0.001 1.97 -147
## - X7 1 0.032 2.00 -146
## - X5 1 0.074 2.04 -145
## - X6 1 0.084 2.05 -145
## <none> 1.97 -143
## - X1 1 0.318 2.29 -139
## - X8 1 0.846 2.82 -128
## - X2 1 2.090 4.06 -108
## - X3 1 2.991 4.96 -97
##
## Step: AIC=-146.8
## lnY ~ X1 + X2 + X3 + X5 + X6 + X7 + X8
##
## Df Sum of Sq RSS AIC
## - X7 1 0.03 2.01 -149.9
## - X5 1 0.09 2.06 -148.5
## - X6 1 0.10 2.07 -148.2
## <none> 1.97 -146.8
## - X1 1 0.63 2.60 -135.9
## - X8 1 0.84 2.82 -131.6
## - X2 1 2.67 4.65 -104.5
## - X3 1 5.10 7.07 -81.9
##
## Step: AIC=-149.9
## lnY ~ X1 + X2 + X3 + X5 + X6 + X8
##
## Df Sum of Sq RSS AIC
## - X5 1 0.08 2.08 -151.9
## - X6 1 0.10 2.10 -151.3
## <none> 2.01 -149.9
## - X1 1 0.63 2.63 -139.2
## - X8 1 0.90 2.91 -133.9
## - X2 1 2.76 4.77 -107.1
## - X3 1 5.08 7.09 -85.7
##
## Step: AIC=-151.9
## lnY ~ X1 + X2 + X3 + X6 + X8
##
## Df Sum of Sq RSS AIC
## - X6 1 0.10 2.18 -153.4
## <none> 2.08 -151.9
## - X1 1 0.62 2.71 -141.7
## - X8 1 0.97 3.06 -135.1
## - X2 1 2.83 4.91 -109.5
## - X3 1 5.08 7.16 -89.2
##
## Step: AIC=-153.4
## lnY ~ X1 + X2 + X3 + X8
##
## Df Sum of Sq RSS AIC
## <none> 2.18 -153.4
## - X1 1 0.66 2.84 -143.0
## - X8 1 0.93 3.11 -138.2
## - X2 1 2.99 5.17 -110.8
## - X3 1 5.45 7.63 -89.7
##
## Call:
## lm(formula = lnY ~ X1 + X2 + X3 + X8, data = df)
##
## Coefficients:
## (Intercept) X1 X2 X3 X8
## 3.8524 0.0733 0.0142 0.0155 0.3530
# Import the training and validation data sets, respectively
df <- do.call("rbind.data.frame",
list(get("CH09TA01", envir = env), get("CH09TA05", envir = env)))
names(df) <- c(paste("X", seq(8), sep = ""), "Y", "lnY")
# Add a factor to which data set the observation belongs
df <- transform(df, class = gl(2, 54, labels = c("training", "validation")))
{
print(head(df))
cat("... \n")
print(tail(df))
}
## X1 X2 X3 X4 X5 X6 X7 X8 Y lnY class
## 1 6.7 62 81 2.59 50 0 1 0 695 6.544 training
## 2 5.1 59 66 1.70 39 0 0 0 403 5.999 training
## 3 7.4 57 83 2.16 55 0 0 0 710 6.565 training
## 4 6.5 73 41 2.01 48 0 0 0 349 5.854 training
## 5 7.8 65 115 4.30 45 0 0 1 2343 7.759 training
## 6 5.8 38 72 1.42 65 1 1 0 348 5.852 training
## ...
## X1 X2 X3 X4 X5 X6 X7 X8 Y lnY class
## 103 10.4 62 85 4.65 50 0 1 0 1041 6.948 validation
## 104 5.8 70 64 2.52 49 0 1 0 589 6.378 validation
## 105 5.4 64 81 1.36 62 0 1 0 599 6.395 validation
## 106 6.9 90 33 2.78 48 1 0 0 655 6.485 validation
## 107 7.9 45 55 2.46 43 0 1 0 377 5.932 validation
## 108 4.5 68 60 2.07 59 0 0 0 642 6.465 validation
Instead of a table this process will create a list object containing the results provided for each of the model fits for each of the data sets.
library(MPV) # For PRESS wrapper function
newsummary <- function(model)
{
list('coefs' = round(t(summary(model)$coef[, 1:2]), 4),
'criteria' = cbind('SSE' = anova(model)["Residuals", "Sum Sq"],
'PRESS' = PRESS(model),
'MSE' = anova(model)["Residuals", "Mean Sq"],
'Rsq' = summary(model)$adj.r.squared))
}
newsummary(lm(lnY ~ X1 + X2 + X3 + X8, df, class == "training"))
## $coefs
## (Intercept) X1 X2 X3 X8
## Estimate 3.8524 0.0733 0.0142 0.0155 0.3530
## Std. Error 0.1927 0.0190 0.0017 0.0014 0.0772
##
## $criteria
## SSE PRESS MSE Rsq
## [1,] 2.179 2.738 0.04447 0.816
newsummary(lm(lnY ~ X1 + X2 + X3 + X8, df, class == "validation"))
## $coefs
## (Intercept) X1 X2 X3 X8
## Estimate 3.6350 0.0958 0.0164 0.0156 0.1860
## Std. Error 0.2894 0.0319 0.0023 0.0020 0.0964
##
## $criteria
## SSE PRESS MSE Rsq
## [1,] 3.795 4.522 0.07745 0.6824
newsummary(lm(lnY ~ X1 + X2 + X3 + X6 + X8, df, class == "training"))
## $coefs
## (Intercept) X1 X2 X3 X6 X8
## Estimate 3.8671 0.0712 0.0139 0.0151 0.0869 0.3627
## Std. Error 0.1906 0.0188 0.0017 0.0014 0.0582 0.0765
##
## $criteria
## SSE PRESS MSE Rsq
## [1,] 2.082 2.783 0.04338 0.8205
newsummary(lm(lnY ~ X1 + X2 + X3 + X6 + X8, df, class == "validation"))
## $coefs
## (Intercept) X1 X2 X3 X6 X8
## Estimate 3.6143 0.0999 0.0159 0.0154 0.0731 0.1886
## Std. Error 0.2907 0.0323 0.0024 0.0020 0.0792 0.0966
##
## $criteria
## SSE PRESS MSE Rsq
## [1,] 3.729 4.654 0.07768 0.6815
newsummary(lm(lnY ~ X1 + X2 + X3 + X5 + X6 + X8, df, class == "training"))
## $coefs
## (Intercept) X1 X2 X3 X5 X6 X8
## Estimate 4.0540 0.0715 0.0138 0.0151 -0.0035 0.0873 0.3509
## Std. Error 0.2348 0.0186 0.0017 0.0014 0.0026 0.0577 0.0764
##
## $criteria
## SSE PRESS MSE Rsq
## [1,] 2.005 2.772 0.04266 0.8234
newsummary(lm(lnY ~ X1 + X2 + X3 + X5 + X6 + X8, df, class == "validation"))
## $coefs
## (Intercept) X1 X2 X3 X5 X6 X8
## Estimate 3.4699 0.0987 0.0162 0.0156 0.0025 0.0727 0.1931
## Std. Error 0.3468 0.0325 0.0024 0.0021 0.0033 0.0795 0.0972
##
## $criteria
## SSE PRESS MSE Rsq
## [1,] 3.682 4.898 0.07834 0.6787