We explore the use of the validation set approach in order to estimate the test error rates that result from fitting various linear models on the Auto data set.
library(ISLR)
set.seed(1)
train = sample(392, 196)
lm.fit = lm(mpg ~ horsepower, data = Auto, subset = train)
We now use the predict() function to estimate the response for all 392 observations, and we use the mean() function to calculate the MSE of the 196 observations in the validation set. Note that the -train index below selects only the observations that are not in the training set.
attach(Auto)
# MSE
mean((mpg - predict(lm.fit, Auto))[-train]^2)
## [1] 26.14
# 2nd order model
lm.fit2 = lm(mpg ~ poly(horsepower, 2), data = Auto, subset = train)
mean((mpg - predict(lm.fit2, Auto))[-train]^2)
## [1] 19.82
# 3rd order model
lm.fit3 = lm(mpg ~ poly(horsepower, 3), data = Auto, subset = train)
mean((mpg - predict(lm.fit3, Auto))[-train]^2)
## [1] 19.78
# Load the boot library to use cv.glm function
library(boot)
glm.fit = glm(mpg ~ horsepower, data = Auto)
cv.err = cv.glm(Auto, glm.fit)
cv.err$delta
## [1] 24.23 24.23
cv.error = rep(0, 5)
for (i in 1:5) {
glm.fit = glm(mpg ~ poly(horsepower, i), data = Auto)
cv.error[i] = cv.glm(Auto, glm.fit)$delta[1]
}
cv.error
## [1] 24.23 19.25 19.33 19.42 19.03
set.seed(17)
cv.error.10 = rep(0, 10)
for (i in 1:10) {
glm.fit = glm(mpg ~ poly(horsepower, i), data = Auto)
cv.error.10[i] = cv.glm(Auto, glm.fit, K = 10)$delta[1]
}
cv.error.10
## [1] 24.21 19.19 19.31 19.34 18.88 19.02 18.90 19.71 18.95 19.50
One of the great advantages of the bootstrap approach is that it can be applied in almost all situations. No complicated mathematical calculations are required. Performing a bootstrap analysis in R entails only two steps. First, we must create a function that computes the statistic of interest. Second, we use the boot() function, which is part of the boot library, to perform the bootstrap by repeatedly sampling observations from the data set with replacement.
# A function to estimate alpha
alpha.fn = function(data, index) {
X = data$X[index]
Y = data$Y[index]
return((var(Y) - cov(X, Y))/(var(X) + var(Y) - 2 * cov(X, Y)))
}
set.seed(1)
alpha.fn(Portfolio, sample(100, 100, replace = T))
## [1] 0.5964
# OR produce R = 1, 000 bootstrap estimates for alpha
boot(Portfolio, alpha.fn, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Portfolio, statistic = alpha.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.5758 -7.315e-05 0.08862
# boot.fn
boot.fn = function(data, index) return(coef(lm(mpg ~ horsepower, data = data,
subset = index)))
# Compute the standard error
boot(Auto, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 39.9359 0.0126153 0.87127
## t2* -0.1578 -0.0002692 0.00754
# Compare to the linear model using lm
summary(lm(mpg ~ horsepower, data = Auto))$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.9359 0.717499 55.66 1.220e-187
## horsepower -0.1578 0.006446 -24.49 7.032e-81
# 2nd order model
boot.fn = function(data, index) coefficients(lm(mpg ~ horsepower + I(horsepower^2),
data = data, subset = index))
set.seed(1)
boot(Auto, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Auto, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 56.900100 6.098e-03 2.0944856
## t2* -0.466190 -1.777e-04 0.0334124
## t3* 0.001231 1.324e-06 0.0001208