Question 6. In this exercise, you will further
analyze the Wage data set considered throughout this chapter.
library(ISLR2)
attach(Wage)
(a) Perform polynomial regression to predict wage
using age. Use cross-validation to select the optimal degree d for the
polyno- mial. What degree was chosen, and how does this compare to the
results of hypothesis testing using ANOVA? Make a plot of the resulting
polynomial fit to the data.
library(boot)
set.seed(15)
cv.error = rep(0,5)
for (i in 1:5){
glm.fit = glm(wage ~ poly(age,i),data=Wage)
cv.error[i]= cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
[1] 1676.192 1599.786 1596.256 1593.755 1595.106
plot(cv.error, type="b", xlab="Degree", ylab="TestMSE",lwd = 2)
points(which.min(cv.error), cv.error[4], col="blue", pch="X", cex=2)

- Based on cross-validation, the optimal degree is 4. The comparison
by ANOVA suggest degree 4 is enough
age.range <- range(age)
age.grid <- seq(from=age.range[1], to=age.range[2])
lm.fit <- lm(wage ~ poly(age, 3), data = Wage)
pred1 <- predict(lm.fit, newdata = list(age = age.grid))
plot(wage ~ age, data = Wage, xlim=age.range, cex=.5, col="darkgrey", xlab = "Age", ylab = "Wage")
lines(age.grid, pred1, col = 'blue', lwd = 2)

(b) Fit a step function to predict wage using age,
and perform cross- validation to choose the optimal number of cuts. Make
a plot of the fit obtained.
set.seed(2)
cv.errors = rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.errors
set.seed(2)
cv.errors = rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.errors
[1] NA 1734.448 1681.623 1635.869 1632.864 1621.643 1613.148 1601.850 1610.852 1603.607
- 8 is the number of optimal cuts.
plot(wage ~ age, data = Wage, col = "darkgrey", xlab = "Age", ylab = "Wage")
fit = glm(wage ~ cut(age, 8), data = Wage)
preds = predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "blue", lwd = 2)
Question 10. This question relates to the College
data set.
library(leaps)
attach(College)
(a) Split the data into a training set and a test
set.Using out-of-state tuition as the response and the other
variabgout-of-state tuition as the response and the other variables as
the predictors, perform forward stepwise selection on the training set
in order to identify a satisfactory model that uses just a subset of the
predictors.
set.seed(5)
train = sample(nrow(College) * 0.7)
train.college = College[train, ]
test.college = College[-train, ]
forward.subset = regsubsets(Outstate ~ ., data = train.college, nvmax = ncol(College)-1, method = "forward")
model.summary = summary(forward.subset)
plot.metric = function(metric, yaxis_label, reverse = FALSE) {
plot(metric, xlab = "# of Variables", ylab = yaxis_label, xaxt = "n", type = "l")
axis(side = 1, at = 1:length(metric))
if (reverse) {
metric_1se = max(metric) - (sd(metric) / sqrt(length(metric)))
min_subset = which(metric > metric_1se)
} else {
metric_1se = min(metric) + (sd(metric) / sqrt(length(metric)))
min_subset = which(metric < metric_1se)
}
abline(h = metric_1se, col = "red", lty = 2)
abline(v = min_subset[1], col = "blue", lty = 2)
}
par(mfrow=c(1, 3))
plot.metric(model.summary$cp, "Cp")
plot.metric(model.summary$bic, "BIC")
plot.metric(model.summary$adjr2, "Adjusted R2", reverse = TRUE)
*Based on the models, Cp and Adjusted R2 have a subset with 7
variables and BIC has 6.
coef(forward.subset, 6)
(b) Fit a GAM on the training data, using
out-of-state tuition as the response and the features selected in the
previous step as the predictors. Plot the results, and explain your
findings.
library(gam)
gam.model = gam(Outstate ~ Private + s(Room.Board, df=2) + s(perc.alumni, df=2) +
s(PhD, df=2) + s(Expend, df=2) + s(Grad.Rate, df=2), data=train.college)
par(mfrow=c(2, 3))
plot(gam.model, se=TRUE, col="blue")
(c) Evaluate the model obtained on the test set, and
explain the results obtained.
calc_mse = function(y, y_hat) {
return(mean((y - y_hat)^2))
}
calc_rmse = function(y, y_hat) {
return(sqrt(calc_mse(y, y_hat)))
}
calc_r2 = function(y, y_hat) {
y_bar = mean(y)
rss = sum((y - y_hat)^2)
tss = sum((y - y_bar)^2)
return(1 - (rss / tss))
}
gam.pred = predict(gam.model, test.college)
gam.rmse = calc_rmse(test.college$Outstate, gam.pred)
cat("RMSE:", gam.rmse, "\n")
gam_r2 = calc_r2(test.college$Outstate, gam.pred)
cat(" R^2:", gam_r2, "\n")
- We get a test RMSE of 1984.38 and R^2 of 0.7614 using GAM with 6
predictors.
(d) For which variables, if any, is there evidence
of a non-linear relationship with the response?
summary(gam.model)
- The results of the non-parametric ANOVA test indicate that there is
compelling evidence of a non-linear relationship between the response
variable and both Expend and PhD, and a moderately strong non-linear
relationship between the response variable and both Room.Board and
Grad.Rate.
LS0tCnRpdGxlOiAnQXNzaWdubWVudCAjNjogTW92aW5nIEJleW9uZCBMaW5lYXJpdHknCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCioqUXVlc3Rpb24gNioqLiBJbiB0aGlzIGV4ZXJjaXNlLCB5b3Ugd2lsbCBmdXJ0aGVyIGFuYWx5emUgdGhlIFdhZ2UgZGF0YSBzZXQgY29uc2lkZXJlZCB0aHJvdWdob3V0IHRoaXMgY2hhcHRlci4KCmBgYHtyLCBlY2hvPVRSVUV9CmxpYnJhcnkoSVNMUjIpCmF0dGFjaChXYWdlKQpgYGAKCioqKGEpKiogUGVyZm9ybSBwb2x5bm9taWFsIHJlZ3Jlc3Npb24gdG8gcHJlZGljdCB3YWdlIHVzaW5nIGFnZS4gVXNlIGNyb3NzLXZhbGlkYXRpb24gdG8gc2VsZWN0IHRoZSBvcHRpbWFsIGRlZ3JlZSBkIGZvciB0aGUgcG9seW5vLSBtaWFsLiBXaGF0IGRlZ3JlZSB3YXMgY2hvc2VuLCBhbmQgaG93IGRvZXMgdGhpcyBjb21wYXJlIHRvIHRoZSByZXN1bHRzIG9mIGh5cG90aGVzaXMgdGVzdGluZyB1c2luZyBBTk9WQT8gTWFrZSBhIHBsb3Qgb2YgdGhlIHJlc3VsdGluZyBwb2x5bm9taWFsIGZpdCB0byB0aGUgZGF0YS4KCmBgYHtyLCBlY2hvPVRSVUV9CmxpYnJhcnkoYm9vdCkKc2V0LnNlZWQoMTUpCmN2LmVycm9yID0gcmVwKDAsNSkKCmZvciAoaSBpbiAxOjUpewpnbG0uZml0ID0gZ2xtKHdhZ2UgfiBwb2x5KGFnZSxpKSxkYXRhPVdhZ2UpCmN2LmVycm9yW2ldPSBjdi5nbG0oV2FnZSxnbG0uZml0LEs9MTApJGRlbHRhWzFdCn0KY3YuZXJyb3IKYGBgCgpgYGB7ciwgZWNobz1UUlVFfQpwbG90KGN2LmVycm9yLCB0eXBlPSJiIiwgeGxhYj0iRGVncmVlIiwgeWxhYj0iVGVzdE1TRSIsbHdkID0gMikKcG9pbnRzKHdoaWNoLm1pbihjdi5lcnJvciksIGN2LmVycm9yWzRdLCBjb2w9ImJsdWUiLCBwY2g9IlgiLCBjZXg9MikKYGBgCgotICAgQmFzZWQgb24gY3Jvc3MtdmFsaWRhdGlvbiwgdGhlIG9wdGltYWwgZGVncmVlIGlzIDQuIFRoZSBjb21wYXJpc29uIGJ5IEFOT1ZBIHN1Z2dlc3QgZGVncmVlIDQgaXMgZW5vdWdoCgpgYGB7ciwgZWNobz1UUlVFfQphZ2UucmFuZ2UgPC0gcmFuZ2UoYWdlKQphZ2UuZ3JpZCA8LSBzZXEoZnJvbT1hZ2UucmFuZ2VbMV0sIHRvPWFnZS5yYW5nZVsyXSkKbG0uZml0IDwtIGxtKHdhZ2UgfiBwb2x5KGFnZSwgMyksIGRhdGEgPSBXYWdlKQpwcmVkMSA8LSBwcmVkaWN0KGxtLmZpdCwgbmV3ZGF0YSA9IGxpc3QoYWdlID0gYWdlLmdyaWQpKQoKcGxvdCh3YWdlIH4gYWdlLCBkYXRhID0gV2FnZSwgeGxpbT1hZ2UucmFuZ2UsIGNleD0uNSwgY29sPSJkYXJrZ3JleSIsIHhsYWIgPSAiQWdlIiwgeWxhYiA9ICJXYWdlIikKbGluZXMoYWdlLmdyaWQsIHByZWQxLCBjb2wgPSAnYmx1ZScsIGx3ZCA9IDIpCmBgYAoKKiooYikqKiBGaXQgYSBzdGVwIGZ1bmN0aW9uIHRvIHByZWRpY3Qgd2FnZSB1c2luZyBhZ2UsIGFuZCBwZXJmb3JtIGNyb3NzLSB2YWxpZGF0aW9uIHRvIGNob29zZSB0aGUgb3B0aW1hbCBudW1iZXIgb2YgY3V0cy4gTWFrZSBhIHBsb3Qgb2YgdGhlIGZpdCBvYnRhaW5lZC4KCmBgYHtyLCBlY2hvPVRSVUV9CnNldC5zZWVkKDIpCmN2LmVycm9ycyA9IHJlcChOQSwgMTApCmZvcihpIGluIDI6MTApewogIFdhZ2UkYWdlLmN1dCA8LSBjdXQoV2FnZSRhZ2UsaSkKICBnbG0uZml0IDwtIGdsbSh3YWdlIH4gYWdlLmN1dCwgZGF0YT1XYWdlKQogIGN2LmVycm9yc1tpXSA8LSBjdi5nbG0oV2FnZSwgZ2xtLmZpdCwgSz0xMCkkZGVsdGFbMV0KfQoKY3YuZXJyb3JzCmBgYAoKYGBge3IsIGVjaG89VFJVRX0KcGxvdCgyOjEwLCBjdi5lcnJvcnNbLTFdLCB0eXBlPSJiIiwgeGxhYj0iQ3V0cyIsIHlsYWI9IlRlc3QgTVNFIikKcG9pbnRzKHdoaWNoLm1pbihjdi5lcnJvcnMpLCBjdi5lcnJvcnNbd2hpY2gubWluKGN2LmVycm9ycyldLCBjb2w9ImJsdWUiLCBwY2g9IlgiLCBjZXg9MikKYGBgCgotICAgOCBpcyB0aGUgbnVtYmVyIG9mIG9wdGltYWwgY3V0cy4KCmBgYHtyLCBlY2hvPVRSVUV9CnBsb3Qod2FnZSB+IGFnZSwgZGF0YSA9IFdhZ2UsIGNvbCA9ICJkYXJrZ3JleSIsIHhsYWIgPSAiQWdlIiwgeWxhYiA9ICJXYWdlIikKZml0ID0gZ2xtKHdhZ2UgfiBjdXQoYWdlLCA4KSwgZGF0YSA9IFdhZ2UpCnByZWRzID0gcHJlZGljdChmaXQsIGxpc3QoYWdlID0gYWdlLmdyaWQpKQpsaW5lcyhhZ2UuZ3JpZCwgcHJlZHMsIGNvbCA9ICJibHVlIiwgbHdkID0gMikKYGBgCgotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0KCioqUXVlc3Rpb24gMTAqKi4gVGhpcyBxdWVzdGlvbiByZWxhdGVzIHRvIHRoZSBDb2xsZWdlIGRhdGEgc2V0LgoKYGBge3IsIGVjaG89VFJVRX0KbGlicmFyeShsZWFwcykKYXR0YWNoKENvbGxlZ2UpCmBgYAoKKiooYSkqKiBTcGxpdCB0aGUgZGF0YSBpbnRvIGEgdHJhaW5pbmcgc2V0IGFuZCBhIHRlc3Qgc2V0LlVzaW5nIG91dC1vZi1zdGF0ZSB0dWl0aW9uIGFzIHRoZSByZXNwb25zZSBhbmQgdGhlIG90aGVyIHZhcmlhYmdvdXQtb2Ytc3RhdGUgdHVpdGlvbiBhcyB0aGUgcmVzcG9uc2UgYW5kIHRoZSBvdGhlciB2YXJpYWJsZXMgYXMgdGhlIHByZWRpY3RvcnMsIHBlcmZvcm0gZm9yd2FyZCBzdGVwd2lzZSBzZWxlY3Rpb24gb24gdGhlIHRyYWluaW5nIHNldCBpbiBvcmRlciB0byBpZGVudGlmeSBhIHNhdGlzZmFjdG9yeSBtb2RlbCB0aGF0IHVzZXMganVzdCBhIHN1YnNldCBvZiB0aGUgcHJlZGljdG9ycy4KCmBgYHtyLCBlY2hvPVRSVUV9CnNldC5zZWVkKDUpCnRyYWluID0gc2FtcGxlKG5yb3coQ29sbGVnZSkgKiAwLjcpCnRyYWluLmNvbGxlZ2UgPSBDb2xsZWdlW3RyYWluLCBdCnRlc3QuY29sbGVnZSA9IENvbGxlZ2VbLXRyYWluLCBdCmBgYAoKYGBge3IsIGVjaG89VFJVRX0KZm9yd2FyZC5zdWJzZXQgPSByZWdzdWJzZXRzKE91dHN0YXRlIH4gLiwgZGF0YSA9IHRyYWluLmNvbGxlZ2UsIG52bWF4ID0gbmNvbChDb2xsZWdlKS0xLCBtZXRob2QgPSAiZm9yd2FyZCIpCm1vZGVsLnN1bW1hcnkgPSBzdW1tYXJ5KGZvcndhcmQuc3Vic2V0KQpgYGAKCmBgYHtyLCBlY2hvPVRSVUV9CnBsb3QubWV0cmljID0gZnVuY3Rpb24obWV0cmljLCB5YXhpc19sYWJlbCwgcmV2ZXJzZSA9IEZBTFNFKSB7CiAgcGxvdChtZXRyaWMsIHhsYWIgPSAiIyBvZiBWYXJpYWJsZXMiLCB5bGFiID0geWF4aXNfbGFiZWwsIHhheHQgPSAibiIsIHR5cGUgPSAibCIpCiAgYXhpcyhzaWRlID0gMSwgYXQgPSAxOmxlbmd0aChtZXRyaWMpKQogIAogIGlmIChyZXZlcnNlKSB7CiAgICBtZXRyaWNfMXNlID0gbWF4KG1ldHJpYykgLSAoc2QobWV0cmljKSAvIHNxcnQobGVuZ3RoKG1ldHJpYykpKQogICAgbWluX3N1YnNldCA9IHdoaWNoKG1ldHJpYyA+IG1ldHJpY18xc2UpCiAgfSBlbHNlIHsKICAgIG1ldHJpY18xc2UgPSBtaW4obWV0cmljKSArIChzZChtZXRyaWMpIC8gc3FydChsZW5ndGgobWV0cmljKSkpCiAgICBtaW5fc3Vic2V0ID0gd2hpY2gobWV0cmljIDwgbWV0cmljXzFzZSkKICB9CiAgCiAgYWJsaW5lKGggPSBtZXRyaWNfMXNlLCBjb2wgPSAicmVkIiwgbHR5ID0gMikKICBhYmxpbmUodiA9IG1pbl9zdWJzZXRbMV0sIGNvbCA9ICJibHVlIiwgbHR5ID0gMikKfQoKcGFyKG1mcm93PWMoMSwgMykpCiAgCnBsb3QubWV0cmljKG1vZGVsLnN1bW1hcnkkY3AsICJDcCIpCnBsb3QubWV0cmljKG1vZGVsLnN1bW1hcnkkYmljLCAiQklDIikKcGxvdC5tZXRyaWMobW9kZWwuc3VtbWFyeSRhZGpyMiwgIkFkanVzdGVkIFIyIiwgcmV2ZXJzZSA9IFRSVUUpCgpgYGAKClwqQmFzZWQgb24gdGhlIG1vZGVscywgQ3AgYW5kIEFkanVzdGVkIFIyIGhhdmUgYSBzdWJzZXQgd2l0aCA3IHZhcmlhYmxlcyBhbmQgQklDIGhhcyA2LgoKYGBge3IsIGVjaG89VFJVRX0KY29lZihmb3J3YXJkLnN1YnNldCwgNikKYGBgCgoqKihiKSoqIEZpdCBhIEdBTSBvbiB0aGUgdHJhaW5pbmcgZGF0YSwgdXNpbmcgb3V0LW9mLXN0YXRlIHR1aXRpb24gYXMgdGhlIHJlc3BvbnNlIGFuZCB0aGUgZmVhdHVyZXMgc2VsZWN0ZWQgaW4gdGhlIHByZXZpb3VzIHN0ZXAgYXMgdGhlIHByZWRpY3RvcnMuIFBsb3QgdGhlIHJlc3VsdHMsIGFuZCBleHBsYWluIHlvdXIgZmluZGluZ3MuCgpgYGB7ciwgZWNobz1UUlVFfQpsaWJyYXJ5KGdhbSkKZ2FtLm1vZGVsID0gZ2FtKE91dHN0YXRlIH4gUHJpdmF0ZSArIHMoUm9vbS5Cb2FyZCwgZGY9MikgKyBzKHBlcmMuYWx1bW5pLCBkZj0yKSArIAogICAgICAgICAgICAgICAgICAgcyhQaEQsIGRmPTIpICsgcyhFeHBlbmQsIGRmPTIpICsgcyhHcmFkLlJhdGUsIGRmPTIpLCBkYXRhPXRyYWluLmNvbGxlZ2UpCnBhcihtZnJvdz1jKDIsIDMpKQpwbG90KGdhbS5tb2RlbCwgc2U9VFJVRSwgY29sPSJibHVlIikKYGBgCgoqKihjKSoqIEV2YWx1YXRlIHRoZSBtb2RlbCBvYnRhaW5lZCBvbiB0aGUgdGVzdCBzZXQsIGFuZCBleHBsYWluIHRoZSByZXN1bHRzIG9idGFpbmVkLgoKYGBge3IsIGVjaG89VFJVRX0KY2FsY19tc2UgPSBmdW5jdGlvbih5LCB5X2hhdCkgewogIHJldHVybihtZWFuKCh5IC0geV9oYXQpXjIpKQp9CgpjYWxjX3Jtc2UgPSBmdW5jdGlvbih5LCB5X2hhdCkgewogIHJldHVybihzcXJ0KGNhbGNfbXNlKHksIHlfaGF0KSkpCn0KCmNhbGNfcjIgPSBmdW5jdGlvbih5LCB5X2hhdCkgewogIHlfYmFyID0gbWVhbih5KQogIHJzcyA9IHN1bSgoeSAtIHlfaGF0KV4yKQogIHRzcyA9IHN1bSgoeSAtIHlfYmFyKV4yKQogIHJldHVybigxIC0gKHJzcyAvIHRzcykpCn0KYGBgCgpgYGB7ciwgZWNobz1UUlVFfQpnYW0ucHJlZCA9IHByZWRpY3QoZ2FtLm1vZGVsLCB0ZXN0LmNvbGxlZ2UpCgpnYW0ucm1zZSA9IGNhbGNfcm1zZSh0ZXN0LmNvbGxlZ2UkT3V0c3RhdGUsIGdhbS5wcmVkKQpjYXQoIlJNU0U6IiwgZ2FtLnJtc2UsICJcbiIpCmBgYAoKYGBge3IsIGVjaG89VFJVRX0KZ2FtX3IyID0gY2FsY19yMih0ZXN0LmNvbGxlZ2UkT3V0c3RhdGUsIGdhbS5wcmVkKQpjYXQoIiBSXjI6IiwgZ2FtX3IyLCAiXG4iKQpgYGAKCi0gICBXZSBnZXQgYSB0ZXN0IFJNU0Ugb2YgMTk4NC4zOCBhbmQgUlxeMiBvZiAwLjc2MTQgdXNpbmcgR0FNIHdpdGggNiBwcmVkaWN0b3JzLgoKKiooZCkqKiBGb3Igd2hpY2ggdmFyaWFibGVzLCBpZiBhbnksIGlzIHRoZXJlIGV2aWRlbmNlIG9mIGEgbm9uLWxpbmVhciByZWxhdGlvbnNoaXAgd2l0aCB0aGUgcmVzcG9uc2U/CgpgYGB7ciwgZWNobz1UUlVFfQpzdW1tYXJ5KGdhbS5tb2RlbCkKYGBgCgotICAgVGhlIHJlc3VsdHMgb2YgdGhlIG5vbi1wYXJhbWV0cmljIEFOT1ZBIHRlc3QgaW5kaWNhdGUgdGhhdCB0aGVyZSBpcyBjb21wZWxsaW5nIGV2aWRlbmNlIG9mIGEgbm9uLWxpbmVhciByZWxhdGlvbnNoaXAgYmV0d2VlbiB0aGUgcmVzcG9uc2UgdmFyaWFibGUgYW5kIGJvdGggRXhwZW5kIGFuZCBQaEQsIGFuZCBhIG1vZGVyYXRlbHkgc3Ryb25nIG5vbi1saW5lYXIgcmVsYXRpb25zaGlwIGJldHdlZW4gdGhlIHJlc3BvbnNlIHZhcmlhYmxlIGFuZCBib3RoIFJvb20uQm9hcmQgYW5kIEdyYWQuUmF0ZS4K