In this exercise, you will further analyze the Wage data set considered throughout this chapter.
library(ISLR2)
attach(College)
library(leaps)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(boot)
set.seed(1)
attach(Wage)
set.seed(42)
cv.error.5 <- rep(0, 5)
for (i in 1:5) {
lm.fit <- glm(wage ~ poly(age, i, raw = TRUE), data = Wage)
cv.error.5[i] <- cv.glm(Wage, lm.fit, K = 10)$delta[1]
}
cv.error.5
## [1] 1676.334 1601.952 1597.313 1594.688 1595.061
Looking at the errors for all of the models we can see the minimum error when the degree d is set to 4.
fit.1 <- lm(wage ~ age, data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Looking at the results from the ANOVA test above we can confirm that degree d being 4 is optimal as the p-value is closest to .05.
coef(summary(fit.4))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 111.70361 0.7287409 153.283015 0.000000e+00
## poly(age, 4)1 447.06785 39.9147851 11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3 125.52169 39.9147851 3.144742 1.678622e-03
## poly(age, 4)4 -77.91118 39.9147851 -1.951938 5.103865e-02
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(fit.4, newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit, preds$fit - 2 * preds$se.fit)
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 4, 0))
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") > title("Degree-4 Polynomial", outer = T)
## logical(0)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)
Wage_clean <- na.omit(Wage)
set.seed(42)
cv.error.10 <- rep(NA, 8)
for (i in 2:10) {
Wage_clean$age.cut <- cut(Wage_clean$age, i)
glm.fit <- glm(wage ~ age.cut, data = Wage_clean)
cv.error.10[i - 1] <- cv.glm(Wage_clean, glm.fit, K = 10)$delta[1]
}
cv.error.10
## [1] 1733.565 1684.082 1637.465 1632.337 1623.836 1610.572 1602.163 1612.770
## [9] 1606.141
Looking at the results from the cross-validation used on the step function, I found that the minimum error was obtained at 7 cuts.
fit.7 <- lm(wage ~ cut(age, 7), data = Wage)
coef(summary(fit.7))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.06402 2.405222 33.287586 5.517532e-207
## cut(age, 7)(26.9,35.7] 24.51568 2.892501 8.475600 3.617008e-17
## cut(age, 7)(35.7,44.6] 38.59253 2.792477 13.820180 3.695152e-42
## cut(age, 7)(44.6,53.4] 38.05482 2.812402 13.531077 1.556835e-40
## cut(age, 7)(53.4,62.3] 39.03969 3.082094 12.666611 7.411125e-36
## cut(age, 7)(62.3,71.1] 31.03930 4.884196 6.355048 2.400708e-10
## cut(age, 7)(71.1,80.1] 15.99445 9.075719 1.762334 7.811488e-02
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(fit.7, newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit, preds$fit - 2 * preds$se.fit)
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 4, 0))
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") > title("7-Cut", outer = T)
## logical(0)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)
This question relates to the College data set. ### (a) Split the data in to a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
set.seed(1)
train<-sample(length(Outstate), length(Outstate)/2)
test<- -train
college_train<- College[train,]
college_test<- College[test,]
# perform forward stepwise selection on the training set
fit<-regsubsets(Outstate ~ ., data = college_train, nvmax = 17, method = "forward")
fit_summary <- summary(fit)
par(mfrow=(c(1,3)))
plot(fit_summary$cp, xlab='Number of variables', ylab = "Mallow's Cp", type = 'l')
min_cp<- min(fit_summary$cp)
std_cp<- sd(fit_summary$cp)
abline(h=min_cp + 0.2*std_cp, col = 'red', lty = 2)
abline(h=min_cp - 0.2*std_cp, col = 'red', lty = 2)
plot(fit_summary$bic, xlab='Number of variables', ylab = "BIC", type = 'l')
min_bic<- min(fit_summary$bic)
std_bic<- sd(fit_summary$bic)
abline(h=min_bic + 0.2*std_bic, col = 'red', lty = 2)
abline(h=min_bic - 0.2*std_bic, col = 'red', lty = 2)
plot(fit_summary$adjr2, xlab='Number of variables', ylab = "Adjusted R-Squared", type = 'l')
min_adjr2<- min(fit_summary$adjr2)
std_adjr2<- sd(fit_summary$adjr2)
abline(h=min_adjr2 + 0.2*std_adjr2, col = 'red', lty = 2)
abline(h=min_adjr2 - 0.2*std_adjr2, col = 'red', lty = 2)
lm<-regsubsets(Outstate ~ ., data= College, method = "forward")
coeffs<- coef(fit, id=6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "Terminal" "perc.alumni"
## [6] "Expend" "Grad.Rate"
gam1<- gam(Outstate ~ Private + s(Room.Board, k=2) + s(PhD, k=2) + s(perc.alumni, k=2) + s(Expend, k=5) + s(Grad.Rate, k=2), data = college_train)
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
par(mfrow=c(2,3))
plot(gam1, se=T, col="lightblue")
set.seed(1)
lm_fit = lm(Outstate ~ Private + Room.Board + Terminal + perc.alumni + Expend + Grad.Rate, data = college_train)
lm_preds<-predict(lm_fit, college_test)
gam_preds<-predict(gam1, college_test)
err_lm<-sqrt(mean((college_test$Outstate - lm_preds)^2))
err_gam<-sqrt(mean((college_test$Outstate - gam_preds)^2))
err_lm
## [1] 1960.831
err_gam
## [1] 1840.276
summary(gam1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Outstate ~ Private + s(Room.Board, k = 2) + s(PhD, k = 2) + s(perc.alumni,
## k = 2) + s(Expend, k = 5) + s(Grad.Rate, k = 2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8791.7 228.7 38.436 < 2e-16 ***
## PrivateYes 2336.7 284.5 8.213 3.46e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Room.Board) 1.375 1.604 34.60 < 2e-16 ***
## s(PhD) 1.000 1.000 1.82 0.178
## s(perc.alumni) 1.000 1.000 21.00 6.69e-06 ***
## s(Expend) 3.564 3.892 37.24 < 2e-16 ***
## s(Grad.Rate) 1.000 1.000 22.63 3.02e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.788 Deviance explained = 79.3%
## GCV = 3.9264e+06 Scale est. = 3.8258e+06 n = 388