Question 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library(ISLR2)
attach(College)
library(leaps)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(boot)
set.seed(1)
  1. Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
attach(Wage)
set.seed(42)
cv.error.5 <- rep(0, 5)
for (i in 1:5) {
  lm.fit <- glm(wage ~ poly(age, i, raw = TRUE), data = Wage)
  cv.error.5[i] <- cv.glm(Wage, lm.fit, K = 10)$delta[1] 
}
cv.error.5
## [1] 1676.334 1601.952 1597.313 1594.688 1595.061

Looking at the errors for all of the models we can see the minimum error when the degree d is set to 4.

fit.1 <- lm(wage ~ age, data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage) 
fit.3 <- lm(wage ~ poly(age, 3), data = Wage) 
fit.4 <- lm(wage ~ poly(age, 4), data = Wage) 
fit.5 <- lm(wage ~ poly(age, 5), data = Wage) 
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Looking at the results from the ANOVA test above we can confirm that degree d being 4 is optimal as the p-value is closest to .05.

coef(summary(fit.4)) 
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(fit.4, newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit, preds$fit - 2 * preds$se.fit)
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 4, 0))
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") > title("Degree-4 Polynomial", outer = T)
## logical(0)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)

  1. Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.
Wage_clean <- na.omit(Wage)
set.seed(42)
cv.error.10 <- rep(NA, 8)
for (i in 2:10) {
  Wage_clean$age.cut <- cut(Wage_clean$age, i)
  glm.fit <- glm(wage ~ age.cut, data = Wage_clean)
  cv.error.10[i - 1] <- cv.glm(Wage_clean, glm.fit, K = 10)$delta[1]
}
cv.error.10
## [1] 1733.565 1684.082 1637.465 1632.337 1623.836 1610.572 1602.163 1612.770
## [9] 1606.141

Looking at the results from the cross-validation used on the step function, I found that the minimum error was obtained at 7 cuts.

fit.7 <- lm(wage ~ cut(age, 7), data = Wage)
coef(summary(fit.7)) 
##                        Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)            80.06402   2.405222 33.287586 5.517532e-207
## cut(age, 7)(26.9,35.7] 24.51568   2.892501  8.475600  3.617008e-17
## cut(age, 7)(35.7,44.6] 38.59253   2.792477 13.820180  3.695152e-42
## cut(age, 7)(44.6,53.4] 38.05482   2.812402 13.531077  1.556835e-40
## cut(age, 7)(53.4,62.3] 39.03969   3.082094 12.666611  7.411125e-36
## cut(age, 7)(62.3,71.1] 31.03930   4.884196  6.355048  2.400708e-10
## cut(age, 7)(71.1,80.1] 15.99445   9.075719  1.762334  7.811488e-02
age.grid <- seq(from = agelims[1], to = agelims[2])
preds <- predict(fit.7, newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit, preds$fit - 2 * preds$se.fit)
par(mfrow = c(1, 2), mar = c(4.5, 4.5, 1, 1), oma = c(0, 0, 4, 0))
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey") > title("7-Cut", outer = T)
## logical(0)
lines(age.grid, preds$fit, lwd = 2, col = "blue")
matlines(age.grid, se.bands, lwd = 1, col = "blue", lty = 3)

Question 10

This question relates to the College data set. ### (a) Split the data in to a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(1)
train<-sample(length(Outstate), length(Outstate)/2)
test<- -train

college_train<- College[train,]
college_test<- College[test,]

# perform forward stepwise selection on the training set
fit<-regsubsets(Outstate ~ ., data = college_train, nvmax = 17, method = "forward")
fit_summary <- summary(fit)

par(mfrow=(c(1,3)))
plot(fit_summary$cp, xlab='Number of variables', ylab = "Mallow's Cp", type = 'l')
min_cp<- min(fit_summary$cp)
std_cp<- sd(fit_summary$cp)
abline(h=min_cp + 0.2*std_cp, col = 'red', lty = 2)
abline(h=min_cp - 0.2*std_cp, col = 'red', lty = 2)

plot(fit_summary$bic, xlab='Number of variables', ylab = "BIC", type = 'l')
min_bic<- min(fit_summary$bic)
std_bic<- sd(fit_summary$bic)
abline(h=min_bic + 0.2*std_bic, col = 'red', lty = 2)
abline(h=min_bic - 0.2*std_bic, col = 'red', lty = 2)

plot(fit_summary$adjr2, xlab='Number of variables', ylab = "Adjusted R-Squared", type = 'l')
min_adjr2<- min(fit_summary$adjr2)
std_adjr2<- sd(fit_summary$adjr2)
abline(h=min_adjr2 + 0.2*std_adjr2, col = 'red', lty = 2)
abline(h=min_adjr2 - 0.2*std_adjr2, col = 'red', lty = 2)

lm<-regsubsets(Outstate ~ ., data= College, method = "forward")
coeffs<- coef(fit, id=6)
names(coeffs)
## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "Terminal"    "perc.alumni"
## [6] "Expend"      "Grad.Rate"

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam1<- gam(Outstate ~ Private + s(Room.Board, k=2) + s(PhD, k=2) + s(perc.alumni, k=2) + s(Expend, k=5) + s(Grad.Rate, k=2), data = college_train)
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
## Warning in smooth.construct.tp.smooth.spec(object, dk$data, dk$knots): basis dimension, k, increased to minimum possible
par(mfrow=c(2,3))
plot(gam1, se=T, col="lightblue")

(c) Evaluate the model obtained on the test set, and explain the results obtained.

set.seed(1)
lm_fit = lm(Outstate ~ Private + Room.Board + Terminal + perc.alumni + Expend + Grad.Rate, data = college_train)

lm_preds<-predict(lm_fit, college_test)
gam_preds<-predict(gam1, college_test)

err_lm<-sqrt(mean((college_test$Outstate - lm_preds)^2))
err_gam<-sqrt(mean((college_test$Outstate - gam_preds)^2))

err_lm
## [1] 1960.831
err_gam 
## [1] 1840.276

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam1)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Outstate ~ Private + s(Room.Board, k = 2) + s(PhD, k = 2) + s(perc.alumni, 
##     k = 2) + s(Expend, k = 5) + s(Grad.Rate, k = 2)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8791.7      228.7  38.436  < 2e-16 ***
## PrivateYes    2336.7      284.5   8.213 3.46e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                  edf Ref.df     F  p-value    
## s(Room.Board)  1.375  1.604 34.60  < 2e-16 ***
## s(PhD)         1.000  1.000  1.82    0.178    
## s(perc.alumni) 1.000  1.000 21.00 6.69e-06 ***
## s(Expend)      3.564  3.892 37.24  < 2e-16 ***
## s(Grad.Rate)   1.000  1.000 22.63 3.02e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.788   Deviance explained = 79.3%
## GCV = 3.9264e+06  Scale est. = 3.8258e+06  n = 388