library(ISLR2)
library(MASS)
library(class)
library(caret)
library(tidyverse)
library(boot)
library(leaps)
library(gam)

6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

attach(Wage)

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

fit=lm(wage~poly(age,4, raw=TRUE),data=Wage)
coef(summary(fit))
##                                Estimate   Std. Error   t value     Pr(>|t|)
## (Intercept)               -1.841542e+02 6.004038e+01 -3.067172 0.0021802539
## poly(age, 4, raw = TRUE)1  2.124552e+01 5.886748e+00  3.609042 0.0003123618
## poly(age, 4, raw = TRUE)2 -5.638593e-01 2.061083e-01 -2.735743 0.0062606446
## poly(age, 4, raw = TRUE)3  6.810688e-03 3.065931e-03  2.221409 0.0263977518
## poly(age, 4, raw = TRUE)4 -3.203830e-05 1.641359e-05 -1.951938 0.0510386498

Using raw=TRUE allows us to see the raw coefficients for the 4 polynomial.

agelims=range(age)
range(age)
## [1] 18 80
age.grid=seq(from=agelims[1], to=agelims[2])
preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit, preds$fit-2*preds$se.fit)
plot(age,wage,xlim=agelims, cex=.5, col='darkgrey')
title('Degree-4 Polynomial')
lines(age.grid,preds$fit,lwd=2,col='darkblue')
matlines(age.grid,se.bands,lwd=1,col='lightblue', lty=3)

fit.1=lm(wage~age, data=Wage)
fit.2=lm(wage~poly(age, 2), data=Wage)
fit.3=lm(wage~poly(age, 3), data=Wage)
fit.4=lm(wage~poly(age, 4), data=Wage)
fit.5=lm(wage~poly(age, 5), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Using the first P value (2.2e-16), I reject the null hypothesis that the simpler model is better. The squared term is better and is needed. The third term is better as well (using the next P value). Using the fourth is very close and is nearly exactly .05. The fifth however is rejected, but the fourth is sufficient.

coef(summary(fit.5))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01

using coef() we can confirm to use 4th degree polynomial.

Although the results should be very similar, I will now recreate the process using cv.glm in the boot package to calculate the cross validation error, as requested in the problem. Rather than doing each one and individually saving them into different objects, we can use a for loop to check them all in sequence.

set.seed(1)
poly.mse=c()
for(degree in 1:7){
  poly.fit=glm(wage~poly(age,degree,raw=T),data=Wage)
  mse=cv.glm(poly.fit,data = Wage,K=10)$delta[1]
  poly.mse=c(poly.mse,mse)
}

plot(poly.mse,xlab='Degree of Polynomial',ylab='Cross Validation Error',type='l')

Our Cross Validation Error basically levels off after degree 4, which agrees with ANOVA and the previous method.

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

all.cvs = rep(NA, 10)
for (i in 2:10) {
  Wage$age.cut = cut(Wage$age, i)
  lm.fit = glm(wage~age.cut, data=Wage)
  all.cvs[i] = cv.glm(Wage, lm.fit, K=10)$delta[2]
}
plot(2:10, all.cvs[-1], xlab="Number of cuts", ylab="CV error", type="l", pch=20, lwd=2)

x=which.min(all.cvs)
points(x,all.cvs[x],pch=20,cex=2,col='red')

Checking up to 10, we can see that minimum CV error occurs at 8.

lm.fit = glm(wage~cut(age, 8), data=Wage)
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.pred = predict(lm.fit, data.frame(age=age.grid))
plot(wage~age, data=Wage, col="darkgrey")
lines(age.grid, lm.pred, col="darkblue", lwd=2)

Finally, repeat training the data using the 8 cuts calculated above and plot it.

10

This question relates to the College data set.

detach(Wage)
attach(College)

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

First we’ll split the data 70/30.

set.seed(1)
train=sample(1:dim(College)[1], round(dim(College)[1]*.7))
test=-train
College.train = College[train, ]
College.test = College[-train, ]
nrow(College.train) / nrow(College)
## [1] 0.7001287
nrow(College.test) / nrow(College)
## [1] 0.2998713

And now to perform forward stepwise selection of the predictors.

forward=regsubsets(Outstate~.,data=College,method = 'forward',nvmax = 17)

plot(1/nrow(College)*summary(forward)$rss,type='l',xlab='Number of Predictors',ylab='Train MSE Score',xaxt='n')
axis(side=1,at=seq(1,17,2),labels = seq(1,17,2))

Looking at the top 6 predictors:

which(summary(forward)$which[6,-1])
##  PrivateYes  Room.Board         PhD perc.alumni      Expend   Grad.Rate 
##           1           9          12          15          16          17

The right number of predictors appears to be around 6 or 7. After that it levels off; if we wanted to keep adding predictors, we do more gains between predictors of 10 through 12, but 6 appears ideal. Reading some work of other analysts on this problem, they have calculated the minimum occurs at 14 or 17, which appears in line with our graph above. The simplest model with a cross-validation error near the bottom is 6.

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam.fit=gam(Outstate~Private+s(Room.Board)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate),data=College[train,])

par(mfrow = c(2, 3))
plot(gam.fit,se=T,col='blue')

Using the gam library, we can create smoothing splines for each of the prediction variables (except Private, which is qualitative).

(c) Evaluate the model obtained on the test set, and explain the results obtained.

gam.pred = predict(gam.fit, College.test)
gam.err = mean((College.test$Outstate - gam.pred)^2)
gam.err
## [1] 3170227
gam.tss = mean((College.test$Outstate - mean(College.test$Outstate))^2)
test.rss = 1 - gam.err/gam.tss
test.rss
## [1] 0.7712156

For GAM: The test MSR is 3170227. The \(R^{2}\) is .7712.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate), data = College[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7511.91 -1172.59    38.71  1267.44  7827.85 
## 
## (Dispersion Parameter for gaussian family taken to be 3617918)
## 
##     Null Deviance: 9263945675 on 543 degrees of freedom
## Residual Deviance: 1888551843 on 521.9997 degrees of freedom
## AIC: 9782.515 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private          1 2402224634 2402224634 663.980 < 2.2e-16 ***
## s(Room.Board)    1 1721929895 1721929895 475.945 < 2.2e-16 ***
## s(PhD)           1  620590394  620590394 171.532 < 2.2e-16 ***
## s(perc.alumni)   1  456743095  456743095 126.245 < 2.2e-16 ***
## s(Expend)        1  746743434  746743434 206.401 < 2.2e-16 ***
## s(Grad.Rate)     1   99786036   99786036  27.581 2.197e-07 ***
## Residuals      522 1888551843    3617918                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar F   Pr(F)    
## (Intercept)                              
## Private                                  
## s(Room.Board)        3  2.680 0.04629 *  
## s(PhD)               3  1.179 0.31718    
## s(perc.alumni)       3  0.937 0.42233    
## s(Expend)            3 32.503 < 2e-16 ***
## s(Grad.Rate)         3  1.995 0.11380    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Using Anova for Nonparametric Effects we see that Expend has a near 0 P value. Also, Room.Board is below .05 showing significance. The remaining variables show a linear effect (all variables have low P values in the Anova for Parametric Effects).

detach(College)