Problem 6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

library(ISLR)
library(boot)
attach(Wage)
set.seed(15)
cv.error <- rep(0,5)

for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1676.192 1599.786 1596.256 1593.755 1595.106
plot(cv.error, type="b", xlab="Degree", ylab="Test MSE")
points(which.min(cv.error), cv.error[4], col="red", pch=20, cex=2)

The optimal degree for the polynomial chosen by cross-validation is 4.
Comparing the results of hypothesis testing using ANOVA.

fit_1 = lm(wage ~ age, data=Wage)
fit_2 = lm(wage ~ poly(age, 2), data=Wage) 
fit_3 = lm(wage ~ poly(age, 3), data=Wage) 
fit_4 = lm(wage ~ poly(age, 4), data=Wage) 
fit_5 = lm(wage ~ poly(age, 5), data=Wage) 
anova(fit_1, fit_2, fit_3, fit_4, fit_5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value indicates that either a cubic or quadratic fit provide a reasonable fit to the data.

age_lim = range(age)
age_grid = seq(from=age_lim[1], to=age_lim[2])
preds = predict(fit_4, newdata=list(age=age_grid),se=TRUE)
se_bands = cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)

plot(age, wage, xlim=age_lim, cex=.5, col="darkgrey")
lines(age_grid, preds$fit, lwd=2, col="blue")
matlines(age_grid, se_bands, lwd=1, col="red", lty=3)

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(2)

cv.errors = rep(NA, 10)

for(i in 2:10){
  Wage$age.cut = cut(Wage$age,i)
  glm.fit = glm(wage ~ age.cut, data=Wage)
  cv.errors[i] = cv.glm(Wage, glm.fit, K=10)$delta[1]
}

cv.errors
##  [1]       NA 1734.448 1681.623 1635.869 1632.864 1621.643 1613.148 1601.850
##  [9] 1610.852 1603.607
plot(2:10, cv.errors[-1], type="b", xlab="Number of cuts", ylab="Test MSE")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col="red", pch=20, cex=2)

The optimal number of cuts is 8.

fit_step = glm(wage ~ cut(age, 8), data=Wage)
preds = predict(fit_step, data.frame(age = age_grid))
plot(age, wage, col="darkgray")
lines(age_grid, preds, col="darkred", lwd=2)

detach(Wage)

Problem 10. This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

attach(College)
set.seed(5)
test_sample = sample(1:nrow(College), nrow(College)/4)
train = College[-test_sample, ]
test = College[test_sample, ]
library(leaps)
fwd = regsubsets(Outstate ~ ., data=train, nvmax=17, method='forward')
fwd_sum = summary(fwd)
par(mfrow=c(2,2))
plot(fwd_sum$cp ,xlab="Number of Variables ", ylab="Cp",
type="b")
points(which.min(fwd_sum$cp), fwd_sum$cp[which.min(fwd_sum$cp)], col="red", cex=2, pch=20)

plot(fwd_sum$bic ,xlab="Number of Variables ", 
ylab="BIC",type="b")
points(which.min(fwd_sum$bic), fwd_sum$bic[which.min(fwd_sum$bic)], col="red", cex=2, pch=20)

plot(fwd_sum$adjr2 ,xlab="Number of Variables ", 
ylab="Adjusted R^2^",type="b")
points(which.max(fwd_sum$adjr2), fwd_sum$adjr2[which.max(fwd_sum$adjr2)], col="red", cex=2, pch=20)

which.min(fwd_sum$cp)
## [1] 13
which.min(fwd_sum$bic)
## [1] 11
which.max(fwd_sum$adjr2)
## [1] 14
test_matrix = model.matrix(Outstate~., data=test)

val.errors = rep(NA,17)
for(i in 1:17){
 coefi = coef(fwd,id=i)
 pred = test_matrix[,names(coefi)]%*%coefi
 val.errors[i] = mean((test$Outstate-pred)^2) 
}

which.min(val.errors)
## [1] 6
plot(val.errors, type='b')
points(which.min(val.errors), val.errors[6], col='red', pch=20, cex=2)

The best model selected using the forward stepwise method is the one that has 6 variables. To get the accurate estimate of the variable and coefficients, need to perform foward subset selection on the entire data set.

fwd_full = regsubsets(Outstate ~ ., data=College, nvmax=17, method='forward')
coef(fwd_full, 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3553.2345268  2768.6347025     0.9679086    35.5283359    48.4221031 
##        Expend     Grad.Rate 
##     0.2210255    29.7119093

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

library(gam)
library(splines)
gam_fit = gam(Outstate ~ Private + s(Room.Board, 3) + s(Terminal, 3) + s(perc.alumni, 3) + s(Expend, 3) + s(Grad.Rate, 3), data=train)

par(mfrow=c(2,3))
plot(gam_fit, se=TRUE, col="blue")

(c) Evaluate the model obtained on the test set, and explain the results obtained.

preds = predict(gam_fit, newdata = test)
error = mean((test$Outstate-preds)^2)
val.errors[6]-error
## [1] 388542.3

With 6 variables GAM model does better than simple linear variable.
(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam_fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 3) + s(Terminal, 
##     3) + s(perc.alumni, 3) + s(Expend, 3) + s(Grad.Rate, 3), 
##     data = train)
## Deviance Residuals:
##       Min        1Q    Median        3Q       Max 
## -7067.838 -1138.361     4.964  1256.279  8163.609 
## 
## (Dispersion Parameter for gaussian family taken to be 3646032)
## 
##     Null Deviance: 9466747723 on 582 degrees of freedom
## Residual Deviance: 2063655284 on 566.0003 degrees of freedom
## AIC: 10481.86 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 2545651515 2545651515 698.198 < 2.2e-16 ***
## s(Room.Board, 3)    1 1826513428 1826513428 500.959 < 2.2e-16 ***
## s(Terminal, 3)      1  507722090  507722090 139.253 < 2.2e-16 ***
## s(perc.alumni, 3)   1  341093757  341093757  93.552 < 2.2e-16 ***
## s(Expend, 3)        1  802720318  802720318 220.163 < 2.2e-16 ***
## s(Grad.Rate, 3)     1  141392931  141392931  38.780 9.253e-10 ***
## Residuals         566 2063655284    3646032                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F   Pr(F)    
## (Intercept)                                 
## Private                                     
## s(Room.Board, 3)        2  2.506 0.08252 .  
## s(Terminal, 3)          2  1.620 0.19880    
## s(perc.alumni, 3)       2  0.997 0.36953    
## s(Expend, 3)            2 45.483 < 2e-16 ***
## s(Grad.Rate, 3)         2  3.191 0.04188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The model suggests a strong non-linear relationship between “Outstate” and “Expend”, and a moderate non-linear relationship between “Outstate” and “Grad.rate”