Chapter 07 (page 297): 6, 10

6) In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library(ISLR)
library(boot)
library(leaps)
library(gam)
attach(Wage)
(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
set.seed(1)

crossValError = rep(1,6)
for (i in 1:6) {
  poly.fit = glm(wage~poly(age,i), data = Wage)
  crossValError[i] = cv.glm(Wage, poly.fit, K=10)$delta[1] 
}
crossValError
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061
plot(crossValError, type='o', col = 'blue')

which.min(crossValError)
## [1] 5

The model fit with 5 as the degree of the polynomial has the lowest MSE, but based on the above plot, we can see there is very minimal difference between using 4 and 5 as the degree of the polynomial. As such 4 will be used as the degree of the polynomial.

fit.4=lm(wage~poly(age,4),data=Wage)
coef(summary(fit.4))
##                 Estimate Std. Error    t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287409 153.283015 0.000000e+00
## poly(age, 4)1  447.06785 39.9147851  11.200558 1.484604e-28
## poly(age, 4)2 -478.31581 39.9147851 -11.983424 2.355831e-32
## poly(age, 4)3  125.52169 39.9147851   3.144742 1.678622e-03
## poly(age, 4)4  -77.91118 39.9147851  -1.951938 5.103865e-02
fit.1=lm(wage~age,data=Wage)
fit.2=lm(wage~poly(age,2),data=Wage)
fit.3=lm(wage~poly(age,3),data=Wage)
fit.4=lm(wage~poly(age,4),data=Wage)
fit.5=lm(wage~poly(age,5),data=Wage)
fit.6=lm(wage~poly(age,6),data=Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6636 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8936  0.001675 ** 
## 4   2995 4771604  1      6070   3.8117  0.050989 .  
## 5   2994 4770322  1      1283   0.8054  0.369565    
## 6   2993 4766389  1      3932   2.4692  0.116201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Based on the above ANOVA table, we can use a quadratic polynomial. The fits with 2 and 3 degrees for the polynomial are insufficient, while greater than 4 seems unnecessarily high. This corresponds with the results from cross-validation.

agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
lm.fit = lm(wage~poly(age, 3), data=Wage)
lm.pred = predict(lm.fit, data.frame(age=age.grid))
attach(Wage)
plot(Wage$age,wage,xlim=agelims,cex =.5,col="darkgrey")
lines(age.grid, lm.pred, col="blue", lwd=2)

(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.
step.mse=c()
for(i in 2:10){
  fit.step=model.frame(wage~cut(age,i),data=Wage)
  names(fit.step)=c('wage','age')
  
  step.fit=glm(wage~age,data=fit.step)
  mse=cv.glm(step.fit,data = fit.step,K=10)$delta[1]
  step.mse=c(step.mse,mse)
}

plot(step.mse,xlab='Degree of Polynomial',ylab='Cross Validation Error',type='l', col='blue')

which.min(step.mse)
## [1] 7
agelims = range(Wage$age)
age.grid = seq(from=agelims[1], to=agelims[2])
fit.step = glm(wage~cut(age,7), data=Wage)
pred.step = predict(fit.step, newdata=list(age=age.grid), se=TRUE)
se.bands = pred.step$fit + cbind(2*pred.step$se.fit, -2*pred.step$se.fit)
plot(Wage$age,Wage$wage, xlim=agelims, cex=0.5, col="darkgrey")
lines(age.grid, pred.step$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

10) This question relates to the College data set.

attach(College)
(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
set.seed(1)
train=sample(1:nrow(College), nrow(College)*.75)
train.college=College[train,]
test=(-train)
test.college=College[test,]
outstate.test=College$Outstate[test]
regfit.best=regsubsets(Outstate~.,data=College[train,],nvmax=17)
test.mat=model.matrix(Outstate~.,data=College[test,])
val.errors=rep(NA,17)
for(i in 1:17){
   coefi=coef(regfit.best,id=i)
   pred=test.mat[,names(coefi)]%*%coefi
   val.errors[i]=mean((College$Outstate[test]-pred)^2)
}
val.errors
##  [1] 7369252 6221067 4950665 3993584 3757305 3529917 3572471 3655013 3419373
## [10] 3257888 3356971 3322114 3301218 3276352 3296237 3313388 3303868
which.min(val.errors)
## [1] 10
summary(regfit.best)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College[train, ], nvmax = 17)
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: exhaustive
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    "*"       " "       " "        
## 9  ( 1 )  "*"        "*"  "*"    " "    "*"       " "       "*"        
## 10  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 11  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 12  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      " " "*"      " "      
## 9  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 10  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
## 11  ( 1 ) " "         "*"        " "   "*"      " " "*"      " "      
## 12  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  "*"         " "    " "      
## 3  ( 1 )  "*"         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    " "      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"

The model with 10 predictors is identified as being the best. This model predicts Outstate with Private, Apps, Accept, Top10perc, F.Undergrad, Room.Board, Terminal, perc.alumni, Expend, and Grad.Rate

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
attach(College)
gam.fit = gam(Outstate~Private+s(Apps,4)+s(Accept,4)+s(Top10perc,4)+s(F.Undergrad,4)+s(Room.Board,4)+s(Terminal,4)+s(perc.alumni,4)+s(Expend,4)+s(Grad.Rate,4), data = College[train,])
plot.Gam(gam.fit, se=TRUE,col ="RED")

Most of the predictor variables seem to have a fairly linear. Expend is the most non-linear. However, there is some movement in the plots for each of the other variables, so perhaps a GAM would provide a better fit than a linear model.

(c) Evaluate the model obtained on the test set, and explain the results obtained.
pred.gam = predict(gam.fit, test.college)
MSE.gam = mean((outstate.test - pred.gam)^2)
MSE.gam
## [1] 2984282
val.errors[10]
## [1] 3257888

The GAM resulted in a MSE of 2984282. Compared to the MSE from the best linear model, 3257888. Based on this, the GAM looks to be a better fit.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

Expend appears to be the most non-linear. Additionally, it appears that both Top10perc and Room.Board may have a slightly non-linear relationship with Outstate