6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

fit=lm(wage~poly(age, 5), data=Wage)
coef(summary(fit))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01
fit.1 = lm(wage~age, data=Wage)
fit.2 = lm(wage~poly(age, 2), data=Wage)
fit.3 = lm(wage~poly(age, 3), data=Wage)
fit.4 = lm(wage~poly(age, 4), data=Wage)
fit.5 = lm(wage~poly(age, 5), data=Wage)

anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
attach(Wage)
agelims=range(Wage$age)
age.grid=seq(from=agelims[1],to=agelims[2])

preds=predict(fit,newdata=list(age=age.grid),se=TRUE)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)


plot(age,wage,xlim=agelims,cex=.5,col="darkgrey")
title("Degree 5 Polynomial")
lines(age.grid,preds$fit,lwd=2,col="red")
matlines(age.grid,se.bands,lwd =1,col="blue",lty =3)

detach(Wage)

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

library(boot)
cv.error.10 = rep(NA,9)
for (i in 2:10) {
  Wage$age.cut = cut(Wage$age,i)
  step.fit=glm(wage~age.cut,data=Wage)
  cv.error.10[i-1]=cv.glm(Wage,step.fit,K=10)$delta[1] # [1]: Std [2]: Bias corrected.
}
which.min(cv.error.10)+1# index starts at 2 so add 1 to get count of true cuts
## [1] 8
cv.error.10
## [1] 1734.648 1683.086 1636.543 1630.488 1623.839 1611.798 1600.291 1608.664
## [9] 1603.674
step.fit = glm(wage~cut(age,8), data=Wage)
preds2=predict(step.fit,newdata=list(age=age.grid), se=T)
se.bands2=cbind(preds2$fit+2*preds2$se.fit,preds2$fit-2*preds2$se.fit)
plot(Wage$age,Wage$wage,xlim=agelims,cex=.5,col="darkgrey")
title("8 cut step Function")
lines(age.grid,preds2$fit,lwd=2,col="red")
matlines(age.grid,se.bands2,lwd =1,col="blue",lty =3)

10. This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

train = sample(length(College$Outstate), length(College$Outstate)/2)
College.train = College[train, ]
College.test = College[-train, ]
regfit.fwd = regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")
fit.summary = summary(regfit.fwd)
which.min(fit.summary$cp) 
## [1] 11
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College.train, nvmax = 17, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  "*"    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  "*"    "*"    " "       " "       " "        
## 10  ( 1 ) "*"        "*"  "*"    "*"    " "       " "       " "        
## 11  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 12  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 14  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       " "        
## 15  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 6  ( 1 )  " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 )  " "         "*"        " "   " "      " " "*"      "*"      
## 8  ( 1 )  " "         "*"        " "   " "      " " "*"      "*"      
## 9  ( 1 )  " "         "*"        " "   " "      " " "*"      "*"      
## 10  ( 1 ) " "         "*"        " "   " "      " " "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   " "      " " "*"      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      " " "*"      "*"      
## 13  ( 1 ) "*"         "*"        " "   "*"      " " "*"      "*"      
## 14  ( 1 ) "*"         "*"        "*"   "*"      " " "*"      "*"      
## 15  ( 1 ) "*"         "*"        "*"   "*"      " " "*"      "*"      
## 16  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"

Model with 13 variables is best fitted model.

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam.fit = gam(Outstate~Private+
               s(Room.Board,4)+
               s(PhD,4)+
               s(perc.alumni,2)+
               s(Expend,4)+
               s(Grad.Rate,5)+
               s(Apps,4)+
               s(Accept,4)+
               s(Enroll,2)+
               s(Top10perc,4)+
               s(F.Undergrad,5)+
               s(Personal,2)+
               s(S.F.Ratio,4)
              
              ,data=College.train)
par(mfrow=c(4,3))
plot(gam.fit, col="blue", se=T)

(c) Evaluate the model obtained on the test set, and explain the results obtained.

gam.pred = predict(gam.fit, College.test)
gam.err = mean((College.test$Outstate - gam.pred)^2)
gam.err
## [1] 4105545

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam.fit)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 4) + s(PhD, 
##     4) + s(perc.alumni, 2) + s(Expend, 4) + s(Grad.Rate, 5) + 
##     s(Apps, 4) + s(Accept, 4) + s(Enroll, 2) + s(Top10perc, 4) + 
##     s(F.Undergrad, 5) + s(Personal, 2) + s(S.F.Ratio, 4), data = College.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -5507.53 -1101.28    31.83  1007.49  7333.42 
## 
## (Dispersion Parameter for gaussian family taken to be 3174103)
## 
##     Null Deviance: 6364591940 on 387 degrees of freedom
## Residual Deviance: 1085540457 on 341.9991 degrees of freedom
## AIC: 6954.701 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private             1 1686196996 1686196996 531.2357 < 2.2e-16 ***
## s(Room.Board, 4)    1 1343943478 1343943478 423.4089 < 2.2e-16 ***
## s(PhD, 4)           1  417275830  417275830 131.4626 < 2.2e-16 ***
## s(perc.alumni, 2)   1  273988960  273988960  86.3201 < 2.2e-16 ***
## s(Expend, 4)        1  420642744  420642744 132.5233 < 2.2e-16 ***
## s(Grad.Rate, 5)     1   27908264   27908264   8.7925 0.0032372 ** 
## s(Apps, 4)          1    2612154    2612154   0.8230 0.3649554    
## s(Accept, 4)        1   13407989   13407989   4.2242 0.0406108 *  
## s(Enroll, 2)        1   43495086   43495086  13.7031 0.0002495 ***
## s(Top10perc, 4)     1   21635330   21635330   6.8162 0.0094310 ** 
## s(F.Undergrad, 5)   1     773504     773504   0.2437 0.6218689    
## s(Personal, 2)      1    5424145    5424145   1.7089 0.1920096    
## s(S.F.Ratio, 4)     1    2091687    2091687   0.6590 0.4174830    
## Residuals         342 1085540457    3174103                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Room.Board, 4)        3  1.2595 0.2881914    
## s(PhD, 4)               3  2.1636 0.0920847 .  
## s(perc.alumni, 2)       1  0.8996 0.3435640    
## s(Expend, 4)            3 14.0742 1.135e-08 ***
## s(Grad.Rate, 5)         4  1.8965 0.1106130    
## s(Apps, 4)              3  2.5769 0.0536815 .  
## s(Accept, 4)            3  5.7835 0.0007237 ***
## s(Enroll, 2)            1  0.2299 0.6319111    
## s(Top10perc, 4)         3  3.5244 0.0152543 *  
## s(F.Undergrad, 5)       4  3.4834 0.0083270 ** 
## s(Personal, 2)          1  3.2295 0.0731986 .  
## s(S.F.Ratio, 4)         3  2.4202 0.0659357 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Under Anova for Parametric Effects these variables are non significant: Apps, Top10perc, F.Undergrad, S.F.Ratio