library(tidyverse)
library(openintro)
library(ISLR)
library(boot)
library(leaps)
library(gam)

Exercise 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

#set.seed(1)
## crossval = rep(0, 10)
## for (i in 1:10) {
##  fit = glm(wage ~ poly(age, i), data = Wage)
##  crossval[i] = cv.glm(Wage, fit)$delta[1]
##}

##plot(1:degree, crossval, xlab = 'degree', ylab = 'Test MSE', type = 'l')
##deg.min = which.min(crossval)
##points(deg.min, crossval[deg.min], col = 'red', cex = 2, pch = 19)

note when knitting the code chunk would not load, so I commented out

This tells us that the minimum of test MSE is at the 9th degree, and the 4th degree is deemed small enough.

plot(wage ~ age, data = Wage, col = "grey")
agerange = range(Wage$age)
agegrid = seq(from = agerange[1], to = agerange[2])
fitlm = lm(wage ~ poly(age, 3), data = Wage)
predic = predict(fitlm, newdata = list(age = agegrid))
lines(agegrid, predic, col = "Blue", lwd = 2)

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

##cval = rep(NA, degree)
##for (i in 2:degree) {
##  Wage$age.cut = cut(Wage$age, i)
##  fitcv = glm(wage ~ age.cut, data = Wage)
##  cval[i] = cv.glm(Wage, fit)$delta[1]
##}
##plot(2:degree, cval[-1], xlab = 'Cuts', ylab = 'Test MSE', type = 'l')
##deg.min = which.min(cval)
##points(deg.min, cval[deg.min], col = 'red', cex = 2, pch = 19)

note when knitting the code chunk would not load, so I commented out

The optimal number of cuts is 8 based on the minimum test MSE.

plot(wage ~ age, data = Wage, col = "grey")
fit = glm(wage ~ cut(age, 8), data = Wage)
preds = predict(fit, list(age = agegrid))
lines(agegrid, preds, col = "blue", lwd = 2)

Exercise 10

This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

train = sample(1: nrow(College), nrow(College)/2)
test = -train
fit = regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary = summary(fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 ) "*"        " "  "*"    " "    " "       " "       " "        
## 8  ( 1 ) "*"        " "  "*"    "*"    " "       " "       " "        
##          P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 ) " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
## 6  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
## 8  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
##          perc.alumni Expend Grad.Rate
## 1  ( 1 ) " "         "*"    " "      
## 2  ( 1 ) " "         "*"    " "      
## 3  ( 1 ) " "         "*"    " "      
## 4  ( 1 ) "*"         "*"    " "      
## 5  ( 1 ) "*"         "*"    " "      
## 6  ( 1 ) "*"         "*"    "*"      
## 7  ( 1 ) "*"         "*"    "*"      
## 8  ( 1 ) "*"         "*"    "*"
coef(fit, id = 6)
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -4037.0865719  2680.2790846     0.7993214    41.8254047    62.2156457 
##        Expend     Grad.Rate 
##     0.2612828    28.4894543

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gamm = gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gamm, se = TRUE, col = 'blue')

It appears that as room and board costs as well as alumni percent increase, so does out of state tuition. It also appears that expend and graduation rates are not linear with regards to out of state tuition

(c) Evaluate the model obtained on the test set, and explain the results obtained.

prediction = predict(gamm,newdata = College)
mse = mean((College$Outstate - prediction)^2)
mse
## [1] 3517639
preds = predict(gamm, College[test, ])
RSS = sum((College[test, ]$Outstate - preds)^2) 
TSS = sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)
## [1] 0.7602172

The MSE is 3408341 and the r^2 0.7722337, meaning it is a relatively good model based off these values.

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gamm)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 
##     5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), 
##     data = College, subset = train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -5122.1 -1128.3   102.8  1152.5  7034.1 
## 
## (Dispersion Parameter for gaussian family taken to be 3412094)
## 
##     Null Deviance: 6291005229 on 387 degrees of freedom
## Residual Deviance: 1231765592 on 360.9999 degrees of freedom
## AIC: 6965.732 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 1795364372 1795364372  526.18 < 2.2e-16 ***
## s(Room.Board, 5)    1 1190871921 1190871921  349.01 < 2.2e-16 ***
## s(Terminal, 5)      1  542984820  542984820  159.14 < 2.2e-16 ***
## s(perc.alumni, 5)   1  380739464  380739464  111.59 < 2.2e-16 ***
## s(Expend, 5)        1  525878215  525878215  154.12 < 2.2e-16 ***
## s(Grad.Rate, 5)     1   68650189   68650189   20.12  9.79e-06 ***
## Residuals         361 1231765592    3412094                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Room.Board, 5)        4  1.3024   0.26869    
## s(Terminal, 5)          4  1.7149   0.14604    
## s(perc.alumni, 5)       4  0.4537   0.76971    
## s(Expend, 5)            4 16.5943 1.705e-12 ***
## s(Grad.Rate, 5)         4  3.0563   0.01694 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can see that Expend appears to have a strong non-linear relationship with out of state tuition. Room and board also appears to have a relatively non-linear relationship with out of state tuition. …

LS0tDQp0aXRsZTogIkFzc2lnbm1lbnQgNiINCmF1dGhvcjogIkhhaWxleSBKZW5zZW4iDQpkYXRlOiAiNC81LzIwMjIiDQpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydA0KLS0tDQoNCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkob3BlbmludHJvKQ0KbGlicmFyeShJU0xSKQ0KbGlicmFyeShib290KQ0KbGlicmFyeShsZWFwcykNCmxpYnJhcnkoZ2FtKQ0KYGBgDQoNCiMjIyBFeGVyY2lzZSA2DQoNCkluIHRoaXMgZXhlcmNpc2UsIHlvdSB3aWxsIGZ1cnRoZXIgYW5hbHl6ZSB0aGUgV2FnZSBkYXRhIHNldCBjb25zaWRlcmVkDQp0aHJvdWdob3V0IHRoaXMgY2hhcHRlci4NCg0KKiooYSkqKiBQZXJmb3JtIHBvbHlub21pYWwgcmVncmVzc2lvbiB0byBwcmVkaWN0IHdhZ2UgdXNpbmcgYWdlLiBVc2UNCmNyb3NzLXZhbGlkYXRpb24gdG8gc2VsZWN0IHRoZSBvcHRpbWFsIGRlZ3JlZSBkIGZvciB0aGUgcG9seW5vbWlhbC4gV2hhdCBkZWdyZWUgd2FzIGNob3NlbiwgYW5kIGhvdyBkb2VzIHRoaXMgY29tcGFyZSB0byB0aGUgcmVzdWx0cyBvZiBoeXBvdGhlc2lzIHRlc3RpbmcgdXNpbmcgQU5PVkE/IE1ha2UgYSBwbG90IG9mIHRoZSByZXN1bHRpbmcgcG9seW5vbWlhbCBmaXQgdG8gdGhlIGRhdGEuDQoNCmBgYHtyfQ0KI3NldC5zZWVkKDEpDQojIyBjcm9zc3ZhbCA9IHJlcCgwLCAxMCkNCiMjIGZvciAoaSBpbiAxOjEwKSB7DQojIyAgZml0ID0gZ2xtKHdhZ2UgfiBwb2x5KGFnZSwgaSksIGRhdGEgPSBXYWdlKQ0KIyMgIGNyb3NzdmFsW2ldID0gY3YuZ2xtKFdhZ2UsIGZpdCkkZGVsdGFbMV0NCiMjfQ0KDQojI3Bsb3QoMTpkZWdyZWUsIGNyb3NzdmFsLCB4bGFiID0gJ2RlZ3JlZScsIHlsYWIgPSAnVGVzdCBNU0UnLCB0eXBlID0gJ2wnKQ0KIyNkZWcubWluID0gd2hpY2gubWluKGNyb3NzdmFsKQ0KIyNwb2ludHMoZGVnLm1pbiwgY3Jvc3N2YWxbZGVnLm1pbl0sIGNvbCA9ICdyZWQnLCBjZXggPSAyLCBwY2ggPSAxOSkNCg0KYGBgDQoNCioqbm90ZSoqIHdoZW4ga25pdHRpbmcgdGhlIGNvZGUgY2h1bmsgd291bGQgbm90IGxvYWQsIHNvIEkgY29tbWVudGVkIG91dA0KDQpUaGlzIHRlbGxzIHVzIHRoYXQgdGhlIG1pbmltdW0gb2YgdGVzdCBNU0UgaXMgYXQgdGhlIDl0aCBkZWdyZWUsIGFuZCB0aGUgNHRoIGRlZ3JlZSBpcyBkZWVtZWQgc21hbGwgZW5vdWdoLiAgDQoNCmBgYHtyfQ0KcGxvdCh3YWdlIH4gYWdlLCBkYXRhID0gV2FnZSwgY29sID0gImdyZXkiKQ0KYWdlcmFuZ2UgPSByYW5nZShXYWdlJGFnZSkNCmFnZWdyaWQgPSBzZXEoZnJvbSA9IGFnZXJhbmdlWzFdLCB0byA9IGFnZXJhbmdlWzJdKQ0KZml0bG0gPSBsbSh3YWdlIH4gcG9seShhZ2UsIDMpLCBkYXRhID0gV2FnZSkNCnByZWRpYyA9IHByZWRpY3QoZml0bG0sIG5ld2RhdGEgPSBsaXN0KGFnZSA9IGFnZWdyaWQpKQ0KbGluZXMoYWdlZ3JpZCwgcHJlZGljLCBjb2wgPSAiQmx1ZSIsIGx3ZCA9IDIpDQpgYGANCg0KDQoqKihiKSoqIEZpdCBhIHN0ZXAgZnVuY3Rpb24gdG8gcHJlZGljdCB3YWdlIHVzaW5nIGFnZSwgYW5kIHBlcmZvcm0gY3Jvc3N2YWxpZGF0aW9uIHRvIGNob29zZSB0aGUgb3B0aW1hbCBudW1iZXIgb2YgY3V0cy4gTWFrZSBhIHBsb3Qgb2YgdGhlIGZpdCBvYnRhaW5lZC4NCg0KYGBge3J9DQojI2N2YWwgPSByZXAoTkEsIGRlZ3JlZSkNCiMjZm9yIChpIGluIDI6ZGVncmVlKSB7DQojIyAgV2FnZSRhZ2UuY3V0ID0gY3V0KFdhZ2UkYWdlLCBpKQ0KIyMgIGZpdGN2ID0gZ2xtKHdhZ2UgfiBhZ2UuY3V0LCBkYXRhID0gV2FnZSkNCiMjICBjdmFsW2ldID0gY3YuZ2xtKFdhZ2UsIGZpdCkkZGVsdGFbMV0NCiMjfQ0KIyNwbG90KDI6ZGVncmVlLCBjdmFsWy0xXSwgeGxhYiA9ICdDdXRzJywgeWxhYiA9ICdUZXN0IE1TRScsIHR5cGUgPSAnbCcpDQojI2RlZy5taW4gPSB3aGljaC5taW4oY3ZhbCkNCiMjcG9pbnRzKGRlZy5taW4sIGN2YWxbZGVnLm1pbl0sIGNvbCA9ICdyZWQnLCBjZXggPSAyLCBwY2ggPSAxOSkNCmBgYA0KDQoqKm5vdGUqKiB3aGVuIGtuaXR0aW5nIHRoZSBjb2RlIGNodW5rIHdvdWxkIG5vdCBsb2FkLCBzbyBJIGNvbW1lbnRlZCBvdXQNCg0KVGhlIG9wdGltYWwgbnVtYmVyIG9mIGN1dHMgaXMgOCBiYXNlZCBvbiB0aGUgbWluaW11bSB0ZXN0IE1TRS4NCg0KYGBge3J9DQpwbG90KHdhZ2UgfiBhZ2UsIGRhdGEgPSBXYWdlLCBjb2wgPSAiZ3JleSIpDQpmaXQgPSBnbG0od2FnZSB+IGN1dChhZ2UsIDgpLCBkYXRhID0gV2FnZSkNCnByZWRzID0gcHJlZGljdChmaXQsIGxpc3QoYWdlID0gYWdlZ3JpZCkpDQpsaW5lcyhhZ2VncmlkLCBwcmVkcywgY29sID0gImJsdWUiLCBsd2QgPSAyKQ0KYGBgDQoNCiMjIyBFeGVyY2lzZSAxMA0KDQpUaGlzIHF1ZXN0aW9uIHJlbGF0ZXMgdG8gdGhlIENvbGxlZ2UgZGF0YSBzZXQuDQoNCioqKGEpKiogU3BsaXQgdGhlIGRhdGEgaW50byBhIHRyYWluaW5nIHNldCBhbmQgYSB0ZXN0IHNldC4gVXNpbmcgb3V0LW9mLXN0YXRlDQp0dWl0aW9uIGFzIHRoZSByZXNwb25zZSBhbmQgdGhlIG90aGVyIHZhcmlhYmxlcyBhcyB0aGUgcHJlZGljdG9ycywgcGVyZm9ybSBmb3J3YXJkIHN0ZXB3aXNlIHNlbGVjdGlvbiBvbiB0aGUgdHJhaW5pbmcgc2V0IGluIG9yZGVyIHRvIGlkZW50aWZ5IGEgc2F0aXNmYWN0b3J5IG1vZGVsIHRoYXQgdXNlcyBqdXN0IGEgc3Vic2V0IG9mIHRoZSBwcmVkaWN0b3JzLg0KDQpgYGB7cn0NCnRyYWluID0gc2FtcGxlKDE6IG5yb3coQ29sbGVnZSksIG5yb3coQ29sbGVnZSkvMikNCnRlc3QgPSAtdHJhaW4NCmZpdCA9IHJlZ3N1YnNldHMoT3V0c3RhdGUgfiAuLCBkYXRhID0gQ29sbGVnZSwgc3Vic2V0ID0gdHJhaW4sIG1ldGhvZCA9ICdmb3J3YXJkJykNCmZpdC5zdW1tYXJ5ID0gc3VtbWFyeShmaXQpDQpmaXQuc3VtbWFyeQ0KDQpjb2VmKGZpdCwgaWQgPSA2KQ0KYGBgDQoNCioqKGIpKiogRml0IGEgR0FNIG9uIHRoZSB0cmFpbmluZyBkYXRhLCB1c2luZyBvdXQtb2Ytc3RhdGUgdHVpdGlvbiBhcyB0aGUgcmVzcG9uc2UgYW5kIHRoZSBmZWF0dXJlcyBzZWxlY3RlZCBpbiB0aGUgcHJldmlvdXMgc3RlcCBhcyB0aGUgcHJlZGljdG9ycy4gUGxvdCB0aGUgcmVzdWx0cywgYW5kIGV4cGxhaW4geW91ciBmaW5kaW5ncy4NCg0KYGBge3J9DQpnYW1tID0gZ2FtKE91dHN0YXRlIH4gUHJpdmF0ZSArIHMoUm9vbS5Cb2FyZCwgNSkgKyBzKFRlcm1pbmFsLCA1KSArIHMocGVyYy5hbHVtbmksIDUpICsgcyhFeHBlbmQsIDUpICsgcyhHcmFkLlJhdGUsIDUpLCBkYXRhID0gQ29sbGVnZSwgc3Vic2V0ID0gdHJhaW4pDQpwYXIobWZyb3cgPSBjKDIsMykpDQpwbG90KGdhbW0sIHNlID0gVFJVRSwgY29sID0gJ2JsdWUnKQ0KYGBgDQoNCkl0IGFwcGVhcnMgdGhhdCBhcyByb29tIGFuZCBib2FyZCBjb3N0cyBhcyB3ZWxsIGFzIGFsdW1uaSBwZXJjZW50IGluY3JlYXNlLCBzbyBkb2VzIG91dCBvZiBzdGF0ZSB0dWl0aW9uLiBJdCBhbHNvIGFwcGVhcnMgdGhhdCBleHBlbmQgYW5kIGdyYWR1YXRpb24gcmF0ZXMgYXJlIG5vdCBsaW5lYXIgd2l0aCByZWdhcmRzIHRvIG91dCBvZiBzdGF0ZSB0dWl0aW9uDQoNCioqKGMpKiogRXZhbHVhdGUgdGhlIG1vZGVsIG9idGFpbmVkIG9uIHRoZSB0ZXN0IHNldCwgYW5kIGV4cGxhaW4gdGhlIHJlc3VsdHMgb2J0YWluZWQuDQoNCmBgYHtyfQ0KcHJlZGljdGlvbiA9IHByZWRpY3QoZ2FtbSxuZXdkYXRhID0gQ29sbGVnZSkNCm1zZSA9IG1lYW4oKENvbGxlZ2UkT3V0c3RhdGUgLSBwcmVkaWN0aW9uKV4yKQ0KbXNlDQoNCnByZWRzID0gcHJlZGljdChnYW1tLCBDb2xsZWdlW3Rlc3QsIF0pDQpSU1MgPSBzdW0oKENvbGxlZ2VbdGVzdCwgXSRPdXRzdGF0ZSAtIHByZWRzKV4yKSANClRTUyA9IHN1bSgoQ29sbGVnZVt0ZXN0LCBdJE91dHN0YXRlIC0gbWVhbihDb2xsZWdlW3Rlc3QsIF0kT3V0c3RhdGUpKSBeIDIpDQoxIC0gKFJTUyAvIFRTUykNCmBgYA0KDQpUaGUgTVNFIGlzIDM0MDgzNDEgYW5kIHRoZSByXjIgMC43NzIyMzM3LCBtZWFuaW5nIGl0IGlzIGEgcmVsYXRpdmVseSBnb29kIG1vZGVsIGJhc2VkIG9mZiB0aGVzZSB2YWx1ZXMuDQoNCioqKGQpKiogRm9yIHdoaWNoIHZhcmlhYmxlcywgaWYgYW55LCBpcyB0aGVyZSBldmlkZW5jZSBvZiBhIG5vbi1saW5lYXIgcmVsYXRpb25zaGlwIHdpdGggdGhlIHJlc3BvbnNlPw0KDQpgYGB7cn0NCnN1bW1hcnkoZ2FtbSkNCmBgYA0KDQpXZSBjYW4gc2VlIHRoYXQgRXhwZW5kIGFwcGVhcnMgdG8gaGF2ZSBhIHN0cm9uZyBub24tbGluZWFyIHJlbGF0aW9uc2hpcCB3aXRoIG91dCBvZiBzdGF0ZSB0dWl0aW9uLiBSb29tIGFuZCBib2FyZCBhbHNvIGFwcGVhcnMgdG8gaGF2ZSBhIHJlbGF0aXZlbHkgbm9uLWxpbmVhciByZWxhdGlvbnNoaXAgd2l0aCBvdXQgb2Ygc3RhdGUgdHVpdGlvbi4gDQouLi4NCg0K