6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(ISLR2)
library(boot)
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:openintro':
## 
##     salinity
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.5
library(gam)
## Warning: package 'gam' was built under R version 4.0.5
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loaded gam 1.20.1

##(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resultng polynomial fit to the data.

set.seed(1)
Wage = Wage;
head(Wage)
##        year age           maritl     race       education             region
## 231655 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 86582  2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443  2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 376662 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##              jobclass         health health_ins  logwage      wage
## 231655  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 86582  2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 161300  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 11443  2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 376662 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574
crossval = rep(0, 10)
for (i in 1:10) {
  fit = glm(wage ~ poly(age, i), data = Wage)
  crossval[i] = cv.glm(Wage, fit)$delta[1]
}


degree=10;
plot(1:degree, crossval, xlab = 'degree', ylab = 'Test MSE', type = 'l')

deg.min = which.min(crossval)
#points(deg.min, crossval[deg.min], col = 'red', cex = 2, pch = 19)

# We see that the minimum test MSE is the 9th degree and the 4th degree is small enough
plot(wage ~ age, data = Wage, col = "grey")
agerange = range(Wage$age)
agegrid = seq(from = agerange[1], to = agerange[2])
fitlm = lm(wage ~ poly(age, 3), data = Wage)
predic = predict(fitlm, newdata = list(age = agegrid))
lines(agegrid, predic, col = "Blue", lwd = 2)

##(b) Fit a step function to predict wage using age, and perform cross validation to choose the optimal number of cuts. Make a plot of the fit obtained

cval = rep(NA, degree)
for (i in 2:degree) {
  Wage$age.cut = cut(Wage$age, i)
  fitcv = glm(wage ~ age.cut, data = Wage)
  cval[i] = cv.glm(Wage, fit)$delta[1]
}
plot(2:degree, cval[-1], xlab = 'Cuts', ylab = 'Test MSE', type = 'l')
deg.min = which.min(cval)
points(deg.min, cval[deg.min], col = 'red', cex = 2, pch = 19)

##The optimal number of cuts is 9 because it has the smallest MSE


plot(wage ~ age, data = Wage, col = "grey")
fit = glm(wage ~ cut(age, 9), data = Wage)
preds = predict(fit, list(age = agegrid))
lines(agegrid, preds, col = "blue", lwd = 2)

10

This question relates to the College data set.

##(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

train = sample(1: nrow(College), nrow(College)/2)
test = -train
fit = regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.sum = summary(fit)
fit.sum
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 3  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 4  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 ) "*"        " "  " "    " "    " "       "*"       " "        
## 8  ( 1 ) "*"        " "  " "    " "    " "       "*"       " "        
##          P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 2  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 3  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 ) " "         "*"        " "   " "      "*" " "      " "      
## 8  ( 1 ) " "         "*"        " "   "*"      "*" " "      " "      
##          perc.alumni Expend Grad.Rate
## 1  ( 1 ) " "         " "    " "      
## 2  ( 1 ) "*"         " "    " "      
## 3  ( 1 ) "*"         "*"    " "      
## 4  ( 1 ) "*"         "*"    " "      
## 5  ( 1 ) "*"         "*"    " "      
## 6  ( 1 ) "*"         "*"    "*"      
## 7  ( 1 ) "*"         "*"    "*"      
## 8  ( 1 ) "*"         "*"    "*"
coef(fit, id = 6)
##   (Intercept)    PrivateYes    Room.Board           PhD   perc.alumni 
## -3815.6574509  2880.3858979     0.9861841    43.6735045    40.4602197 
##        Expend     Grad.Rate 
##     0.1770944    30.8363935

##(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gamm = gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gamm, se = TRUE, col = 'blue')

#If we take a look at the plots, we see that as room and board costs and alumni percent increase, so does out of state tuition. We also see that expend and graduation rates are not linear with out of state tuition

##(c) Evaluate the model obtained on the test set, and explain the results obtained.

prediction = predict(gamm,newdata = College)
mse = mean((College$Outstate - prediction)^2)
mse
## [1] 3399540
preds = predict(gamm, College[test, ])
RSS = sum((College[test, ]$Outstate - preds)^2) 
TSS = sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)
## [1] 0.7649038
#We get an MSE of 3408341 and R^2 of 0.7722337, meaning it is a moderately good model based off these values.

##(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gamm)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 
##     5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), 
##     data = College, subset = train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -7289.5 -1004.3    18.3  1123.6  4218.8 
## 
## (Dispersion Parameter for gaussian family taken to be 3138798)
## 
##     Null Deviance: 6139053909 on 387 degrees of freedom
## Residual Deviance: 1133105994 on 361 degrees of freedom
## AIC: 6933.339 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 1658551575 1658551575 528.404 < 2.2e-16 ***
## s(Room.Board, 5)    1 1093958629 1093958629 348.528 < 2.2e-16 ***
## s(Terminal, 5)      1  239592419  239592419  76.332 < 2.2e-16 ***
## s(perc.alumni, 5)   1  189302589  189302589  60.310 8.461e-14 ***
## s(Expend, 5)        1  671008681  671008681 213.779 < 2.2e-16 ***
## s(Grad.Rate, 5)     1   87504239   87504239  27.878 2.236e-07 ***
## Residuals         361 1133105994    3138798                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Room.Board, 5)        4  3.6201  0.006576 ** 
## s(Terminal, 5)          4  2.3018  0.058243 .  
## s(perc.alumni, 5)       4  0.8690  0.482600    
## s(Expend, 5)            4 28.0768 < 2.2e-16 ***
## s(Grad.Rate, 5)         4  2.7848  0.026556 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#We can see that Expend has a strong non-linear relationship with out of state tuition. Room and board also have a relatively non-linear relationship with out of state tuition.