Problem 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

data(Wage)

6(a)

Perform polynomial regression to predict wage using age.

#polynomial regression, orthogonal
fit <- lm(wage ~ poly(age, 5), data = Wage)
coef(summary(fit))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01
#polynomial, raw
fit1 <- lm(wage ~ poly(age, 5, raw=TRUE), data = Wage)
coef(summary(fit))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01
#same as `fit1`, different method
fit2 <- lm(wage ~ age + I(age^2) + I(age^3) + I(age^4) + I(age^5), data=Wage)
coef(fit2)
##   (Intercept)           age      I(age^2)      I(age^3)      I(age^4) 
## -4.970462e+01  3.993043e+00  2.759768e-01 -1.264979e-02  1.834622e-04 
##      I(age^5) 
## -9.157095e-07
#Estimates from `fit1` are equivalent to coefficients from `fit2`.

#what degree polynomial is best? hypothesis testing with ANOVA
fit.1 <- lm(wage~age,data=Wage)
fit.2 <- lm(wage~poly(age,2), data=Wage)
fit.3 <- lm(wage~poly(age,3), data=Wage)
fit.4 <- lm(wage~poly(age,4), data=Wage)
fit.5 <- lm(wage~poly(age,5), data=Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(summary(fit.5))
##                 Estimate Std. Error     t value     Pr(>|t|)
## (Intercept)    111.70361  0.7287647 153.2780243 0.000000e+00
## poly(age, 5)1  447.06785 39.9160847  11.2001930 1.491111e-28
## poly(age, 5)2 -478.31581 39.9160847 -11.9830341 2.367734e-32
## poly(age, 5)3  125.52169 39.9160847   3.1446392 1.679213e-03
## poly(age, 5)4  -77.91118 39.9160847  -1.9518743 5.104623e-02
## poly(age, 5)5  -35.81289 39.9160847  -0.8972045 3.696820e-01
#notice that the coefficient table also gives p-values for each model

Use cross-validation to select the optimal degree d for the polynomial.

set.seed (892)
cv.errors <- rep(0, 5)
for (i in 1:5) {
  glm.fit <- glm(wage ~ poly(age , i), data = Wage)
  cv.errors[i] <- cv.glm(Wage, glm.fit, K = 10)$delta [1]
}
cv.errors
## [1] 1676.700 1601.163 1596.438 1594.250 1595.231
which.min(cv.errors)
## [1] 4
plot(cv.errors, xlab="Polynomial Degree d", ylab="Test MSE", type = "l")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col='red', cex=2, pch=20)

What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

The 4th-degree polynomial (d=4) was chosen by k-fold cross-validation, k = 10. This is consistent with ANOVA hypothesis testing showing the 4th-degree polynomial is significant, p < 0.5.

#set up prediction with standard error bands
attach(Wage)
agelims <- range(age)
age.grid <- seq(from=agelims[1],to=agelims[2])
pred <- predict(fit.4, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(pred$fit + 2*pred$se.fit, pred$fit - 2*pred$se.fit)

#plot fit
plot(age, wage, xlim=agelims, cex=.5, col="darkgrey")
title("Degree-4 Polynomial")
lines(age.grid, pred$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)

6(b)

Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

#example fit
table(cut(age, 4))
## 
## (17.9,33.5]   (33.5,49]   (49,64.5] (64.5,80.1] 
##         750        1399         779          72
step.fit.ex <- lm(wage~cut(age, 4), data=Wage)
coef(summary(step.fit.ex))
##                         Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)            94.158392   1.476069 63.789970 0.000000e+00
## cut(age, 4)(33.5,49]   24.053491   1.829431 13.148074 1.982315e-38
## cut(age, 4)(49,64.5]   23.664559   2.067958 11.443444 1.040750e-29
## cut(age, 4)(64.5,80.1]  7.640592   4.987424  1.531972 1.256350e-01
set.seed (892)
step.cv <- rep(0, 8)
for (i in 2:10) {
  Wage$age.cut <- cut(age, i)
  step.fit <- glm(wage ~ age.cut, data = Wage)
  step.cv[i-1] <- cv.glm(Wage, step.fit, K = 10)$delta [1]
}
step.cv
## [1] 1733.731 1683.333 1637.331 1630.151 1625.323 1610.571 1602.049 1610.665
## [9] 1605.653
which.min(step.cv)
## [1] 7
plot(step.cv, xlab="Number of Cuts", ylab="Test MSE", type = "l")
points(x=which.min(step.cv), y=step.cv[which.min(step.cv)], col='red', cex=2, pch=20)

#fit new model with number of cuts chosen by cross-validation
table(cut(age, 7))
## 
## (17.9,26.9] (26.9,35.7] (35.7,44.6] (44.6,53.4] (53.4,62.3] (62.3,71.1] 
##         278         623         799         757         433          89 
## (71.1,80.1] 
##          21
step.fit <- lm(wage~cut(age, 7), data=Wage)
coef(summary(step.fit))
##                        Estimate Std. Error   t value      Pr(>|t|)
## (Intercept)            80.06402   2.405222 33.287586 5.517532e-207
## cut(age, 7)(26.9,35.7] 24.51568   2.892501  8.475600  3.617008e-17
## cut(age, 7)(35.7,44.6] 38.59253   2.792477 13.820180  3.695152e-42
## cut(age, 7)(44.6,53.4] 38.05482   2.812402 13.531077  1.556835e-40
## cut(age, 7)(53.4,62.3] 39.03969   3.082094 12.666611  7.411125e-36
## cut(age, 7)(62.3,71.1] 31.03930   4.884196  6.355048  2.400708e-10
## cut(age, 7)(71.1,80.1] 15.99445   9.075719  1.762334  7.811488e-02
#set up prediction with standard error bands
step.pred <- predict(step.fit, newdata=list(age=age.grid), se=TRUE)
step.se.bands <- cbind(step.pred$fit + 2*step.pred$se.fit, step.pred$fit - 2*step.pred$se.fit)

#plot fit
plot(age, wage, xlim=agelims, cex=.5, col="darkgrey")
title("Step Function")
lines(age.grid, step.pred$fit, lwd=2, col="blue")
matlines(age.grid, step.se.bands, lwd=1, col="blue", lty=3)

Problem 10

This question relates to the College data set.

data(College)

10(a)

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(44)
#80-20 train-test sets
train <- sample(777, 777*0.8)
test <- -train

fwd <- regsubsets(Outstate~., data=College[train, ], nvmax=17, method="forward")
summary(fwd)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College[train, ], nvmax = 17, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  " "    " "    "*"       " "       " "        
## 10  ( 1 ) "*"        " "  " "    " "    "*"       " "       " "        
## 11  ( 1 ) "*"        " "  " "    " "    "*"       " "       " "        
## 12  ( 1 ) "*"        " "  "*"    " "    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       " "        
## 14  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 15  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 16  ( 1 ) "*"        "*"  "*"    "*"    "*"       " "       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      "*" "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      "*" "*"      " "      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 11  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 12  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 13  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        "*"   "*"      "*" "*"      "*"      
## 15  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"
#test set validation
fwd.best <- regsubsets(Outstate~., data=College[train, ], nvmax=17, method="forward")
test.mat <- model.matrix(Outstate~., data=College[test, ])
val.errors <- rep(NA, 12)
for(i in 1:12){
   coefi = coef(fwd.best, id=i)
   pred = test.mat[ , names(coefi)]%*%coefi
   val.errors[i] = mean((College$Outstate[test] - pred)^2)
}
val.errors
##  [1] 8967679 6326856 5166524 4464440 4177092 4056817 4009095 3959416 4041145
## [10] 4010691 4062365 4003766
sub <- which.min(val.errors)
MSE.FW <- val.errors[sub]
coef(fwd.best, sub)
##   (Intercept)    PrivateYes    Room.Board      Personal           PhD 
## -3420.9617690  2756.4639845     0.9164574    -0.3012005    19.2853601 
##      Terminal   perc.alumni        Expend     Grad.Rate 
##    22.8790223    40.9381406     0.2171245    30.3219798

Forward stepwise selection has chosen an 8-variable model using Private, Room.Board, Personal, PhD, Terminal, perc.alumni, Expend, and Grad.Rate as predictors for the Outstate response.

10(b)

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

#natural and smoothing spline GAMs
gam.N <- lm(Outstate ~ ns(Room.Board,5) + ns(Personal,5) + ns(PhD,5) + ns(Terminal,5) + ns(perc.alumni,5) + ns(Expend,5) + ns(Grad.Rate,5) + Private, data=College[train, ])
gam.S <- gam(Outstate ~ s(Room.Board) + s(Personal) + s(PhD) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + Private, data=College[train, ])

#compare plots
par(mfrow=c(2,2))
plot.Gam(gam.N, se=TRUE, col="red")

plot(gam.S, se=TRUE, col="blue")

It appears as though Room.Board has a very linear relationship with Outstate, while Expend has a highly nonlinear relationship with Outstate.

#`Room.Board` looks very linear; what to do?
# throw it out:
gam.m1 <- gam(Outstate ~ s(Personal) + s(PhD) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + Private, data=College[train, ])
# keep it as a linear term:
gam.m2 <- gam(Outstate ~ Room.Board + s(Personal) + s(PhD) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + Private, data=College[train, ])

#compare:
anova(gam.m1, gam.m2, gam.S)
## Analysis of Deviance Table
## 
## Model 1: Outstate ~ s(Personal) + s(PhD) + s(Terminal) + s(perc.alumni) + 
##     s(Expend) + s(Grad.Rate) + Private
## Model 2: Outstate ~ Room.Board + s(Personal) + s(PhD) + s(Terminal) + 
##     s(perc.alumni) + s(Expend) + s(Grad.Rate) + Private
## Model 3: Outstate ~ s(Room.Board) + s(Personal) + s(PhD) + s(Terminal) + 
##     s(perc.alumni) + s(Expend) + s(Grad.Rate) + Private
##   Resid. Df Resid. Dev Df  Deviance  Pr(>Chi)    
## 1       595 2153570062                           
## 2       594 1967254204  1 186315859 6.121e-14 ***
## 3       591 1954711445  3  12542759    0.2848    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Including Room.Board as a linear term in the model appears to be the best approach to this predictor.

summary(gam.m2)
## 
## Call: gam(formula = Outstate ~ Room.Board + s(Personal) + s(PhD) + 
##     s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + 
##     Private, data = College[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7214.76 -1048.62    18.02  1308.61  7356.57 
## 
## (Dispersion Parameter for gaussian family taken to be 3311878)
## 
##     Null Deviance: 10281250334 on 620 degrees of freedom
## Residual Deviance: 1967254204 on 593.9996 degrees of freedom
## AIC: 11113.81 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                 Df     Sum Sq    Mean Sq   F value    Pr(>F)    
## Room.Board       1 3854134786 3854134786 1163.7309 < 2.2e-16 ***
## s(Personal)      1  210951155  210951155   63.6953 7.525e-15 ***
## s(PhD)           1  256238536  256238536   77.3696 < 2.2e-16 ***
## s(Terminal)      1    4729984    4729984    1.4282    0.2325    
## s(perc.alumni)   1  981511703  981511703  296.3611 < 2.2e-16 ***
## s(Expend)        1 1092309436 1092309436  329.8157 < 2.2e-16 ***
## s(Grad.Rate)     1  211532013  211532013   63.8707 6.944e-15 ***
## Private          1  434473276  434473276  131.1864 < 2.2e-16 ***
## Residuals      594 1967254204    3311878                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar F   Pr(F)    
## (Intercept)                              
## Room.Board                               
## s(Personal)          3  3.447 0.01647 *  
## s(PhD)               3  2.890 0.03487 *  
## s(Terminal)          3  1.546 0.20156    
## s(perc.alumni)       3  1.618 0.18393    
## s(Expend)            3 40.261 < 2e-16 ***
## s(Grad.Rate)         3  3.355 0.01866 *  
## Private                                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(2,2))
plot(gam.m2, se=TRUE, col="blue")

10(c)

Evaluate the model obtained on the test set, and explain the results obtained.

gam.pred <- predict(gam.m2, newdata=College[test, ])
gam.MSE <- mean((gam.pred - College[test, ]$Outstate)^2)
gam.MSE
## [1] 3833107
#compare to linear model MSE
lin <- glm(Outstate ~ Room.Board + Personal + PhD + Terminal + perc.alumni + Expend + Grad.Rate + Private, data=College[train, ])
lin.pred <- predict(lin, College[test, ])
lin.MSE <- mean((lin.pred-College$Outstate[test])^2)
lin.MSE
## [1] 3959416
abs((gam.MSE - lin.MSE)/lin.MSE)*100
## [1] 3.190094

The GAM offers a 3% lower test MSE than linear regression performed on the same subset; however, this may not be enough of an improvement to justify the increased complexity of the GAM.

10(d)

For which variables, if any, is there evidence of a non-linear relationship with the response?

According to the GAM plots, Expend has a highly nonlinear relationship with Outstate response, but other predictors may warrant further investigation as well.