library(boot)
library(ISLR)
library(leaps)
## Warning: package 'leaps' was built under R version 4.0.2
library(gam)
## Warning: package 'gam' was built under R version 4.0.2
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20

Applied Exercises

Question 6

6. In this exercise, you will further analyze the Wage data set considered throughout this chapter.

(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

attach(Wage)
summary(Wage)
##       year           age                     maritl           race     
##  Min.   :2003   Min.   :18.00   1. Never Married: 648   1. White:2480  
##  1st Qu.:2004   1st Qu.:33.75   2. Married      :2074   2. Black: 293  
##  Median :2006   Median :42.00   3. Widowed      :  19   3. Asian: 190  
##  Mean   :2006   Mean   :42.41   4. Divorced     : 204   4. Other:  37  
##  3rd Qu.:2008   3rd Qu.:51.00   5. Separated    :  55                  
##  Max.   :2009   Max.   :80.00                                          
##                                                                        
##               education                     region               jobclass   
##  1. < HS Grad      :268   2. Middle Atlantic   :3000   1. Industrial :1544  
##  2. HS Grad        :971   1. New England       :   0   2. Information:1456  
##  3. Some College   :650   3. East North Central:   0                        
##  4. College Grad   :685   4. West North Central:   0                        
##  5. Advanced Degree:426   5. South Atlantic    :   0                        
##                           6. East South Central:   0                        
##                           (Other)              :   0                        
##             health      health_ins      logwage           wage       
##  1. <=Good     : 858   1. Yes:2083   Min.   :3.000   Min.   : 20.09  
##  2. >=Very Good:2142   2. No : 917   1st Qu.:4.447   1st Qu.: 85.38  
##                                      Median :4.653   Median :104.92  
##                                      Mean   :4.654   Mean   :111.70  
##                                      3rd Qu.:4.857   3rd Qu.:128.68  
##                                      Max.   :5.763   Max.   :318.34  
## 
#polynomial regressions
cv.error <- rep(0, 10)
for (i in 1:10) {
    glm.fit <- glm(wage ~ poly(age, i), data = Wage)
    cv.error[i] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

plot(1:10, cv.error, pch = 19, type = "b", xlab = "Polynomial Degree", ylab = "CV Estimate of Prediction Error")

After the 4th degree polynomial, there is not much improvement in the error.

fit.01 <- lm(wage~age, data=Wage)
fit.02 <- lm(wage~poly(age,2), data=Wage)
fit.03 <- lm(wage~poly(age,3), data=Wage)
fit.04 <- lm(wage~poly(age,4), data=Wage)
fit.05 <- lm(wage~poly(age,5), data=Wage)
fit.06 <- lm(wage~poly(age,6), data=Wage)
fit.07 <- lm(wage~poly(age,7), data=Wage)
fit.08 <- lm(wage~poly(age,8), data=Wage)
fit.09 <- lm(wage~poly(age,9), data=Wage)
fit.10 <- lm(wage~poly(age,10), data=Wage)
anova(fit.01,fit.02,fit.03,fit.04,fit.05,fit.06,fit.07,fit.08,fit.09,fit.10)
## Analysis of Variance Table
## 
## Model  1: wage ~ age
## Model  2: wage ~ poly(age, 2)
## Model  3: wage ~ poly(age, 3)
## Model  4: wage ~ poly(age, 4)
## Model  5: wage ~ poly(age, 5)
## Model  6: wage ~ poly(age, 6)
## Model  7: wage ~ poly(age, 7)
## Model  8: wage ~ poly(age, 8)
## Model  9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
##    Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1    2998 5022216                                    
## 2    2997 4793430  1    228786 143.7638 < 2.2e-16 ***
## 3    2996 4777674  1     15756   9.9005  0.001669 ** 
## 4    2995 4771604  1      6070   3.8143  0.050909 .  
## 5    2994 4770322  1      1283   0.8059  0.369398    
## 6    2993 4766389  1      3932   2.4709  0.116074    
## 7    2992 4763834  1      2555   1.6057  0.205199    
## 8    2991 4763707  1       127   0.0796  0.777865    
## 9    2990 4756703  1      7004   4.4014  0.035994 *  
## 10   2989 4756701  1         3   0.0017  0.967529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA seems to show agreement that using the 4th degree polynomial is the optimal model, we can reject the null hypothesis that a simpler model would be better.

# cross-validation
set.seed(12)
cv.error <- rep(0,10)
for (i in 1:10) {
  glm.fit <- glm(wage~poly(age,i), data=Wage) 
  cv.error[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]  # [1]:std, [2]:bias-corrected
}
cv.error
##  [1] 1676.828 1600.825 1594.923 1594.344 1595.959 1594.117 1594.056 1594.322
##  [9] 1593.886 1593.525
plot(cv.error, type="b")

The resulting plot further supports the optimal solution to be the 4th degree polynomial.

fit.01 <- lm(wage~age, data=Wage)
fit.02 <- lm(wage~poly(age,2), data=Wage)
fit.03 <- lm(wage~poly(age,3), data=Wage)
fit.04 <- lm(wage~poly(age,4), data=Wage)
fit.05 <- lm(wage~poly(age,5), data=Wage)
fit.06 <- lm(wage~poly(age,6), data=Wage)
fit.07 <- lm(wage~poly(age,7), data=Wage)
fit.08 <- lm(wage~poly(age,8), data=Wage)
fit.09 <- lm(wage~poly(age,9), data=Wage)
fit.10 <- lm(wage~poly(age,10), data=Wage)
anova(fit.01,fit.02,fit.03,fit.04,fit.05,fit.06,fit.07,fit.08,fit.09,fit.10)
## Analysis of Variance Table
## 
## Model  1: wage ~ age
## Model  2: wage ~ poly(age, 2)
## Model  3: wage ~ poly(age, 3)
## Model  4: wage ~ poly(age, 4)
## Model  5: wage ~ poly(age, 5)
## Model  6: wage ~ poly(age, 6)
## Model  7: wage ~ poly(age, 7)
## Model  8: wage ~ poly(age, 8)
## Model  9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
##    Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1    2998 5022216                                    
## 2    2997 4793430  1    228786 143.7638 < 2.2e-16 ***
## 3    2996 4777674  1     15756   9.9005  0.001669 ** 
## 4    2995 4771604  1      6070   3.8143  0.050909 .  
## 5    2994 4770322  1      1283   0.8059  0.369398    
## 6    2993 4766389  1      3932   2.4709  0.116074    
## 7    2992 4763834  1      2555   1.6057  0.205199    
## 8    2991 4763707  1       127   0.0796  0.777865    
## 9    2990 4756703  1      7004   4.4014  0.035994 *  
## 10   2989 4756701  1         3   0.0017  0.967529    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
agelims <- range(Wage$age)
age.grid <- seq(agelims[1], agelims[2])
preds <- predict(fit.04, newdata=list(age=age.grid), se=TRUE)
se.bands <- preds$fit + cbind(2*preds$se.fit, -2*preds$se.fit)
par(mfrow=c(1,1), mar=c(4.5,4.5,1,1), oma=c(0,0,4,0))
plot(Wage$age, Wage$wage, xlim=agelims, cex=0.5, col="darkgrey")
title("Degree 4 Polynomial Fit", outer=TRUE)
lines(age.grid, preds$fit, lwd=2, col="darkred")
matlines(age.grid, se.bands, lwd=1, col="red", lty=3)

(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the fit obtained.

cv.results <- rep(NA,10)
for (i in 2:10){
  Wage$age.cut <- cut(age, i)
  lm.fit <- glm(wage~age.cut, data=Wage)
  cv.results[i] <- cv.glm(Wage, lm.fit, K=10)$delta[1]
}
plot(2:10, cv.results[-1], xlab="Number of cuts", ylab="CV error", type="l")
min.point.x <- which.min(cv.results)
points(min.point.x, min(cv.results[-1]), col="red", cex=2, pch=20)

From the graph above we can see that the optimal number of cutoff points is 8.

lm.fit <- glm(wage ~ cut(age, 8), data = Wage)
lm.pred <- predict(lm.fit, newdata = list(age = age.grid))
plot(wage ~ age, data = Wage, col = "darkgrey")
lines(age.grid, lm.pred, col = "blue", lwd = 2)
title("8 Cut Step Function")

Question 10

This question relates to the College data set.

(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

attach(College)
summary(College)
##  Private        Apps           Accept          Enroll       Top10perc    
##  No :212   Min.   :   81   Min.   :   72   Min.   :  35   Min.   : 1.00  
##  Yes:565   1st Qu.:  776   1st Qu.:  604   1st Qu.: 242   1st Qu.:15.00  
##            Median : 1558   Median : 1110   Median : 434   Median :23.00  
##            Mean   : 3002   Mean   : 2019   Mean   : 780   Mean   :27.56  
##            3rd Qu.: 3624   3rd Qu.: 2424   3rd Qu.: 902   3rd Qu.:35.00  
##            Max.   :48094   Max.   :26330   Max.   :6392   Max.   :96.00  
##    Top25perc      F.Undergrad     P.Undergrad         Outstate    
##  Min.   :  9.0   Min.   :  139   Min.   :    1.0   Min.   : 2340  
##  1st Qu.: 41.0   1st Qu.:  992   1st Qu.:   95.0   1st Qu.: 7320  
##  Median : 54.0   Median : 1707   Median :  353.0   Median : 9990  
##  Mean   : 55.8   Mean   : 3700   Mean   :  855.3   Mean   :10441  
##  3rd Qu.: 69.0   3rd Qu.: 4005   3rd Qu.:  967.0   3rd Qu.:12925  
##  Max.   :100.0   Max.   :31643   Max.   :21836.0   Max.   :21700  
##    Room.Board       Books           Personal         PhD        
##  Min.   :1780   Min.   :  96.0   Min.   : 250   Min.   :  8.00  
##  1st Qu.:3597   1st Qu.: 470.0   1st Qu.: 850   1st Qu.: 62.00  
##  Median :4200   Median : 500.0   Median :1200   Median : 75.00  
##  Mean   :4358   Mean   : 549.4   Mean   :1341   Mean   : 72.66  
##  3rd Qu.:5050   3rd Qu.: 600.0   3rd Qu.:1700   3rd Qu.: 85.00  
##  Max.   :8124   Max.   :2340.0   Max.   :6800   Max.   :103.00  
##     Terminal       S.F.Ratio      perc.alumni        Expend     
##  Min.   : 24.0   Min.   : 2.50   Min.   : 0.00   Min.   : 3186  
##  1st Qu.: 71.0   1st Qu.:11.50   1st Qu.:13.00   1st Qu.: 6751  
##  Median : 82.0   Median :13.60   Median :21.00   Median : 8377  
##  Mean   : 79.7   Mean   :14.09   Mean   :22.74   Mean   : 9660  
##  3rd Qu.: 92.0   3rd Qu.:16.50   3rd Qu.:31.00   3rd Qu.:10830  
##  Max.   :100.0   Max.   :39.80   Max.   :64.00   Max.   :56233  
##    Grad.Rate     
##  Min.   : 10.00  
##  1st Qu.: 53.00  
##  Median : 65.00  
##  Mean   : 65.46  
##  3rd Qu.: 78.00  
##  Max.   :118.00
set.seed(1)

train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary <- summary(fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 3  ( 1 ) " "        " "  " "    " "    " "       " "       " "        
## 4  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 ) "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 ) "*"        " "  " "    " "    "*"       " "       " "        
##          P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 2  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 3  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 ) " "         "*"        " "   " "      " " " "      " "      
## 6  ( 1 ) " "         "*"        " "   " "      " " "*"      " "      
## 7  ( 1 ) " "         "*"        " "   "*"      " " "*"      " "      
## 8  ( 1 ) " "         "*"        " "   "*"      " " "*"      " "      
##          perc.alumni Expend Grad.Rate
## 1  ( 1 ) " "         " "    " "      
## 2  ( 1 ) "*"         " "    " "      
## 3  ( 1 ) "*"         "*"    " "      
## 4  ( 1 ) "*"         "*"    " "      
## 5  ( 1 ) "*"         "*"    "*"      
## 6  ( 1 ) "*"         "*"    "*"      
## 7  ( 1 ) "*"         "*"    "*"      
## 8  ( 1 ) "*"         "*"    "*"

I will use the following 6 predictors.

coef(fit, id = 6)
##   (Intercept)    PrivateYes    Room.Board      Terminal   perc.alumni 
## -4726.8810613  2717.7019276     1.1032433    36.9990286    59.0863753 
##        Expend     Grad.Rate 
##     0.1930814    33.8303314

(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam.mod <- gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gam.mod, se = TRUE, col = 'blue')

(c) Evaluate the model obtained on the test set, and explain the results obtained.

preds <- predict(gam.mod, College[test, ])
RSS <- sum((College[test, ]$Outstate - preds)^2) # based on equation (3.16)
TSS <- sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)   # R squared
## [1] 0.7652114

(d) For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam.mod)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 
##     5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), 
##     data = College, subset = train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -7270.32 -1115.76   -58.54  1231.45  7013.47 
## 
## (Dispersion Parameter for gaussian family taken to be 3666101)
## 
##     Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1323460787 on 360.9995 degrees of freedom
## AIC: 6993.591 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private             1 1780121359 1780121359 485.562 < 2.2e-16 ***
## s(Room.Board, 5)    1 1635038271 1635038271 445.988 < 2.2e-16 ***
## s(Terminal, 5)      1  281113708  281113708  76.679 < 2.2e-16 ***
## s(perc.alumni, 5)   1  351003562  351003562  95.743 < 2.2e-16 ***
## s(Expend, 5)        1  607193060  607193060 165.624 < 2.2e-16 ***
## s(Grad.Rate, 5)     1   89036829   89036829  24.287 1.267e-06 ***
## Residuals         361 1323460787    3666101                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df  Npar F     Pr(F)    
## (Intercept)                                    
## Private                                        
## s(Room.Board, 5)        4  2.0626    0.0852 .  
## s(Terminal, 5)          4  1.5754    0.1803    
## s(perc.alumni, 5)       4  0.4126    0.7996    
## s(Expend, 5)            4 21.2640 8.882e-16 ***
## s(Grad.Rate, 5)         4  0.8399    0.5006    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The ANOVA for Nonparametric Effects tells us that Expend has strong non-linear relationshop with the Outstate.