6.

(a)

library(ISLR)
library(boot)
wageDf <- Wage
set.seed(4)
mse <- rep(NA, 10)
for(i in 1:10){
    fit <- glm(wage ~ poly(age, i), data = wageDf)
    mse[i] <- cv.glm(wageDf, fit, K = 10)$delta[1]
}

plot(1:10, mse, xlab = "Degree", ylab = "Test MSE", type = "l")

points(which.min(mse), mse[which.min(mse)], col = "red", cex = 1, pch = 20)

min(mse)
## [1] 1594.591
fit1 <- lm(wage ~ age, data = wageDf)
fit2 <- lm(wage ~ poly(age, 2), data = wageDf)
fit3 <- lm(wage ~ poly(age, 3), data = wageDf)
fit4 <- lm(wage ~ poly(age, 4), data = wageDf)
fit5 <- lm(wage ~ poly(age, 5), data = wageDf)
fit6 <- lm(wage ~ poly(age, 6), data = wageDf)
anova(fit1, fit2, fit3, fit4, fit5, fit6)
## Analysis of Variance Table
## 
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.6636 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8936  0.001675 ** 
## 4   2995 4771604  1      6070   3.8117  0.050989 .  
## 5   2994 4770322  1      1283   0.8054  0.369565    
## 6   2993 4766389  1      3932   2.4692  0.116201    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(wage ~ age, data = wageDf, col = "lightgrey")
ageRange <- range(wageDf$age)
ageGrid <- seq(from = ageRange[1], to = ageRange[2])
fitPlot <- lm(wage ~ poly(age, 3), data = wageDf)
preds <- predict(fitPlot, newdata = list(age = ageGrid))
lines(ageGrid, preds, col = "darkblue", lwd = 3)

The first step taken is using K-fold cross validation where \(K = 10\) to determine the degree of the polynomial regression. This cross validation shows that a 4th degree polynomial gives the lowest test MSE of 1594.591. Following this, hypothesis testing using ANOVA where H0 = The simpler model is better and HA = The more complex model is better is performed. Using this ANOVA test, it appears that the 3rd degree polynomial is significant, whereas the 4th is not, so a 3rd degree polynomial will be used.

(b)

set.seed(4)
mse2 <- rep(NA, 10)
for(i in 2:10){
    wageDf$ageCut <- cut(wageDf$age, i)
    mse2[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}

plot(2:10, mse2[-1], xlab = "Cut", ylab = "Test MSE", type = "l")

points(which.min(mse2), mse[which.min(mse)], col = "green", cex = 3, pch = 21)

plot(wage ~ age, wageDf, col = "lightgrey")
ageRange2 <- range(wageDf$age)
ageGrid2 <- seq(from = ageRange2[1], to = ageRange2[2])
fit <- glm(wage ~ cut(age, 5), data = wageDf)
preds2 <- predict(fit, data.frame(age = ageGrid2))
lines(ageGrid2, preds2, col = "green", lwd = 4)

10.

(a)

library(leaps)
collegeDf <- College
set.seed(4)
train <- sample(NROW(collegeDf),NROW(collegeDf)/2)
test <- -train

collegeDfTrain <- collegeDf[train,]
collegeDfTest <- collegeDf[test,]

col.fit <- regsubsets(Outstate ~ ., data = collegeDfTrain, nvmax = 17, method = "forward")

col.summary <- summary(col.fit)
par(mfrow = c(1, 3))
plot(col.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")

min.cp <- min(col.summary$cp)
std.cp <- sd(col.summary$cp)

abline(h = min.cp + 0.2 * std.cp, col = "blue", lty = 3)
abline(h = min.cp - 0.2 * std.cp, col = "blue", lty = 3)
plot(col.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')

min.bic <- min(col.summary$bic)
std.bic <- sd(col.summary$bic)

abline(h = min.bic + 0.2 * std.bic, col = "blue", lty = 3)
abline(h = min.bic - 0.2 * std.bic, col = "blue", lty = 3)
plot(col.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(col.summary$adjr2)
std.adjr2 <- sd(col.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "blue", lty = 3)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "blue", lty = 3)

Using Mallow's Cp, BIC, and adjusted \(R^2\), it looks like 7 is the minimum that is within the 0.2 SD bounds.

fit.sub <- regsubsets(Outstate ~ ., data = collegeDf, method = "forward")
coeffs <- coef(fit.sub, id = 7)
names(coeffs)
## [1] "(Intercept)" "PrivateYes"  "Room.Board"  "Personal"    "PhD"        
## [6] "perc.alumni" "Expend"      "Grad.Rate"

(b)

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
fit.gam <- gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(Personal, df = 5) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2) , data= collegeDfTrain)
par(mfrow = c(2, 3))
plot(fit.gam, se = T, col = "green")

### (c)

preds.gam <- predict(fit.gam, collegeDfTest)
err <- mean((collegeDfTest$Outstate - preds.gam)^2)
tss <- mean((collegeDfTest$Outstate - mean(collegeDfTest$Outstate))^2)
rss <- 1 - err/tss
rss
## [1] 0.7687917

\(R^2 = 0.769\) using 7 predictors.

(d)

summary(fit.gam)
## 
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, 
##     df = 2) + s(Personal, df = 5) + s(perc.alumni, df = 2) + 
##     s(Expend, df = 5) + s(Grad.Rate, df = 2), data = collegeDfTrain)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6458.28  -991.75   -30.68  1226.99  6136.38 
## 
## (Dispersion Parameter for gaussian family taken to be 3349851)
## 
##     Null Deviance: 6412929838 on 387 degrees of freedom
## Residual Deviance: 1232745324 on 368 degrees of freedom
## AIC: 6952.04 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                         Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private                  1 1559149971 1559149971 465.4385 < 2.2e-16 ***
## s(Room.Board, df = 2)    1 1363211397 1363211397 406.9468 < 2.2e-16 ***
## s(PhD, df = 2)           1  312174053  312174053  93.1904 < 2.2e-16 ***
## s(Personal, df = 5)      1   25314427   25314427   7.5569  0.006273 ** 
## s(perc.alumni, df = 2)   1  328652235  328652235  98.1095 < 2.2e-16 ***
## s(Expend, df = 5)        1  665241029  665241029 198.5882 < 2.2e-16 ***
## s(Grad.Rate, df = 2)     1   93190390   93190390  27.8193 2.278e-07 ***
## Residuals              368 1232745324    3349851                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                        Npar Df  Npar F    Pr(F)    
## (Intercept)                                        
## Private                                            
## s(Room.Board, df = 2)        1  6.0190 0.014613 *  
## s(PhD, df = 2)               1  2.5392 0.111911    
## s(Personal, df = 5)          4  2.4022 0.049524 *  
## s(perc.alumni, df = 2)       1  7.0110 0.008451 ** 
## s(Expend, df = 5)            4 22.2618 2.22e-16 ***
## s(Grad.Rate, df = 2)         1  1.1501 0.284236    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The results of the ANOVA suggest that Room.Board, Personal, perc.alumni, and Expend all have non-linear relationships with Outstate.