library(ISLR)
library(boot)
wageDf <- Wage
set.seed(4)
mse <- rep(NA, 10)
for(i in 1:10){
fit <- glm(wage ~ poly(age, i), data = wageDf)
mse[i] <- cv.glm(wageDf, fit, K = 10)$delta[1]
}
plot(1:10, mse, xlab = "Degree", ylab = "Test MSE", type = "l")
points(which.min(mse), mse[which.min(mse)], col = "red", cex = 1, pch = 20)
min(mse)
## [1] 1594.591
fit1 <- lm(wage ~ age, data = wageDf)
fit2 <- lm(wage ~ poly(age, 2), data = wageDf)
fit3 <- lm(wage ~ poly(age, 3), data = wageDf)
fit4 <- lm(wage ~ poly(age, 4), data = wageDf)
fit5 <- lm(wage ~ poly(age, 5), data = wageDf)
fit6 <- lm(wage ~ poly(age, 6), data = wageDf)
anova(fit1, fit2, fit3, fit4, fit5, fit6)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6636 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8936 0.001675 **
## 4 2995 4771604 1 6070 3.8117 0.050989 .
## 5 2994 4770322 1 1283 0.8054 0.369565
## 6 2993 4766389 1 3932 2.4692 0.116201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(wage ~ age, data = wageDf, col = "lightgrey")
ageRange <- range(wageDf$age)
ageGrid <- seq(from = ageRange[1], to = ageRange[2])
fitPlot <- lm(wage ~ poly(age, 3), data = wageDf)
preds <- predict(fitPlot, newdata = list(age = ageGrid))
lines(ageGrid, preds, col = "darkblue", lwd = 3)
The first step taken is using K-fold cross validation where \(K = 10\) to determine the degree of the polynomial regression. This cross validation shows that a 4th degree polynomial gives the lowest test MSE of 1594.591. Following this, hypothesis testing using ANOVA where H0 = The simpler model is better and HA = The more complex model is better is performed. Using this ANOVA test, it appears that the 3rd degree polynomial is significant, whereas the 4th is not, so a 3rd degree polynomial will be used.
set.seed(4)
mse2 <- rep(NA, 10)
for(i in 2:10){
wageDf$ageCut <- cut(wageDf$age, i)
mse2[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
plot(2:10, mse2[-1], xlab = "Cut", ylab = "Test MSE", type = "l")
points(which.min(mse2), mse[which.min(mse)], col = "green", cex = 3, pch = 21)
plot(wage ~ age, wageDf, col = "lightgrey")
ageRange2 <- range(wageDf$age)
ageGrid2 <- seq(from = ageRange2[1], to = ageRange2[2])
fit <- glm(wage ~ cut(age, 5), data = wageDf)
preds2 <- predict(fit, data.frame(age = ageGrid2))
lines(ageGrid2, preds2, col = "green", lwd = 4)
library(leaps)
collegeDf <- College
set.seed(4)
train <- sample(NROW(collegeDf),NROW(collegeDf)/2)
test <- -train
collegeDfTrain <- collegeDf[train,]
collegeDfTest <- collegeDf[test,]
col.fit <- regsubsets(Outstate ~ ., data = collegeDfTrain, nvmax = 17, method = "forward")
col.summary <- summary(col.fit)
par(mfrow = c(1, 3))
plot(col.summary$cp, xlab = "Number of variables", ylab = "Cp", type = "l")
min.cp <- min(col.summary$cp)
std.cp <- sd(col.summary$cp)
abline(h = min.cp + 0.2 * std.cp, col = "blue", lty = 3)
abline(h = min.cp - 0.2 * std.cp, col = "blue", lty = 3)
plot(col.summary$bic, xlab = "Number of variables", ylab = "BIC", type='l')
min.bic <- min(col.summary$bic)
std.bic <- sd(col.summary$bic)
abline(h = min.bic + 0.2 * std.bic, col = "blue", lty = 3)
abline(h = min.bic - 0.2 * std.bic, col = "blue", lty = 3)
plot(col.summary$adjr2, xlab = "Number of variables", ylab = "Adjusted R2", type = "l", ylim = c(0.4, 0.84))
max.adjr2 <- max(col.summary$adjr2)
std.adjr2 <- sd(col.summary$adjr2)
abline(h = max.adjr2 + 0.2 * std.adjr2, col = "blue", lty = 3)
abline(h = max.adjr2 - 0.2 * std.adjr2, col = "blue", lty = 3)
Using Mallow's Cp, BIC, and adjusted \(R^2\), it looks like 7 is the minimum that is within the 0.2 SD bounds.
fit.sub <- regsubsets(Outstate ~ ., data = collegeDf, method = "forward")
coeffs <- coef(fit.sub, id = 7)
names(coeffs)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "Personal" "PhD"
## [6] "perc.alumni" "Expend" "Grad.Rate"
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.20
fit.gam <- gam(Outstate ~ Private + s(Room.Board, df = 2) + s(PhD, df = 2) + s(Personal, df = 5) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate, df = 2) , data= collegeDfTrain)
par(mfrow = c(2, 3))
plot(fit.gam, se = T, col = "green")
### (c)
preds.gam <- predict(fit.gam, collegeDfTest)
err <- mean((collegeDfTest$Outstate - preds.gam)^2)
tss <- mean((collegeDfTest$Outstate - mean(collegeDfTest$Outstate))^2)
rss <- 1 - err/tss
rss
## [1] 0.7687917
\(R^2 = 0.769\) using 7 predictors.
summary(fit.gam)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(Personal, df = 5) + s(perc.alumni, df = 2) +
## s(Expend, df = 5) + s(Grad.Rate, df = 2), data = collegeDfTrain)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6458.28 -991.75 -30.68 1226.99 6136.38
##
## (Dispersion Parameter for gaussian family taken to be 3349851)
##
## Null Deviance: 6412929838 on 387 degrees of freedom
## Residual Deviance: 1232745324 on 368 degrees of freedom
## AIC: 6952.04
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1559149971 1559149971 465.4385 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1363211397 1363211397 406.9468 < 2.2e-16 ***
## s(PhD, df = 2) 1 312174053 312174053 93.1904 < 2.2e-16 ***
## s(Personal, df = 5) 1 25314427 25314427 7.5569 0.006273 **
## s(perc.alumni, df = 2) 1 328652235 328652235 98.1095 < 2.2e-16 ***
## s(Expend, df = 5) 1 665241029 665241029 198.5882 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 93190390 93190390 27.8193 2.278e-07 ***
## Residuals 368 1232745324 3349851
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 6.0190 0.014613 *
## s(PhD, df = 2) 1 2.5392 0.111911
## s(Personal, df = 5) 4 2.4022 0.049524 *
## s(perc.alumni, df = 2) 1 7.0110 0.008451 **
## s(Expend, df = 5) 4 22.2618 2.22e-16 ***
## s(Grad.Rate, df = 2) 1 1.1501 0.284236
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The results of the ANOVA suggest that Room.Board, Personal, perc.alumni, and Expend all have non-linear relationships with Outstate.