Load Libraries and attach data sets
library(ISLR2)
library(caret)
library(gam)
library(leaps)
library(pander)
library(boot)
library(tidyverse)
attach(Wage)
attach(College)
Wage data set considered throughout this chapter.wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.Using the Polynomial Regression with cross-validation, the degree with the lowest error was 7; however, further analysis of the plot shows that a degree of 4 has a very similar error value and would result in a less complex model. So 4 would be my choice as the optimal degree using this Polynomial Regression.
Using the ANOVA model, a degree of 3 would cause you to reject the Null hypothesis while 4 is right on the cusp. In this case, I would choose to reject the null hypothesis for a degree of 4 and choose it as the optimal degree in the ANOVA model as well.
Fit Polynomial Regression
set.seed(22)
d <- 10
cv.err <- rep(NA, d)
for (i in 1:d){
fit <- glm(wage~poly(age,i),data=Wage)
cv.err[i] <- cv.glm(Wage,fit,K=d)$delta[1]
}
d.min <- which.min(cv.err)
Plot Optimal Degree from Polynomial Regression
plot(cv.err, xlab = 'Degree', ylab = 'MSE', type = 'l', col = 'light Blue')
points(c(4,d.min), c(cv.err[d.min],cv.err[d.min]), col = c('Dark Green','Dark Blue'), bg = c('Dark Green','Dark Blue'), pch = 21, fill = T, cex = 1)
text(c(4,7), c(1595,1595), pos = 3, labels = c('Optimal Degree','Minimum CV Error'), col = c('Dark Green','Dark Blue'))
ANOVA Test
set.seed(22)
fit1=lm(wage~poly(age,1),data=Wage)
fit2=lm(wage~poly(age,2),data=Wage)
fit3=lm(wage~poly(age,3),data=Wage)
fit4=lm(wage~poly(age,4),data=Wage)
fit5=lm(wage~poly(age,5),data=Wage)
fit6=lm(wage~poly(age,6),data=Wage)
Print Anova Table
pander(anova(fit1, fit2, fit3, fit4, fit5, fit6))
| Res.Df | RSS | Df | Sum of Sq | F | Pr(>F) |
|---|---|---|---|---|---|
| 2998 | 5022216 | NA | NA | NA | NA |
| 2997 | 4793430 | 1 | 228786 | 143.7 | 2.29e-32 |
| 2996 | 4777674 | 1 | 15756 | 9.894 | 0.001675 |
| 2995 | 4771604 | 1 | 6070 | 3.812 | 0.05099 |
| 2994 | 4770322 | 1 | 1283 | 0.8054 | 0.3696 |
| 2993 | 4766389 | 1 | 3932 | 2.469 | 0.1162 |
Plot of Fit to Data
plot(wage ~ age, data = Wage, col = 'dark grey')
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
preds <- predict(fit4, newdata = list(age = age.grid), se = T)
se.bands=cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
lines(age.grid, preds$fit, col = 'dark blue', lwd = 2)
matlines(age.grid,se.bands,lwd=1,col='light blue',lty=3)
wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.Using cross-validation, the optimal number of cuts is 8.
Fit Step Function
set.seed(22)
for (i in 2:d) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
cv.err[i] <- cv.glm(Wage, fit)$delta[1]
}
d.min <- which.min(cv.err)
#Optimal number of cuts:
d.min
## [1] 8
Plot of Fit to Data
plot(wage ~ age, data = Wage, col = 'darkgrey')
fit <- glm(wage ~ cut(age, d.min), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = 'dark blue', lwd = 2)
College data set.Looking at the plots of the forward regression (below), it appears that the optimal number of predictors are arguably between 5 to 6. In this case, after evaluating the curves, I selected 6 as the optimal number of predictors. I will note however, that running the model with the top 5 predictors (excluding Grad.Rate) results in only a slightly lower \(R^{2}\), which might arguably be worth the reduction in model complexity. I elected to stay with 6, as that is what the plots indicated to be best.
Split Data into Train & Test
set.seed(22)
in_train <- sample(nrow(College) * 0.75)
train <- College[in_train, ]
test <- College[-in_train, ]
Perform Stepwise selection on the training set
set.seed(22)
fwd_step <- regsubsets(Outstate ~ ., data = train, method = 'forward')
fwd_step_summary <- summary(fwd_step)
fwd_step_summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train, method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" "*" " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
Create 3 plots to determine best model
par(mfrow=c(1, 3))
#Plot 1
plot(fwd_step_summary$cp, xlab = '# of Predictors', ylab = 'Cp', type = 'l')
min_cp <- min(fwd_step_summary$cp)
std_cp <- sd(fwd_step_summary$cp)
abline(h = min_cp + std_cp, col = 'red', lty = 2)
#Plot 2
plot(fwd_step_summary$bic,xlab = '# of Predictors', ylab = 'BIC', type = 'l')
min_bic <- min(fwd_step_summary$bic)
std_bic <- sd(fwd_step_summary$bic)
abline(h = min_bic + std_bic, col = 'red', lty=2)
#Plot 3
plot(fwd_step_summary$adjr2, xlab = '# of Predictors', ylab = 'Adjusted R2', type = 'l')
max_adjr2 <- max(fwd_step_summary$adjr2)
std_adjr2 <- sd(fwd_step_summary$adjr2)
abline(h=max_adjr2 - std_adjr2, col = 'red', lty = 2)
Plotting the results with a smoothing spline using 4 degrees of freedom shows that there are are not linear relationships between most of the predictors and the response; however, it does appear that there might be a slightly more linear relationships with perc.alumni and Room.Board than the others.
List Optimal (Top 6) predictors
predictors <- (names(coef(fwd_step, 6)))[2:7]
predictors
## [1] "PrivateYes" "Room.Board" "PhD" "perc.alumni" "Expend"
## [6] "Grad.Rate"
Train and plot a GAM
coefs <- names(coef(fwd_step, 6))
gam_fit <- gam(Outstate ~ Private +
s(Room.Board, 4) +
s(PhD, 4) +
s(perc.alumni, 4) +
s(Expend, 4) +
s(Grad.Rate, 4),
data = train)
gam_fit_summary <- (summary(gam_fit))
par(mfrow=c(2, 3))
plot(gam_fit, se = TRUE, col = 'dark blue')
The model has an \(R^{2}\) of .76, which means that approximately 76% of the variance in Outstate can be explained by the six predictors Private, Room.Board, PhD, perc.alumni, Expend, and Grad.Rate.
Evaluate Model using R2
gam_pred <- predict(gam_fit, test)
gam_err <- mean((test$Outstate - gam_pred)^2)
gam_tss <- mean((test$Outstate - mean(test$Outstate))^2)
gam_r2 <- 1 - gam_err / gam_tss
gam_r2
## [1] 0.7588494
The Anova for Nonparametric Effects (in the summary below) shows that there is a very significant (alpha = 0) non-linear relationship between Expend and Outstate (the response). It also shows that there is a significant (alpha = .05) non-linear relationship between the response and PhD as well as Grad.Rate. As a result, if we remove smoothing for perc.alumni, we wind up with a very slight increase in our \(R^{2}\) (.7598 vice .7588).
Print a Summary of the Model
gam_fit_summary
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 4) + s(PhD,
## 4) + s(perc.alumni, 4) + s(Expend, 4) + s(Grad.Rate, 4),
## data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6855.38 -1064.53 16.12 1242.89 7131.43
##
## (Dispersion Parameter for gaussian family taken to be 3279494)
##
## Null Deviance: 9261203461 on 581 degrees of freedom
## Residual Deviance: 1836516009 on 559.9998 degrees of freedom
## AIC: 10407.08
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2167242895 2167242895 660.847 < 2.2e-16 ***
## s(Room.Board, 4) 1 1783298982 1783298982 543.773 < 2.2e-16 ***
## s(PhD, 4) 1 660457576 660457576 201.390 < 2.2e-16 ***
## s(perc.alumni, 4) 1 382401587 382401587 116.604 < 2.2e-16 ***
## s(Expend, 4) 1 852532186 852532186 259.959 < 2.2e-16 ***
## s(Grad.Rate, 4) 1 119779828 119779828 36.524 2.754e-09 ***
## Residuals 560 1836516009 3279494
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 4) 3 3.121 0.02564 *
## s(PhD, 4) 3 2.947 0.03235 *
## s(perc.alumni, 4) 3 1.256 0.28865
## s(Expend, 4) 3 39.022 < 2e-16 ***
## s(Grad.Rate, 4) 3 3.080 0.02708 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Train without smoothing per.alumni
gam_fit2 <- gam(Outstate ~ Private +
s(Room.Board, 4) +
s(PhD, 4) +
perc.alumni +
s(Expend, 4) +
s(Grad.Rate, 4),
data = train)
gam2_pred <- predict(gam_fit2, test)
gam2_err <- mean((test$Outstate - gam2_pred)^2)
gam2_tss <- mean((test$Outstate - mean(test$Outstate))^2)
gam2_r2 <- 1 - gam2_err / gam2_tss
gam2_r2
## [1] 0.7598218