Wage
data set considered throughout this chapter.##
## Attaching package: 'boot'
## The following object is masked from 'package:openintro':
##
## salinity
attach(Wage)
set.seed(15)
cv.error <- rep(0,5)
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error## [1] 1676.192 1599.786 1596.256 1593.755 1595.106
plot(cv.error, type="b", xlab="Degree", ylab="Test MSE")
points(which.min(cv.error), cv.error[4], col="red", pch=20, cex=2)The optimal degree chosen by cross-validation is 4.
fit_1 <- lm(wage ~ age, data=Wage)
fit_2 <- lm(wage ~ poly(age, 2), data=Wage)
fit_3 <- lm(wage ~ poly(age, 3), data=Wage)
fit_4 <- lm(wage ~ poly(age, 4), data=Wage)
fit_5 <- lm(wage ~ poly(age, 5), data=Wage)
anova(fit_1, fit_2, fit_3, fit_4, fit_5)## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value indicates that either a quadratic or cubic fit provide a reasonable fit to the data. A degree fit of 4 is sufficient as well but the previously mentioned degrees are better suited for this model.
age_lim <- range(age)
age_grid <- seq(from=age_lim[1], to=age_lim[2])
preds <- predict(fit_4, newdata=list(age=age_grid),se=TRUE)
se_bands <- cbind(preds$fit+2*preds$se.fit,preds$fit-2*preds$se.fit)
plot(age, wage, xlim=age_lim, cex=.5, col="darkgrey")
lines(age_grid, preds$fit, lwd=2, col="blue")
matlines(age_grid, se_bands, lwd=1, col="blue", lty=3)wage using age, and perform
cross-validation to choose the optimal number of cuts. Make a plot of
the fit obtained.set.seed(2)
cv.errors <- rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.errors## [1] NA 1734.448 1681.623 1635.869 1632.864 1621.643 1613.148 1601.850
## [9] 1610.852 1603.607
plot(2:10, cv.errors[-1], type="b", xlab="Number of cuts", ylab="Test MSE")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col="red", pch=20, cex=2)The optimal number of cuts is 8.
fit_step <- glm(wage ~ cut(age, 8), data=Wage)
preds <- predict(fit_step, data.frame(age = age_grid))
plot(age, wage, col="darkgray")
lines(age_grid, preds, col="darkgreen", lwd=2)College data set.attach(College)
set.seed(5)
test_sample <- sample(1:nrow(College), nrow(College)/4)
train <- College[-test_sample, ]
test <- College[test_sample, ]
library(leaps)
fwd <- regsubsets(Outstate ~ ., data=train, nvmax=17, method='forward')
fwd_sum <- summary(fwd)
par(mfrow=c(2,2))
plot(fwd_sum$cp ,xlab="Number of Variables ", ylab="Cp",
type="b")
points(which.min(fwd_sum$cp), fwd_sum$cp[which.min(fwd_sum$cp)], col="red",
cex=2, pch=20)
plot(fwd_sum$bic ,xlab="Number of Variables ",
ylab="BIC",type="b")
points(which.min(fwd_sum$bic), fwd_sum$bic[which.min(fwd_sum$bic)], col="red",
cex=2, pch=20)
plot(fwd_sum$adjr2 ,xlab="Number of Variables ",
ylab="Adjusted R^2^",type="b")
points(which.max(fwd_sum$adjr2), fwd_sum$adjr2[which.max(fwd_sum$adjr2)],
col="red", cex=2, pch=20)
which.min(fwd_sum$cp)## [1] 13
## [1] 11
## [1] 14
test_matrix <- model.matrix(Outstate~., data=test)
val.errors <- rep(NA,17)
for(i in 1:17){
coefi <- coef(fwd,id=i)
pred <- test_matrix[,names(coefi)]%*%coefi
val.errors[i] <- mean((test$Outstate-pred)^2)
}
which.min(val.errors)## [1] 6
The best model selected using the forward stepwise method is one that has 12 variables, but by looking at the plot, we see that a simpler model could be enough, as the test MSE doesn’t decrease significantly after 6 variables.
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3553.2345268 2768.6347025 0.9679086 35.5283359 48.4221031
## Expend Grad.Rate
## 0.2210255 29.7119093
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.22-5
library(splines)
gam_fit <- gam(Outstate ~ Private + s(Room.Board, 3) + s(Terminal, 3) + s(perc.alumni, 3) + s(Expend, 3) + s(Grad.Rate, 3), data=train)
par(mfrow=c(2,3))
plot(gam_fit, se=TRUE, col="blue")preds <- predict(gam_fit, newdata = test)
error <- mean((test$Outstate-preds)^2)
val.errors[6]-error## [1] 388542.3
The GAM model does better than a simple linear model with 6 variables.
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 3) + s(Terminal,
## 3) + s(perc.alumni, 3) + s(Expend, 3) + s(Grad.Rate, 3),
## data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7067.838 -1138.361 4.964 1256.279 8163.609
##
## (Dispersion Parameter for gaussian family taken to be 3646032)
##
## Null Deviance: 9466747723 on 582 degrees of freedom
## Residual Deviance: 2063655284 on 566.0003 degrees of freedom
## AIC: 10481.86
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2545651515 2545651515 698.198 < 2.2e-16 ***
## s(Room.Board, 3) 1 1826513428 1826513428 500.959 < 2.2e-16 ***
## s(Terminal, 3) 1 507722090 507722090 139.253 < 2.2e-16 ***
## s(perc.alumni, 3) 1 341093757 341093757 93.552 < 2.2e-16 ***
## s(Expend, 3) 1 802720318 802720318 220.163 < 2.2e-16 ***
## s(Grad.Rate, 3) 1 141392931 141392931 38.780 9.253e-10 ***
## Residuals 566 2063655284 3646032
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 3) 2 2.506 0.08252 .
## s(Terminal, 3) 2 1.620 0.19880
## s(perc.alumni, 3) 2 0.997 0.36953
## s(Expend, 3) 2 45.483 < 2e-16 ***
## s(Grad.Rate, 3) 2 3.191 0.04188 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The model suggests a strong non-linear relationship between “Outstate” and “Expend”, and a moderate non-linear relationship between “Outstate” and “Grad.rate”.