set.seed(1)
set.training <- sample(nrow(Wage), nrow(Wage) * .66)
wage.training <- Wage[set.training, ]
wage.test <- Wage[-set.training, ]
pred.wage.lm <- rep(0,5)
wage.error <- rep(0, 5)
for (i in 1:5) {
lm.wage.fit <- lm(wage ~ poly(age, i), data = wage.training)
pred.wage.lm <- predict(lm.wage.fit, newdata = wage.test)
wage.error[i] <- mean((wage.test$wage - pred.wage.lm)^2)
}
wage.error
## [1] 1886.737 1795.899 1794.159 1794.524 1798.749
plot(wage.error, type = 'b', xlab = 'Degree', ylab = 'Test MSE', main = "Cross Validation MSE's")
points(which.min(wage.error), wage.error[which.min(wage.error)], col = 'hotpink', pch = 20, cex = 2)
Using cross-validation the smallest MSE is a 3rd order model with MSE = 1794.159.
lm.wage.anova <- lm(wage ~ poly(age, 5), data = wage.training)
summary(lm.wage.anova)
##
## Call:
## lm(formula = wage ~ poly(age, 5), data = wage.training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.062 -23.822 -4.968 15.091 202.384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 110.7900 0.8675 127.709 < 2e-16 ***
## poly(age, 5)1 349.2509 38.6020 9.047 < 2e-16 ***
## poly(age, 5)2 -368.4715 38.6020 -9.545 < 2e-16 ***
## poly(age, 5)3 117.5607 38.6020 3.045 0.00235 **
## poly(age, 5)4 -77.6699 38.6020 -2.012 0.04435 *
## poly(age, 5)5 -62.6628 38.6020 -1.623 0.10468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 38.6 on 1974 degrees of freedom
## Multiple R-squared: 0.08735, Adjusted R-squared: 0.08504
## F-statistic: 37.79 on 5 and 1974 DF, p-value: < 2.2e-16
The ANOVA results yield us a 4th degree model.
H0: The simpler model is the best model
Ha: There is a more complex model that yields a smaller error
For the first, second, third, and fourth degree models we would fail to reject the null hypothesis at a significance level of α=0.05. The 5th degree model has a p-value of 0.10468, and we would reject the null hypothesis that there is a simpler model to accurately fit the data. Therefore the 4th degree model is the best fit using ANOVA.
It should be noted that the p-value on the ANOVA is very close to our α=0.05, and the MSEs between the 3rd and 4th models using cross validation are very similar. This suggests that the difference between the 3rd and 4th degree models is potentially not significant enough to justify the more complicated model.
lm.wage.3 <- lm(wage ~ poly(age,3), data = Wage)
age.range <- range(age)
age.grid <- seq(from = age.range[1], to = age.range[2])
pred.wage <- predict(lm.wage.3, newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(pred.wage$fit + 2 * pred.wage$se.fit, pred.wage$fit - 2*pred.wage$se.fit)
plot(age, wage, xlim = age.range, cex = 0.5, col = 'darkgrey')
title('Degree 3 Polynomial')
lines(age.grid, pred.wage$fit, lwd = 2, col = 'gold2')
matlines(age.grid, se.bands, lwd =.5, col = 'darkturquoise')
wage.errors.cv <- rep(NA, 10)
for (i in 2:10) {
Wage$agecut <- cut(Wage$age, i)
lm.wage.cut <- glm(wage ~ agecut, data = Wage)
wage.errors.cv[i] <- cv.glm(Wage, lm.wage.cut, K=10)$delta[2]
}
wage.errors.cv
## [1] NA 1734.280 1682.103 1635.397 1632.528 1621.638 1612.030 1599.777
## [9] 1613.488 1602.661
plot(2:10, wage.errors.cv[-1], xlab = 'Cuts', ylab = 'Error', type = 'b', pch = 20, lwd = 1)
title('Optimal Number of Cuts')
points(which.min(wage.errors.cv), wage.errors.cv[which.min(wage.errors.cv)], col = 'forestgreen', pch = 20, cex = 2)
The optimal number of cuts for this data set is k = 8, with an MSE of 1600.896.
lm.wage.kfcv <- glm(wage ~ cut(age, 8), data = Wage)
summary(lm.wage.kfcv)
##
## Call:
## glm(formula = wage ~ cut(age, 8), data = Wage)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -99.697 -24.552 -5.307 15.417 198.560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.282 2.630 29.007 < 2e-16 ***
## cut(age, 8)(25.8,33.5] 25.833 3.161 8.172 4.44e-16 ***
## cut(age, 8)(33.5,41.2] 40.226 3.049 13.193 < 2e-16 ***
## cut(age, 8)(41.2,49] 43.501 3.018 14.412 < 2e-16 ***
## cut(age, 8)(49,56.8] 40.136 3.177 12.634 < 2e-16 ***
## cut(age, 8)(56.8,64.5] 44.102 3.564 12.373 < 2e-16 ***
## cut(age, 8)(64.5,72.2] 28.948 6.042 4.792 1.74e-06 ***
## cut(age, 8)(72.2,80.1] 15.224 9.781 1.556 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1597.576)
##
## Null deviance: 5222086 on 2999 degrees of freedom
## Residual deviance: 4779946 on 2992 degrees of freedom
## AIC: 30652
##
## Number of Fisher Scoring iterations: 2
age.range.kfcv <- range(Wage$age)
age.grid.kfcv <- seq(from = age.range.kfcv[1], to = age.range.kfcv[2])
pred.wage.kfcv <- predict(lm.wage.kfcv, data.frame(age = age.grid.kfcv))
plot(age, wage, col = 'darkgrey')
lines(age.grid.kfcv, pred.wage.kfcv, col = 'darkviolet', lwd = 2)
detach(Wage)
set.seed(1)
set.college.training <- sample(nrow(College), nrow(College) * .66)
training.College <- College[set.college.training, ]
test.College <- College[-set.college.training, ]
fwd.college <- regsubsets(Outstate ~ ., data = training.College, nvmax = (ncol(College)-1) , method = 'forward')
fwd.college.summary <- summary(fwd.college)
par(mfrow = c(1, 3))
plot(fwd.college.summary$cp, xlab = 'Number of Variables', ylab = 'Cp',type = 'o')
points(which.min(fwd.college.summary$cp), fwd.college.summary$cp[which.min(fwd.college.summary$cp)], col = 'deeppink', cex = 2, pch = 20)
title('Best # of Variables: Cp')
abline(h = min(fwd.college.summary$cp) + (0.2 * sd(fwd.college.summary$cp)), col = 'darkorchid', lty = 2)
abline(h = min(fwd.college.summary$cp) - (0.2 * sd(fwd.college.summary$cp)), col = 'darkorchid', lty = 2)
plot(fwd.college.summary$bic, xlab = 'Number of Variables', ylab = 'BIC',type = 'o')
points(which.min(fwd.college.summary$bic), fwd.college.summary$bic[which.min(fwd.college.summary$bic)], col = 'deepskyblue4', cex = 2, pch = 20)
title('Best # of Variables: BIC')
abline(h = min(fwd.college.summary$bic) + (0.2 * sd(fwd.college.summary$bic)), col = 'aquamarine3', lty = 2)
abline(h = min(fwd.college.summary$bic) - (0.2 * sd(fwd.college.summary$bic)), col = 'aquamarine3', lty = 2)
plot(fwd.college.summary$adjr2, xlab = 'Number of Variables', ylab = 'Adjusted RSq',type = 'o')
points(which.max(fwd.college.summary$adjr2), fwd.college.summary$adjr2[which.max(fwd.college.summary$adjr2)], col = 'darkseagreen4', cex = 2, pch = 20)
title('Best # of Variables: Adj RSq')
abline(h = max(fwd.college.summary$adjr2) + 0.2 * sd(fwd.college.summary$adjr2), col = 'darkgoldenrod2', lty = 2)
abline(h = max(fwd.college.summary$adjr2) - (0.2 * sd(fwd.college.summary$adjr2)), col = 'darkgoldenrod2', lty = 2)
print('Best Cp')
## [1] "Best Cp"
which.min(fwd.college.summary$cp)
## [1] 12
print('Best BIC')
## [1] "Best BIC"
which.min(fwd.college.summary$bic)
## [1] 12
print('Best Adjusted RSq')
## [1] "Best Adjusted RSq"
which.max(fwd.college.summary$adjr2)
## [1] 13
Both Cp and BIC give models that include 12 variables. Adjusted RSq gives a model that uses 13 variables. The minimum number of variables for any of the 3 different model selection methods is between 5 and 6.
max(fwd.college.summary$rsq)
## [1] 0.770329
fwd.college.summary$rsq[6]
## [1] 0.7489739
fwd.college.full <- regsubsets(Outstate ~ ., data = College, nvmax = ncol(College) - 1, method = 'forward')
coef(fwd.college.full, 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3553.2345268 2768.6347025 0.9679086 35.5283359 48.4221031
## Expend Grad.Rate
## 0.2210255 29.7119093
With a model fitting 13 variables (max rsq) we would have an RSq of 0.77, and with a model fitting 6 variables we would have an RSq of 0.75.
gam.college <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = training.College)
par(mfrow = c(2,3))
plot.Gam(gam.college, se = TRUE, col = 'darkseagreen4')
pred.college.gam <- predict.Gam(gam.college, newdata = test.College)
err.college.gam <- mean((test.College$Outstat - pred.college.gam)^2)
print('MSE')
## [1] "MSE"
err.college.gam
## [1] 3150000
rss.College.gam <- mean((College$Outstate - mean(test.College$Outstate))^2)
test.College.rss <- 1 - err.college.gam/rss.College.gam
print("R Sq")
## [1] "R Sq"
test.College.rss
## [1] 0.8066743
OLS[13] = 0.77
OLS[6] = 0.75
GAM[6] = 0.81
Using a GAM with the 6 best predictors we end up with an R^2 of 0.81. This is better than both the OLS model with 13 variables (harder to interpret and possibly over fit) , and the OLS with 6 variables.
summary(gam.college)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate), data = training.College)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7554.29 -1164.44 54.37 1311.52 7757.12
##
## (Dispersion Parameter for gaussian family taken to be 3670985)
##
## Null Deviance: 8925069297 on 511 degrees of freedom
## Residual Deviance: 1798780854 on 489.9995 degrees of freedom
## AIC: 9215.884
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2265694003 2265694003 617.190 < 2.2e-16 ***
## s(Room.Board) 1 1787499460 1787499460 486.926 < 2.2e-16 ***
## s(PhD) 1 553207366 553207366 150.697 < 2.2e-16 ***
## s(perc.alumni) 1 397951072 397951072 108.404 < 2.2e-16 ***
## s(Expend) 1 684585013 684585013 186.485 < 2.2e-16 ***
## s(Grad.Rate) 1 91338430 91338430 24.881 8.481e-07 ***
## Residuals 490 1798780854 3670985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 2.7381 0.04293 *
## s(PhD) 3 1.1188 0.34095
## s(perc.alumni) 3 0.8140 0.48654
## s(Expend) 3 31.1130 < 2e-16 ***
## s(Grad.Rate) 3 1.8221 0.14218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
There appears to be a non-linear relationship between the response and Expend. There is also a potential non-linear relationship between the response and Room.Board.
detach(College)