library(boot)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
library(ISLR)
#Problems 6 and 10 from Chapter 7 of ISLR2
Exercise 6
In this exercise, you will further analyze the Wage data set considered throughout this chapter.
attach(Wage)
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
library(boot)
set.seed(1)
delta <- rep(NA,10)
for (i in 1:10){
fit <- glm(wage ~ poly(age, i), data = Wage)
delta[i] <- cv.glm(Wage, fit, K=10)$delta[1]
}
plot(1:10, delta, xlab = "Degree", ylab = "Test MSE", type = "l")
d.min <- which.min(delta)
points(d.min, delta[d.min], col = "red", cex = 2, pch = 20)
From the plot above, we can see that cross-validation chose the optimal
degree of the polynomial as 9.
set.seed(1)
fit.1 = lm(wage~poly(age,1), data = Wage)
fit.2 = lm(wage~poly(age,2), data = Wage)
fit.3 = lm(wage~poly(age,3), data = Wage)
fit.4 = lm(wage~poly(age,4), data = Wage)
fit.5 = lm(wage~poly(age,5), data = Wage)
fit.6 = lm(wage~poly(age,6), data = Wage)
fit.7 = lm(wage~poly(age,7), data = Wage)
fit.8 = lm(wage~poly(age,8), data = Wage)
fit.9 = lm(wage~poly(age,9), data = Wage)
fit.10 = lm(wage~poly(age,10), data = Wage)
anova(fit.1,fit.2,fit.3,fit.4,fit.5,fit.6,fit.7,fit.8,fit.9,fit.10)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
## Model 8: wage ~ poly(age, 8)
## Model 9: wage ~ poly(age, 9)
## Model 10: wage ~ poly(age, 10)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.7638 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.9005 0.001669 **
## 4 2995 4771604 1 6070 3.8143 0.050909 .
## 5 2994 4770322 1 1283 0.8059 0.369398
## 6 2993 4766389 1 3932 2.4709 0.116074
## 7 2992 4763834 1 2555 1.6057 0.205199
## 8 2991 4763707 1 127 0.0796 0.777865
## 9 2990 4756703 1 7004 4.4014 0.035994 *
## 10 2989 4756701 1 3 0.0017 0.967529
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the ANOVA results, the 3rd and 9th order polynomials have significant p-values. Hence, those could be a good fit.
Let us plot the data using the 3rd order polynomial first.
plot(wage ~ age, data = Wage, col = "darkgrey")
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims[2])
fit <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
Now let’s plot the data using the 9th order polynomial.
plot(wage ~ age, data = Wage, col = "darkgrey")
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims[2])
fit <- lm(wage ~ poly(age, 9), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
(b) Fit a step function to predict wage using age, and perform
crossvalidation to choose the optimal number of cuts. Make a plot of the
fit obtained.
set.seed(1)
cvs <- rep(0, 10)
for (i in 2:10) {
Wage$age.cut <- cut(Wage$age, i)
fit <- glm(wage ~ age.cut, data = Wage)
cvs[i] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
plot(2:10, cvs[-1], xlab = "Cuts", ylab = "CV Error", type = "l")
d.min <- which.min(cvs)
points(d.min, delta[d.min], col = "red", cex = 2, pch = 20)
The plot shows that the optimal number of cuts is 8.
plot(wage ~ age, data = Wage, col = "darkgrey")
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims[2])
fit <- lm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
#Exercise 10
This question relates to the College data set.
detach(Wage)
attach(College)
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
library(caret)
set.seed(1)
train.Index <- createDataPartition(Outstate, p=0.8, list = FALSE, times = 1)
College.train <- College[train.Index,]
College.test <- College[-train.Index,]
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 1,
selectionFunction = "oneSE")
set.seed(1)
model_forward <- train(Outstate ~ .,
data = College.train,
method = "leapForward",
maximize = F,
trControl = ctrl,
tuneGrid = data.frame(nvmax = 1:17))
model_forward
## Linear Regression with Forward Selection
##
## 623 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 560, 560, 562, 560, 561, 561, ...
## Resampling results across tuning parameters:
##
## nvmax RMSE Rsquared MAE
## 1 3169.576 0.4229785 2473.480
## 2 2673.444 0.5769138 2027.409
## 3 2315.210 0.6760545 1813.115
## 4 2119.253 0.7250021 1666.489
## 5 2051.198 0.7413953 1615.754
## 6 2011.108 0.7507054 1583.030
## 7 2032.862 0.7457135 1601.087
## 8 2054.518 0.7404374 1611.457
## 9 2054.324 0.7402399 1614.336
## 10 2033.989 0.7455431 1608.504
## 11 2036.540 0.7447230 1608.870
## 12 2033.428 0.7453446 1598.234
## 13 2022.145 0.7492188 1592.980
## 14 1999.300 0.7544045 1572.820
## 15 1996.559 0.7547738 1575.071
## 16 1994.824 0.7550069 1575.145
## 17 1995.677 0.7548594 1575.779
##
## RMSE was used to select the optimal model using the one SE rule.
## The final value used for the model was nvmax = 6.
Using forward selection, the best number of variables chosen is 6. For nvmax=6, the error is at its lowest.
coef(model_forward$finalModel, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3748.7268770 2885.1884620 0.9448039 41.0276564 51.9906135
## Expend Grad.Rate
## 0.1997015 28.4932592
The best predictors chosen in the final model by forward selection are Private, Room.Board, PhD, perc.alumni, Expend and Grad.Rate.
library(gam)
gam.model <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College.train)
par(mfrow = c(2,3))
plot(gam.model, se = T, col = 'red')
There seems to be definite non-linearity for Expend whereas for the other variables, a linear relationship would suffice. Grad.Rate also seems to exhibit a tiny bit of non-linearity.
Error rate for the GAM model
mean((predict(gam.model, newdata = College.test) - College.test$Outstate)^2)
## [1] 3816401
Error rate for the model chosen using forward stepwise selection
mean((predict(model_forward, newdata = College.test) - College.test$Outstate)^2)
## [1] 4960797
The GAM model has a lower test error rate compared to the model chosen using forward stepwise selection.
summary(gam.model)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate), data = College.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7238.47 -1185.55 85.31 1280.35 4604.05
##
## (Dispersion Parameter for gaussian family taken to be 3411970)
##
## Null Deviance: 9778743994 on 622 degrees of freedom
## Residual Deviance: 2050592947 on 600.9997 degrees of freedom
## AIC: 11163.26
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2634772462 2634772462 772.21 < 2.2e-16 ***
## s(Room.Board) 1 1955517765 1955517765 573.13 < 2.2e-16 ***
## s(PhD) 1 756514727 756514727 221.72 < 2.2e-16 ***
## s(perc.alumni) 1 386996490 386996490 113.42 < 2.2e-16 ***
## s(Expend) 1 681419771 681419771 199.71 < 2.2e-16 ***
## s(Grad.Rate) 1 103860089 103860089 30.44 5.123e-08 ***
## Residuals 601 2050592947 3411970
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 1.9582 0.11910
## s(PhD) 3 1.3924 0.24407
## s(perc.alumni) 3 1.1768 0.31779
## s(Expend) 3 27.3661 < 2e-16 ***
## s(Grad.Rate) 3 2.3871 0.06803 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
By examining the ANOVA results for the GAM model, we can see from the ANOVA for parametric effects that all predictors are significant. Hence, Room.Board, PhD, perc.alumni, Expend and Grad.Rate all exhibit a linear relationship.
From the Anova for non-parametric effects, only Expend has a significant p-value. Hence, there is evidence of non-linear relationship with Outstate. This is the same behavior that we observed from the plots of the GAM model as well.