Wage data set considered throughout this chapter.library(ISLR)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
attach(Wage)
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
(a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
ctrl = trainControl(method = "cv", number = 10)
cv.error = c()
set.seed(123)
for (i in 1:10){
lm.fit = train(y = Wage$wage,
x = poly(Wage$age ,i, raw = TRUE),
method = "lm",
metric = "RMSE",
trcontrol = ctrl)
cv.error[i] = lm.fit$results$RMSE
}
data_frame = data.frame(degree = 1:10, CV_RMSE = cv.error)
data_frame
## degree CV_RMSE
## 1 1 40.95455
## 2 2 39.53766
## 3 3 39.78744
## 4 4 40.30738
## 5 5 39.84529
## 6 6 40.21556
## 7 7 40.39497
## 8 8 39.84910
## 9 9 39.78724
## 10 10 40.42590
min(cv.error)
## [1] 39.53766
data_frame %>%
mutate(min_cv_error = as.numeric(min(cv.error) == cv.error)) %>%
ggplot(aes(x = degree, y = cv.error)) +
geom_line(col = "black") +
geom_point(size = 2, aes(col = factor(min(cv.error)))) +
scale_x_continuous(breaks = seq(1, 10), minor_breaks = NULL) +
theme(legend.position = "none") +
labs(title = "Polynomial degree with Cross-validation",
x = "Degree",
y = "CV RMSE")
The optimal degree
d for the polynomial according to cross-validation is 2.
lm.fit.1 <- lm(wage ~ age, data = Wage)
lm.fit.2 <- lm(wage ~ poly(age, 2, raw = T), data = Wage)
lm.fit.3 <- lm(wage ~ poly(age, 3, raw = T), data = Wage)
lm.fit.4 <- lm(wage ~ poly(age, 4, raw = T), data = Wage)
lm.fit.5 <- lm(wage ~ poly(age, 5, raw = T), data = Wage)
anova(lm.fit.1, lm.fit.2, lm.fit.3, lm.fit.5, lm.fit.5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2, raw = T)
## Model 3: wage ~ poly(age, 3, raw = T)
## Model 4: wage ~ poly(age, 5, raw = T)
## Model 5: wage ~ poly(age, 5, raw = T)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2994 4770322 2 7353 2.3074 0.099697 .
## 5 2994 4770322 0 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
According to Anova, we could select either Model3 or Model 4.
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", formula = "y ~ poly(x, 2, raw = T)") +
labs(title = "Polynomial Regression",
subtitle = "Predicting 'wage' with a 2nd Degree Polynomial of 'age'")
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", formula = "y ~ poly(x, 3, raw = T)") +
labs(title = "Polynomial Regression",
subtitle = "Predicting 'wage' with a 3er Degree Polynomial of 'age'")
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", formula = "y ~ poly(x, 4, raw = T)") +
labs(title = "Polynomial Regression",
subtitle = "Predicting 'wage' with a 4th Degree Polynomial of 'age'")
(b) Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.
cv.error_step = c()
set.seed(123)
for (i in 2:20){
lm.fit_step = train(y = Wage$wage,
x = data.frame(cut(Wage$age, i)),
method = "lm",
metric = "RMSE",
trcontrol = ctrl)
cv.error_step[i-1] = lm.fit_step$results$RMSE
}
data_frame_step = data.frame(cuts = 2:20, CV_RMSE = cv.error_step)
data_frame_step
## cuts CV_RMSE
## 1 2 41.71835
## 2 3 40.52840
## 3 4 40.29139
## 4 5 40.77998
## 5 6 40.22279
## 6 7 40.44401
## 7 8 40.47687
## 8 9 40.03964
## 9 10 39.92958
## 10 11 40.50768
## 11 12 39.94227
## 12 13 39.71342
## 13 14 40.56430
## 14 15 39.83774
## 15 16 40.18459
## 16 17 40.08346
## 17 18 40.12602
## 18 19 40.01920
## 19 20 39.95358
table((cut(age, 13)))
##
## (17.9,22.8] (22.8,27.5] (27.5,32.3] (32.3,37.1] (37.1,41.8] (41.8,46.6]
## 98 233 332 381 377 454
## (46.6,51.4] (51.4,56.2] (56.2,60.9] (60.9,65.7] (65.7,70.5] (70.5,75.2]
## 451 326 175 109 35 21
## (75.2,80.1]
## 8
data_frame_step %>%
mutate(min_cv_error = as.numeric(min(cv.error_step) == cv.error_step)) %>%
ggplot(aes(x = cuts, y = cv.error_step)) +
geom_line(col = "black") +
geom_point(size = 2, aes(col = factor(min(cv.error_step)))) +
scale_x_continuous(breaks = seq(2, 20), minor_breaks = NULL) +
theme(legend.position = "none") +
labs(title = "Cut points with Cross-validation",
x = "Intervals",
y = "CV RMSE")
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", formula = "y ~ cut(x, 13)") +
labs(title = "Step Function",
subtitle = "Predicting 'wage' with a 13-interval step function of 'age'")
attach(College)
str(College)
## 'data.frame': 777 obs. of 18 variables:
## $ Private : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Apps : num 1660 2186 1428 417 193 ...
## $ Accept : num 1232 1924 1097 349 146 ...
## $ Enroll : num 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : num 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : num 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: num 2885 2683 1036 510 249 ...
## $ P.Undergrad: num 537 1227 99 63 869 ...
## $ Outstate : num 7440 12280 11250 12960 7560 ...
## $ Room.Board : num 3300 6450 3750 5450 4120 ...
## $ Books : num 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : num 2200 1500 1165 875 1500 ...
## $ PhD : num 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : num 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: num 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : num 7041 10527 8735 19016 10922 ...
## $ Grad.Rate : num 60 56 54 59 15 55 63 73 80 52 ...
(a) Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward step-wise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.
set.seed(1)
trainIndex <- createDataPartition(College$Outstate, p = 0.7, list = FALSE)
train <- College[trainIndex, ]
test <- College[-trainIndex, ]
set.seed(42)
model_null <- lm(Outstate ~ 1, data = train)
model_full <- lm(Outstate ~ ., data = train)
model_forward <- step(model_null, direction = "forward", scope = formula(model_full), trace = 0)
summary(model_forward)
##
## Call:
## lm(formula = Outstate ~ Expend + Private + Room.Board + perc.alumni +
## PhD + Grad.Rate + Terminal + S.F.Ratio + Personal + Top10perc,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6357.4 -1257.9 -56.9 1285.6 5784.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.247e+03 8.707e+02 -2.581 0.010124 *
## Expend 1.417e-01 2.345e-02 6.043 2.84e-09 ***
## PrivateYes 2.801e+03 2.486e+02 11.265 < 2e-16 ***
## Room.Board 8.855e-01 9.706e-02 9.123 < 2e-16 ***
## perc.alumni 4.136e+01 8.703e+00 4.752 2.59e-06 ***
## PhD 2.583e+01 1.066e+01 2.423 0.015743 *
## Grad.Rate 2.433e+01 6.584e+00 3.695 0.000242 ***
## Terminal 2.473e+01 1.132e+01 2.185 0.029343 *
## S.F.Ratio -6.897e+01 3.032e+01 -2.275 0.023309 *
## Personal -3.028e-01 1.278e-01 -2.370 0.018148 *
## Top10perc 1.365e+01 6.962e+00 1.961 0.050401 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1918 on 535 degrees of freedom
## Multiple R-squared: 0.7725, Adjusted R-squared: 0.7682
## F-statistic: 181.6 on 10 and 535 DF, p-value: < 2.2e-16
model_forward
##
## Call:
## lm(formula = Outstate ~ Expend + Private + Room.Board + perc.alumni +
## PhD + Grad.Rate + Terminal + S.F.Ratio + Personal + Top10perc,
## data = train)
##
## Coefficients:
## (Intercept) Expend PrivateYes Room.Board perc.alumni PhD
## -2246.9583 0.1417 2800.7577 0.8855 41.3565 25.8289
## Grad.Rate Terminal S.F.Ratio Personal Top10perc
## 24.3273 24.7326 -68.9744 -0.3027 13.6529
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
car::vif(model_forward)
## Expend Private Room.Board perc.alumni PhD Grad.Rate
## 2.560131 1.893810 1.665366 1.736361 4.179037 1.826930
## Terminal S.F.Ratio Personal Top10perc
## 3.930903 1.927617 1.201035 2.279483
ctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 1,
selectionFunction = "oneSE")
set.seed(11)
forward <- train(Outstate ~ .,
data = train,
method = "leapForward",
metric = "MSE",
maximize = F,
trControl = ctrl,
tuneGrid = data.frame(nvmax = 1:17))
## Warning in train.default(x, y, weights = w, ...): The metric "MSE" was not in
## the result set. RMSE will be used instead.
forward
## Linear Regression with Forward Selection
##
## 546 samples
## 17 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 491, 492, 494, 492, 491, 490, ...
## Resampling results across tuning parameters:
##
## nvmax RMSE Rsquared MAE
## 1 3145.790 0.4213291 2490.437
## 2 2542.574 0.6128139 1919.293
## 3 2318.515 0.6697868 1771.760
## 4 2140.920 0.7126656 1674.759
## 5 1991.787 0.7512806 1572.973
## 6 1949.028 0.7620367 1545.758
## 7 1964.222 0.7581387 1556.702
## 8 1960.076 0.7591031 1563.759
## 9 1943.231 0.7633189 1550.525
## 10 1925.727 0.7671333 1533.877
## 11 1928.084 0.7663182 1539.351
## 12 1932.577 0.7655672 1531.555
## 13 1905.470 0.7716630 1517.793
## 14 1897.805 0.7735142 1513.250
## 15 1902.198 0.7725648 1516.383
## 16 1904.340 0.7721352 1517.022
## 17 1904.639 0.7720839 1516.796
##
## RMSE was used to select the optimal model using the one SE rule.
## The final value used for the model was nvmax = 6.
coef(forward$finalModel, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3896.4692078 2996.1334413 0.9113605 46.2771415 50.9134890
## Expend Grad.Rate
## 0.1822738 28.3550059
(b) Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.
library(gam)
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.20
model_gam1 <- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate) + s(Terminal) + s(S.F.Ratio) + s(Personal) + s(Top10perc),data = train)
par(mfrow = c(5, 2))
plot(model_gam1, se = T, col = "blue")
model_gam2<- gam(Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = train)
par(mfrow = c(2, 3))
plot(model_gam2, se = T, col = "blue")
anova(model_gam1, model_gam2, test = "F")
## Analysis of Deviance Table
##
## Model 1: Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate) + s(Terminal) + s(S.F.Ratio) + s(Personal) +
## s(Top10perc)
## Model 2: Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 508 1575791119
## 2 524 1693648333 -16 -117857213 2.3747 0.002003 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(model_gam2)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(PhD) + s(perc.alumni) +
## s(Expend) + s(Grad.Rate), data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6980.56 -1188.05 20.76 1261.21 4694.53
##
## (Dispersion Parameter for gaussian family taken to be 3232157)
##
## Null Deviance: 8648835880 on 545 degrees of freedom
## Residual Deviance: 1693648333 on 523.9995 degrees of freedom
## AIC: 9756.834
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2529911179 2529911179 782.732 < 2.2e-16 ***
## s(Room.Board) 1 1638249614 1638249614 506.860 < 2.2e-16 ***
## s(PhD) 1 643605089 643605089 199.126 < 2.2e-16 ***
## s(perc.alumni) 1 347392831 347392831 107.480 < 2.2e-16 ***
## s(Expend) 1 643202403 643202403 199.001 < 2.2e-16 ***
## s(Grad.Rate) 1 82045608 82045608 25.384 6.476e-07 ***
## Residuals 524 1693648333 3232157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 2.4737 0.06083 .
## s(PhD) 3 1.1030 0.34739
## s(perc.alumni) 3 0.7430 0.52676
## s(Expend) 3 27.5206 2.22e-16 ***
## s(Grad.Rate) 3 1.7663 0.15257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(c) Evaluate the model obtained on the test set, and explain the results obtained.
model_pred <- predict(model_gam2, newdata = test)
mean((model_pred - test$Outstate)^2)
## [1] 4224718
test_TSS <- sum((test$Outstate - mean(test$Outstate))^2)
test_RSS <- sum((model_pred - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7504265
forward_pred <- predict(forward, newdata = test)
mean((forward_pred - test$Outstate)^2)
## [1] 5073114
forward_test_RSS <- sum((forward_pred - test$Outstate)^2)
1 - forward_test_RSS/test_TSS
## [1] 0.7003078
The model model_gam2 has a better performance because it has lower mean error and higher \(R^2\).
(d) For which variables, if any, is there evidence of a non-linear relationship with the response?
Base on the summary(model_gam2) the p-value for the smooth effect of Expend and Room.Board are significant. A new model is fit were I keep the smooth effect on only these two variables to determine is there evidence of non-linear relationship with the response variable.
model_gam3 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + s(Expend) + Grad.Rate, data = train)
model_gam3_pred <- predict(model_gam3, newdata = test)
mean((model_gam3_pred - test$Outstate)^2)
## [1] 4234908
test_RSS <- sum((model_gam3_pred - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7498245
par(mfrow = c(2, 3))
plot(model_gam3, se = T, col = "blue")
model_gam4 <- gam(Outstate ~ Private + s(Room.Board) + PhD + perc.alumni + Expend + Grad.Rate, data = train)
model_gam4_pred <- predict(model_gam4, newdata = test)
mean((model_gam4_pred - test$Outstate)^2)
## [1] 5056435
test_RSS <- sum((model_gam4_pred - test$Outstate)^2)
1 - test_RSS/test_TSS
## [1] 0.7012932
par(mfrow = c(2, 3))
plot(model_gam4, se = T, col = "blue")
If we removed the splines()`function from the model the performance drops. So this might indicate that there is a non-linear relationship with the response variable.