library(ISLR)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(rpart.plot)
## Loading required package: rpart
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
carseats_dat <- Carseats
skim_to_wide(carseats_dat)
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
Data summary
| Name |
Piped data |
| Number of rows |
400 |
| Number of columns |
11 |
| _______________________ |
|
| Column type frequency: |
|
| factor |
3 |
| numeric |
8 |
| ________________________ |
|
| Group variables |
None |
Variable type: factor
| ShelveLoc |
0 |
1 |
FALSE |
3 |
Med: 219, Bad: 96, Goo: 85 |
| Urban |
0 |
1 |
FALSE |
2 |
Yes: 282, No: 118 |
| US |
0 |
1 |
FALSE |
2 |
Yes: 258, No: 142 |
Variable type: numeric
| Sales |
0 |
1 |
7.50 |
2.82 |
0 |
5.39 |
7.49 |
9.32 |
16.27 |
▁▆▇▃▁ |
| CompPrice |
0 |
1 |
124.97 |
15.33 |
77 |
115.00 |
125.00 |
135.00 |
175.00 |
▁▅▇▃▁ |
| Income |
0 |
1 |
68.66 |
27.99 |
21 |
42.75 |
69.00 |
91.00 |
120.00 |
▇▆▇▆▅ |
| Advertising |
0 |
1 |
6.64 |
6.65 |
0 |
0.00 |
5.00 |
12.00 |
29.00 |
▇▃▃▁▁ |
| Population |
0 |
1 |
264.84 |
147.38 |
10 |
139.00 |
272.00 |
398.50 |
509.00 |
▇▇▇▇▇ |
| Price |
0 |
1 |
115.79 |
23.68 |
24 |
100.00 |
117.00 |
131.00 |
191.00 |
▁▂▇▆▁ |
| Age |
0 |
1 |
53.32 |
16.20 |
25 |
39.75 |
54.50 |
66.00 |
80.00 |
▇▆▇▇▇ |
| Education |
0 |
1 |
13.90 |
2.62 |
10 |
12.00 |
14.00 |
16.00 |
18.00 |
▇▇▃▇▇ |
partition <- createDataPartition(y = carseats_dat$Sales, p = 0.8, list = FALSE)
carseats.train <- carseats_dat[partition, ]
carseats.test <- carseats_dat[-partition, ]
rm(partition)
set.seed(0725)
# Specify model = TRUE to handle plotting splits with factor variables.
carseats.full_anova <- rpart(formula = Sales ~ .,
data = carseats.train,
method = "anova",
xval = 10,
model = TRUE)
rpart.plot(carseats.full_anova, yesno = TRUE)

printcp(carseats.full_anova)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = carseats.train, method = "anova",
## model = TRUE, xval = 10)
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Income Population Price
## [7] ShelveLoc
##
## Root node error: 2545.6/321 = 7.9303
##
## n= 321
##
## CP nsplit rel error xerror xstd
## 1 0.249572 0 1.00000 1.00918 0.075278
## 2 0.102466 1 0.75043 0.75779 0.058253
## 3 0.056684 2 0.64796 0.68316 0.052016
## 4 0.048653 3 0.59128 0.66291 0.051455
## 5 0.042774 4 0.54262 0.65399 0.052269
## 6 0.029989 5 0.49985 0.62273 0.050640
## 7 0.021486 6 0.46986 0.60368 0.047149
## 8 0.017726 7 0.44838 0.60292 0.045379
## 9 0.016790 9 0.41292 0.60020 0.043976
## 10 0.015466 10 0.39613 0.58153 0.043447
## 11 0.014463 11 0.38067 0.57162 0.042612
## 12 0.013021 12 0.36620 0.55928 0.042178
## 13 0.012956 13 0.35318 0.56007 0.042104
## 14 0.012404 14 0.34023 0.55754 0.042310
## 15 0.011942 15 0.32782 0.55010 0.042199
## 16 0.010592 16 0.31588 0.55640 0.042214
## 17 0.010416 17 0.30529 0.54425 0.041110
## 18 0.010000 18 0.29487 0.54781 0.041209
plotcp(carseats.full_anova)

carseats.anova <- prune(carseats.full_anova,
cp = 0.039)
rpart.plot(carseats.anova, yesno = TRUE)

rm(carseats.full_anova)
carseats.anova.pred <- predict(carseats.anova, carseats.test, type = "vector")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")
abline(0, 1)

(carseats.anova.rmse <- RMSE(pred = carseats.anova.pred,
obs = carseats.test$Sales))
## [1] 2.247746
rm(carseats.anova.pred)
carseats.anova2 = train(Sales ~ .,
data = carseats.train,
method = "rpart", # for classification tree
tuneLength = 5, # choose up to 5 combinations of tuning parameters (cp)
metric = "RMSE", # evaluate hyperparamter combinations with RMSE
trControl = trainControl(
method = "cv", # k-fold cross validation
number = 10, # 10 folds
savePredictions = "final" # save predictions for the optimal tuning parameter
)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
carseats.anova2
## CART
##
## 321 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 289, 289, 288, 289, 289, 289, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.04277419 2.266994 0.3725934 1.787839
## 0.04865277 2.279546 0.3631046 1.796333
## 0.05668440 2.310869 0.3486055 1.831413
## 0.10246552 2.430971 0.2717422 1.958624
## 0.24957247 2.760834 0.1388108 2.207340
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.04277419.
plot(carseats.anova2)

carseats.anova.pred <- predict(carseats.anova2, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")

(carseats.anova.rmse2 <- RMSE(pred = carseats.anova.pred,
obs = carseats.test$Sales))
## [1] 2.236897
rm(oj.class.pred)
## Warning in rm(oj.class.pred): object 'oj.class.pred' not found
rpart.plot(carseats.anova2$finalModel)

plot(varImp(carseats.anova2), main="Variable Importance with Simple Regression")

myGrid <- expand.grid(cp = (0:2)/10)
carseats.anova3 = train(Sales ~ .,
data = carseats.train,
method = "rpart", # for classification tree
tuneGrid = myGrid, # choose up to 5 combinations of tuning parameters (cp)
metric = "RMSE", # evaluate hyperparamter combinations with RMSE
trControl = trainControl(
method = "cv", # k-fold cross validation
number = 10, # 10 folds
savePredictions = "final" # save predictions for the optimal tuning parameter
)
)
carseats.anova3
## CART
##
## 321 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 289, 289, 288, 289, 289, 289, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.0 2.094905 0.4849439 1.688322
## 0.1 2.393249 0.2810345 1.924768
## 0.2 2.430416 0.2598597 1.982809
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
plot(carseats.anova3)

carseats.anova.pred <- predict(carseats.anova3, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")

(carseats.anova.rmse3 <- RMSE(pred = carseats.anova.pred,
obs = carseats.test$Sales))
## [1] 1.918437
rm(carseats.anova.pred)
rpart.plot(carseats.anova3$finalModel)

plot(varImp(carseats.anova3), main="Variable Importance with Simple Regression")

rbind(data.frame(model = "Manual ANOVA",
RMSE = round(carseats.anova.rmse, 5)),
data.frame(model = "Caret w/tuneLength",
RMSE = round(carseats.anova.rmse2, 5)),
data.frame(model = "Caret w.tuneGrid",
RMSE = round(carseats.anova.rmse3, 5))
)
## model RMSE
## 1 Manual ANOVA 2.24775
## 2 Caret w/tuneLength 2.23690
## 3 Caret w.tuneGrid 1.91844