#1.- BIBLIOTECAS
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3
#2.- DATOS
carseats_dat <- Carseats
skim_to_wide(carseats_dat)
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
| Name | Piped data |
| Number of rows | 400 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| ShelveLoc | 0 | 1 | FALSE | 3 | Med: 219, Bad: 96, Goo: 85 |
| Urban | 0 | 1 | FALSE | 2 | Yes: 282, No: 118 |
| US | 0 | 1 | FALSE | 2 | Yes: 258, No: 142 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Sales | 0 | 1 | 7.50 | 2.82 | 0 | 5.39 | 7.49 | 9.32 | 16.27 | ▁▆▇▃▁ |
| CompPrice | 0 | 1 | 124.97 | 15.33 | 77 | 115.00 | 125.00 | 135.00 | 175.00 | ▁▅▇▃▁ |
| Income | 0 | 1 | 68.66 | 27.99 | 21 | 42.75 | 69.00 | 91.00 | 120.00 | ▇▆▇▆▅ |
| Advertising | 0 | 1 | 6.64 | 6.65 | 0 | 0.00 | 5.00 | 12.00 | 29.00 | ▇▃▃▁▁ |
| Population | 0 | 1 | 264.84 | 147.38 | 10 | 139.00 | 272.00 | 398.50 | 509.00 | ▇▇▇▇▇ |
| Price | 0 | 1 | 115.80 | 23.68 | 24 | 100.00 | 117.00 | 131.00 | 191.00 | ▁▂▇▆▁ |
| Age | 0 | 1 | 53.32 | 16.20 | 25 | 39.75 | 54.50 | 66.00 | 80.00 | ▇▆▇▇▇ |
| Education | 0 | 1 | 13.90 | 2.62 | 10 | 12.00 | 14.00 | 16.00 | 18.00 | ▇▇▃▇▇ |
#3.- DATA PARTION TRAIN & TEST
partition <- createDataPartition(y= carseats_dat$Sales, p = 0.8, list = FALSE)
carseats.train <- carseats_dat[partition, ]
carseats.test <- carseats_dat[-partition, ]
rm(partition)
#4.- MODELO
set.seed(1234)
carseats.full_anova <- rpart(formula = Sales ~.,
data = carseats.train,
method = "anova",
xval = 10,
model = TRUE)
rpart.plot(carseats.full_anova, yesno = TRUE)
printcp(carseats.full_anova)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = carseats.train, method = "anova",
## model = TRUE, xval = 10)
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Income Price ShelveLoc
##
## Root node error: 2671.4/321 = 8.322
##
## n= 321
##
## CP nsplit rel error xerror xstd
## 1 0.263312 0 1.00000 1.00602 0.077768
## 2 0.104108 1 0.73669 0.74454 0.055035
## 3 0.051347 2 0.63258 0.69955 0.049496
## 4 0.050214 3 0.58123 0.71051 0.048805
## 5 0.034708 4 0.53102 0.66278 0.047736
## 6 0.028472 5 0.49631 0.64116 0.045501
## 7 0.026780 6 0.46784 0.62118 0.044831
## 8 0.026305 7 0.44106 0.61440 0.045354
## 9 0.017869 8 0.41475 0.57372 0.042780
## 10 0.016556 9 0.39689 0.58249 0.043348
## 11 0.015792 10 0.38033 0.56590 0.042447
## 12 0.015418 11 0.36454 0.55747 0.042075
## 13 0.012673 12 0.34912 0.55452 0.041938
## 14 0.011726 13 0.33645 0.55288 0.038873
## 15 0.011424 15 0.31300 0.54662 0.038381
## 16 0.010076 16 0.30157 0.54323 0.038714
## 17 0.010000 17 0.29150 0.54891 0.039089
plotcp(carseats.full_anova)
carseats.anova <- prune(carseats.full_anova,
cp = 0.039)
rpart.plot(carseats.anova, yesno = TRUE)
rm(carseats.full_anova)
carseats.anova.pred <- predict(carseats.anova, carseats.test, type = "vector")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
abline(0,1)
carseats.anova.rmse <- RMSE(pred = carseats.anova.pred,
obs = carseats.test$Sales)
head(carseats.anova.rmse)
## [1] 2.230808
rm(carseats.anova.pred)
carseats.anova2 = train(Sales ~ .,
data = carseats.train,
method = "rpart", # for classification tree
tuneLength = 5, # choose up to 5 combinations of tuning parameters (cp)
metric = "RMSE", # evaluate hyperparameter combinations with RMSE
trControl =trainControl(
method = "cv", # k-fold cross-validation
number = 10, # 10 folds
savePredictions = "final" # save predictions for the optimal tuning parameter
)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
carseats.anova2
## CART
##
## 321 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 289, 289, 289, 288, 289, 289, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.03470827 2.256611 0.3957323 1.845247
## 0.05021356 2.359277 0.3469020 1.908966
## 0.05134650 2.359277 0.3469020 1.908966
## 0.10410756 2.471116 0.2786895 2.023435
## 0.26331159 2.744439 0.1686525 2.229575
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.03470827.
plot(carseats.anova2)
carseats.anova.pred <- predict(carseats.anova2, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
carseats.anova.rmse2 <- RMSE(pred = carseats.anova.pred,
obs = carseats.test$Sales)
carseats.anova.rmse2
## [1] 2.230808
rm(carseats.anova.pred)
rpart.plot(carseats.anova2$finalModel)
plot(varImp(carseats.anova2), main="Variable Importance with Simple Regression")
myGrid <- expand.grid(cp = (0:2)/10)
carseats.anova3 = train(Sales ~ .,
data = carseats.train,
method = "rpart", # for classification tree
tuneGrid = myGrid, # choose up to 5 combinations of tuning parameters (cp)
metric = "RMSE", # evaluate hyperparamter combinations with RMSE
trControl = trainControl(
method = "cv", # k-fold cross validation
number = 10, # 10 folds
savePredictions = "final" # save predictions for the optimal tuning parameter
)
)
carseats.anova3
## CART
##
## 321 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 289, 289, 288, 289, 289, 289, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.0 2.213092 0.4541088 1.803401
## 0.1 2.404378 0.3177583 1.968127
## 0.2 2.471603 0.2720979 2.035684
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
plot(carseats.anova3)
carseats.anova.pred <- predict(carseats.anova3, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
carseats.anova.rmse3 <- RMSE(pred = carseats.anova.pred,
obs=carseats.test$Sales)
rm(carseats.anova.pred)
rpart.plot(carseats.anova3$finalModel)
plot(varImp(carseats.anova3), main = "Variable Importance with Simple Regression")
rbind(data.frame(model = "Manual ANOVA",
RMSE = round(carseats.anova.rmse , 5)),
data.frame(model = "Caret w/tuneLength",
RMSE = round(carseats.anova.rmse2, 5)),
data.frame(model = "Caret w/tuneGrid",
RMSE = round(carseats.anova.rmse3, 5))
)
## model RMSE
## 1 Manual ANOVA 2.23081
## 2 Caret w/tuneLength 2.23081
## 3 Caret w/tuneGrid 1.98004