#1.- BIBLIOTECAS
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3
#2.- DATOS
carseats_dat <- Carseats
skim_to_wide(carseats_dat)
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
| Name | Piped data |
| Number of rows | 400 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| ShelveLoc | 0 | 1 | FALSE | 3 | Med: 219, Bad: 96, Goo: 85 |
| Urban | 0 | 1 | FALSE | 2 | Yes: 282, No: 118 |
| US | 0 | 1 | FALSE | 2 | Yes: 258, No: 142 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Sales | 0 | 1 | 7.50 | 2.82 | 0 | 5.39 | 7.49 | 9.32 | 16.27 | ▁▆▇▃▁ |
| CompPrice | 0 | 1 | 124.97 | 15.33 | 77 | 115.00 | 125.00 | 135.00 | 175.00 | ▁▅▇▃▁ |
| Income | 0 | 1 | 68.66 | 27.99 | 21 | 42.75 | 69.00 | 91.00 | 120.00 | ▇▆▇▆▅ |
| Advertising | 0 | 1 | 6.64 | 6.65 | 0 | 0.00 | 5.00 | 12.00 | 29.00 | ▇▃▃▁▁ |
| Population | 0 | 1 | 264.84 | 147.38 | 10 | 139.00 | 272.00 | 398.50 | 509.00 | ▇▇▇▇▇ |
| Price | 0 | 1 | 115.80 | 23.68 | 24 | 100.00 | 117.00 | 131.00 | 191.00 | ▁▂▇▆▁ |
| Age | 0 | 1 | 53.32 | 16.20 | 25 | 39.75 | 54.50 | 66.00 | 80.00 | ▇▆▇▇▇ |
| Education | 0 | 1 | 13.90 | 2.62 | 10 | 12.00 | 14.00 | 16.00 | 18.00 | ▇▇▃▇▇ |
#3.- DATA PARTITION TRAIN & TEST
partition <- createDataPartition(y= carseats_dat$Sales, p = 0.8, list = FALSE)
carseats.train <- carseats_dat[partition, ]
carseats.test <- carseats_dat[-partition, ]
rm(partition)
#4.- MODELO
set.seed(1234)
carseats.full_anova <- rpart(formula = Sales ~.,
data = carseats.train,
method = "anova",
xval = 10,
model = TRUE)
rpart.plot(carseats.full_anova, yesno = TRUE)
printcp(carseats.full_anova)
##
## Regression tree:
## rpart(formula = Sales ~ ., data = carseats.train, method = "anova",
## model = TRUE, xval = 10)
##
## Variables actually used in tree construction:
## [1] Advertising Age CompPrice Population Price ShelveLoc
##
## Root node error: 2518.5/321 = 7.8457
##
## n= 321
##
## CP nsplit rel error xerror xstd
## 1 0.238206 0 1.00000 1.00375 0.077986
## 2 0.096230 1 0.76179 0.76865 0.056719
## 3 0.071891 2 0.66556 0.73684 0.055176
## 4 0.049881 3 0.59367 0.64770 0.050228
## 5 0.047304 4 0.54379 0.63547 0.048780
## 6 0.028737 5 0.49649 0.60459 0.044324
## 7 0.027349 6 0.46775 0.60877 0.044084
## 8 0.020187 7 0.44040 0.57363 0.041084
## 9 0.019656 8 0.42022 0.58331 0.041550
## 10 0.019006 9 0.40056 0.58331 0.041550
## 11 0.016982 10 0.38155 0.55866 0.038465
## 12 0.015434 11 0.36457 0.55244 0.038471
## 13 0.011543 12 0.34914 0.55403 0.039429
## 14 0.011158 13 0.33760 0.55275 0.038792
## 15 0.010000 14 0.32644 0.55149 0.038869
plotcp(carseats.full_anova)
carseats.anova <- prune(carseats.full_anova,
cp = 0.039)
rpart.plot(carseats.anova, yesno = TRUE)
rm(carseats.full_anova)
carseats.anova.pred <- predict(carseats.anova, carseats.test, type = "vector")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
abline(0,1)
carseats.anova.rmse <- RMSE(pred = carseats.anova.pred,
obs = carseats.test$Sales)
head(carseats.anova.rmse)
## [1] 2.254917
rm(carseats.anova.pred)
carseats.anova2 = train(Sales ~ .,
data = carseats.train,
method = "rpart", # for classification tree
tuneLength = 5, # choose up to 5 combinations of tuning parameters (cp)
metric = "RMSE", # evaluate hyperparameter combinations with RMSE
trControl =trainControl(
method = "cv", # k-fold cross-validation
number = 10, # 10 folds
savePredictions = "final" # save predictions for the optimal tuning parameter
)
)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
carseats.anova2
## CART
##
## 321 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 289, 289, 289, 288, 289, 289, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.04730392 2.223201 0.3842487 1.804767
## 0.04988066 2.241514 0.3728225 1.837737
## 0.07189137 2.372612 0.2974763 1.947141
## 0.09622972 2.451818 0.2530545 2.018379
## 0.23820603 2.687358 0.1584233 2.181634
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.04730392.
plot(carseats.anova2)
carseats.anova.pred <- predict(carseats.anova2, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
carseats.anova.rmse2 <- RMSE(pred = carseats.anova.pred,
obs = carseats.test$Sales)
carseats.anova.rmse2
## [1] 2.327818
rm(carseats.anova.pred)
rpart.plot(carseats.anova2$finalModel)
plot(varImp(carseats.anova2), main="Variable Importance with Simple Regression")
myGrid <- expand.grid(cp = (0:2)/10)
carseats.anova3 = train(Sales ~ .,
data = carseats.train,
method = "rpart", # for classification tree
tuneGrid = myGrid, # choose up to 5 combinations of tuning parameters (cp)
metric = "RMSE", # evaluate hyperparamter combinations with RMSE
trControl = trainControl(
method = "cv", # k-fold cross validation
number = 10, # 10 folds
savePredictions = "final" # save predictions for the optimal tuning parameter
)
)
carseats.anova3
## CART
##
## 321 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 289, 289, 288, 289, 289, 289, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.0 2.146578 0.4617173 1.751046
## 0.1 2.433628 0.2635371 2.008275
## 0.2 2.444306 0.2661119 2.012999
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
plot(carseats.anova3)
carseats.anova.pred <- predict(carseats.anova3, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
main = "Simple Regression: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
carseats.anova.rmse3 <- RMSE(pred = carseats.anova.pred,
obs=carseats.test$Sales)
rm(carseats.anova.pred)
rpart.plot(carseats.anova3$finalModel)
plot(varImp(carseats.anova3), main = "Variable Importance with Simple Regression")
rbind(data.frame(model = "Manual ANOVA",
RMSE = round(carseats.anova.rmse , 5)),
data.frame(model = "Caret w/tuneLength",
RMSE = round(carseats.anova.rmse2, 5)),
data.frame(model = "Caret w/tuneGrid",
RMSE = round(carseats.anova.rmse3, 5))
)
## model RMSE
## 1 Manual ANOVA 2.25492
## 2 Caret w/tuneLength 2.32782
## 3 Caret w/tuneGrid 2.07894