#1.- BIBLIOTECAS

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3

#2.- DATOS

carseats_dat <- Carseats
skim_to_wide(carseats_dat)
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
Data summary
Name Piped data
Number of rows 400
Number of columns 11
_______________________
Column type frequency:
factor 3
numeric 8
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
ShelveLoc 0 1 FALSE 3 Med: 219, Bad: 96, Goo: 85
Urban 0 1 FALSE 2 Yes: 282, No: 118
US 0 1 FALSE 2 Yes: 258, No: 142

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Sales 0 1 7.50 2.82 0 5.39 7.49 9.32 16.27 ▁▆▇▃▁
CompPrice 0 1 124.97 15.33 77 115.00 125.00 135.00 175.00 ▁▅▇▃▁
Income 0 1 68.66 27.99 21 42.75 69.00 91.00 120.00 ▇▆▇▆▅
Advertising 0 1 6.64 6.65 0 0.00 5.00 12.00 29.00 ▇▃▃▁▁
Population 0 1 264.84 147.38 10 139.00 272.00 398.50 509.00 ▇▇▇▇▇
Price 0 1 115.80 23.68 24 100.00 117.00 131.00 191.00 ▁▂▇▆▁
Age 0 1 53.32 16.20 25 39.75 54.50 66.00 80.00 ▇▆▇▇▇
Education 0 1 13.90 2.62 10 12.00 14.00 16.00 18.00 ▇▇▃▇▇

#3.- DATA PARTION TRAIN & TEST

partition <- createDataPartition(y= carseats_dat$Sales, p = 0.8, list = FALSE)
carseats.train <- carseats_dat[partition, ]
carseats.test <- carseats_dat[-partition, ]
rm(partition)

#4.- MODELO

set.seed(1234)
carseats.full_anova <- rpart(formula = Sales ~., 
                             data = carseats.train,
                             method = "anova",
                             xval = 10,
                             model = TRUE)
rpart.plot(carseats.full_anova, yesno = TRUE)

printcp(carseats.full_anova)
## 
## Regression tree:
## rpart(formula = Sales ~ ., data = carseats.train, method = "anova", 
##     model = TRUE, xval = 10)
## 
## Variables actually used in tree construction:
## [1] Advertising Age         CompPrice   Income      Price       ShelveLoc  
## 
## Root node error: 2671.4/321 = 8.322
## 
## n= 321 
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.263312      0   1.00000 1.00602 0.077768
## 2  0.104108      1   0.73669 0.74454 0.055035
## 3  0.051347      2   0.63258 0.69955 0.049496
## 4  0.050214      3   0.58123 0.71051 0.048805
## 5  0.034708      4   0.53102 0.66278 0.047736
## 6  0.028472      5   0.49631 0.64116 0.045501
## 7  0.026780      6   0.46784 0.62118 0.044831
## 8  0.026305      7   0.44106 0.61440 0.045354
## 9  0.017869      8   0.41475 0.57372 0.042780
## 10 0.016556      9   0.39689 0.58249 0.043348
## 11 0.015792     10   0.38033 0.56590 0.042447
## 12 0.015418     11   0.36454 0.55747 0.042075
## 13 0.012673     12   0.34912 0.55452 0.041938
## 14 0.011726     13   0.33645 0.55288 0.038873
## 15 0.011424     15   0.31300 0.54662 0.038381
## 16 0.010076     16   0.30157 0.54323 0.038714
## 17 0.010000     17   0.29150 0.54891 0.039089
plotcp(carseats.full_anova)

carseats.anova <- prune(carseats.full_anova,
                        cp = 0.039)
rpart.plot(carseats.anova, yesno = TRUE)

rm(carseats.full_anova)
carseats.anova.pred <- predict(carseats.anova, carseats.test, type = "vector")
plot(carseats.test$Sales, carseats.anova.pred,
     main = "Simple Regression: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")
abline(0,1)

carseats.anova.rmse <- RMSE(pred = carseats.anova.pred,
                            obs = carseats.test$Sales)
head(carseats.anova.rmse)
## [1] 2.230808
rm(carseats.anova.pred)
carseats.anova2 = train(Sales ~ .,
                    data = carseats.train,
                    method = "rpart", # for classification tree
                    tuneLength = 5,  # choose up to 5 combinations of tuning parameters (cp)
                    metric = "RMSE", # evaluate hyperparameter combinations with RMSE
                    trControl =trainControl(
                      method = "cv", # k-fold cross-validation
                      number = 10, # 10 folds
                      savePredictions = "final"    # save predictions for the optimal tuning parameter
                      )
                    )
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
carseats.anova2
## CART 
## 
## 321 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 289, 289, 289, 288, 289, 289, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.03470827  2.256611  0.3957323  1.845247
##   0.05021356  2.359277  0.3469020  1.908966
##   0.05134650  2.359277  0.3469020  1.908966
##   0.10410756  2.471116  0.2786895  2.023435
##   0.26331159  2.744439  0.1686525  2.229575
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.03470827.
plot(carseats.anova2)

carseats.anova.pred <- predict(carseats.anova2, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
     main = "Simple Regression: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")

carseats.anova.rmse2 <- RMSE(pred = carseats.anova.pred,
                             obs = carseats.test$Sales)
carseats.anova.rmse2
## [1] 2.230808
rm(carseats.anova.pred)
rpart.plot(carseats.anova2$finalModel)

plot(varImp(carseats.anova2), main="Variable Importance with Simple Regression")

myGrid <-  expand.grid(cp = (0:2)/10)
carseats.anova3 = train(Sales ~ .,
                    data = carseats.train,
                    method = "rpart",  # for classification tree
                    tuneGrid = myGrid,  # choose up to 5 combinations of tuning parameters (cp)
                    metric = "RMSE",  # evaluate hyperparamter combinations with RMSE
                    trControl = trainControl(
                      method = "cv",  # k-fold cross validation
                      number = 10,  # 10 folds
                      savePredictions = "final"       # save predictions for the optimal tuning parameter
                      )
                    )
carseats.anova3
## CART 
## 
## 321 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 289, 289, 288, 289, 289, 289, ... 
## Resampling results across tuning parameters:
## 
##   cp   RMSE      Rsquared   MAE     
##   0.0  2.213092  0.4541088  1.803401
##   0.1  2.404378  0.3177583  1.968127
##   0.2  2.471603  0.2720979  2.035684
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.
plot(carseats.anova3)

carseats.anova.pred <- predict(carseats.anova3, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
     main = "Simple Regression: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")

carseats.anova.rmse3 <- RMSE(pred = carseats.anova.pred,
                             obs=carseats.test$Sales)
rm(carseats.anova.pred)
rpart.plot(carseats.anova3$finalModel)

plot(varImp(carseats.anova3), main = "Variable Importance with Simple Regression")

rbind(data.frame(model = "Manual ANOVA",
                 RMSE = round(carseats.anova.rmse , 5)),
      data.frame(model = "Caret w/tuneLength",
                 RMSE = round(carseats.anova.rmse2, 5)),
      data.frame(model = "Caret w/tuneGrid",
                 RMSE = round(carseats.anova.rmse3, 5))
)
##                model    RMSE
## 1       Manual ANOVA 2.23081
## 2 Caret w/tuneLength 2.23081
## 3   Caret w/tuneGrid 1.98004