REGRESSION TREES

#1.- BIBLIOTECAS

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.2.3

library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

## Loading required package: rpart

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'stringr' was built under R version 4.2.3

## Warning: package 'forcats' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)

## Warning: package 'skimr' was built under R version 4.2.3

#2.- DATOS

carseats_dat <- Carseats
skim_to_wide(carseats_dat)

## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")

Data summary
Name	Piped data
Number of rows	400
Number of columns	11
_______________________
Column type frequency:
factor	3
numeric	8
________________________
Group variables	None

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
ShelveLoc	1	FALSE	3	Med: 219, Bad: 96, Goo: 85
Urban	1	FALSE	2	Yes: 282, No: 118
US	1	FALSE	2	Yes: 258, No: 142

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Sales	1	7.50	2.82	0	5.39	7.49	9.32	16.27	▁▆▇▃▁
CompPrice	1	124.97	15.33	77	115.00	125.00	135.00	175.00	▁▅▇▃▁
Income	1	68.66	27.99	21	42.75	69.00	91.00	120.00	▇▆▇▆▅
Advertising	1	6.64	6.65	0	0.00	5.00	12.00	29.00	▇▃▃▁▁
Population	1	264.84	147.38	10	139.00	272.00	398.50	509.00	▇▇▇▇▇
Price	1	115.80	23.68	24	100.00	117.00	131.00	191.00	▁▂▇▆▁
Age	1	53.32	16.20	25	39.75	54.50	66.00	80.00	▇▆▇▇▇
Education	1	13.90	2.62	10	12.00	14.00	16.00	18.00	▇▇▃▇▇

#3.- DATA PARTITION TRAIN & TEST

partition <- createDataPartition(y= carseats_dat$Sales, p = 0.8, list = FALSE)
carseats.train <- carseats_dat[partition, ]
carseats.test <- carseats_dat[-partition, ]
rm(partition)

#4.- MODELO

set.seed(1234)

carseats.full_anova <- rpart(formula = Sales ~., 
                             data = carseats.train,
                             method = "anova",
                             xval = 10,
                             model = TRUE)

rpart.plot(carseats.full_anova, yesno = TRUE)

printcp(carseats.full_anova)

## 
## Regression tree:
## rpart(formula = Sales ~ ., data = carseats.train, method = "anova", 
##     model = TRUE, xval = 10)
## 
## Variables actually used in tree construction:
## [1] Advertising Age         CompPrice   Population  Price       ShelveLoc  
## 
## Root node error: 2518.5/321 = 7.8457
## 
## n= 321 
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.238206      0   1.00000 1.00375 0.077986
## 2  0.096230      1   0.76179 0.76865 0.056719
## 3  0.071891      2   0.66556 0.73684 0.055176
## 4  0.049881      3   0.59367 0.64770 0.050228
## 5  0.047304      4   0.54379 0.63547 0.048780
## 6  0.028737      5   0.49649 0.60459 0.044324
## 7  0.027349      6   0.46775 0.60877 0.044084
## 8  0.020187      7   0.44040 0.57363 0.041084
## 9  0.019656      8   0.42022 0.58331 0.041550
## 10 0.019006      9   0.40056 0.58331 0.041550
## 11 0.016982     10   0.38155 0.55866 0.038465
## 12 0.015434     11   0.36457 0.55244 0.038471
## 13 0.011543     12   0.34914 0.55403 0.039429
## 14 0.011158     13   0.33760 0.55275 0.038792
## 15 0.010000     14   0.32644 0.55149 0.038869

plotcp(carseats.full_anova)

carseats.anova <- prune(carseats.full_anova,
                        cp = 0.039)
rpart.plot(carseats.anova, yesno = TRUE)

rm(carseats.full_anova)

carseats.anova.pred <- predict(carseats.anova, carseats.test, type = "vector")
plot(carseats.test$Sales, carseats.anova.pred,
     main = "Simple Regression: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")
abline(0,1)

carseats.anova.rmse <- RMSE(pred = carseats.anova.pred,
                            obs = carseats.test$Sales)

head(carseats.anova.rmse)

## [1] 2.254917

rm(carseats.anova.pred)

carseats.anova2 = train(Sales ~ .,
                    data = carseats.train,
                    method = "rpart", # for classification tree
                    tuneLength = 5,  # choose up to 5 combinations of tuning parameters (cp)
                    metric = "RMSE", # evaluate hyperparameter combinations with RMSE
                    trControl =trainControl(
                      method = "cv", # k-fold cross-validation
                      number = 10, # 10 folds
                      savePredictions = "final"    # save predictions for the optimal tuning parameter
                      )
                    )

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

carseats.anova2

## CART 
## 
## 321 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 289, 289, 289, 288, 289, 289, ... 
## Resampling results across tuning parameters:
## 
##   cp          RMSE      Rsquared   MAE     
##   0.04730392  2.223201  0.3842487  1.804767
##   0.04988066  2.241514  0.3728225  1.837737
##   0.07189137  2.372612  0.2974763  1.947141
##   0.09622972  2.451818  0.2530545  2.018379
##   0.23820603  2.687358  0.1584233  2.181634
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.04730392.

plot(carseats.anova2)

carseats.anova.pred <- predict(carseats.anova2, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
     main = "Simple Regression: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")

carseats.anova.rmse2 <- RMSE(pred = carseats.anova.pred,
                             obs = carseats.test$Sales)

carseats.anova.rmse2

## [1] 2.327818

rm(carseats.anova.pred)

rpart.plot(carseats.anova2$finalModel)

plot(varImp(carseats.anova2), main="Variable Importance with Simple Regression")

myGrid <-  expand.grid(cp = (0:2)/10)
carseats.anova3 = train(Sales ~ .,
                    data = carseats.train,
                    method = "rpart",  # for classification tree
                    tuneGrid = myGrid,  # choose up to 5 combinations of tuning parameters (cp)
                    metric = "RMSE",  # evaluate hyperparamter combinations with RMSE
                    trControl = trainControl(
                      method = "cv",  # k-fold cross validation
                      number = 10,  # 10 folds
                      savePredictions = "final"       # save predictions for the optimal tuning parameter
                      )
                    )
carseats.anova3

## CART 
## 
## 321 samples
##  10 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 289, 289, 288, 289, 289, 289, ... 
## Resampling results across tuning parameters:
## 
##   cp   RMSE      Rsquared   MAE     
##   0.0  2.146578  0.4617173  1.751046
##   0.1  2.433628  0.2635371  2.008275
##   0.2  2.444306  0.2661119  2.012999
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.

plot(carseats.anova3)

carseats.anova.pred <- predict(carseats.anova3, carseats.test, type = "raw")
plot(carseats.test$Sales, carseats.anova.pred,
     main = "Simple Regression: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")

carseats.anova.rmse3 <- RMSE(pred = carseats.anova.pred,
                             obs=carseats.test$Sales)

rm(carseats.anova.pred)
rpart.plot(carseats.anova3$finalModel)

plot(varImp(carseats.anova3), main = "Variable Importance with Simple Regression")

rbind(data.frame(model = "Manual ANOVA",
                 RMSE = round(carseats.anova.rmse , 5)),
      data.frame(model = "Caret w/tuneLength",
                 RMSE = round(carseats.anova.rmse2, 5)),
      data.frame(model = "Caret w/tuneGrid",
                 RMSE = round(carseats.anova.rmse3, 5))
)

##                model    RMSE
## 1       Manual ANOVA 2.25492
## 2 Caret w/tuneLength 2.32782
## 3   Caret w/tuneGrid 2.07894

REGRESSION TREES

2023-10-21