BAGGING DECISION TREES & RANDOM FOREST

#1.- BIBLIOTECAS

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.2.3

library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

## Loading required package: rpart

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'stringr' was built under R version 4.2.3

## Warning: package 'forcats' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)

## Warning: package 'skimr' was built under R version 4.2.3

#2.- DATOS

oj_dat <- OJ
skim_to_wide(oj_dat)

## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")

Data summary
Name	Piped data
Number of rows	1070
Number of columns	18
_______________________
Column type frequency:
factor	2
numeric	16
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Purchase	0	1	FALSE	2	CH: 653, MM: 417
Store7	0	1	FALSE	2	No: 714, Yes: 356

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
WeekofPurchase	1	254.38	15.56	227.00	240.00	257.00	268.00	278.00	▆▅▅▇▇
StoreID	1	3.96	2.31	1.00	2.00	3.00	7.00	7.00	▇▅▃▁▇
PriceCH	1	1.87	0.10	1.69	1.79	1.86	1.99	2.09	▅▂▇▆▁
PriceMM	1	2.09	0.13	1.69	1.99	2.09	2.18	2.29	▂▁▃▇▆
DiscCH	1	0.05	0.12	0.00	0.00	0.00	0.00	0.50	▇▁▁▁▁
DiscMM	1	0.12	0.21	0.00	0.00	0.00	0.23	0.80	▇▁▂▁▁
SpecialCH	1	0.15	0.35	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
SpecialMM	1	0.16	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
LoyalCH	1	0.57	0.31	0.00	0.33	0.60	0.85	1.00	▅▃▆▆▇
SalePriceMM	1	1.96	0.25	1.19	1.69	2.09	2.13	2.29	▁▂▂▂▇
SalePriceCH	1	1.82	0.14	1.39	1.75	1.86	1.89	2.09	▂▁▇▇▅
PriceDiff	1	0.15	0.27	-0.67	0.00	0.23	0.32	0.64	▁▂▃▇▂
PctDiscMM	1	0.06	0.10	0.00	0.00	0.00	0.11	0.40	▇▁▂▁▁
PctDiscCH	1	0.03	0.06	0.00	0.00	0.00	0.00	0.25	▇▁▁▁▁
ListPriceDiff	1	0.22	0.11	0.00	0.14	0.24	0.30	0.44	▂▃▆▇▁
STORE	1	1.63	1.43	0.00	0.00	2.00	3.00	4.00	▇▃▅▅▃

#3.-PARTICIONES TRAIN AND TEST

set.seed(12345)

partition <- createDataPartition(y = oj_dat$Purchase, p = 0.8, list = FALSE)

oj.train <- oj_dat[partition, ]

oj.test <- oj_dat[-partition, ]

rm(partition)

#4.- ÁRBOL DE DECISIÓN

set.seed(123)

oj.bag = train(Purchase ~ .,
               data = oj.train,
               method = "treebag", # for bagging
               tuneLength = 5, # choose up to 5 combinations of tuning parameters
               metric = "ROC", # evaluate hyperparameter combinations with ROC
               trControl = trainControl(
                 method = "cv", # k-fold cross-validation
                 number = 10, # k= 10 folds
                 savePredictions = "final", # save predictions for the optimal tuning parameters
                      classProbs = TRUE, # return class probabilities in addition to predicted values
                      summaryFunction = twoClassSummary # for binary response variable
                      )
                    )
oj.bag

## Bagged CART 
## 
## 857 samples
##  17 predictor
##   2 classes: 'CH', 'MM' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 770, 772, 771, 772, 772, 771, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8485285  0.8220972  0.7093583

oj.pred <- predict(oj.bag, oj.test, type = "raw")
plot(oj.test$Purchase, oj.pred,
     main = "Bagging Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

oj.conf <- confusionMatrix(data = oj.pred,
                            reference = oj.test$Purchase)

oj.conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 109  19
##         MM  21  64
##                                           
##                Accuracy : 0.8122          
##                  95% CI : (0.7532, 0.8623)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 1.758e-10       
##                                           
##                   Kappa : 0.6069          
##                                           
##  Mcnemar's Test P-Value : 0.8744          
##                                           
##             Sensitivity : 0.8385          
##             Specificity : 0.7711          
##          Pos Pred Value : 0.8516          
##          Neg Pred Value : 0.7529          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5117          
##    Detection Prevalence : 0.6009          
##       Balanced Accuracy : 0.8048          
##                                           
##        'Positive' Class : CH              
##

j.bag.acc <- as.numeric(oj.conf$overall[1])
rm(oj.pred)
rm(oj.conf)
#plot (oj.bag$, oj.bag$finalModel$y)
plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")

oj.frst = train(Purchase ~ .,
               data = oj.train,
               method = "ranger", # for bagging
               tuneLength = 5, # choose up to 5 combinations of tuning parameters
               metric = "ROC", # evaluate hyperparameter combinations with ROC
               trControl = trainControl(
                 method = "cv", # k-fold cross-validation
                 number = 10, # k= 10 folds
                 savePredictions = "final", # save predictions for the optimal tuning parameters
                      classProbs = TRUE, # return class probabilities in addition to predicted values
                      summaryFunction = twoClassSummary # for binary response variable
                      )
                    )
oj.frst

## Random Forest 
## 
## 857 samples
##  17 predictor
##   2 classes: 'CH', 'MM' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 772, 771, 771, 772, 771, 771, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   ROC        Sens       Spec     
##    2    gini        0.8606099  0.8643687  0.7089127
##    2    extratrees  0.8517494  0.8910740  0.6403743
##    5    gini        0.8662687  0.8509434  0.7271836
##    5    extratrees  0.8629548  0.8624093  0.6942068
##    9    gini        0.8665160  0.8355588  0.7363636
##    9    extratrees  0.8633796  0.8431785  0.7122103
##   13    gini        0.8624577  0.8241292  0.7454545
##   13    extratrees  0.8599718  0.8316401  0.7244207
##   17    gini        0.8603882  0.8165094  0.7456328
##   17    extratrees  0.8606794  0.8259434  0.7213904
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 9, splitrule = gini
##  and min.node.size = 1.

plot(oj.frst)

oj.pred <- predict(oj.frst, oj.test, type = "raw")
plot(oj.test$Purchase, oj.pred,
     main = "Random Forest Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

oj.conf <- confusionMatrix(data = oj.pred,
                             reference = oj.test$Purchase)

oj.conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 110  16
##         MM  20  67
##                                           
##                Accuracy : 0.831           
##                  95% CI : (0.7738, 0.8787)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 2.296e-12       
##                                           
##                   Kappa : 0.6477          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.8462          
##             Specificity : 0.8072          
##          Pos Pred Value : 0.8730          
##          Neg Pred Value : 0.7701          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5164          
##    Detection Prevalence : 0.5915          
##       Balanced Accuracy : 0.8267          
##                                           
##        'Positive' Class : CH              
##

oj.frst.acc <- as.numeric(oj.conf$overall[1])
rm(oj.pred)
rm(oj.conf)

plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")

plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")

rbind(
      data.frame(model = "Bagging", Accuracy = round(j.bag.acc, 5)),
      data.frame(model = "Random Forest", Accuracy = round(oj.frst.acc, 5))
) %>% arrange(desc(Accuracy))

##           model Accuracy
## 1 Random Forest  0.83099
## 2       Bagging  0.81221

BAGGING DECISION TREES & RANDOM FOREST

2023-10-22