#1.- BIBLIOTECAS

library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3

#2.- DATOS

oj_dat <- OJ
skim_to_wide(oj_dat)
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
Data summary
Name Piped data
Number of rows 1070
Number of columns 18
_______________________
Column type frequency:
factor 2
numeric 16
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Purchase 0 1 FALSE 2 CH: 653, MM: 417
Store7 0 1 FALSE 2 No: 714, Yes: 356

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WeekofPurchase 0 1 254.38 15.56 227.00 240.00 257.00 268.00 278.00 ▆▅▅▇▇
StoreID 0 1 3.96 2.31 1.00 2.00 3.00 7.00 7.00 ▇▅▃▁▇
PriceCH 0 1 1.87 0.10 1.69 1.79 1.86 1.99 2.09 ▅▂▇▆▁
PriceMM 0 1 2.09 0.13 1.69 1.99 2.09 2.18 2.29 ▂▁▃▇▆
DiscCH 0 1 0.05 0.12 0.00 0.00 0.00 0.00 0.50 ▇▁▁▁▁
DiscMM 0 1 0.12 0.21 0.00 0.00 0.00 0.23 0.80 ▇▁▂▁▁
SpecialCH 0 1 0.15 0.35 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
SpecialMM 0 1 0.16 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
LoyalCH 0 1 0.57 0.31 0.00 0.33 0.60 0.85 1.00 ▅▃▆▆▇
SalePriceMM 0 1 1.96 0.25 1.19 1.69 2.09 2.13 2.29 ▁▂▂▂▇
SalePriceCH 0 1 1.82 0.14 1.39 1.75 1.86 1.89 2.09 ▂▁▇▇▅
PriceDiff 0 1 0.15 0.27 -0.67 0.00 0.23 0.32 0.64 ▁▂▃▇▂
PctDiscMM 0 1 0.06 0.10 0.00 0.00 0.00 0.11 0.40 ▇▁▂▁▁
PctDiscCH 0 1 0.03 0.06 0.00 0.00 0.00 0.00 0.25 ▇▁▁▁▁
ListPriceDiff 0 1 0.22 0.11 0.00 0.14 0.24 0.30 0.44 ▂▃▆▇▁
STORE 0 1 1.63 1.43 0.00 0.00 2.00 3.00 4.00 ▇▃▅▅▃

#3.-PARTICIONES TRAIN AND TEST

set.seed(12345)
partition <- createDataPartition(y = oj_dat$Purchase, p = 0.8, list = FALSE)
oj.train <- oj_dat[partition, ]
oj.test <- oj_dat[-partition, ]
rm(partition)

#4.- ÁRBOL DE DECISIÓN

set.seed(123)
oj.bag = train(Purchase ~ .,
               data = oj.train,
               method = "treebag", # for bagging
               tuneLength = 5, # choose up to 5 combinations of tuning parameters
               metric = "ROC", # evaluate hyperparameter combinations with ROC
               trControl = trainControl(
                 method = "cv", # k-fold cross-validation
                 number = 10, # k= 10 folds
                 savePredictions = "final", # save predictions for the optimal tuning parameters
                      classProbs = TRUE, # return class probabilities in addition to predicted values
                      summaryFunction = twoClassSummary # for binary response variable
                      )
                    )
oj.bag
## Bagged CART 
## 
## 857 samples
##  17 predictor
##   2 classes: 'CH', 'MM' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 770, 772, 771, 772, 772, 771, ... 
## Resampling results:
## 
##   ROC        Sens       Spec     
##   0.8485285  0.8220972  0.7093583
oj.pred <- predict(oj.bag, oj.test, type = "raw")
plot(oj.test$Purchase, oj.pred,
     main = "Bagging Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

oj.conf <- confusionMatrix(data = oj.pred,
                            reference = oj.test$Purchase)
oj.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 109  19
##         MM  21  64
##                                           
##                Accuracy : 0.8122          
##                  95% CI : (0.7532, 0.8623)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 1.758e-10       
##                                           
##                   Kappa : 0.6069          
##                                           
##  Mcnemar's Test P-Value : 0.8744          
##                                           
##             Sensitivity : 0.8385          
##             Specificity : 0.7711          
##          Pos Pred Value : 0.8516          
##          Neg Pred Value : 0.7529          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5117          
##    Detection Prevalence : 0.6009          
##       Balanced Accuracy : 0.8048          
##                                           
##        'Positive' Class : CH              
## 
j.bag.acc <- as.numeric(oj.conf$overall[1])
rm(oj.pred)
rm(oj.conf)
#plot (oj.bag$, oj.bag$finalModel$y)
plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")

oj.frst = train(Purchase ~ .,
               data = oj.train,
               method = "ranger", # for bagging
               tuneLength = 5, # choose up to 5 combinations of tuning parameters
               metric = "ROC", # evaluate hyperparameter combinations with ROC
               trControl = trainControl(
                 method = "cv", # k-fold cross-validation
                 number = 10, # k= 10 folds
                 savePredictions = "final", # save predictions for the optimal tuning parameters
                      classProbs = TRUE, # return class probabilities in addition to predicted values
                      summaryFunction = twoClassSummary # for binary response variable
                      )
                    )
oj.frst
## Random Forest 
## 
## 857 samples
##  17 predictor
##   2 classes: 'CH', 'MM' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 772, 771, 771, 772, 771, 771, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   ROC        Sens       Spec     
##    2    gini        0.8606099  0.8643687  0.7089127
##    2    extratrees  0.8517494  0.8910740  0.6403743
##    5    gini        0.8662687  0.8509434  0.7271836
##    5    extratrees  0.8629548  0.8624093  0.6942068
##    9    gini        0.8665160  0.8355588  0.7363636
##    9    extratrees  0.8633796  0.8431785  0.7122103
##   13    gini        0.8624577  0.8241292  0.7454545
##   13    extratrees  0.8599718  0.8316401  0.7244207
##   17    gini        0.8603882  0.8165094  0.7456328
##   17    extratrees  0.8606794  0.8259434  0.7213904
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 9, splitrule = gini
##  and min.node.size = 1.
plot(oj.frst)

oj.pred <- predict(oj.frst, oj.test, type = "raw")
plot(oj.test$Purchase, oj.pred,
     main = "Random Forest Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

oj.conf <- confusionMatrix(data = oj.pred,
                             reference = oj.test$Purchase)
oj.conf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 110  16
##         MM  20  67
##                                           
##                Accuracy : 0.831           
##                  95% CI : (0.7738, 0.8787)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 2.296e-12       
##                                           
##                   Kappa : 0.6477          
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.8462          
##             Specificity : 0.8072          
##          Pos Pred Value : 0.8730          
##          Neg Pred Value : 0.7701          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5164          
##    Detection Prevalence : 0.5915          
##       Balanced Accuracy : 0.8267          
##                                           
##        'Positive' Class : CH              
## 
oj.frst.acc <- as.numeric(oj.conf$overall[1])
rm(oj.pred)
rm(oj.conf)
plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")

plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")

rbind(
      data.frame(model = "Bagging", Accuracy = round(j.bag.acc, 5)),
      data.frame(model = "Random Forest", Accuracy = round(oj.frst.acc, 5))
) %>% arrange(desc(Accuracy))
##           model Accuracy
## 1 Random Forest  0.83099
## 2       Bagging  0.81221