#1.- BIBLIOTECAS
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3
#2.- DATOS
oj_dat <- OJ
skim_to_wide(oj_dat)
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
| Name | Piped data |
| Number of rows | 1070 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| factor | 2 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Purchase | 0 | 1 | FALSE | 2 | CH: 653, MM: 417 |
| Store7 | 0 | 1 | FALSE | 2 | No: 714, Yes: 356 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WeekofPurchase | 0 | 1 | 254.38 | 15.56 | 227.00 | 240.00 | 257.00 | 268.00 | 278.00 | ▆▅▅▇▇ |
| StoreID | 0 | 1 | 3.96 | 2.31 | 1.00 | 2.00 | 3.00 | 7.00 | 7.00 | ▇▅▃▁▇ |
| PriceCH | 0 | 1 | 1.87 | 0.10 | 1.69 | 1.79 | 1.86 | 1.99 | 2.09 | ▅▂▇▆▁ |
| PriceMM | 0 | 1 | 2.09 | 0.13 | 1.69 | 1.99 | 2.09 | 2.18 | 2.29 | ▂▁▃▇▆ |
| DiscCH | 0 | 1 | 0.05 | 0.12 | 0.00 | 0.00 | 0.00 | 0.00 | 0.50 | ▇▁▁▁▁ |
| DiscMM | 0 | 1 | 0.12 | 0.21 | 0.00 | 0.00 | 0.00 | 0.23 | 0.80 | ▇▁▂▁▁ |
| SpecialCH | 0 | 1 | 0.15 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| SpecialMM | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| LoyalCH | 0 | 1 | 0.57 | 0.31 | 0.00 | 0.33 | 0.60 | 0.85 | 1.00 | ▅▃▆▆▇ |
| SalePriceMM | 0 | 1 | 1.96 | 0.25 | 1.19 | 1.69 | 2.09 | 2.13 | 2.29 | ▁▂▂▂▇ |
| SalePriceCH | 0 | 1 | 1.82 | 0.14 | 1.39 | 1.75 | 1.86 | 1.89 | 2.09 | ▂▁▇▇▅ |
| PriceDiff | 0 | 1 | 0.15 | 0.27 | -0.67 | 0.00 | 0.23 | 0.32 | 0.64 | ▁▂▃▇▂ |
| PctDiscMM | 0 | 1 | 0.06 | 0.10 | 0.00 | 0.00 | 0.00 | 0.11 | 0.40 | ▇▁▂▁▁ |
| PctDiscCH | 0 | 1 | 0.03 | 0.06 | 0.00 | 0.00 | 0.00 | 0.00 | 0.25 | ▇▁▁▁▁ |
| ListPriceDiff | 0 | 1 | 0.22 | 0.11 | 0.00 | 0.14 | 0.24 | 0.30 | 0.44 | ▂▃▆▇▁ |
| STORE | 0 | 1 | 1.63 | 1.43 | 0.00 | 0.00 | 2.00 | 3.00 | 4.00 | ▇▃▅▅▃ |
#3.-PARTICIONES TRAIN AND TEST
set.seed(12345)
partition <- createDataPartition(y = oj_dat$Purchase, p = 0.8, list = FALSE)
oj.train <- oj_dat[partition, ]
oj.test <- oj_dat[-partition, ]
rm(partition)
#4.- ÁRBOL DE DECISIÓN
set.seed(123)
oj.bag = train(Purchase ~ .,
data = oj.train,
method = "treebag", # for bagging
tuneLength = 5, # choose up to 5 combinations of tuning parameters
metric = "ROC", # evaluate hyperparameter combinations with ROC
trControl = trainControl(
method = "cv", # k-fold cross-validation
number = 10, # k= 10 folds
savePredictions = "final", # save predictions for the optimal tuning parameters
classProbs = TRUE, # return class probabilities in addition to predicted values
summaryFunction = twoClassSummary # for binary response variable
)
)
oj.bag
## Bagged CART
##
## 857 samples
## 17 predictor
## 2 classes: 'CH', 'MM'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 770, 772, 771, 772, 772, 771, ...
## Resampling results:
##
## ROC Sens Spec
## 0.8485285 0.8220972 0.7093583
oj.pred <- predict(oj.bag, oj.test, type = "raw")
plot(oj.test$Purchase, oj.pred,
main = "Bagging Classification: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")
oj.conf <- confusionMatrix(data = oj.pred,
reference = oj.test$Purchase)
oj.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 109 19
## MM 21 64
##
## Accuracy : 0.8122
## 95% CI : (0.7532, 0.8623)
## No Information Rate : 0.6103
## P-Value [Acc > NIR] : 1.758e-10
##
## Kappa : 0.6069
##
## Mcnemar's Test P-Value : 0.8744
##
## Sensitivity : 0.8385
## Specificity : 0.7711
## Pos Pred Value : 0.8516
## Neg Pred Value : 0.7529
## Prevalence : 0.6103
## Detection Rate : 0.5117
## Detection Prevalence : 0.6009
## Balanced Accuracy : 0.8048
##
## 'Positive' Class : CH
##
j.bag.acc <- as.numeric(oj.conf$overall[1])
rm(oj.pred)
rm(oj.conf)
#plot (oj.bag$, oj.bag$finalModel$y)
plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")
oj.frst = train(Purchase ~ .,
data = oj.train,
method = "ranger", # for bagging
tuneLength = 5, # choose up to 5 combinations of tuning parameters
metric = "ROC", # evaluate hyperparameter combinations with ROC
trControl = trainControl(
method = "cv", # k-fold cross-validation
number = 10, # k= 10 folds
savePredictions = "final", # save predictions for the optimal tuning parameters
classProbs = TRUE, # return class probabilities in addition to predicted values
summaryFunction = twoClassSummary # for binary response variable
)
)
oj.frst
## Random Forest
##
## 857 samples
## 17 predictor
## 2 classes: 'CH', 'MM'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 772, 771, 771, 772, 771, 771, ...
## Resampling results across tuning parameters:
##
## mtry splitrule ROC Sens Spec
## 2 gini 0.8606099 0.8643687 0.7089127
## 2 extratrees 0.8517494 0.8910740 0.6403743
## 5 gini 0.8662687 0.8509434 0.7271836
## 5 extratrees 0.8629548 0.8624093 0.6942068
## 9 gini 0.8665160 0.8355588 0.7363636
## 9 extratrees 0.8633796 0.8431785 0.7122103
## 13 gini 0.8624577 0.8241292 0.7454545
## 13 extratrees 0.8599718 0.8316401 0.7244207
## 17 gini 0.8603882 0.8165094 0.7456328
## 17 extratrees 0.8606794 0.8259434 0.7213904
##
## Tuning parameter 'min.node.size' was held constant at a value of 1
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 9, splitrule = gini
## and min.node.size = 1.
plot(oj.frst)
oj.pred <- predict(oj.frst, oj.test, type = "raw")
plot(oj.test$Purchase, oj.pred,
main = "Random Forest Classification: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")
oj.conf <- confusionMatrix(data = oj.pred,
reference = oj.test$Purchase)
oj.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 110 16
## MM 20 67
##
## Accuracy : 0.831
## 95% CI : (0.7738, 0.8787)
## No Information Rate : 0.6103
## P-Value [Acc > NIR] : 2.296e-12
##
## Kappa : 0.6477
##
## Mcnemar's Test P-Value : 0.6171
##
## Sensitivity : 0.8462
## Specificity : 0.8072
## Pos Pred Value : 0.8730
## Neg Pred Value : 0.7701
## Prevalence : 0.6103
## Detection Rate : 0.5164
## Detection Prevalence : 0.5915
## Balanced Accuracy : 0.8267
##
## 'Positive' Class : CH
##
oj.frst.acc <- as.numeric(oj.conf$overall[1])
rm(oj.pred)
rm(oj.conf)
plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")
plot(varImp(oj.bag), main = "Variable Importance with Simple Classification")
rbind(
data.frame(model = "Bagging", Accuracy = round(j.bag.acc, 5)),
data.frame(model = "Random Forest", Accuracy = round(oj.frst.acc, 5))
) %>% arrange(desc(Accuracy))
## model Accuracy
## 1 Random Forest 0.83099
## 2 Bagging 0.81221