#1.- BIBLIOTECAS
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.2.3
library(caret)
## Warning: package 'caret' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.3
## Loading required package: lattice
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.3
## Loading required package: rpart
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.3
#2.- DATOS
oj_dat <- OJ
skim_to_wide(oj_dat)
## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")
| Name | Piped data |
| Number of rows | 1070 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| factor | 2 |
| numeric | 16 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Purchase | 0 | 1 | FALSE | 2 | CH: 653, MM: 417 |
| Store7 | 0 | 1 | FALSE | 2 | No: 714, Yes: 356 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| WeekofPurchase | 0 | 1 | 254.38 | 15.56 | 227.00 | 240.00 | 257.00 | 268.00 | 278.00 | ▆▅▅▇▇ |
| StoreID | 0 | 1 | 3.96 | 2.31 | 1.00 | 2.00 | 3.00 | 7.00 | 7.00 | ▇▅▃▁▇ |
| PriceCH | 0 | 1 | 1.87 | 0.10 | 1.69 | 1.79 | 1.86 | 1.99 | 2.09 | ▅▂▇▆▁ |
| PriceMM | 0 | 1 | 2.09 | 0.13 | 1.69 | 1.99 | 2.09 | 2.18 | 2.29 | ▂▁▃▇▆ |
| DiscCH | 0 | 1 | 0.05 | 0.12 | 0.00 | 0.00 | 0.00 | 0.00 | 0.50 | ▇▁▁▁▁ |
| DiscMM | 0 | 1 | 0.12 | 0.21 | 0.00 | 0.00 | 0.00 | 0.23 | 0.80 | ▇▁▂▁▁ |
| SpecialCH | 0 | 1 | 0.15 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| SpecialMM | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| LoyalCH | 0 | 1 | 0.57 | 0.31 | 0.00 | 0.33 | 0.60 | 0.85 | 1.00 | ▅▃▆▆▇ |
| SalePriceMM | 0 | 1 | 1.96 | 0.25 | 1.19 | 1.69 | 2.09 | 2.13 | 2.29 | ▁▂▂▂▇ |
| SalePriceCH | 0 | 1 | 1.82 | 0.14 | 1.39 | 1.75 | 1.86 | 1.89 | 2.09 | ▂▁▇▇▅ |
| PriceDiff | 0 | 1 | 0.15 | 0.27 | -0.67 | 0.00 | 0.23 | 0.32 | 0.64 | ▁▂▃▇▂ |
| PctDiscMM | 0 | 1 | 0.06 | 0.10 | 0.00 | 0.00 | 0.00 | 0.11 | 0.40 | ▇▁▂▁▁ |
| PctDiscCH | 0 | 1 | 0.03 | 0.06 | 0.00 | 0.00 | 0.00 | 0.00 | 0.25 | ▇▁▁▁▁ |
| ListPriceDiff | 0 | 1 | 0.22 | 0.11 | 0.00 | 0.14 | 0.24 | 0.30 | 0.44 | ▂▃▆▇▁ |
| STORE | 0 | 1 | 1.63 | 1.43 | 0.00 | 0.00 | 2.00 | 3.00 | 4.00 | ▇▃▅▅▃ |
#3.- PARTICIONES TRAIN Y TEST
set.seed(12345)
partition <- createDataPartition(y= oj_dat$Purchase, p = 0.8, list = FALSE)
oj.train <- oj_dat[partition, ]
oj.test <- oj_dat[-partition, ]
rm(partition)
#4.- ÁRBOL DE DECISIÓN
set.seed(123)
oj.full_class <- rpart(formula = Purchase ~ .,
data = oj.train,
method = "class", # clasificación (no regresión)
xval = 10 # validación cruzada de 10 veces
)
rpart.plot(oj.full_class, yesno = TRUE)
printcp(oj.full_class)
##
## Classification tree:
## rpart(formula = Purchase ~ ., data = oj.train, method = "class",
## xval = 10)
##
## Variables actually used in tree construction:
## [1] LoyalCH PriceDiff SalePriceMM
##
## Root node error: 334/857 = 0.38973
##
## n= 857
##
## CP nsplit rel error xerror xstd
## 1 0.479042 0 1.00000 1.00000 0.042745
## 2 0.032934 1 0.52096 0.54192 0.035775
## 3 0.013473 3 0.45509 0.47006 0.033905
## 4 0.010000 5 0.42814 0.46407 0.033736
The algorithm included 13 variables in the full tree. The root node contains 334 errors out of the 857 values (39%).
The second step is to prune the tree to optimal size (to avoid overfitting). The CP table in the model summary shows the relavent statistics to choose the appropriate pruning paramter. The rel error column is the error rate / root node error produced when pruning the tree using complexity parameter CP to nsplits splits. The xerror column shows the error rate. A plot of xerror vs cp shows the relationship.
plotcp(oj.full_class)
The dashed line is set at the minimum xerror + xstd. Any value below the
line would be considered statistically significant. A good choice for CP
is often the largest value for which the error is within a standard
deviation of the mimimum error. In this case, the smallest cp is at
0.01. A good way to detect and capture the correct cp is with the
which.min() function, but if you want to choose the smallest
statistically equivalent tree, specify it manually. Use the prune()
function to prune the tree by specifying the associated cost-complexity
cp.
oj.class <- prune(oj.full_class,
cp =
oj.full_class$cptable[which.min(oj.full_class$cptable[, "xerror"]), "CP"])
rm(oj.full_class)
rpart.plot(oj.class, yesno = TRUE)
oj.class.pred <- predict(oj.class, oj.test, type = "class")
plot(oj.test$Purchase, oj.class.pred,
main = "Simple Classification: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
oj.class.conf <- confusionMatrix(data = oj.class.pred,
reference = oj.test$Purchase)
oj.class.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 117 18
## MM 13 65
##
## Accuracy : 0.8545
## 95% CI : (0.7998, 0.8989)
## No Information Rate : 0.6103
## P-Value [Acc > NIR] : 4.83e-15
##
## Kappa : 0.6907
##
## Mcnemar's Test P-Value : 0.4725
##
## Sensitivity : 0.9000
## Specificity : 0.7831
## Pos Pred Value : 0.8667
## Neg Pred Value : 0.8333
## Prevalence : 0.6103
## Detection Rate : 0.5493
## Detection Prevalence : 0.6338
## Balanced Accuracy : 0.8416
##
## 'Positive' Class : CH
##
oj.class.acc <- as.numeric(oj.class.conf$overall[1])
rm(oj.class.pred)
rm(oj.class.conf)
oj.class2 = train(Purchase ~ .,
data = oj.train,
method = "rpart", # for classification tree
tuneLength = 5, # choose up to 5 combination of tuning parameters (cp)
metric='ROC', # evaluate hyperparameter combinations with ROC
trControl = trainControl(
method = "cv", # k-fold cross validation
number = 10, # 10 folds
savePredictions = "final", # save predictions fot the optimal tuning parameter
classProbs = TRUE, # return class probabilities in addition to predicted values
summaryFunction = twoClassSummary # for binary response variable
)
)
oj.class2
## CART
##
## 857 samples
## 17 predictor
## 2 classes: 'CH', 'MM'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 771, 772, 772, 772, 770, 770, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.005988024 0.8497527 0.8545718 0.7427807
## 0.008982036 0.8495648 0.8431060 0.7309269
## 0.013473054 0.8385888 0.8279390 0.7426916
## 0.032934132 0.7895605 0.8543904 0.6948307
## 0.479041916 0.6213113 0.9042090 0.3384135
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005988024.
plot(oj.class2)
oj.class.pred <- predict(oj.class2, oj.test, type = "raw")
plot(oj.test$Purchase, oj.class.pred,
main = "Simple Classification: Predicted vs. Actual",
xlab = "Actual",
ylab = "Predicted")
oj.class.conf <- confusionMatrix(data= oj.class.pred,
reference = oj.test$Purchase)
oj.class.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 115 18
## MM 15 65
##
## Accuracy : 0.8451
## 95% CI : (0.7894, 0.8909)
## No Information Rate : 0.6103
## P-Value [Acc > NIR] : 6.311e-14
##
## Kappa : 0.6721
##
## Mcnemar's Test P-Value : 0.7277
##
## Sensitivity : 0.8846
## Specificity : 0.7831
## Pos Pred Value : 0.8647
## Neg Pred Value : 0.8125
## Prevalence : 0.6103
## Detection Rate : 0.5399
## Detection Prevalence : 0.6244
## Balanced Accuracy : 0.8339
##
## 'Positive' Class : CH
##
oj.class.acc2 <- as.numeric(oj.class.conf$overall[1])
rm(oj.class.pred)
rm(oj.class.conf)
rpart.plot(oj.class2$finalModel)
plot(varImp(oj.class2), main = "Variable Importance with Simple Classification")
myGrid <- expand.grid(cp = (0:1)/10)
oj.class3 = train(Purchase ~ .,
data = oj.train,
method = "rpart", # for classification tree
tuneGrid = myGrid, # choose up to 5 combinations of tuning parameters (cp)
metric='ROC', # evaluate hyperparamter combinations with ROC
trControl = trainControl(
method = "cv", # k-fold cross validation
number = 10, # 10 folds
savePredictions = "final", # save predictions for the optimal tuning parameter
classProbs = TRUE, # return class probabilities in addition to predicted values
summaryFunction = twoClassSummary # for binary response variable
)
)
rm(myGrid)
oj.class3
## CART
##
## 857 samples
## 17 predictor
## 2 classes: 'CH', 'MM'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 770, 772, 772, 772, 771, 771, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.0 0.8509904 0.820029 0.7331551
## 0.1 0.7747727 0.826016 0.7235294
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
plot(oj.class3)
oj.class.pred <- predict(oj.class3, oj.test, type = "raw")
plot(oj.test$Purchase, oj.class.pred,
main = "Simple Classification: Predicted vs Actual",
xlab = "Actual",
ylab = "Predicted")
oj.class.conf <- confusionMatrix(data = oj.class.pred,
reference = oj.test$Purchase)
oj.class.conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction CH MM
## CH 116 18
## MM 14 65
##
## Accuracy : 0.8498
## 95% CI : (0.7946, 0.8949)
## No Information Rate : 0.6103
## P-Value [Acc > NIR] : 1.778e-14
##
## Kappa : 0.6814
##
## Mcnemar's Test P-Value : 0.5959
##
## Sensitivity : 0.8923
## Specificity : 0.7831
## Pos Pred Value : 0.8657
## Neg Pred Value : 0.8228
## Prevalence : 0.6103
## Detection Rate : 0.5446
## Detection Prevalence : 0.6291
## Balanced Accuracy : 0.8377
##
## 'Positive' Class : CH
##
oj.class.acc3 <- as.numeric(oj.class.conf$overall[1])
rm(oj.class.pred)
rm(oj.class.conf)
rpart.plot(oj.class3$finalModel)
plot(varImp(oj.class3), main = "Variable Importance with Simple Classification")
rbind(data.frame(model = "Manual Class", Acc = round(oj.class.acc, 5)),
data.frame(model = "Caret w/tuneLength", Acc = round(oj.class.acc2, 5)),
data.frame(model = "Caret w/tuneGrid", Acc = round(oj.class.acc3, 5))
)
## model Acc
## 1 Manual Class 0.85446
## 2 Caret w/tuneLength 0.84507
## 3 Caret w/tuneGrid 0.84977