DECISION TREES 1

#1.- BIBLIOTECAS

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.2.3

library(caret)

## Warning: package 'caret' was built under R version 4.2.3

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.2.3

## Loading required package: lattice

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.3

## Loading required package: rpart

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'stringr' was built under R version 4.2.3

## Warning: package 'forcats' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ purrr::lift()   masks caret::lift()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(skimr)

## Warning: package 'skimr' was built under R version 4.2.3

#2.- DATOS

oj_dat <- OJ
skim_to_wide(oj_dat)

## Warning: 'skim_to_wide' is deprecated.
## Use 'skim()' instead.
## See help("Deprecated")

Data summary
Name	Piped data
Number of rows	1070
Number of columns	18
_______________________
Column type frequency:
factor	2
numeric	16
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Purchase	0	1	FALSE	2	CH: 653, MM: 417
Store7	0	1	FALSE	2	No: 714, Yes: 356

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
WeekofPurchase	1	254.38	15.56	227.00	240.00	257.00	268.00	278.00	▆▅▅▇▇
StoreID	1	3.96	2.31	1.00	2.00	3.00	7.00	7.00	▇▅▃▁▇
PriceCH	1	1.87	0.10	1.69	1.79	1.86	1.99	2.09	▅▂▇▆▁
PriceMM	1	2.09	0.13	1.69	1.99	2.09	2.18	2.29	▂▁▃▇▆
DiscCH	1	0.05	0.12	0.00	0.00	0.00	0.00	0.50	▇▁▁▁▁
DiscMM	1	0.12	0.21	0.00	0.00	0.00	0.23	0.80	▇▁▂▁▁
SpecialCH	1	0.15	0.35	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
SpecialMM	1	0.16	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
LoyalCH	1	0.57	0.31	0.00	0.33	0.60	0.85	1.00	▅▃▆▆▇
SalePriceMM	1	1.96	0.25	1.19	1.69	2.09	2.13	2.29	▁▂▂▂▇
SalePriceCH	1	1.82	0.14	1.39	1.75	1.86	1.89	2.09	▂▁▇▇▅
PriceDiff	1	0.15	0.27	-0.67	0.00	0.23	0.32	0.64	▁▂▃▇▂
PctDiscMM	1	0.06	0.10	0.00	0.00	0.00	0.11	0.40	▇▁▂▁▁
PctDiscCH	1	0.03	0.06	0.00	0.00	0.00	0.00	0.25	▇▁▁▁▁
ListPriceDiff	1	0.22	0.11	0.00	0.14	0.24	0.30	0.44	▂▃▆▇▁
STORE	1	1.63	1.43	0.00	0.00	2.00	3.00	4.00	▇▃▅▅▃

#3.- PARTICIONES TRAIN Y TEST

set.seed(12345)

partition <- createDataPartition(y= oj_dat$Purchase, p = 0.8, list = FALSE)

oj.train <- oj_dat[partition, ]

oj.test <- oj_dat[-partition, ]

rm(partition)

#4.- ÁRBOL DE DECISIÓN

set.seed(123)

oj.full_class <- rpart(formula = Purchase ~ .,
                       data = oj.train,
                       method = "class", # clasificación (no regresión)
                       xval = 10  # validación cruzada de 10 veces
                       )

rpart.plot(oj.full_class, yesno = TRUE)

printcp(oj.full_class)

## 
## Classification tree:
## rpart(formula = Purchase ~ ., data = oj.train, method = "class", 
##     xval = 10)
## 
## Variables actually used in tree construction:
## [1] LoyalCH     PriceDiff   SalePriceMM
## 
## Root node error: 334/857 = 0.38973
## 
## n= 857 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.479042      0   1.00000 1.00000 0.042745
## 2 0.032934      1   0.52096 0.54192 0.035775
## 3 0.013473      3   0.45509 0.47006 0.033905
## 4 0.010000      5   0.42814 0.46407 0.033736

The algorithm included 13 variables in the full tree. The root node contains 334 errors out of the 857 values (39%).

The second step is to prune the tree to optimal size (to avoid overfitting). The CP table in the model summary shows the relavent statistics to choose the appropriate pruning paramter. The rel error column is the error rate / root node error produced when pruning the tree using complexity parameter CP to nsplits splits. The xerror column shows the error rate. A plot of xerror vs cp shows the relationship.

plotcp(oj.full_class)

The dashed line is set at the minimum xerror + xstd. Any value below the line would be considered statistically significant. A good choice for CP is often the largest value for which the error is within a standard deviation of the mimimum error. In this case, the smallest cp is at 0.01. A good way to detect and capture the correct cp is with the which.min() function, but if you want to choose the smallest statistically equivalent tree, specify it manually. Use the prune() function to prune the tree by specifying the associated cost-complexity cp.

oj.class <- prune(oj.full_class,
                  cp = 
oj.full_class$cptable[which.min(oj.full_class$cptable[, "xerror"]), "CP"])
rm(oj.full_class)
rpart.plot(oj.class, yesno = TRUE)

oj.class.pred <- predict(oj.class, oj.test, type = "class")
plot(oj.test$Purchase, oj.class.pred,
     main = "Simple Classification: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")

oj.class.conf <- confusionMatrix(data = oj.class.pred,
                                  reference = oj.test$Purchase)

oj.class.conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 117  18
##         MM  13  65
##                                           
##                Accuracy : 0.8545          
##                  95% CI : (0.7998, 0.8989)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 4.83e-15        
##                                           
##                   Kappa : 0.6907          
##                                           
##  Mcnemar's Test P-Value : 0.4725          
##                                           
##             Sensitivity : 0.9000          
##             Specificity : 0.7831          
##          Pos Pred Value : 0.8667          
##          Neg Pred Value : 0.8333          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5493          
##    Detection Prevalence : 0.6338          
##       Balanced Accuracy : 0.8416          
##                                           
##        'Positive' Class : CH              
##

oj.class.acc <- as.numeric(oj.class.conf$overall[1])
rm(oj.class.pred)
rm(oj.class.conf)

oj.class2 = train(Purchase ~ .,
                    data = oj.train,
                    method = "rpart", # for classification tree
                    tuneLength = 5,  # choose up to 5 combination of tuning parameters (cp)
                    metric='ROC',    # evaluate hyperparameter combinations with ROC
                    trControl = trainControl(
                      method = "cv", # k-fold cross validation
                      number = 10, # 10 folds
                      savePredictions = "final",    # save predictions fot the optimal tuning parameter
                      classProbs = TRUE, # return class probabilities in addition to predicted values
                      summaryFunction = twoClassSummary # for binary response variable
                      )
                    )
oj.class2

## CART 
## 
## 857 samples
##  17 predictor
##   2 classes: 'CH', 'MM' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 771, 772, 772, 772, 770, 770, ... 
## Resampling results across tuning parameters:
## 
##   cp           ROC        Sens       Spec     
##   0.005988024  0.8497527  0.8545718  0.7427807
##   0.008982036  0.8495648  0.8431060  0.7309269
##   0.013473054  0.8385888  0.8279390  0.7426916
##   0.032934132  0.7895605  0.8543904  0.6948307
##   0.479041916  0.6213113  0.9042090  0.3384135
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005988024.

plot(oj.class2)

oj.class.pred <- predict(oj.class2, oj.test, type = "raw")
plot(oj.test$Purchase, oj.class.pred,
     main = "Simple Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

oj.class.conf <- confusionMatrix(data= oj.class.pred,
                                  reference = oj.test$Purchase)

oj.class.conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 115  18
##         MM  15  65
##                                           
##                Accuracy : 0.8451          
##                  95% CI : (0.7894, 0.8909)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 6.311e-14       
##                                           
##                   Kappa : 0.6721          
##                                           
##  Mcnemar's Test P-Value : 0.7277          
##                                           
##             Sensitivity : 0.8846          
##             Specificity : 0.7831          
##          Pos Pred Value : 0.8647          
##          Neg Pred Value : 0.8125          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5399          
##    Detection Prevalence : 0.6244          
##       Balanced Accuracy : 0.8339          
##                                           
##        'Positive' Class : CH              
##

oj.class.acc2 <- as.numeric(oj.class.conf$overall[1])
rm(oj.class.pred)
rm(oj.class.conf)
rpart.plot(oj.class2$finalModel)

plot(varImp(oj.class2), main = "Variable Importance with Simple Classification")

myGrid <-  expand.grid(cp = (0:1)/10)
oj.class3 = train(Purchase ~ .,
                    data = oj.train,
                    method = "rpart",  # for classification tree
                    tuneGrid = myGrid,  # choose up to 5 combinations of tuning parameters (cp)
                    metric='ROC',  # evaluate hyperparamter combinations with ROC
                    trControl = trainControl(
                      method = "cv",  # k-fold cross validation
                      number = 10,  # 10 folds
                      savePredictions = "final",       # save predictions for the optimal tuning parameter
                      classProbs = TRUE,  # return class probabilities in addition to predicted values
                      summaryFunction = twoClassSummary  # for binary response variable
                      )
                    )
rm(myGrid)
oj.class3

## CART 
## 
## 857 samples
##  17 predictor
##   2 classes: 'CH', 'MM' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 770, 772, 772, 772, 771, 771, ... 
## Resampling results across tuning parameters:
## 
##   cp   ROC        Sens      Spec     
##   0.0  0.8509904  0.820029  0.7331551
##   0.1  0.7747727  0.826016  0.7235294
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.

plot(oj.class3)

oj.class.pred <- predict(oj.class3, oj.test, type = "raw")
plot(oj.test$Purchase, oj.class.pred,
     main = "Simple Classification: Predicted vs Actual",
     xlab = "Actual",
     ylab = "Predicted")

oj.class.conf <- confusionMatrix(data = oj.class.pred,
                                  reference = oj.test$Purchase)

oj.class.conf

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 116  18
##         MM  14  65
##                                           
##                Accuracy : 0.8498          
##                  95% CI : (0.7946, 0.8949)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 1.778e-14       
##                                           
##                   Kappa : 0.6814          
##                                           
##  Mcnemar's Test P-Value : 0.5959          
##                                           
##             Sensitivity : 0.8923          
##             Specificity : 0.7831          
##          Pos Pred Value : 0.8657          
##          Neg Pred Value : 0.8228          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5446          
##    Detection Prevalence : 0.6291          
##       Balanced Accuracy : 0.8377          
##                                           
##        'Positive' Class : CH              
##

oj.class.acc3 <- as.numeric(oj.class.conf$overall[1])
rm(oj.class.pred)
rm(oj.class.conf)
rpart.plot(oj.class3$finalModel)

plot(varImp(oj.class3), main = "Variable Importance with Simple Classification")

rbind(data.frame(model = "Manual Class", Acc = round(oj.class.acc, 5)),
      data.frame(model = "Caret w/tuneLength", Acc = round(oj.class.acc2, 5)),
      data.frame(model = "Caret w/tuneGrid", Acc = round(oj.class.acc3, 5))
)

##                model     Acc
## 1       Manual Class 0.85446
## 2 Caret w/tuneLength 0.84507
## 3   Caret w/tuneGrid 0.84977

DECISION TREES 1

2023-10-21