Decision tree adalah salah satu algoritma machine learning yang digunakan untuk klasifikasi dan regresi. Algoritma ini membangun sebuah struktur pohon keputusan berdasarkan fitur-fitur dari data input dan labelnya (atau nilai target). Pohon keputusan ini dapat digunakan untuk membuat prediksi terhadap data yang baru.

Library

library(caret)
library(DescTools)
library(rpart)
library(rpart.plot)
library(readxl)
library(ISLR)  # For OJ datasets

Data

Deskripsi Data

Data yang digunakan adalah data OJ dari package ISLR , Data tersebut berisi 1070 pembelian dimana pelanggan membeli Citrus Hill atau Minute Maid Orange Juice. Sejumlah karakteristik pelanggan dan produk dicatat. Data OJ memiliki 18 variabel sebagai berikut:

Purchase A factor with levels CH and MM indicating whether the customer purchased Citrus Hill or Minute Maid Orange Juice

WeekofPurchase Week of purchase

StoreID Store ID

PriceCH Price charged for CH

PriceMM Price charged for MM

DiscCH Discount offered for CH

DiscMM Discount offered for MM

SpecialCH Indicator of special on CH

SpecialMM Indicator of special on MM

LoyalCH Customer brand loyalty for CH

SalePriceMM Sale price for MM

SalePriceCH Sale price for CH

PriceDiff Sale price of MM less sale price of CH

Store7 A factor with levels No and Yes indicating whether the sale is at Store 7

PctDiscMM Percentage discount for MM

PctDiscCH Percentage discount for CH

ListPriceDiff List price of MM less list price of CH

STORE Which of 5 possible stores the sale occured at

data_OJ <- OJ
head(data_OJ)

##   Purchase WeekofPurchase StoreID PriceCH PriceMM DiscCH DiscMM SpecialCH
## 1       CH            237       1    1.75    1.99   0.00    0.0         0
## 2       CH            239       1    1.75    1.99   0.00    0.3         0
## 3       CH            245       1    1.86    2.09   0.17    0.0         0
## 4       MM            227       1    1.69    1.69   0.00    0.0         0
## 5       CH            228       7    1.69    1.69   0.00    0.0         0
## 6       CH            230       7    1.69    1.99   0.00    0.0         0
##   SpecialMM  LoyalCH SalePriceMM SalePriceCH PriceDiff Store7 PctDiscMM
## 1         0 0.500000        1.99        1.75      0.24     No  0.000000
## 2         1 0.600000        1.69        1.75     -0.06     No  0.150754
## 3         0 0.680000        2.09        1.69      0.40     No  0.000000
## 4         0 0.400000        1.69        1.69      0.00     No  0.000000
## 5         0 0.956535        1.69        1.69      0.00    Yes  0.000000
## 6         1 0.965228        1.99        1.69      0.30    Yes  0.000000
##   PctDiscCH ListPriceDiff STORE
## 1  0.000000          0.24     1
## 2  0.000000          0.24     1
## 3  0.091398          0.23     1
## 4  0.000000          0.00     1
## 5  0.000000          0.00     0
## 6  0.000000          0.30     0

summary(data_OJ)

##  Purchase WeekofPurchase     StoreID        PriceCH         PriceMM     
##  CH:653   Min.   :227.0   Min.   :1.00   Min.   :1.690   Min.   :1.690  
##  MM:417   1st Qu.:240.0   1st Qu.:2.00   1st Qu.:1.790   1st Qu.:1.990  
##           Median :257.0   Median :3.00   Median :1.860   Median :2.090  
##           Mean   :254.4   Mean   :3.96   Mean   :1.867   Mean   :2.085  
##           3rd Qu.:268.0   3rd Qu.:7.00   3rd Qu.:1.990   3rd Qu.:2.180  
##           Max.   :278.0   Max.   :7.00   Max.   :2.090   Max.   :2.290  
##      DiscCH            DiscMM         SpecialCH        SpecialMM     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.05186   Mean   :0.1234   Mean   :0.1477   Mean   :0.1617  
##  3rd Qu.:0.00000   3rd Qu.:0.2300   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :0.50000   Max.   :0.8000   Max.   :1.0000   Max.   :1.0000  
##     LoyalCH          SalePriceMM     SalePriceCH      PriceDiff       Store7   
##  Min.   :0.000011   Min.   :1.190   Min.   :1.390   Min.   :-0.6700   No :714  
##  1st Qu.:0.325257   1st Qu.:1.690   1st Qu.:1.750   1st Qu.: 0.0000   Yes:356  
##  Median :0.600000   Median :2.090   Median :1.860   Median : 0.2300            
##  Mean   :0.565782   Mean   :1.962   Mean   :1.816   Mean   : 0.1465            
##  3rd Qu.:0.850873   3rd Qu.:2.130   3rd Qu.:1.890   3rd Qu.: 0.3200            
##  Max.   :0.999947   Max.   :2.290   Max.   :2.090   Max.   : 0.6400            
##    PctDiscMM        PctDiscCH       ListPriceDiff       STORE      
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.140   1st Qu.:0.000  
##  Median :0.0000   Median :0.00000   Median :0.240   Median :2.000  
##  Mean   :0.0593   Mean   :0.02731   Mean   :0.218   Mean   :1.631  
##  3rd Qu.:0.1127   3rd Qu.:0.00000   3rd Qu.:0.300   3rd Qu.:3.000  
##  Max.   :0.4020   Max.   :0.25269   Max.   :0.440   Max.   :4.000

Eksplorasi Data

library(DataExplorer)
plot_intro(data_OJ)

library(dplyr)
library(ggplot2)
plotdata <- data_OJ %>%
  count(Purchase ) %>%
  arrange(desc(Purchase )) %>%
  mutate(prop = round(n*100/sum(n), 1),
         lab.ypos = cumsum(prop) - 0.5*prop)

# Pie Chart
ggplot(plotdata, aes(x = "", y = prop, fill = Purchase )) +
  geom_bar(width = 1, stat = "identity", color = "white") +
  coord_polar("y", start = 0)+
  geom_text(aes(y = lab.ypos, label = prop), color = "black")+
  theme_void()+
  labs(title = "Persentase Purchase")

Partisi Data

set.seed(123)
# Set the proportion of data to be used for training
train_proportion <- 0.8

# Create the train/test indices with stratified sampling
train_index <- createDataPartition(data_OJ$Purchase, p = train_proportion, list = FALSE, times = 1)

# Split the data into training and testing sets
train_set1 <- data_OJ[train_index, ]
test_set1 <- data_OJ[-train_index, ]

Decision Tree

set.seed(123)
dataoj_class <- rpart(formula = Purchase ~ .,
                       data = train_set1,
                       method = "class")  # classification (not regression))
rpart.plot(dataoj_class, yesno = TRUE)

prediksi_test <- predict(dataoj_class, test_set1, type = "class")
plot(test_set1$Purchase, prediksi_test, 
     main = "Simple Classification: Predicted vs. Actual",
     xlab = "Actual",
     ylab = "Predicted")

(conf_mat_oj <- confusionMatrix(data = prediksi_test, 
                                  reference = test_set1$Purchase))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  CH  MM
##         CH 109  23
##         MM  21  60
##                                           
##                Accuracy : 0.7934          
##                  95% CI : (0.7328, 0.8457)
##     No Information Rate : 0.6103          
##     P-Value [Acc > NIR] : 8.319e-09       
##                                           
##                   Kappa : 0.5638          
##                                           
##  Mcnemar's Test P-Value : 0.8802          
##                                           
##             Sensitivity : 0.8385          
##             Specificity : 0.7229          
##          Pos Pred Value : 0.8258          
##          Neg Pred Value : 0.7407          
##              Prevalence : 0.6103          
##          Detection Rate : 0.5117          
##    Detection Prevalence : 0.6197          
##       Balanced Accuracy : 0.7807          
##                                           
##        'Positive' Class : CH              
##