1 Read Data

library(dplyr)

purchase <- read.csv("Customer_Behaviour.csv", stringsAsFactors = T)
purchase

Deskripsi data:

Gender: Jenis Kelamin (Male, Female)
Age: Range usia (< 30, 30-50, > 50)
Salary: Kategori Gaji Customer (Low, Medium, High)
Purchased: Apakah klien membeli produk kita atau tidak (Yes, No)

2 Cross Validation

RNGkind(sample.kind = "Rounding")
set.seed(100)

purchase_intrain <- sample(nrow(purchase), nrow(purchase)*0.8)
purchase_train <- purchase[purchase_intrain, ]
purchase_test <- purchase[-purchase_intrain, ]

prop.table(table(purchase_train$Purchase))

## 
##       No      Yes 
## 0.640625 0.359375

3 Upsampling

# upsampling

library(caret)

RNGkind(sample.kind = "Rounding")
set.seed(100)


purchase_train <- upSample(x = purchase_train %>% select(-Purchased),
                         y = purchase_train$Purchased,
                         yname = "Purchased")

prop.table(table(purchase_train$Purchased))

## 
##  No Yes 
## 0.5 0.5

4 Naive Bayes

4.1 Modelling

library(e1071)

naive_model <- naiveBayes(x = purchase_train %>% select(-Purchased), # kolom-kolom prediktor
                          y = purchase_train$Purchased) # kolom target variable
naive_model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = purchase_train %>% select(-Purchased), 
##     y = purchase_train$Purchased)
## 
## A-priori probabilities:
## purchase_train$Purchased
##  No Yes 
## 0.5 0.5 
## 
## Conditional probabilities:
##                         Gender
## purchase_train$Purchased    Female      Male
##                      No  0.5170732 0.4829268
##                      Yes 0.5756098 0.4243902
## 
##                         Age
## purchase_train$Purchased        < 30        > 50       30-50
##                      No  0.346341463 0.009756098 0.643902439
##                      Yes 0.034146341 0.336585366 0.629268293
## 
##                         Salary
## purchase_train$Purchased       High        Low     Medium
##                      No  0.06829268 0.22439024 0.70731707
##                      Yes 0.48292683 0.29756098 0.21951220

4.2 Confusion Matrix

#confusion matrix data train
prediction_naive_train <- predict(object = naive_model, # nama model
        newdata = purchase_train,
        type = "class") # probabilitas
library(caret)
eval_naive_train <- confusionMatrix(prediction_naive_train, reference = purchase_train$Purchased, positive = "Yes")

#confusion matrix data test
prediction_naive_test <- predict(object = naive_model, # nama model
        newdata = purchase_test,
        type = "class") # probabilitas

library(caret)
eval_naive_test <- confusionMatrix(prediction_naive_test, reference = purchase_test$Purchased, positive = "Yes")
eval_naive_test

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  46   1
##        Yes  6  27
##                                          
##                Accuracy : 0.9125         
##                  95% CI : (0.828, 0.9641)
##     No Information Rate : 0.65           
##     P-Value [Acc > NIR] : 5.423e-08      
##                                          
##                   Kappa : 0.8153         
##                                          
##  Mcnemar's Test P-Value : 0.1306         
##                                          
##             Sensitivity : 0.9643         
##             Specificity : 0.8846         
##          Pos Pred Value : 0.8182         
##          Neg Pred Value : 0.9787         
##              Prevalence : 0.3500         
##          Detection Rate : 0.3375         
##    Detection Prevalence : 0.4125         
##       Balanced Accuracy : 0.9245         
##                                          
##        'Positive' Class : Yes            
##

4.3 ROC & AUC

# ambil hasil prediksi dalam bentuk probability
purchase_pred_prob <- predict(object = naive_model, 
                          newdata = purchase_test,
                          type = "raw")

head(purchase_pred_prob)

##             No        Yes
## [1,] 0.8969466 0.10305340
## [2,] 0.9670607 0.03293928
## [3,] 0.9738155 0.02618454
## [4,] 0.8969466 0.10305340
## [5,] 0.9738155 0.02618454
## [6,] 0.4675379 0.53246208

# menyiapkan pred vs actual
data_roc_naive <- data.frame(pred_prob = purchase_pred_prob[,"Yes"],
                       actual = ifelse(purchase_test$Purchased == "Yes", 1, 0))


head(data_roc_naive)

Membuat ROC dengan menyiapkan objek prediction()

library(ROCR)

#object prediction

naive_roc <- prediction(predictions = data_roc_naive$pred_prob,
                      labels = data_roc_naive$actual)

# nilai AUC

naive_auc <- performance(naive_roc, measure = "auc")
naive_auc@y.values[[1]]

## [1] 0.9330357

4.4 ROC curve and AUC

plot(performance(naive_roc, "tpr", "fpr"))
abline(0, 1, lty = 2)
text(0.4, 0.6, paste("AUC = ", round(naive_auc@y.values[[1]], 2)))

5 Decision Tree

5.1 Modelling

library(partykit)
model_dt <- ctree(formula = purchase_train$Purchased ~.,
                  data = purchase_train %>% select(-Purchased),
                  control = ctree_control(mincriterion=0.95))
plot(model_dt, type = "simple")

5.2 Confusion Matrix

# prediction to data train
pred_train_dt <- predict(model_dt, newdata = purchase_train)
confusionMatrix(pred_train_dt, reference = purchase_train$Purchased, positive = "Yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  167  28
##        Yes  38 177
##                                           
##                Accuracy : 0.839           
##                  95% CI : (0.7998, 0.8733)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.678           
##                                           
##  Mcnemar's Test P-Value : 0.2679          
##                                           
##             Sensitivity : 0.8634          
##             Specificity : 0.8146          
##          Pos Pred Value : 0.8233          
##          Neg Pred Value : 0.8564          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4317          
##    Detection Prevalence : 0.5244          
##       Balanced Accuracy : 0.8390          
##                                           
##        'Positive' Class : Yes             
##

# prediction to data test
pred_test_dt <- predict(model_dt, newdata = purchase_test)
eval_dt_test <- confusionMatrix(pred_test_dt, reference = purchase_test$Purchased, positive = "Yes")
eval_dt_test

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  44   1
##        Yes  8  27
##                                           
##                Accuracy : 0.8875          
##                  95% CI : (0.7972, 0.9472)
##     No Information Rate : 0.65            
##     P-Value [Acc > NIR] : 1.228e-06       
##                                           
##                   Kappa : 0.7662          
##                                           
##  Mcnemar's Test P-Value : 0.0455          
##                                           
##             Sensitivity : 0.9643          
##             Specificity : 0.8462          
##          Pos Pred Value : 0.7714          
##          Neg Pred Value : 0.9778          
##              Prevalence : 0.3500          
##          Detection Rate : 0.3375          
##    Detection Prevalence : 0.4375          
##       Balanced Accuracy : 0.9052          
##                                           
##        'Positive' Class : Yes             
##

5.3 ROC & AUC

# ambil hasil prediksi dalam bentuk probability
purchase_pred_prob <- predict(object = model_dt, 
                          newdata = purchase_test,
                          type = "prob")

head(purchase_pred_prob)

##           No       Yes
## 1  1.0000000 0.0000000
## 3  1.0000000 0.0000000
## 6  1.0000000 0.0000000
## 9  1.0000000 0.0000000
## 16 1.0000000 0.0000000
## 17 0.3898305 0.6101695

# menyiapkan pred vs actual
data_roc_dt <- data.frame(pred_prob = purchase_pred_prob[,"Yes"],
                       actual = ifelse(purchase_test$Purchased == "Yes", 1, 0))


head(data_roc_dt)

Membuat ROC dengan menyiapkan objek prediction()

library(ROCR)

#object prediction

dt_roc <- prediction(predictions = data_roc_dt$pred_prob,
                      labels = data_roc_dt$actual)

# nilai AUC

dt_auc <- performance(dt_roc, measure = "auc")
dt_auc@y.values[[1]]

## [1] 0.9350962

5.4 ROC curve and AUC

plot(performance(dt_roc, "tpr", "fpr"))
abline(0, 1, lty = 2)
text(0.4, 0.6, paste("AUC = ", round(dt_auc@y.values[[1]], 2)))

6 Random Forest

6.1 Modelling

set.seed(417)

ctrl <- trainControl(method = "repeatedcv",
                      number = 5, # k-fold
                      repeats = 3) # repetisi
purchase_forest <- train(Purchased ~ .,
                    data = purchase_train,
                    method = "rf", # random forest
                    trControl = ctrl)

6.2 Confusion Matrix

#confusion matrix data train
pred_train_rf <- predict(purchase_forest, newdata = purchase_train)
confusionMatrix(pred_train_rf, reference = purchase_train$Purchased, positive = "Yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  167  28
##        Yes  38 177
##                                           
##                Accuracy : 0.839           
##                  95% CI : (0.7998, 0.8733)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.678           
##                                           
##  Mcnemar's Test P-Value : 0.2679          
##                                           
##             Sensitivity : 0.8634          
##             Specificity : 0.8146          
##          Pos Pred Value : 0.8233          
##          Neg Pred Value : 0.8564          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4317          
##    Detection Prevalence : 0.5244          
##       Balanced Accuracy : 0.8390          
##                                           
##        'Positive' Class : Yes             
##

#confusion matrix data test
pred_test_rf <- predict(purchase_forest, newdata = purchase_test)
eval_rf_test <- confusionMatrix(pred_test_rf, reference = purchase_test$Purchased, positive = "Yes")
eval_rf_test

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  44   1
##        Yes  8  27
##                                           
##                Accuracy : 0.8875          
##                  95% CI : (0.7972, 0.9472)
##     No Information Rate : 0.65            
##     P-Value [Acc > NIR] : 1.228e-06       
##                                           
##                   Kappa : 0.7662          
##                                           
##  Mcnemar's Test P-Value : 0.0455          
##                                           
##             Sensitivity : 0.9643          
##             Specificity : 0.8462          
##          Pos Pred Value : 0.7714          
##          Neg Pred Value : 0.9778          
##              Prevalence : 0.3500          
##          Detection Rate : 0.3375          
##    Detection Prevalence : 0.4375          
##       Balanced Accuracy : 0.9052          
##                                           
##        'Positive' Class : Yes             
##

6.3 ROC & AUC

# ambil hasil prediksi dalam bentuk probability
purchase_pred_prob <- predict(object = purchase_forest, 
                          newdata = purchase_test,
                          type = "prob")

head(purchase_pred_prob)

# menyiapkan pred vs actual
data_roc_rf <- data.frame(pred_prob = purchase_pred_prob[,"Yes"],
                       actual = ifelse(purchase_test$Purchased == "Yes", 1, 0))


head(data_roc_rf)

Membuat ROC dengan menyiapkan objek prediction()

library(ROCR)

#object prediction

rf_roc <- prediction(predictions = data_roc_rf$pred_prob,
                      labels = data_roc_rf$actual)

# nilai AUC

rf_auc <- performance(rf_roc, measure = "auc")
rf_auc@y.values[[1]]

## [1] 0.9138049

6.4 ROC curve and AUC

plot(performance(rf_roc, "tpr", "fpr"))
abline(0, 1, lty = 2)
text(0.4, 0.6, paste("AUC = ", round(rf_auc@y.values[[1]], 2)))

7 Conclusion

eval_naive_test <- data_frame(Accuracy = eval_naive_test$overall[1],
           Recall = eval_naive_test$byClass[1],
           Specificity = eval_naive_test$byClass[2],
           Precision = eval_naive_test$byClass[3],
           AUC=naive_auc@y.values[[1]])
eval_dt_test <- data_frame(Accuracy = eval_dt_test$overall[1],
           Recall = eval_dt_test$byClass[1],
           Specificity = eval_dt_test$byClass[2],
           Precision = eval_dt_test$byClass[3],
           AUC=dt_auc@y.values[[1]])
eval_rf_test <- data_frame(Accuracy = eval_rf_test$overall[1],
           Recall = eval_rf_test$byClass[1],
           Specificity = eval_rf_test$byClass[2],
           Precision = eval_rf_test$byClass[3],
           AUC=rf_auc@y.values[[1]])
b <- rbind("Naive Bayes" = eval_naive_test, "Decision Tree" = eval_dt_test, "Random Forest" = eval_rf_test)
cbind(b)

Based on confusion matrix of data test, we can conclude that in this case Naive Bayes is the best model to be used for prediction, with highest accuracy, recall, specificity, precision, and AUC than other model. But overall, all model already gave a good prediction results and can be used for prediction.

LBB ML2: Naive Bayes, Decision Tree, and Random Forest

Debrina Sugiharto

2/7/2021

1 Read Data

2 Cross Validation

3 Upsampling

4 Naive Bayes

4.1 Modelling

4.2 Confusion Matrix

4.3 ROC & AUC

4.4 ROC curve and AUC

5 Decision Tree

5.1 Modelling

5.2 Confusion Matrix

5.3 ROC & AUC

5.4 ROC curve and AUC

6 Random Forest

6.1 Modelling

6.2 Confusion Matrix

6.3 ROC & AUC

6.4 ROC curve and AUC

7 Conclusion