library(dplyr)
purchase <- read.csv("Customer_Behaviour.csv", stringsAsFactors = T)
purchase
Deskripsi data:
RNGkind(sample.kind = "Rounding")
set.seed(100)
purchase_intrain <- sample(nrow(purchase), nrow(purchase)*0.8)
purchase_train <- purchase[purchase_intrain, ]
purchase_test <- purchase[-purchase_intrain, ]
prop.table(table(purchase_train$Purchase))
##
## No Yes
## 0.640625 0.359375
# upsampling
library(caret)
RNGkind(sample.kind = "Rounding")
set.seed(100)
purchase_train <- upSample(x = purchase_train %>% select(-Purchased),
y = purchase_train$Purchased,
yname = "Purchased")
prop.table(table(purchase_train$Purchased))
##
## No Yes
## 0.5 0.5
library(e1071)
naive_model <- naiveBayes(x = purchase_train %>% select(-Purchased), # kolom-kolom prediktor
y = purchase_train$Purchased) # kolom target variable
naive_model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = purchase_train %>% select(-Purchased),
## y = purchase_train$Purchased)
##
## A-priori probabilities:
## purchase_train$Purchased
## No Yes
## 0.5 0.5
##
## Conditional probabilities:
## Gender
## purchase_train$Purchased Female Male
## No 0.5170732 0.4829268
## Yes 0.5756098 0.4243902
##
## Age
## purchase_train$Purchased < 30 > 50 30-50
## No 0.346341463 0.009756098 0.643902439
## Yes 0.034146341 0.336585366 0.629268293
##
## Salary
## purchase_train$Purchased High Low Medium
## No 0.06829268 0.22439024 0.70731707
## Yes 0.48292683 0.29756098 0.21951220
#confusion matrix data train
prediction_naive_train <- predict(object = naive_model, # nama model
newdata = purchase_train,
type = "class") # probabilitas
library(caret)
eval_naive_train <- confusionMatrix(prediction_naive_train, reference = purchase_train$Purchased, positive = "Yes")
#confusion matrix data test
prediction_naive_test <- predict(object = naive_model, # nama model
newdata = purchase_test,
type = "class") # probabilitas
library(caret)
eval_naive_test <- confusionMatrix(prediction_naive_test, reference = purchase_test$Purchased, positive = "Yes")
eval_naive_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 46 1
## Yes 6 27
##
## Accuracy : 0.9125
## 95% CI : (0.828, 0.9641)
## No Information Rate : 0.65
## P-Value [Acc > NIR] : 5.423e-08
##
## Kappa : 0.8153
##
## Mcnemar's Test P-Value : 0.1306
##
## Sensitivity : 0.9643
## Specificity : 0.8846
## Pos Pred Value : 0.8182
## Neg Pred Value : 0.9787
## Prevalence : 0.3500
## Detection Rate : 0.3375
## Detection Prevalence : 0.4125
## Balanced Accuracy : 0.9245
##
## 'Positive' Class : Yes
##
# ambil hasil prediksi dalam bentuk probability
purchase_pred_prob <- predict(object = naive_model,
newdata = purchase_test,
type = "raw")
head(purchase_pred_prob)
## No Yes
## [1,] 0.8969466 0.10305340
## [2,] 0.9670607 0.03293928
## [3,] 0.9738155 0.02618454
## [4,] 0.8969466 0.10305340
## [5,] 0.9738155 0.02618454
## [6,] 0.4675379 0.53246208
# menyiapkan pred vs actual
data_roc_naive <- data.frame(pred_prob = purchase_pred_prob[,"Yes"],
actual = ifelse(purchase_test$Purchased == "Yes", 1, 0))
head(data_roc_naive)
Membuat ROC dengan menyiapkan objek prediction()
library(ROCR)
#object prediction
naive_roc <- prediction(predictions = data_roc_naive$pred_prob,
labels = data_roc_naive$actual)
# nilai AUC
naive_auc <- performance(naive_roc, measure = "auc")
naive_auc@y.values[[1]]
## [1] 0.9330357
plot(performance(naive_roc, "tpr", "fpr"))
abline(0, 1, lty = 2)
text(0.4, 0.6, paste("AUC = ", round(naive_auc@y.values[[1]], 2)))
library(partykit)
model_dt <- ctree(formula = purchase_train$Purchased ~.,
data = purchase_train %>% select(-Purchased),
control = ctree_control(mincriterion=0.95))
plot(model_dt, type = "simple")
# prediction to data train
pred_train_dt <- predict(model_dt, newdata = purchase_train)
confusionMatrix(pred_train_dt, reference = purchase_train$Purchased, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 167 28
## Yes 38 177
##
## Accuracy : 0.839
## 95% CI : (0.7998, 0.8733)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.678
##
## Mcnemar's Test P-Value : 0.2679
##
## Sensitivity : 0.8634
## Specificity : 0.8146
## Pos Pred Value : 0.8233
## Neg Pred Value : 0.8564
## Prevalence : 0.5000
## Detection Rate : 0.4317
## Detection Prevalence : 0.5244
## Balanced Accuracy : 0.8390
##
## 'Positive' Class : Yes
##
# prediction to data test
pred_test_dt <- predict(model_dt, newdata = purchase_test)
eval_dt_test <- confusionMatrix(pred_test_dt, reference = purchase_test$Purchased, positive = "Yes")
eval_dt_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 44 1
## Yes 8 27
##
## Accuracy : 0.8875
## 95% CI : (0.7972, 0.9472)
## No Information Rate : 0.65
## P-Value [Acc > NIR] : 1.228e-06
##
## Kappa : 0.7662
##
## Mcnemar's Test P-Value : 0.0455
##
## Sensitivity : 0.9643
## Specificity : 0.8462
## Pos Pred Value : 0.7714
## Neg Pred Value : 0.9778
## Prevalence : 0.3500
## Detection Rate : 0.3375
## Detection Prevalence : 0.4375
## Balanced Accuracy : 0.9052
##
## 'Positive' Class : Yes
##
# ambil hasil prediksi dalam bentuk probability
purchase_pred_prob <- predict(object = model_dt,
newdata = purchase_test,
type = "prob")
head(purchase_pred_prob)
## No Yes
## 1 1.0000000 0.0000000
## 3 1.0000000 0.0000000
## 6 1.0000000 0.0000000
## 9 1.0000000 0.0000000
## 16 1.0000000 0.0000000
## 17 0.3898305 0.6101695
# menyiapkan pred vs actual
data_roc_dt <- data.frame(pred_prob = purchase_pred_prob[,"Yes"],
actual = ifelse(purchase_test$Purchased == "Yes", 1, 0))
head(data_roc_dt)
Membuat ROC dengan menyiapkan objek prediction()
library(ROCR)
#object prediction
dt_roc <- prediction(predictions = data_roc_dt$pred_prob,
labels = data_roc_dt$actual)
# nilai AUC
dt_auc <- performance(dt_roc, measure = "auc")
dt_auc@y.values[[1]]
## [1] 0.9350962
plot(performance(dt_roc, "tpr", "fpr"))
abline(0, 1, lty = 2)
text(0.4, 0.6, paste("AUC = ", round(dt_auc@y.values[[1]], 2)))
set.seed(417)
ctrl <- trainControl(method = "repeatedcv",
number = 5, # k-fold
repeats = 3) # repetisi
purchase_forest <- train(Purchased ~ .,
data = purchase_train,
method = "rf", # random forest
trControl = ctrl)
#confusion matrix data train
pred_train_rf <- predict(purchase_forest, newdata = purchase_train)
confusionMatrix(pred_train_rf, reference = purchase_train$Purchased, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 167 28
## Yes 38 177
##
## Accuracy : 0.839
## 95% CI : (0.7998, 0.8733)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.678
##
## Mcnemar's Test P-Value : 0.2679
##
## Sensitivity : 0.8634
## Specificity : 0.8146
## Pos Pred Value : 0.8233
## Neg Pred Value : 0.8564
## Prevalence : 0.5000
## Detection Rate : 0.4317
## Detection Prevalence : 0.5244
## Balanced Accuracy : 0.8390
##
## 'Positive' Class : Yes
##
#confusion matrix data test
pred_test_rf <- predict(purchase_forest, newdata = purchase_test)
eval_rf_test <- confusionMatrix(pred_test_rf, reference = purchase_test$Purchased, positive = "Yes")
eval_rf_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 44 1
## Yes 8 27
##
## Accuracy : 0.8875
## 95% CI : (0.7972, 0.9472)
## No Information Rate : 0.65
## P-Value [Acc > NIR] : 1.228e-06
##
## Kappa : 0.7662
##
## Mcnemar's Test P-Value : 0.0455
##
## Sensitivity : 0.9643
## Specificity : 0.8462
## Pos Pred Value : 0.7714
## Neg Pred Value : 0.9778
## Prevalence : 0.3500
## Detection Rate : 0.3375
## Detection Prevalence : 0.4375
## Balanced Accuracy : 0.9052
##
## 'Positive' Class : Yes
##
# ambil hasil prediksi dalam bentuk probability
purchase_pred_prob <- predict(object = purchase_forest,
newdata = purchase_test,
type = "prob")
head(purchase_pred_prob)
# menyiapkan pred vs actual
data_roc_rf <- data.frame(pred_prob = purchase_pred_prob[,"Yes"],
actual = ifelse(purchase_test$Purchased == "Yes", 1, 0))
head(data_roc_rf)
Membuat ROC dengan menyiapkan objek prediction()
library(ROCR)
#object prediction
rf_roc <- prediction(predictions = data_roc_rf$pred_prob,
labels = data_roc_rf$actual)
# nilai AUC
rf_auc <- performance(rf_roc, measure = "auc")
rf_auc@y.values[[1]]
## [1] 0.9138049
plot(performance(rf_roc, "tpr", "fpr"))
abline(0, 1, lty = 2)
text(0.4, 0.6, paste("AUC = ", round(rf_auc@y.values[[1]], 2)))
eval_naive_test <- data_frame(Accuracy = eval_naive_test$overall[1],
Recall = eval_naive_test$byClass[1],
Specificity = eval_naive_test$byClass[2],
Precision = eval_naive_test$byClass[3],
AUC=naive_auc@y.values[[1]])
eval_dt_test <- data_frame(Accuracy = eval_dt_test$overall[1],
Recall = eval_dt_test$byClass[1],
Specificity = eval_dt_test$byClass[2],
Precision = eval_dt_test$byClass[3],
AUC=dt_auc@y.values[[1]])
eval_rf_test <- data_frame(Accuracy = eval_rf_test$overall[1],
Recall = eval_rf_test$byClass[1],
Specificity = eval_rf_test$byClass[2],
Precision = eval_rf_test$byClass[3],
AUC=rf_auc@y.values[[1]])
b <- rbind("Naive Bayes" = eval_naive_test, "Decision Tree" = eval_dt_test, "Random Forest" = eval_rf_test)
cbind(b)
Based on confusion matrix of data test, we can conclude that in this case Naive Bayes is the best model to be used for prediction, with highest accuracy, recall, specificity, precision, and AUC than other model. But overall, all model already gave a good prediction results and can be used for prediction.