table(data$y)
## 
##     0     1 
## 36548  4640

#encoding

 library(tidyverse)

data <- data %>%
  mutate(across(where(is.character), as.factor))
data <- data %>% select(-duration)

#duplikat

data <- data[!duplicated(data), ]

#split data

library(caret)

set.seed(123)

trainIndex <- createDataPartition(data$y, p = 0.8, list = FALSE)

train <- data[trainIndex, ]
test  <- data[-trainIndex, ]

# cek
table(train$y)
## 
##     0     1 
## 27848  3676
table(test$y)
## 
##    0    1 
## 6958  922

#model logistic

model_log <- glm(y ~ ., data = train, family = binomial)

#prediksi + evaluasi

pred_log_prob <- predict(model_log, test, type = "response")

pred_log <- ifelse(pred_log_prob > 0.5, "1", "0")

pred_log <- factor(pred_log, levels = c("0","1"))
test$y   <- factor(test$y, levels = c("0","1"))

conf_log <- confusionMatrix(pred_log, test$y)
conf_log
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6846  688
##          1  112  234
##                                           
##                Accuracy : 0.8985          
##                  95% CI : (0.8916, 0.9051)
##     No Information Rate : 0.883           
##     P-Value [Acc > NIR] : 7.13e-06        
##                                           
##                   Kappa : 0.3261          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9839          
##             Specificity : 0.2538          
##          Pos Pred Value : 0.9087          
##          Neg Pred Value : 0.6763          
##              Prevalence : 0.8830          
##          Detection Rate : 0.8688          
##    Detection Prevalence : 0.9561          
##       Balanced Accuracy : 0.6188          
##                                           
##        'Positive' Class : 0               
## 
train$y <- factor(train$y)
test$y  <- factor(test$y)
model_rf <- randomForest(y ~ ., data = train)

pred_rf <- predict(model_rf, test)

conf_rf <- confusionMatrix(pred_rf, test$y)
conf_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6775  647
##          1  183  275
##                                           
##                Accuracy : 0.8947          
##                  95% CI : (0.8877, 0.9014)
##     No Information Rate : 0.883           
##     P-Value [Acc > NIR] : 0.0005762       
##                                           
##                   Kappa : 0.3479          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9737          
##             Specificity : 0.2983          
##          Pos Pred Value : 0.9128          
##          Neg Pred Value : 0.6004          
##              Prevalence : 0.8830          
##          Detection Rate : 0.8598          
##    Detection Prevalence : 0.9419          
##       Balanced Accuracy : 0.6360          
##                                           
##        'Positive' Class : 0               
## 
conf_log$overall["Accuracy"]
##  Accuracy 
## 0.8984772
conf_rf$overall["Accuracy"]
##  Accuracy 
## 0.8946701