1. LIBRARY

Berikut adalah package yang digunakan dalam analisis ini. Package caret digunakan untuk proses pemodelan dan evaluasi model, readxl untuk membaca data, dplyr untuk manipulasi data, ggplot2 untuk visualisasi data, rpart dan rpart.plot untuk algoritma dan visualisasi Decision Tree, naivebayes sebagai package utama algoritma Naive Bayes, smotefamily untuk penanganan data tidak seimbang menggunakan SMOTE, serta reshape2 untuk transformasi struktur data.

library(readxl)

## Warning: package 'readxl' was built under R version 4.5.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.5.3

library(caret)

## Warning: package 'caret' was built under R version 4.5.3

## Loading required package: lattice

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.5.3

library(naivebayes)

## Warning: package 'naivebayes' was built under R version 4.5.3

## naivebayes 1.0.0 loaded

## For more information please visit:

## https://majkamichal.github.io/naivebayes/

library(smotefamily)

## Warning: package 'smotefamily' was built under R version 4.5.3

library(reshape2)

2. IMPORT DATA

Dataset yang digunakan dalam penelitian ini bersumber dari Demographic and Health Survey (IDHS) dari Indonesia tahun 2017. Dataset tersebut terdiri atas 10.009 data responden, 10 variabel independen, dan 1 variabel dependen berupa status merokok yang diklasifikasikan menjadi tidak merokok (0) dan merokok (1)

datarokok <- read_excel("~/Lidia Semester 6/Data Mininng/datarokok.xlsx")
head(datarokok)

## # A tibble: 6 × 10
##   `Frequency currently smokes tobacco (Y)` `Current age` Highest educational l…¹
##                                      <dbl>         <dbl>                   <dbl>
## 1                                        1            33                       1
## 2                                        1            45                       1
## 3                                        0            39                       3
## 4                                        1            44                       2
## 5                                        1            42                       2
## 6                                        1            28                       3
## # ℹ abbreviated name: ¹`Highest educational level`
## # ℹ 7 more variables: `Currently working` <dbl>, Occupation <dbl>,
## #   `Current marital status` <dbl>, `Type of place of residence` <dbl>,
## #   `Wealth index combined` <dbl>, `Covered by health insurance` <dbl>,
## #   `Frequency of watching television` <dbl>

cat("Dimensi data:", dim(datarokok), "\n")

## Dimensi data: 10009 10

names(datarokok) <- c("Y", "usia", "pendidikan", "bekerja", "pekerjaan",
                      "status_pernikahan", "tempat_tinggal", "kekayaan",
                      "asuransi", "tv")

3. PREPROCESSING DATA

3.1 Missing Value

Dilakukan pemeriksaan dan penanganan missing value dengan mengubah kode khusus (9, 98, dan 99) menjadi nilai NA. Selanjutnya, data yang mengandung missing value dihapus menggunakan metode listwise deletion (na.omit)

cat("Missing value sebelum preprocessing:\n")

## Missing value sebelum preprocessing:

print(colSums(is.na(datarokok)))

##                 Y              usia        pendidikan           bekerja 
##                 0                 0               204                 0 
##         pekerjaan status_pernikahan    tempat_tinggal          kekayaan 
##                 0                 0                 0                 0 
##          asuransi                tv 
##                 0                 0

datarokok$Y[datarokok$Y == 9]                            <- NA
datarokok$bekerja[datarokok$bekerja == 9]                <- NA
datarokok$pekerjaan[datarokok$pekerjaan %in% c(98, 99)] <- NA
datarokok$tv[datarokok$tv == 9]                         <- NA

cat("\nMissing value setelah penggantian kode (9/98/99 -> NA):\n")

## 
## Missing value setelah penggantian kode (9/98/99 -> NA):

print(colSums(is.na(datarokok)))

##                 Y              usia        pendidikan           bekerja 
##                 3                 0               204                 2 
##         pekerjaan status_pernikahan    tempat_tinggal          kekayaan 
##                18                 0                 0                 0 
##          asuransi                tv 
##                 0                16

cat("Total missing value:", sum(is.na(datarokok)), "\n")

## Total missing value: 243

cat("Persentase missing per kolom (%):\n")

## Persentase missing per kolom (%):

print(round(colSums(is.na(datarokok)) / nrow(datarokok) * 100, 2))

##                 Y              usia        pendidikan           bekerja 
##              0.03              0.00              2.04              0.02 
##         pekerjaan status_pernikahan    tempat_tinggal          kekayaan 
##              0.18              0.00              0.00              0.00 
##          asuransi                tv 
##              0.00              0.16

n_sebelum <- nrow(datarokok)
datarokok  <- na.omit(datarokok)
n_sesudah  <- nrow(datarokok)

cat("\nJumlah baris sebelum hapus missing:", n_sebelum, "\n")

## 
## Jumlah baris sebelum hapus missing: 10009

cat("Jumlah baris setelah hapus missing :", n_sesudah, "\n")

## Jumlah baris setelah hapus missing : 9766

cat("Baris yang dibuang                 :", n_sebelum - n_sesudah, "\n")

## Baris yang dibuang                 : 243

cat("Persentase data terbuang           :",
    round((n_sebelum - n_sesudah) / n_sebelum * 100, 2), "%\n")

## Persentase data terbuang           : 2.43 %

cat("\nMissing value setelah na.omit:\n")

## 
## Missing value setelah na.omit:

print(colSums(is.na(datarokok)))

##                 Y              usia        pendidikan           bekerja 
##                 0                 0                 0                 0 
##         pekerjaan status_pernikahan    tempat_tinggal          kekayaan 
##                 0                 0                 0                 0 
##          asuransi                tv 
##                 0                 0

3.2 Binarisasi variabel target

variabel target dikategorikan menjadi tidak merokok (0) dan merokok (1)

datarokok$Y <- ifelse(datarokok$Y == 0, 0, 1)
datarokok$Y <- factor(datarokok$Y, levels = c(0, 1),
                      labels = c("Tidak_Merokok", "Merokok"))

3.3 Konversi Ke Faktor

Variabel kategorik dikonversi ke dalam bentuk faktor sebelum dilakukan pemodelan.

for (col in c("pendidikan", "bekerja", "pekerjaan", "status_pernikahan",
              "tempat_tinggal", "kekayaan", "asuransi", "tv")) {
  datarokok[[col]] <- factor(datarokok[[col]])}
cat("\nStruktur data setelah preprocessing:\n")

## 
## Struktur data setelah preprocessing:

str(datarokok)

## tibble [9,766 × 10] (S3: tbl_df/tbl/data.frame)
##  $ Y                : Factor w/ 2 levels "Tidak_Merokok",..: 2 2 1 2 2 2 1 1 2 2 ...
##  $ usia             : num [1:9766] 33 45 39 44 42 28 45 45 35 37 ...
##  $ pendidikan       : Factor w/ 5 levels "1","2","3","4",..: 1 1 3 2 2 3 3 2 1 2 ...
##  $ bekerja          : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ pekerjaan        : Factor w/ 9 levels "0","1","2","3",..: 7 7 7 8 7 7 8 7 7 7 ...
##  $ status_pernikahan: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ tempat_tinggal   : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ kekayaan         : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 3 4 2 2 4 1 3 ...
##  $ asuransi         : Factor w/ 3 levels "0","1","9": 1 2 2 1 2 2 2 1 1 1 ...
##  $ tv               : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 2 2 3 ...
##  - attr(*, "na.action")= 'omit' Named int [1:243] 76 261 268 316 336 439 473 474 475 476 ...
##   ..- attr(*, "names")= chr [1:243] "76" "261" "268" "316" ...

4. Statistik Deskriptif

4.1 Dimensi Data

dataset penelitian terdiri atas … observasi dan … variabel. Variabel yang digunakan meliputi … variabel numerik dan … variabel kategorik, yang selanjutnya digunakan dalam proses analisis klasifikasi status merokok.

statdesk_dimensi2 <- data.frame(
  Keterangan = c("Jumlah Observasi", "Jumlah Variabel",
                 "Jumlah Variabel Numerik", "Jumlah Variabel Kategorik"),
  Nilai = c(nrow(datarokok), ncol(datarokok),
            sum(sapply(datarokok, is.numeric)),
            sum(sapply(datarokok, is.factor))))
print(statdesk_dimensi2)

##                  Keterangan Nilai
## 1          Jumlah Observasi  9766
## 2           Jumlah Variabel    10
## 3   Jumlah Variabel Numerik     1
## 4 Jumlah Variabel Kategorik     9

##4.2 Distribusi Variabel Target Distribusi variabel target menunjukkan jumlah dan persentase responden pada kategori tidak merokok dan merokok. Informasi ini digunakan untuk mengetahui keseimbangan kelas dalam data sebelum proses pemodelan dilakukan.

distribusi_target <- datarokok %>%
  dplyr::count(Y) %>%
  dplyr::mutate(Persentase = round(n / sum(n) * 100, 2))
print(distribusi_target)

## # A tibble: 2 × 3
##   Y                 n Persentase
##   <fct>         <int>      <dbl>
## 1 Tidak_Merokok  2778       28.4
## 2 Merokok        6988       71.6

ggplot(distribusi_target, aes(x = Y, y = n, fill = Y)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(n, " (", Persentase, "%)")),
            vjust = -0.3, size = 5) +
  scale_fill_manual(values = c("Tidak_Merokok" = "steelblue", "Merokok" = "tomato")) +
  labs(title = "Distribusi Variabel Target: Status Merokok",
       x = "Status Merokok", y = "Frekuensi") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title  = element_text(hjust = 0.5, face = "bold", size = 14),
        axis.text.x = element_text(size = 12, face = "bold"),
        axis.title  = element_text(size = 13, face = "bold"))

## 4.3 Statistik Deskriptif Variabel Kategorik Statistik deskriptif variabel kategorik digunakan untuk memberikan gambaran umum mengenai karakteristik responden berdasarkan kategori pada setiap variabel penelitian.

var_kategorik <- c(
  "Y",
  "pendidikan",
  "bekerja",
  "pekerjaan",
  "status_pernikahan",
  "tempat_tinggal",
  "kekayaan",
  "asuransi",
  "tv")
statdesk_kategorik <- bind_rows(
  lapply(var_kategorik, function(var) {
    tbl <- as.data.frame(table(datarokok[[var]]))
    colnames(tbl) <- c("Kategori", "Frekuensi")
    tbl <- tbl %>%
      mutate(
        Variabel   = var,
        Persentase = round(Frekuensi / sum(Frekuensi) * 100, 2)
      ) %>%
      dplyr::select(Variabel, Kategori, Frekuensi, Persentase)
    tbl }))
print(statdesk_kategorik)

##             Variabel      Kategori Frekuensi Persentase
## 1                  Y Tidak_Merokok      2778      28.45
## 2                  Y       Merokok      6988      71.55
## 3         pendidikan             1      3079      31.53
## 4         pendidikan             2      1899      19.45
## 5         pendidikan             3      3355      34.35
## 6         pendidikan             4       276       2.83
## 7         pendidikan             5      1157      11.85
## 8            bekerja             0       196       2.01
## 9            bekerja             1      9570      97.99
## 10         pekerjaan             0        56       0.57
## 11         pekerjaan             1       758       7.76
## 12         pekerjaan             2       199       2.04
## 13         pekerjaan             3       594       6.08
## 14         pekerjaan             4      1345      13.77
## 15         pekerjaan             5      1683      17.23
## 16         pekerjaan             6      2747      28.13
## 17         pekerjaan             7      2326      23.82
## 18         pekerjaan            96        58       0.59
## 19 status_pernikahan             1      9703      99.35
## 20 status_pernikahan             2        63       0.65
## 21    tempat_tinggal             1      4987      51.06
## 22    tempat_tinggal             2      4779      48.94
## 23          kekayaan             1      2128      21.79
## 24          kekayaan             2      1940      19.86
## 25          kekayaan             3      1935      19.81
## 26          kekayaan             4      1866      19.11
## 27          kekayaan             5      1897      19.42
## 28          asuransi             0      3858      39.50
## 29          asuransi             1      5906      60.48
## 30          asuransi             9         2       0.02
## 31                tv             0       314       3.22
## 32                tv             1      1206      12.35
## 33                tv             2      8246      84.44

5. Fungsi Evaluasi Model

Evaluasi model dilakukan untuk mengetahui tingkat performa model klasifikasi yang dibangun. Penilaian kinerja model dilakukan berdasarkan nilai metrik evaluasi yang diperoleh dari confusion matrix.

evaluasi_model <- function(actual, pred, nama_model, split, kondisi, set_data) {
  
 
  cat("Split    :", split,      "\n")
  cat("Kondisi  :", kondisi,    "\n")
  cat("Metode   :", nama_model, "\n")
  cat("Set Data :", set_data,   "\n")
  cm <- confusionMatrix(pred, actual, positive = "Merokok")
  print(cm)
   hasil <- data.frame(
    Split       = split,
    Kondisi     = kondisi,
    Model       = nama_model,
    Set_Data    = set_data,
    Accuracy    = as.numeric(cm$overall["Accuracy"]),
    Precision   = ifelse(is.na(cm$byClass["Precision"]),   0, cm$byClass["Precision"]),
    Recall      = ifelse(is.na(cm$byClass["Recall"]),      0, cm$byClass["Recall"]),
    F1_Score    = ifelse(is.na(cm$byClass["F1"]),          0, cm$byClass["F1"]),
    Specificity = ifelse(is.na(cm$byClass["Specificity"]), 0, cm$byClass["Specificity"]),
    Kappa       = as.numeric(cm$overall["Kappa"])
  )
  return(hasil)}

6. Fungsi SMOTHEE

SMOTE (Synthetic Minority Over-sampling Technique) digunakan untuk mengatasi masalah ketidakseimbangan kelas (class imbalance) pada data.

smote_data <- function(train_df) {
    train_num <- train_df
   for (col in setdiff(names(train_num), "Y")) {
    train_num[[col]] <- as.numeric(as.factor(train_num[[col]])) }
  
  train_num$Y <- as.numeric(train_num$Y) - 1
  jumlah_tidak <- sum(train_df$Y == "Tidak_Merokok")
  jumlah_rokok <- sum(train_df$Y == "Merokok")
  
  cat("\n--- Info SMOTE ---\n")
  cat("Sebelum SMOTE - Tidak_Merokok:",
      jumlah_tidak,
      "| Merokok:",
      jumlah_rokok,
      "\n")
  
  smote_res <- SMOTE(
    X = train_num[, setdiff(names(train_num), "Y")],
    target = train_num$Y,
    K = 5,
    dup_size = 0  )
    tb <- smote_res$data
    cat("\nStruktur hasil SMOTE:\n")
  str(tb$class)
  
  tb$Y <- factor(
    as.numeric(tb$class),
    levels = c(0, 1),
    labels = c("Tidak_Merokok", "Merokok")  )
  tb$class <- NULL
    cat("Sesudah SMOTE - Tidak_Merokok:",
      sum(tb$Y == "Tidak_Merokok"),
      "| Merokok:",
      sum(tb$Y == "Merokok"),
      "\n")
    return(tb)}

7. FUNGSI KONVERSI NUMERIK

Fungsi konversi numerik digunakan untuk mengubah seluruh variabel prediktor yang bertipe kategorik menjadi numerik, sedangkan variabel target tetap dipertahankan. Tahap ini dilakukan untuk memastikan data dapat diproses oleh metode SMOTE yang memerlukan input berupa data numerik.

konversi_numerik <- function(df) {
  out <- df
  for (col in setdiff(names(out), "Y")) {
    out[[col]] <- as.numeric(as.factor(out[[col]]))  }
  out}

8. Split Data (90:10 | 80:20 | 70:30)

Data dibagi menjadi data pelatihan dan data pengujian dengan rasio 90:10, 80:20, dan 70:30. Pembagian ini bertujuan untuk membangun model serta mengukur kemampuan model dalam melakukan prediksi pada data baru.

set.seed(123)

index_90 <- createDataPartition(datarokok$Y, p = 0.90, list = FALSE)
train_90  <- datarokok[ index_90, ]; test_10 <- datarokok[-index_90, ]

index_80 <- createDataPartition(datarokok$Y, p = 0.80, list = FALSE)
train_80  <- datarokok[ index_80, ]; test_20 <- datarokok[-index_80, ]

index_70 <- createDataPartition(datarokok$Y, p = 0.70, list = FALSE)
train_70  <- datarokok[ index_70, ]; test_30 <- datarokok[-index_70, ]

cat("\n--- Distribusi Target per Split ---\n")

## 
## --- Distribusi Target per Split ---

cat("Train 90:10\n"); print(summary(train_90$Y))

## Train 90:10

## Tidak_Merokok       Merokok 
##          2501          6290

cat("Train 80:20\n"); print(summary(train_80$Y))

## Train 80:20

## Tidak_Merokok       Merokok 
##          2223          5591

cat("Train 70:30\n"); print(summary(train_70$Y))

## Train 70:30

## Tidak_Merokok       Merokok 
##          1945          4892

9. Fungsi Pemodelan Klasifikasi KNN, Decision Tree, dan Naive Bayes untuk Data Asli dan Data Hasil SMOTE

Fungsi ini digunakan untuk menjalankan proses pemodelan klasifikasi pada data asli dan data hasil SMOTE menggunakan metode Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Selanjutnya, kinerja masing-masing model dievaluasi berdasarkan hasil klasifikasi pada data pelatihan dan data pengujian.

jalankan_model <- function(train_data, test_data, split_name, kondisi) {
  
  hasil_semua <- data.frame()
  
  train_x <- train_data %>% dplyr::select(-Y)
  train_y <- train_data$Y
  test_x  <- test_data  %>% dplyr::select(-Y)
  test_y  <- test_data$Y
  
  
 
  # A. KNN (k = 5, default)
  cat("\n>>> KNN <<<\n")
  
  spek_knn <- data.frame(
    Parameter = c("Metode", "k (default)", "Preprocessing", "Platform"),
    Nilai     = c("KNN", "5", "Center + Scale", "R - caret")
  )
  cat("Spesifikasi KNN:\n"); print(spek_knn)
  
  set.seed(123)
  model_knn <- train(Y ~ ., data = train_data, method = "knn",
                     trControl  = trainControl(method = "none"),
                     tuneGrid   = data.frame(k = 5),
                     preProcess = c("center", "scale"))
  
  pred_knn_train <- factor(predict(model_knn, train_x), levels = levels(train_y))
  hasil_semua <- bind_rows(hasil_semua,
                           evaluasi_model(train_y, pred_knn_train, "KNN", split_name, kondisi, "Training"))
  
  pred_knn_test <- factor(predict(model_knn, test_x), levels = levels(test_y))
  hasil_semua <- bind_rows(hasil_semua,
                           evaluasi_model(test_y, pred_knn_test, "KNN", split_name, kondisi, "Testing"))
  
  
  
  # B. Decision Tree (cp = 0.01, maxdepth = 10, default)
  cat("\n>>> Decision Tree <<<\n")
  
  spek_dt <- data.frame(
    Parameter = c("Metode", "Splitting Criterion", "cp (default)",
                  "maxdepth", "Platform"),
    Nilai     = c("rpart", "Gini Index", "0.01", "10", "R - rpart")
  )
  cat("Spesifikasi Decision Tree:\n"); print(spek_dt)
  
  set.seed(123)
  model_dt <- rpart(Y ~ ., data = train_data, method = "class",
                    control = rpart.control(cp = 0.01, maxdepth = 10))
  
  pred_dt_train <- factor(predict(model_dt, train_x, type = "class"),
                          levels = levels(train_y))
  hasil_semua <- bind_rows(hasil_semua,
                           evaluasi_model(train_y, pred_dt_train,
                                          "Decision Tree", split_name, kondisi, "Training"))
  
  pred_dt_test <- factor(predict(model_dt, test_x, type = "class"),
                         levels = levels(test_y))
  hasil_semua <- bind_rows(hasil_semua,
                           evaluasi_model(test_y, pred_dt_test,
                                          "Decision Tree", split_name, kondisi, "Testing"))
  
  
  
  # C. Naive Bayes (laplace = 0, usekernel = FALSE, default)
  cat("\n>>> Naive Bayes <<<\n")
  
  spek_nb <- data.frame(
    Parameter = c("Metode", "laplace (default)", "usekernel (default)",
                  "adjust (default)", "Platform"),
    Nilai     = c("naive_bayes", "0", "FALSE", "1", "R - naivebayes")
  )
  cat("Spesifikasi Naive Bayes:\n"); print(spek_nb)
  
  set.seed(123)
  model_nb <- train(Y ~ ., data = train_data, method = "naive_bayes",
                    trControl = trainControl(method = "none"),
                    tuneGrid  = data.frame(laplace = 0,
                                           usekernel = FALSE,
                                           adjust    = 1))
  
  pred_nb_train <- factor(predict(model_nb, train_x), levels = levels(train_y))
  hasil_semua <- bind_rows(hasil_semua,
                           evaluasi_model(train_y, pred_nb_train,
                                          "Naive Bayes", split_name, kondisi, "Training"))
  
  pred_nb_test <- factor(predict(model_nb, test_x), levels = levels(test_y))
  hasil_semua <- bind_rows(hasil_semua,
                           evaluasi_model(test_y, pred_nb_test,
                                          "Naive Bayes", split_name, kondisi, "Testing"))
  
  
  # Kembalikan hasil + simpan model
  return(list(
    hasil = hasil_semua,
    knn   = model_knn,
    dt    = model_dt,
    nb    = model_nb ))}

10. Hasil Model Sebelum SMOTHE

Berdasarkan fungsi pemodelan yang telah dibangun sebelumnya, dilakukan proses klasifikasi pada data sebelum penerapan SMOTE menggunakan metode Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Hasil evaluasi masing-masing model disajikan pada bagian berikut.

res_awal_90 <- jalankan_model(train_90, test_10, "90:10", "Sebelum SMOTE")

## 
## >>> KNN <<<
## Spesifikasi KNN:
##       Parameter          Nilai
## 1        Metode            KNN
## 2   k (default)              5
## 3 Preprocessing Center + Scale
## 4      Platform      R - caret
## Split    : 90:10 
## Kondisi  : Sebelum SMOTE 
## Metode   : KNN 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           907     524
##   Merokok                1594    5766
##                                        
##                Accuracy : 0.7591       
##                  95% CI : (0.75, 0.768)
##     No Information Rate : 0.7155       
##     P-Value [Acc > NIR] : < 2.2e-16    
##                                        
##                   Kappa : 0.3207       
##                                        
##  Mcnemar's Test P-Value : < 2.2e-16    
##                                        
##             Sensitivity : 0.9167       
##             Specificity : 0.3627       
##          Pos Pred Value : 0.7834       
##          Neg Pred Value : 0.6338       
##              Prevalence : 0.7155       
##          Detection Rate : 0.6559       
##    Detection Prevalence : 0.8372       
##       Balanced Accuracy : 0.6397       
##                                        
##        'Positive' Class : Merokok      
##                                        
## Split    : 90:10 
## Kondisi  : Sebelum SMOTE 
## Metode   : KNN 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok            57      96
##   Merokok                 220     602
##                                           
##                Accuracy : 0.6759          
##                  95% CI : (0.6455, 0.7052)
##     No Information Rate : 0.7159          
##     P-Value [Acc > NIR] : 0.9972          
##                                           
##                   Kappa : 0.0789          
##                                           
##  Mcnemar's Test P-Value : 4.539e-12       
##                                           
##             Sensitivity : 0.8625          
##             Specificity : 0.2058          
##          Pos Pred Value : 0.7324          
##          Neg Pred Value : 0.3725          
##              Prevalence : 0.7159          
##          Detection Rate : 0.6174          
##    Detection Prevalence : 0.8431          
##       Balanced Accuracy : 0.5341          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
##             Parameter      Nilai
## 1              Metode      rpart
## 2 Splitting Criterion Gini Index
## 3        cp (default)       0.01
## 4            maxdepth         10
## 5            Platform  R - rpart
## Split    : 90:10 
## Kondisi  : Sebelum SMOTE 
## Metode   : Decision Tree 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok             0       0
##   Merokok                2501    6290
##                                           
##                Accuracy : 0.7155          
##                  95% CI : (0.7059, 0.7249)
##     No Information Rate : 0.7155          
##     P-Value [Acc > NIR] : 0.5054          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7155          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7155          
##          Detection Rate : 0.7155          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 90:10 
## Kondisi  : Sebelum SMOTE 
## Metode   : Decision Tree 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok             0       0
##   Merokok                 277     698
##                                          
##                Accuracy : 0.7159         
##                  95% CI : (0.6865, 0.744)
##     No Information Rate : 0.7159         
##     P-Value [Acc > NIR] : 0.5162         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.7159         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.7159         
##          Detection Rate : 0.7159         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : Merokok        
##                                          
## 
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
##             Parameter          Nilai
## 1              Metode    naive_bayes
## 2   laplace (default)              0
## 3 usekernel (default)          FALSE
## 4    adjust (default)              1
## 5            Platform R - naivebayes
## Split    : 90:10 
## Kondisi  : Sebelum SMOTE 
## Metode   : Naive Bayes 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          1878    3735
##   Merokok                 623    2555
##                                           
##                Accuracy : 0.5043          
##                  95% CI : (0.4938, 0.5148)
##     No Information Rate : 0.7155          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1143          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.4062          
##             Specificity : 0.7509          
##          Pos Pred Value : 0.8040          
##          Neg Pred Value : 0.3346          
##              Prevalence : 0.7155          
##          Detection Rate : 0.2906          
##    Detection Prevalence : 0.3615          
##       Balanced Accuracy : 0.5785          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 90:10 
## Kondisi  : Sebelum SMOTE 
## Metode   : Naive Bayes 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           194     425
##   Merokok                  83     273
##                                           
##                Accuracy : 0.479           
##                  95% CI : (0.4472, 0.5109)
##     No Information Rate : 0.7159          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0667          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.3911          
##             Specificity : 0.7004          
##          Pos Pred Value : 0.7669          
##          Neg Pred Value : 0.3134          
##              Prevalence : 0.7159          
##          Detection Rate : 0.2800          
##    Detection Prevalence : 0.3651          
##       Balanced Accuracy : 0.5457          
##                                           
##        'Positive' Class : Merokok         
##

res_awal_80 <- jalankan_model(train_80, test_20, "80:20", "Sebelum SMOTE")

## 
## >>> KNN <<<
## Spesifikasi KNN:
##       Parameter          Nilai
## 1        Metode            KNN
## 2   k (default)              5
## 3 Preprocessing Center + Scale
## 4      Platform      R - caret
## Split    : 80:20 
## Kondisi  : Sebelum SMOTE 
## Metode   : KNN 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           867     501
##   Merokok                1356    5090
##                                           
##                Accuracy : 0.7623          
##                  95% CI : (0.7528, 0.7718)
##     No Information Rate : 0.7155          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3398          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9104          
##             Specificity : 0.3900          
##          Pos Pred Value : 0.7896          
##          Neg Pred Value : 0.6338          
##              Prevalence : 0.7155          
##          Detection Rate : 0.6514          
##    Detection Prevalence : 0.8249          
##       Balanced Accuracy : 0.6502          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : Sebelum SMOTE 
## Metode   : KNN 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           134     239
##   Merokok                 421    1158
##                                           
##                Accuracy : 0.6619          
##                  95% CI : (0.6404, 0.6829)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0781          
##                                           
##  Mcnemar's Test P-Value : 1.849e-12       
##                                           
##             Sensitivity : 0.8289          
##             Specificity : 0.2414          
##          Pos Pred Value : 0.7334          
##          Neg Pred Value : 0.3592          
##              Prevalence : 0.7157          
##          Detection Rate : 0.5932          
##    Detection Prevalence : 0.8089          
##       Balanced Accuracy : 0.5352          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
##             Parameter      Nilai
## 1              Metode      rpart
## 2 Splitting Criterion Gini Index
## 3        cp (default)       0.01
## 4            maxdepth         10
## 5            Platform  R - rpart
## Split    : 80:20 
## Kondisi  : Sebelum SMOTE 
## Metode   : Decision Tree 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok             0       0
##   Merokok                2223    5591
##                                           
##                Accuracy : 0.7155          
##                  95% CI : (0.7054, 0.7255)
##     No Information Rate : 0.7155          
##     P-Value [Acc > NIR] : 0.5057          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7155          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7155          
##          Detection Rate : 0.7155          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : Sebelum SMOTE 
## Metode   : Decision Tree 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok             0       0
##   Merokok                 555    1397
##                                           
##                Accuracy : 0.7157          
##                  95% CI : (0.6951, 0.7356)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 0.5114          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7157          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7157          
##          Detection Rate : 0.7157          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
##             Parameter          Nilai
## 1              Metode    naive_bayes
## 2   laplace (default)              0
## 3 usekernel (default)          FALSE
## 4    adjust (default)              1
## 5            Platform R - naivebayes
## Split    : 80:20 
## Kondisi  : Sebelum SMOTE 
## Metode   : Naive Bayes 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          1557    2985
##   Merokok                 666    2606
##                                           
##                Accuracy : 0.5328          
##                  95% CI : (0.5216, 0.5439)
##     No Information Rate : 0.7155          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1267          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.4661          
##             Specificity : 0.7004          
##          Pos Pred Value : 0.7965          
##          Neg Pred Value : 0.3428          
##              Prevalence : 0.7155          
##          Detection Rate : 0.3335          
##    Detection Prevalence : 0.4187          
##       Balanced Accuracy : 0.5833          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : Sebelum SMOTE 
## Metode   : Naive Bayes 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           382     759
##   Merokok                 173     638
##                                           
##                Accuracy : 0.5225          
##                  95% CI : (0.5001, 0.5449)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.11            
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.4567          
##             Specificity : 0.6883          
##          Pos Pred Value : 0.7867          
##          Neg Pred Value : 0.3348          
##              Prevalence : 0.7157          
##          Detection Rate : 0.3268          
##    Detection Prevalence : 0.4155          
##       Balanced Accuracy : 0.5725          
##                                           
##        'Positive' Class : Merokok         
##

res_awal_70 <- jalankan_model(train_70, test_30, "70:30", "Sebelum SMOTE")

## 
## >>> KNN <<<
## Spesifikasi KNN:
##       Parameter          Nilai
## 1        Metode            KNN
## 2   k (default)              5
## 3 Preprocessing Center + Scale
## 4      Platform      R - caret
## Split    : 70:30 
## Kondisi  : Sebelum SMOTE 
## Metode   : KNN 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           733     409
##   Merokok                1212    4483
##                                           
##                Accuracy : 0.7629          
##                  95% CI : (0.7526, 0.7729)
##     No Information Rate : 0.7155          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3349          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9164          
##             Specificity : 0.3769          
##          Pos Pred Value : 0.7872          
##          Neg Pred Value : 0.6419          
##              Prevalence : 0.7155          
##          Detection Rate : 0.6557          
##    Detection Prevalence : 0.8330          
##       Balanced Accuracy : 0.6466          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 70:30 
## Kondisi  : Sebelum SMOTE 
## Metode   : KNN 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           209     305
##   Merokok                 624    1791
##                                           
##                Accuracy : 0.6828          
##                  95% CI : (0.6656, 0.6997)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1191          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8545          
##             Specificity : 0.2509          
##          Pos Pred Value : 0.7416          
##          Neg Pred Value : 0.4066          
##              Prevalence : 0.7156          
##          Detection Rate : 0.6115          
##    Detection Prevalence : 0.8245          
##       Balanced Accuracy : 0.5527          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
##             Parameter      Nilai
## 1              Metode      rpart
## 2 Splitting Criterion Gini Index
## 3        cp (default)       0.01
## 4            maxdepth         10
## 5            Platform  R - rpart
## Split    : 70:30 
## Kondisi  : Sebelum SMOTE 
## Metode   : Decision Tree 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok             0       0
##   Merokok                1945    4892
##                                           
##                Accuracy : 0.7155          
##                  95% CI : (0.7047, 0.7262)
##     No Information Rate : 0.7155          
##     P-Value [Acc > NIR] : 0.5061          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7155          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7155          
##          Detection Rate : 0.7155          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 70:30 
## Kondisi  : Sebelum SMOTE 
## Metode   : Decision Tree 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok             0       0
##   Merokok                 833    2096
##                                           
##                Accuracy : 0.7156          
##                  95% CI : (0.6989, 0.7319)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 0.5093          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.7156          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.7156          
##          Detection Rate : 0.7156          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
##             Parameter          Nilai
## 1              Metode    naive_bayes
## 2   laplace (default)              0
## 3 usekernel (default)          FALSE
## 4    adjust (default)              1
## 5            Platform R - naivebayes
## Split    : 70:30 
## Kondisi  : Sebelum SMOTE 
## Metode   : Naive Bayes 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          1567    3320
##   Merokok                 378    1572
##                                          
##                Accuracy : 0.4591         
##                  95% CI : (0.4473, 0.471)
##     No Information Rate : 0.7155         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.0872         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.3213         
##             Specificity : 0.8057         
##          Pos Pred Value : 0.8062         
##          Neg Pred Value : 0.3206         
##              Prevalence : 0.7155         
##          Detection Rate : 0.2299         
##    Detection Prevalence : 0.2852         
##       Balanced Accuracy : 0.5635         
##                                          
##        'Positive' Class : Merokok        
##                                          
## Split    : 70:30 
## Kondisi  : Sebelum SMOTE 
## Metode   : Naive Bayes 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           676    1409
##   Merokok                 157     687
##                                           
##                Accuracy : 0.4653          
##                  95% CI : (0.4472, 0.4836)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0959          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.3278          
##             Specificity : 0.8115          
##          Pos Pred Value : 0.8140          
##          Neg Pred Value : 0.3242          
##              Prevalence : 0.7156          
##          Detection Rate : 0.2346          
##    Detection Prevalence : 0.2882          
##       Balanced Accuracy : 0.5696          
##                                           
##        'Positive' Class : Merokok         
##

hasil_awal <- bind_rows(res_awal_90$hasil, res_awal_80$hasil, res_awal_70$hasil)
print(hasil_awal)

##                Split       Kondisi         Model Set_Data  Accuracy Precision
## Precision...1  90:10 Sebelum SMOTE           KNN Training 0.7590718 0.7834239
## Precision...2  90:10 Sebelum SMOTE           KNN  Testing 0.6758974 0.7323601
## Precision...3  90:10 Sebelum SMOTE Decision Tree Training 0.7155045 0.7155045
## Precision...4  90:10 Sebelum SMOTE Decision Tree  Testing 0.7158974 0.7158974
## Precision...5  90:10 Sebelum SMOTE   Naive Bayes Training 0.5042657 0.8039648
## Precision...6  90:10 Sebelum SMOTE   Naive Bayes  Testing 0.4789744 0.7668539
## Precision...7  80:20 Sebelum SMOTE           KNN Training 0.7623496 0.7896370
## Precision...8  80:20 Sebelum SMOTE           KNN  Testing 0.6618852 0.7333756
## Precision...9  80:20 Sebelum SMOTE Decision Tree Training 0.7155106 0.7155106
## Precision...10 80:20 Sebelum SMOTE Decision Tree  Testing 0.7156762 0.7156762
## Precision...11 80:20 Sebelum SMOTE   Naive Bayes Training 0.5327617 0.7964548
## Precision...12 80:20 Sebelum SMOTE   Naive Bayes  Testing 0.5225410 0.7866831
## Precision...13 70:30 Sebelum SMOTE           KNN Training 0.7629077 0.7871817
## Precision...14 70:30 Sebelum SMOTE           KNN  Testing 0.6828269 0.7416149
## Precision...15 70:30 Sebelum SMOTE Decision Tree Training 0.7155185 0.7155185
## Precision...16 70:30 Sebelum SMOTE Decision Tree  Testing 0.7156026 0.7156026
## Precision...17 70:30 Sebelum SMOTE   Naive Bayes Training 0.4591195 0.8061538
## Precision...18 70:30 Sebelum SMOTE   Naive Bayes  Testing 0.4653465 0.8139810
##                   Recall  F1_Score Specificity      Kappa
## Precision...1  0.9166932 0.8448352   0.3626549 0.32066914
## Precision...2  0.8624642 0.7921053   0.2057762 0.07889072
## Precision...3  1.0000000 0.8341622   0.0000000 0.00000000
## Precision...4  1.0000000 0.8344292   0.0000000 0.00000000
## Precision...5  0.4062003 0.5397127   0.7508996 0.11427327
## Precision...6  0.3911175 0.5180266   0.7003610 0.06665863
## Precision...7  0.9103917 0.8457257   0.3900135 0.33976551
## Precision...8  0.8289191 0.7782258   0.2414414 0.07807641
## Precision...9  1.0000000 0.8341664   0.0000000 0.00000000
## Precision...10 1.0000000 0.8342789   0.0000000 0.00000000
## Precision...11 0.4661062 0.5880627   0.7004049 0.12670095
## Precision...12 0.4566929 0.5778986   0.6882883 0.10998482
## Precision...13 0.9163941 0.8468877   0.3768638 0.33490458
## Precision...14 0.8544847 0.7940590   0.2509004 0.11913025
## Precision...15 1.0000000 0.8341717   0.0000000 0.00000000
## Precision...16 1.0000000 0.8342289   0.0000000 0.00000000
## Precision...17 0.3213410 0.4595148   0.8056555 0.08724664
## Precision...18 0.3277672 0.4673469   0.8115246 0.09587575

11. Terapkan Smothe

Berdasarkan fungsi SMOTE yang telah dibuat pada tahap sebelumnya, dilakukan penerapan smothe yang berproses untuk penyeimbangan data dengan menambahkan data sintetis pada kelas minoritas. Penerapan metode ini bertujuan untuk menghasilkan distribusi kelas yang lebih seimbang sehingga dapat meningkatkan kinerja model klasifikasi. Hasil penerapan SMOTE disajikan pada bagian berikut.

train_90_smote <- smote_data(train_90)

## 
## --- Info SMOTE ---
## Sebelum SMOTE - Tidak_Merokok: 2501 | Merokok: 6290 
## 
## Struktur hasil SMOTE:
##  chr [1:11292] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
## Sesudah SMOTE - Tidak_Merokok: 5002 | Merokok: 6290

train_80_smote <- smote_data(train_80)

## 
## --- Info SMOTE ---
## Sebelum SMOTE - Tidak_Merokok: 2223 | Merokok: 5591 
## 
## Struktur hasil SMOTE:
##  chr [1:10037] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
## Sesudah SMOTE - Tidak_Merokok: 4446 | Merokok: 5591

train_70_smote <- smote_data(train_70)

## 
## --- Info SMOTE ---
## Sebelum SMOTE - Tidak_Merokok: 1945 | Merokok: 4892 
## 
## Struktur hasil SMOTE:
##  chr [1:8782] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
## Sesudah SMOTE - Tidak_Merokok: 3890 | Merokok: 4892

# Test set dikonversi numerik agar cocok dengan format SMOTE
test_10_num    <- konversi_numerik(test_10); test_10_num$Y <- test_10$Y
test_20_num    <- konversi_numerik(test_20); test_20_num$Y <- test_20$Y
test_30_num    <- konversi_numerik(test_30); test_30_num$Y <- test_30$Y

# Perbandingan distribusi sebelum vs sesudah SMOTE
distribusi_smote <- bind_rows(
  lapply(list(
    "90:10 Sebelum SMOTE" = train_90,
    "80:20 Sebelum SMOTE" = train_80,
    "70:30 Sebelum SMOTE" = train_70,
    "90:10 Sesudah SMOTE" = train_90_smote,
    "80:20 Sesudah SMOTE" = train_80_smote,
    "70:30 Sesudah SMOTE" = train_70_smote
  ), function(x) as.data.frame(table(x$Y))),
  .id = "Keterangan"
)
colnames(distribusi_smote)[2:3] <- c("Status", "Frekuensi")
cat("\n--- Distribusi Data Sebelum dan Sesudah SMOTE ---\n")

## 
## --- Distribusi Data Sebelum dan Sesudah SMOTE ---

print(distribusi_smote)

##             Keterangan        Status Frekuensi
## 1  90:10 Sebelum SMOTE Tidak_Merokok      2501
## 2  90:10 Sebelum SMOTE       Merokok      6290
## 3  80:20 Sebelum SMOTE Tidak_Merokok      2223
## 4  80:20 Sebelum SMOTE       Merokok      5591
## 5  70:30 Sebelum SMOTE Tidak_Merokok      1945
## 6  70:30 Sebelum SMOTE       Merokok      4892
## 7  90:10 Sesudah SMOTE Tidak_Merokok      5002
## 8  90:10 Sesudah SMOTE       Merokok      6290
## 9  80:20 Sesudah SMOTE Tidak_Merokok      4446
## 10 80:20 Sesudah SMOTE       Merokok      5591
## 11 70:30 Sesudah SMOTE Tidak_Merokok      3890
## 12 70:30 Sesudah SMOTE       Merokok      4892

table(train_90_smote$Y)

## 
## Tidak_Merokok       Merokok 
##          5002          6290

table(train_80_smote$Y)

## 
## Tidak_Merokok       Merokok 
##          4446          5591

table(train_70_smote$Y)

## 
## Tidak_Merokok       Merokok 
##          3890          4892

12. Hasil Model Setelah Melakukan SMOTHE

Berdasarkan data hasil penerapan SMOTE, proses pemodelan klasifikasi dilakukan menggunakan metode Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Hasil yang diperoleh kemudian dievaluasi untuk mengetahui kinerja model setelah penyeimbangan kelas dilakukan.

res_smote_90 <- jalankan_model(train_90_smote, test_10_num, "90:10", "Sesudah SMOTE")

## 
## >>> KNN <<<
## Spesifikasi KNN:
##       Parameter          Nilai
## 1        Metode            KNN
## 2   k (default)              5
## 3 Preprocessing Center + Scale
## 4      Platform      R - caret
## Split    : 90:10 
## Kondisi  : Sesudah SMOTE 
## Metode   : KNN 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          3386    1051
##   Merokok                1616    5239
##                                           
##                Accuracy : 0.7638          
##                  95% CI : (0.7559, 0.7716)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5158          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8329          
##             Specificity : 0.6769          
##          Pos Pred Value : 0.7643          
##          Neg Pred Value : 0.7631          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4640          
##    Detection Prevalence : 0.6071          
##       Balanced Accuracy : 0.7549          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 90:10 
## Kondisi  : Sesudah SMOTE 
## Metode   : KNN 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok            95     177
##   Merokok                 182     521
##                                           
##                Accuracy : 0.6318          
##                  95% CI : (0.6006, 0.6621)
##     No Information Rate : 0.7159          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.0899          
##                                           
##  Mcnemar's Test P-Value : 0.8328          
##                                           
##             Sensitivity : 0.7464          
##             Specificity : 0.3430          
##          Pos Pred Value : 0.7411          
##          Neg Pred Value : 0.3493          
##              Prevalence : 0.7159          
##          Detection Rate : 0.5344          
##    Detection Prevalence : 0.7210          
##       Balanced Accuracy : 0.5447          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
##             Parameter      Nilai
## 1              Metode      rpart
## 2 Splitting Criterion Gini Index
## 3        cp (default)       0.01
## 4            maxdepth         10
## 5            Platform  R - rpart
## Split    : 90:10 
## Kondisi  : Sesudah SMOTE 
## Metode   : Decision Tree 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2302    1197
##   Merokok                2700    5093
##                                          
##                Accuracy : 0.6549         
##                  95% CI : (0.646, 0.6637)
##     No Information Rate : 0.557          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.2785         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.8097         
##             Specificity : 0.4602         
##          Pos Pred Value : 0.6535         
##          Neg Pred Value : 0.6579         
##              Prevalence : 0.5570         
##          Detection Rate : 0.4510         
##    Detection Prevalence : 0.6901         
##       Balanced Accuracy : 0.6350         
##                                          
##        'Positive' Class : Merokok        
##                                          
## Split    : 90:10 
## Kondisi  : Sesudah SMOTE 
## Metode   : Decision Tree 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           100     136
##   Merokok                 177     562
##                                           
##                Accuracy : 0.679           
##                  95% CI : (0.6486, 0.7082)
##     No Information Rate : 0.7159          
##     P-Value [Acc > NIR] : 0.99484         
##                                           
##                   Kappa : 0.1739          
##                                           
##  Mcnemar's Test P-Value : 0.02376         
##                                           
##             Sensitivity : 0.8052          
##             Specificity : 0.3610          
##          Pos Pred Value : 0.7605          
##          Neg Pred Value : 0.4237          
##              Prevalence : 0.7159          
##          Detection Rate : 0.5764          
##    Detection Prevalence : 0.7579          
##       Balanced Accuracy : 0.5831          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
##             Parameter          Nilai
## 1              Metode    naive_bayes
## 2   laplace (default)              0
## 3 usekernel (default)          FALSE
## 4    adjust (default)              1
## 5            Platform R - naivebayes
## Split    : 90:10 
## Kondisi  : Sesudah SMOTE 
## Metode   : Naive Bayes 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          3445    3172
##   Merokok                1557    3118
##                                          
##                Accuracy : 0.5812         
##                  95% CI : (0.572, 0.5903)
##     No Information Rate : 0.557          
##     P-Value [Acc > NIR] : 1.154e-07      
##                                          
##                   Kappa : 0.1785         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.4957         
##             Specificity : 0.6887         
##          Pos Pred Value : 0.6670         
##          Neg Pred Value : 0.5206         
##              Prevalence : 0.5570         
##          Detection Rate : 0.2761         
##    Detection Prevalence : 0.4140         
##       Balanced Accuracy : 0.5922         
##                                          
##        'Positive' Class : Merokok        
##                                          
## Split    : 90:10 
## Kondisi  : Sesudah SMOTE 
## Metode   : Naive Bayes 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           188     338
##   Merokok                  89     360
##                                           
##                Accuracy : 0.5621          
##                  95% CI : (0.5302, 0.5935)
##     No Information Rate : 0.7159          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.153           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5158          
##             Specificity : 0.6787          
##          Pos Pred Value : 0.8018          
##          Neg Pred Value : 0.3574          
##              Prevalence : 0.7159          
##          Detection Rate : 0.3692          
##    Detection Prevalence : 0.4605          
##       Balanced Accuracy : 0.5972          
##                                           
##        'Positive' Class : Merokok         
##

res_smote_80 <- jalankan_model(train_80_smote, test_20_num, "80:20", "Sesudah SMOTE")

## 
## >>> KNN <<<
## Spesifikasi KNN:
##       Parameter          Nilai
## 1        Metode            KNN
## 2   k (default)              5
## 3 Preprocessing Center + Scale
## 4      Platform      R - caret
## Split    : 80:20 
## Kondisi  : Sesudah SMOTE 
## Metode   : KNN 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          3138     965
##   Merokok                1308    4626
##                                           
##                Accuracy : 0.7735          
##                  95% CI : (0.7652, 0.7817)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5374          
##                                           
##  Mcnemar's Test P-Value : 7.315e-13       
##                                           
##             Sensitivity : 0.8274          
##             Specificity : 0.7058          
##          Pos Pred Value : 0.7796          
##          Neg Pred Value : 0.7648          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4609          
##    Detection Prevalence : 0.5912          
##       Balanced Accuracy : 0.7666          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : Sesudah SMOTE 
## Metode   : KNN 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           194     385
##   Merokok                 361    1012
##                                           
##                Accuracy : 0.6178          
##                  95% CI : (0.5959, 0.6394)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.073           
##                                           
##  Mcnemar's Test P-Value : 0.3997          
##                                           
##             Sensitivity : 0.7244          
##             Specificity : 0.3495          
##          Pos Pred Value : 0.7371          
##          Neg Pred Value : 0.3351          
##              Prevalence : 0.7157          
##          Detection Rate : 0.5184          
##    Detection Prevalence : 0.7034          
##       Balanced Accuracy : 0.5370          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
##             Parameter      Nilai
## 1              Metode      rpart
## 2 Splitting Criterion Gini Index
## 3        cp (default)       0.01
## 4            maxdepth         10
## 5            Platform  R - rpart
## Split    : 80:20 
## Kondisi  : Sesudah SMOTE 
## Metode   : Decision Tree 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2083    1033
##   Merokok                2363    4558
##                                           
##                Accuracy : 0.6617          
##                  95% CI : (0.6523, 0.6709)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.2927          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8152          
##             Specificity : 0.4685          
##          Pos Pred Value : 0.6586          
##          Neg Pred Value : 0.6685          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4541          
##    Detection Prevalence : 0.6895          
##       Balanced Accuracy : 0.6419          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : Sesudah SMOTE 
## Metode   : Decision Tree 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           199     263
##   Merokok                 356    1134
##                                           
##                Accuracy : 0.6829          
##                  95% CI : (0.6617, 0.7035)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 0.9993233       
##                                           
##                   Kappa : 0.1794          
##                                           
##  Mcnemar's Test P-Value : 0.0002175       
##                                           
##             Sensitivity : 0.8117          
##             Specificity : 0.3586          
##          Pos Pred Value : 0.7611          
##          Neg Pred Value : 0.4307          
##              Prevalence : 0.7157          
##          Detection Rate : 0.5809          
##    Detection Prevalence : 0.7633          
##       Balanced Accuracy : 0.5851          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
##             Parameter          Nilai
## 1              Metode    naive_bayes
## 2   laplace (default)              0
## 3 usekernel (default)          FALSE
## 4    adjust (default)              1
## 5            Platform R - naivebayes
## Split    : 80:20 
## Kondisi  : Sesudah SMOTE 
## Metode   : Naive Bayes 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2960    2617
##   Merokok                1486    2974
##                                           
##                Accuracy : 0.5912          
##                  95% CI : (0.5815, 0.6009)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : 2.549e-12       
##                                           
##                   Kappa : 0.1927          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5319          
##             Specificity : 0.6658          
##          Pos Pred Value : 0.6668          
##          Neg Pred Value : 0.5308          
##              Prevalence : 0.5570          
##          Detection Rate : 0.2963          
##    Detection Prevalence : 0.4444          
##       Balanced Accuracy : 0.5988          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : Sesudah SMOTE 
## Metode   : Naive Bayes 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           345     647
##   Merokok                 210     750
##                                           
##                Accuracy : 0.561           
##                  95% CI : (0.5386, 0.5831)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1281          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5369          
##             Specificity : 0.6216          
##          Pos Pred Value : 0.7812          
##          Neg Pred Value : 0.3478          
##              Prevalence : 0.7157          
##          Detection Rate : 0.3842          
##    Detection Prevalence : 0.4918          
##       Balanced Accuracy : 0.5792          
##                                           
##        'Positive' Class : Merokok         
##

res_smote_70 <- jalankan_model(train_70_smote, test_30_num, "70:30", "Sesudah SMOTE")

## 
## >>> KNN <<<
## Spesifikasi KNN:
##       Parameter          Nilai
## 1        Metode            KNN
## 2   k (default)              5
## 3 Preprocessing Center + Scale
## 4      Platform      R - caret
## Split    : 70:30 
## Kondisi  : Sesudah SMOTE 
## Metode   : KNN 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2735     807
##   Merokok                1155    4085
##                                           
##                Accuracy : 0.7766          
##                  95% CI : (0.7677, 0.7853)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5431          
##                                           
##  Mcnemar's Test P-Value : 4.728e-15       
##                                           
##             Sensitivity : 0.8350          
##             Specificity : 0.7031          
##          Pos Pred Value : 0.7796          
##          Neg Pred Value : 0.7722          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4652          
##    Detection Prevalence : 0.5967          
##       Balanced Accuracy : 0.7691          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 70:30 
## Kondisi  : Sesudah SMOTE 
## Metode   : KNN 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           320     594
##   Merokok                 513    1502
##                                           
##                Accuracy : 0.6221          
##                  95% CI : (0.6042, 0.6397)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.0979          
##                                           
##  Mcnemar's Test P-Value : 0.0162          
##                                           
##             Sensitivity : 0.7166          
##             Specificity : 0.3842          
##          Pos Pred Value : 0.7454          
##          Neg Pred Value : 0.3501          
##              Prevalence : 0.7156          
##          Detection Rate : 0.5128          
##    Detection Prevalence : 0.6879          
##       Balanced Accuracy : 0.5504          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
##             Parameter      Nilai
## 1              Metode      rpart
## 2 Splitting Criterion Gini Index
## 3        cp (default)       0.01
## 4            maxdepth         10
## 5            Platform  R - rpart
## Split    : 70:30 
## Kondisi  : Sesudah SMOTE 
## Metode   : Decision Tree 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          1756     920
##   Merokok                2134    3972
##                                           
##                Accuracy : 0.6522          
##                  95% CI : (0.6422, 0.6622)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.272           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8119          
##             Specificity : 0.4514          
##          Pos Pred Value : 0.6505          
##          Neg Pred Value : 0.6562          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4523          
##    Detection Prevalence : 0.6953          
##       Balanced Accuracy : 0.6317          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 70:30 
## Kondisi  : Sesudah SMOTE 
## Metode   : Decision Tree 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           322     376
##   Merokok                 511    1720
##                                           
##                Accuracy : 0.6972          
##                  95% CI : (0.6802, 0.7138)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 0.9868          
##                                           
##                   Kappa : 0.2178          
##                                           
##  Mcnemar's Test P-Value : 6.818e-06       
##                                           
##             Sensitivity : 0.8206          
##             Specificity : 0.3866          
##          Pos Pred Value : 0.7710          
##          Neg Pred Value : 0.4613          
##              Prevalence : 0.7156          
##          Detection Rate : 0.5872          
##    Detection Prevalence : 0.7617          
##       Balanced Accuracy : 0.6036          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
##             Parameter          Nilai
## 1              Metode    naive_bayes
## 2   laplace (default)              0
## 3 usekernel (default)          FALSE
## 4    adjust (default)              1
## 5            Platform R - naivebayes
## Split    : 70:30 
## Kondisi  : Sesudah SMOTE 
## Metode   : Naive Bayes 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2589    2349
##   Merokok                1301    2543
##                                          
##                Accuracy : 0.5844         
##                  95% CI : (0.574, 0.5947)
##     No Information Rate : 0.557          
##     P-Value [Acc > NIR] : 1.257e-07      
##                                          
##                   Kappa : 0.1804         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.5198         
##             Specificity : 0.6656         
##          Pos Pred Value : 0.6616         
##          Neg Pred Value : 0.5243         
##              Prevalence : 0.5570         
##          Detection Rate : 0.2896         
##    Detection Prevalence : 0.4377         
##       Balanced Accuracy : 0.5927         
##                                          
##        'Positive' Class : Merokok        
##                                          
## Split    : 70:30 
## Kondisi  : Sesudah SMOTE 
## Metode   : Naive Bayes 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           553     978
##   Merokok                 280    1118
##                                           
##                Accuracy : 0.5705          
##                  95% CI : (0.5523, 0.5885)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1575          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5334          
##             Specificity : 0.6639          
##          Pos Pred Value : 0.7997          
##          Neg Pred Value : 0.3612          
##              Prevalence : 0.7156          
##          Detection Rate : 0.3817          
##    Detection Prevalence : 0.4773          
##       Balanced Accuracy : 0.5986          
##                                           
##        'Positive' Class : Merokok         
##

hasil_smote <- bind_rows(res_smote_90$hasil, res_smote_80$hasil, res_smote_70$hasil)
print(hasil_smote)

##                Split       Kondisi         Model Set_Data  Accuracy Precision
## Precision...1  90:10 Sesudah SMOTE           KNN Training 0.7638151 0.7642597
## Precision...2  90:10 Sesudah SMOTE           KNN  Testing 0.6317949 0.7411095
## Precision...3  90:10 Sesudah SMOTE Decision Tree Training 0.6548884 0.6535352
## Precision...4  90:10 Sesudah SMOTE Decision Tree  Testing 0.6789744 0.7604871
## Precision...5  90:10 Sesudah SMOTE   Naive Bayes Training 0.5812079 0.6669519
## Precision...6  90:10 Sesudah SMOTE   Naive Bayes  Testing 0.5620513 0.8017817
## Precision...7  80:20 Sesudah SMOTE           KNN Training 0.7735379 0.7795753
## Precision...8  80:20 Sesudah SMOTE           KNN  Testing 0.6178279 0.7370721
## Precision...9  80:20 Sesudah SMOTE Decision Tree Training 0.6616519 0.6585754
## Precision...10 80:20 Sesudah SMOTE Decision Tree  Testing 0.6828893 0.7610738
## Precision...11 80:20 Sesudah SMOTE   Naive Bayes Training 0.5912125 0.6668161
## Precision...12 80:20 Sesudah SMOTE   Naive Bayes  Testing 0.5609631 0.7812500
## Precision...13 70:30 Sesudah SMOTE           KNN Training 0.7765885 0.7795802
## Precision...14 70:30 Sesudah SMOTE           KNN  Testing 0.6220553 0.7454094
## Precision...15 70:30 Sesudah SMOTE Decision Tree Training 0.6522432 0.6505077
## Precision...16 70:30 Sesudah SMOTE Decision Tree  Testing 0.6971663 0.7709547
## Precision...17 70:30 Sesudah SMOTE   Naive Bayes Training 0.5843771 0.6615505
## Precision...18 70:30 Sesudah SMOTE   Naive Bayes  Testing 0.5705019 0.7997139
##                   Recall  F1_Score Specificity      Kappa
## Precision...1  0.8329094 0.7971092   0.6769292 0.51580380
## Precision...2  0.7464183 0.7437545   0.3429603 0.08986783
## Precision...3  0.8096979 0.7232834   0.4602159 0.27848119
## Precision...4  0.8051576 0.7821851   0.3610108 0.17393234
## Precision...5  0.4957075 0.5687187   0.6887245 0.17853030
## Precision...6  0.5157593 0.6277245   0.6787004 0.15298634
## Precision...7  0.8274012 0.8027766   0.7058030 0.53744984
## Precision...8  0.7244094 0.7306859   0.3495495 0.07300758
## Precision...9  0.8152388 0.7285806   0.4685110 0.29271613
## Precision...10 0.8117394 0.7855906   0.3585586 0.17935511
## Precision...11 0.5319263 0.5917819   0.6657670 0.19267445
## Precision...12 0.5368647 0.6364022   0.6216216 0.12809180
## Precision...13 0.8350368 0.8063561   0.7030848 0.54309738
## Precision...14 0.7166031 0.7307225   0.3841537 0.09788887
## Precision...15 0.8119379 0.7223131   0.4514139 0.27204662
## Precision...16 0.8206107 0.7950081   0.3865546 0.21780010
## Precision...17 0.5198283 0.5821886   0.6655527 0.18040353
## Precision...18 0.5333969 0.6399542   0.6638655 0.15750004

13. Penerapan Grid Search dengan 5-Fold Cross Validation

Proses hyperparameter tuning dilakukan menggunakan Grid Search dengan 5-Fold Cross Validation. Pendekatan ini digunakan untuk memperoleh kombinasi parameter terbaik berdasarkan hasil evaluasi pada lima lipatan data pelatihan.

set.seed(123)
ctrl_grid <- trainControl(
  method          = "cv",
  number          = 5,
  classProbs      = TRUE,
  search          = "grid",
  summaryFunction = defaultSummary)

13. Fungsi Hyperparameter Tuning Grid Search pada model Klasifikasi

Hyperparameter tuning dengan metode Grid Search dilakukan pada data hasil SMOTE untuk memperoleh kombinasi parameter yang optimal pada model Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Parameter terbaik yang diperoleh kemudian digunakan dalam proses pemodelan dan evaluasi model.

tuning_model_grid <- function(train_data, test_data, split_name) {
  
  hasil_tuning <- data.frame()
  kondisi      <- "SMOTE + Grid Search"
  
  train_x <- train_data %>% dplyr::select(-Y)
  train_y <- train_data$Y
  test_x  <- test_data  %>% dplyr::select(-Y)
  test_y  <- test_data$Y
  
  
  # A. KNN — grid k
  cat("\n>>> KNN Grid Search <<<\n")
  
  grid_knn <- expand.grid(k = c(3, 5, 7, 9, 11, 13, 15))
  
  set.seed(123)
  tune_knn <- train(Y ~ ., data = train_data, method = "knn",
                    trControl  = ctrl_grid,
                    tuneGrid   = grid_knn,
                    preProcess = c("center", "scale"),
                    metric     = "Accuracy")
  
  cat("\nHasil Grid KNN:\n"); print(tune_knn$results)
  cat("\nBest Tune KNN:\n");  print(tune_knn$bestTune)
  
  spek_knn_tuning <- data.frame(
    Parameter = c("Metode Tuning", "Search Strategy", "Cross-Validation",
                  "Grid k", "Best k"),
    Nilai     = c("Grid Search", "Grid", "5-Fold CV",
                  paste(grid_knn$k, collapse = ", "),
                  as.character(tune_knn$bestTune$k))
  )
  print(spek_knn_tuning)
  
  pred_knn_train <- factor(predict(tune_knn, train_x), levels = levels(train_y))
  hasil_tuning <- bind_rows(hasil_tuning,
                            evaluasi_model(train_y, pred_knn_train,
                                           "KNN Tuning", split_name, kondisi, "Training"))
  
  pred_knn_test <- factor(predict(tune_knn, test_x), levels = levels(test_y))
  hasil_tuning <- bind_rows(hasil_tuning,
                            evaluasi_model(test_y, pred_knn_test,
                                           "KNN Tuning", split_name, kondisi, "Testing"))
  
  
  # B. Decision Tree — grid cp
  cat("\n>>> Decision Tree Grid Search <<<\n")
  
  grid_dt <- expand.grid(cp = c(0.0001, 0.001, 0.005, 0.01, 0.05, 0.1))
  
  set.seed(123)
  tune_dt_cv <- train(Y ~ ., data = train_data, method = "rpart",
                      trControl = ctrl_grid,
                      tuneGrid  = grid_dt,
                      metric    = "Accuracy")
  
  cat("\nHasil Grid Decision Tree:\n"); print(tune_dt_cv$results)
  cat("\nBest Tune Decision Tree:\n");  print(tune_dt_cv$bestTune)
  
  spek_dt_tuning <- data.frame(
    Parameter = c("Metode Tuning", "Search Strategy", "Cross-Validation",
                  "Grid cp", "Splitting Criterion", "Best cp", "maxdepth"),
    Nilai     = c("Grid Search", "Grid", "5-Fold CV",
                  paste(grid_dt$cp, collapse = ", "),
                  "Gini Index",
                  as.character(round(tune_dt_cv$bestTune$cp, 6)), "10")
  )
  print(spek_dt_tuning)
  
  # Latih ulang dengan cp terbaik
  tune_dt <- rpart(Y ~ ., data = train_data, method = "class",
                   control = rpart.control(cp       = tune_dt_cv$bestTune$cp,
                                           maxdepth = 10))
  
  pred_dt_train <- factor(predict(tune_dt, train_x, type = "class"),
                          levels = levels(train_y))
  hasil_tuning <- bind_rows(hasil_tuning,
                            evaluasi_model(train_y, pred_dt_train,
                                           "Decision Tree Tuning", split_name, kondisi, "Training"))
  
  pred_dt_test <- factor(predict(tune_dt, test_x, type = "class"),
                         levels = levels(test_y))
  hasil_tuning <- bind_rows(hasil_tuning,
                            evaluasi_model(test_y, pred_dt_test,
                                           "Decision Tree Tuning", split_name, kondisi, "Testing"))
  
  
  
  # C. Naive Bayes — grid laplace, usekernel, adjust
  cat("\n>>> Naive Bayes Grid Search <<<\n")
  
  grid_nb <- expand.grid(
    laplace   = c(0, 0.5, 1),
    usekernel = c(FALSE, TRUE),
    adjust    = c(0.5, 1, 1.5, 2)
  )
  
  set.seed(123)
  tune_nb <- train(Y ~ ., data = train_data, method = "naive_bayes",
                   trControl = ctrl_grid,
                   tuneGrid  = grid_nb,
                   metric    = "Accuracy")
  
  cat("\nHasil Grid Naive Bayes:\n"); print(tune_nb$results)
  cat("\nBest Tune Naive Bayes:\n");  print(tune_nb$bestTune)
  
  spek_nb_tuning <- data.frame(
    Parameter = c("Metode Tuning", "Search Strategy", "Cross-Validation",
                  "Grid laplace", "Grid usekernel", "Grid adjust",
                  "Best laplace", "Best usekernel", "Best adjust"),
    Nilai     = c("Grid Search", "Grid", "5-Fold CV",
                  paste(unique(grid_nb$laplace),   collapse = ", "),
                  paste(unique(grid_nb$usekernel), collapse = ", "),
                  paste(unique(grid_nb$adjust),    collapse = ", "),
                  as.character(tune_nb$bestTune$laplace),
                  as.character(tune_nb$bestTune$usekernel),
                  as.character(tune_nb$bestTune$adjust))
  )
  print(spek_nb_tuning)
  
  pred_nb_train <- factor(predict(tune_nb, train_x), levels = levels(train_y))
  hasil_tuning <- bind_rows(hasil_tuning,
                            evaluasi_model(train_y, pred_nb_train,
                                           "Naive Bayes Tuning", split_name, kondisi, "Training"))
  
  pred_nb_test <- factor(predict(tune_nb, test_x), levels = levels(test_y))
  hasil_tuning <- bind_rows(hasil_tuning,
                            evaluasi_model(test_y, pred_nb_test,
                                           "Naive Bayes Tuning", split_name, kondisi, "Testing"))
  
  
  return(list(
    hasil  = hasil_tuning,
    knn    = tune_knn,
    dt     = tune_dt,
    dt_cv  = tune_dt_cv,
    nb     = tune_nb
  ))}

14. Proses Hyperparameter Tuning Grid Search pada Data Hasil SMOTE

Pada tahap ini dilakukan proses hyperparameter tuning menggunakan metode Grid Search pada data hasil SMOTE untuk memperoleh kombinasi parameter terbaik pada model klasifikasi, yaitu Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN).

tuning_90 <- tuning_model_grid(train_90_smote, test_10_num, "90:10")

## 
## >>> KNN Grid Search <<<
## 
## Hasil Grid KNN:
##    k  Accuracy     Kappa  AccuracySD     KappaSD
## 1  3 0.6526736 0.2891045 0.006751281 0.014633744
## 2  5 0.6483350 0.2785450 0.004970076 0.010820510
## 3  7 0.6462088 0.2728919 0.005466383 0.011393553
## 4  9 0.6436411 0.2661629 0.002691579 0.006242274
## 5 11 0.6412496 0.2606083 0.004882112 0.010561670
## 6 13 0.6393010 0.2556801 0.008963066 0.019413253
## 7 15 0.6377956 0.2510948 0.008086724 0.017535153
## 
## Best Tune KNN:
##   k
## 1 3
##          Parameter                  Nilai
## 1    Metode Tuning            Grid Search
## 2  Search Strategy                   Grid
## 3 Cross-Validation              5-Fold CV
## 4           Grid k 3, 5, 7, 9, 11, 13, 15
## 5           Best k                      3
## Split    : 90:10 
## Kondisi  : SMOTE + Grid Search 
## Metode   : KNN Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          3792     886
##   Merokok                1210    5404
##                                           
##                Accuracy : 0.8144          
##                  95% CI : (0.8071, 0.8215)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6214          
##                                           
##  Mcnemar's Test P-Value : 1.724e-12       
##                                           
##             Sensitivity : 0.8591          
##             Specificity : 0.7581          
##          Pos Pred Value : 0.8171          
##          Neg Pred Value : 0.8106          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4786          
##    Detection Prevalence : 0.5857          
##       Balanced Accuracy : 0.8086          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 90:10 
## Kondisi  : SMOTE + Grid Search 
## Metode   : KNN Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok            89     188
##   Merokok                 188     510
##                                         
##                Accuracy : 0.6144        
##                  95% CI : (0.583, 0.645)
##     No Information Rate : 0.7159        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.052         
##                                         
##  Mcnemar's Test P-Value : 1             
##                                         
##             Sensitivity : 0.7307        
##             Specificity : 0.3213        
##          Pos Pred Value : 0.7307        
##          Neg Pred Value : 0.3213        
##              Prevalence : 0.7159        
##          Detection Rate : 0.5231        
##    Detection Prevalence : 0.7159        
##       Balanced Accuracy : 0.5260        
##                                         
##        'Positive' Class : Merokok       
##                                         
## 
## >>> Decision Tree Grid Search <<<
## 
## Hasil Grid Decision Tree:
##      cp  Accuracy     Kappa  AccuracySD    KappaSD
## 1 1e-04 0.6871226 0.3562079 0.006777489 0.01519806
## 2 1e-03 0.6948268 0.3612574 0.008934419 0.01877888
## 3 5e-03 0.6592271 0.2842918 0.007847472 0.01622820
## 4 1e-02 0.6480692 0.2619817 0.011089281 0.02330709
## 5 5e-02 0.6152145 0.1799497 0.005137419 0.01091414
## 6 1e-01 0.6152145 0.1799497 0.005137419 0.01091414
## 
## Best Tune Decision Tree:
##      cp
## 2 0.001
##             Parameter                                Nilai
## 1       Metode Tuning                          Grid Search
## 2     Search Strategy                                 Grid
## 3    Cross-Validation                            5-Fold CV
## 4             Grid cp 1e-04, 0.001, 0.005, 0.01, 0.05, 0.1
## 5 Splitting Criterion                           Gini Index
## 6             Best cp                                0.001
## 7            maxdepth                                   10
## Split    : 90:10 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Decision Tree Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2391     707
##   Merokok                2611    5583
##                                           
##                Accuracy : 0.7062          
##                  95% CI : (0.6977, 0.7146)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3804          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8876          
##             Specificity : 0.4780          
##          Pos Pred Value : 0.6814          
##          Neg Pred Value : 0.7718          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4944          
##    Detection Prevalence : 0.7256          
##       Balanced Accuracy : 0.6828          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 90:10 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Decision Tree Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok            67      84
##   Merokok                 210     614
##                                           
##                Accuracy : 0.6985          
##                  95% CI : (0.6686, 0.7271)
##     No Information Rate : 0.7159          
##     P-Value [Acc > NIR] : 0.8925          
##                                           
##                   Kappa : 0.1409          
##                                           
##  Mcnemar's Test P-Value : 3.096e-13       
##                                           
##             Sensitivity : 0.8797          
##             Specificity : 0.2419          
##          Pos Pred Value : 0.7451          
##          Neg Pred Value : 0.4437          
##              Prevalence : 0.7159          
##          Detection Rate : 0.6297          
##    Detection Prevalence : 0.8451          
##       Balanced Accuracy : 0.5608          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes Grid Search <<<
## 
## Hasil Grid Naive Bayes:
##    laplace usekernel adjust  Accuracy     Kappa  AccuracySD     KappaSD
## 1      0.0     FALSE    0.5 0.5757183 0.1690515 0.012728918 0.022399680
## 2      0.0     FALSE    1.0 0.5757183 0.1690515 0.012728918 0.022399680
## 3      0.0     FALSE    1.5 0.5757183 0.1690515 0.012728918 0.022399680
## 4      0.0     FALSE    2.0 0.5757183 0.1690515 0.012728918 0.022399680
## 5      0.0      TRUE    0.5 0.6923478 0.3724414 0.005540614 0.012912229
## 6      0.0      TRUE    1.0 0.6541803 0.2987830 0.005573450 0.008554418
## 7      0.0      TRUE    1.5 0.6250451 0.2506262 0.011094902 0.018577419
## 8      0.0      TRUE    2.0 0.6047652 0.2178888 0.007280343 0.009985229
## 9      0.5     FALSE    0.5 0.5757183 0.1690515 0.012728918 0.022399680
## 10     0.5     FALSE    1.0 0.5757183 0.1690515 0.012728918 0.022399680
## 11     0.5     FALSE    1.5 0.5757183 0.1690515 0.012728918 0.022399680
## 12     0.5     FALSE    2.0 0.5757183 0.1690515 0.012728918 0.022399680
## 13     0.5      TRUE    0.5 0.6923478 0.3724414 0.005540614 0.012912229
## 14     0.5      TRUE    1.0 0.6541803 0.2987830 0.005573450 0.008554418
## 15     0.5      TRUE    1.5 0.6250451 0.2506262 0.011094902 0.018577419
## 16     0.5      TRUE    2.0 0.6047652 0.2178888 0.007280343 0.009985229
## 17     1.0     FALSE    0.5 0.5757183 0.1690515 0.012728918 0.022399680
## 18     1.0     FALSE    1.0 0.5757183 0.1690515 0.012728918 0.022399680
## 19     1.0     FALSE    1.5 0.5757183 0.1690515 0.012728918 0.022399680
## 20     1.0     FALSE    2.0 0.5757183 0.1690515 0.012728918 0.022399680
## 21     1.0      TRUE    0.5 0.6923478 0.3724414 0.005540614 0.012912229
## 22     1.0      TRUE    1.0 0.6541803 0.2987830 0.005573450 0.008554418
## 23     1.0      TRUE    1.5 0.6250451 0.2506262 0.011094902 0.018577419
## 24     1.0      TRUE    2.0 0.6047652 0.2178888 0.007280343 0.009985229
## 
## Best Tune Naive Bayes:
##   laplace usekernel adjust
## 5       0      TRUE    0.5
##          Parameter          Nilai
## 1    Metode Tuning    Grid Search
## 2  Search Strategy           Grid
## 3 Cross-Validation      5-Fold CV
## 4     Grid laplace      0, 0.5, 1
## 5   Grid usekernel    FALSE, TRUE
## 6      Grid adjust 0.5, 1, 1.5, 2
## 7     Best laplace              0
## 8   Best usekernel           TRUE
## 9      Best adjust            0.5
## Split    : 90:10 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Naive Bayes Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          3120    1551
##   Merokok                1882    4739
##                                           
##                Accuracy : 0.696           
##                  95% CI : (0.6874, 0.7045)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3797          
##                                           
##  Mcnemar's Test P-Value : 1.779e-08       
##                                           
##             Sensitivity : 0.7534          
##             Specificity : 0.6238          
##          Pos Pred Value : 0.7158          
##          Neg Pred Value : 0.6680          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4197          
##    Detection Prevalence : 0.5863          
##       Balanced Accuracy : 0.6886          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 90:10 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Naive Bayes Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           115     162
##   Merokok                 162     536
##                                           
##                Accuracy : 0.6677          
##                  95% CI : (0.6371, 0.6972)
##     No Information Rate : 0.7159          
##     P-Value [Acc > NIR] : 0.9996          
##                                           
##                   Kappa : 0.1831          
##                                           
##  Mcnemar's Test P-Value : 1.0000          
##                                           
##             Sensitivity : 0.7679          
##             Specificity : 0.4152          
##          Pos Pred Value : 0.7679          
##          Neg Pred Value : 0.4152          
##              Prevalence : 0.7159          
##          Detection Rate : 0.5497          
##    Detection Prevalence : 0.7159          
##       Balanced Accuracy : 0.5915          
##                                           
##        'Positive' Class : Merokok         
##

tuning_80 <- tuning_model_grid(train_80_smote, test_20_num, "80:20")

## 
## >>> KNN Grid Search <<<
## 
## Hasil Grid KNN:
##    k  Accuracy     Kappa  AccuracySD     KappaSD
## 1  3 0.6603563 0.3061821 0.002219518 0.003602833
## 2  5 0.6578639 0.2981994 0.010000064 0.020695645
## 3  7 0.6533814 0.2877321 0.005278955 0.010749839
## 4  9 0.6476025 0.2749781 0.008234597 0.016831306
## 5 11 0.6447138 0.2680527 0.004509737 0.009778393
## 6 13 0.6428203 0.2630049 0.007938468 0.016046519
## 7 15 0.6424224 0.2609627 0.005359674 0.011055896
## 
## Best Tune KNN:
##   k
## 1 3
##          Parameter                  Nilai
## 1    Metode Tuning            Grid Search
## 2  Search Strategy                   Grid
## 3 Cross-Validation              5-Fold CV
## 4           Grid k 3, 5, 7, 9, 11, 13, 15
## 5           Best k                      3
## Split    : 80:20 
## Kondisi  : SMOTE + Grid Search 
## Metode   : KNN Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          3475     825
##   Merokok                 971    4766
##                                           
##                Accuracy : 0.8211          
##                  95% CI : (0.8134, 0.8285)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6362          
##                                           
##  Mcnemar's Test P-Value : 0.0006228       
##                                           
##             Sensitivity : 0.8524          
##             Specificity : 0.7816          
##          Pos Pred Value : 0.8307          
##          Neg Pred Value : 0.8081          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4748          
##    Detection Prevalence : 0.5716          
##       Balanced Accuracy : 0.8170          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : SMOTE + Grid Search 
## Metode   : KNN Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           179     390
##   Merokok                 376    1007
##                                           
##                Accuracy : 0.6076          
##                  95% CI : (0.5855, 0.6293)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 1.0000          
##                                           
##                   Kappa : 0.043           
##                                           
##  Mcnemar's Test P-Value : 0.6386          
##                                           
##             Sensitivity : 0.7208          
##             Specificity : 0.3225          
##          Pos Pred Value : 0.7281          
##          Neg Pred Value : 0.3146          
##              Prevalence : 0.7157          
##          Detection Rate : 0.5159          
##    Detection Prevalence : 0.7085          
##       Balanced Accuracy : 0.5217          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree Grid Search <<<
## 
## Hasil Grid Decision Tree:
##      cp  Accuracy     Kappa  AccuracySD     KappaSD
## 1 1e-04 0.6937310 0.3705395 0.010407544 0.023202931
## 2 1e-03 0.6941313 0.3636930 0.004555454 0.009537227
## 3 5e-03 0.6717145 0.3094030 0.003795039 0.009691605
## 4 1e-02 0.6574673 0.2811684 0.005214245 0.014762149
## 5 5e-02 0.6166197 0.1830240 0.006949055 0.016473587
## 6 1e-01 0.6166197 0.1830240 0.006949055 0.016473587
## 
## Best Tune Decision Tree:
##      cp
## 2 0.001
##             Parameter                                Nilai
## 1       Metode Tuning                          Grid Search
## 2     Search Strategy                                 Grid
## 3    Cross-Validation                            5-Fold CV
## 4             Grid cp 1e-04, 0.001, 0.005, 0.01, 0.05, 0.1
## 5 Splitting Criterion                           Gini Index
## 6             Best cp                                0.001
## 7            maxdepth                                   10
## Split    : 80:20 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Decision Tree Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2127     634
##   Merokok                2319    4957
##                                           
##                Accuracy : 0.7058          
##                  95% CI : (0.6968, 0.7147)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3797          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.8866          
##             Specificity : 0.4784          
##          Pos Pred Value : 0.6813          
##          Neg Pred Value : 0.7704          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4939          
##    Detection Prevalence : 0.7249          
##       Balanced Accuracy : 0.6825          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Decision Tree Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           147     182
##   Merokok                 408    1215
##                                           
##                Accuracy : 0.6977          
##                  95% CI : (0.6768, 0.7181)
##     No Information Rate : 0.7157          
##     P-Value [Acc > NIR] : 0.9619          
##                                           
##                   Kappa : 0.1534          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8697          
##             Specificity : 0.2649          
##          Pos Pred Value : 0.7486          
##          Neg Pred Value : 0.4468          
##              Prevalence : 0.7157          
##          Detection Rate : 0.6224          
##    Detection Prevalence : 0.8315          
##       Balanced Accuracy : 0.5673          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes Grid Search <<<
## 
## Hasil Grid Naive Bayes:
##    laplace usekernel adjust  Accuracy     Kappa AccuracySD    KappaSD
## 1      0.0     FALSE    0.5 0.5891196 0.1893072 0.01244551 0.01948005
## 2      0.0     FALSE    1.0 0.5891196 0.1893072 0.01244551 0.01948005
## 3      0.0     FALSE    1.5 0.5891196 0.1893072 0.01244551 0.01948005
## 4      0.0     FALSE    2.0 0.5891196 0.1893072 0.01244551 0.01948005
## 5      0.0      TRUE    0.5 0.6692221 0.3345024 0.01880158 0.02933722
## 6      0.0      TRUE    1.0 0.6319608 0.2654318 0.01792967 0.02585402
## 7      0.0      TRUE    1.5 0.6105399 0.2333733 0.02231004 0.03222596
## 8      0.0      TRUE    2.0 0.5962928 0.2092832 0.01965208 0.02728571
## 9      0.5     FALSE    0.5 0.5891196 0.1893072 0.01244551 0.01948005
## 10     0.5     FALSE    1.0 0.5891196 0.1893072 0.01244551 0.01948005
## 11     0.5     FALSE    1.5 0.5891196 0.1893072 0.01244551 0.01948005
## 12     0.5     FALSE    2.0 0.5891196 0.1893072 0.01244551 0.01948005
## 13     0.5      TRUE    0.5 0.6692221 0.3345024 0.01880158 0.02933722
## 14     0.5      TRUE    1.0 0.6319608 0.2654318 0.01792967 0.02585402
## 15     0.5      TRUE    1.5 0.6105399 0.2333733 0.02231004 0.03222596
## 16     0.5      TRUE    2.0 0.5962928 0.2092832 0.01965208 0.02728571
## 17     1.0     FALSE    0.5 0.5891196 0.1893072 0.01244551 0.01948005
## 18     1.0     FALSE    1.0 0.5891196 0.1893072 0.01244551 0.01948005
## 19     1.0     FALSE    1.5 0.5891196 0.1893072 0.01244551 0.01948005
## 20     1.0     FALSE    2.0 0.5891196 0.1893072 0.01244551 0.01948005
## 21     1.0      TRUE    0.5 0.6692221 0.3345024 0.01880158 0.02933722
## 22     1.0      TRUE    1.0 0.6319608 0.2654318 0.01792967 0.02585402
## 23     1.0      TRUE    1.5 0.6105399 0.2333733 0.02231004 0.03222596
## 24     1.0      TRUE    2.0 0.5962928 0.2092832 0.01965208 0.02728571
## 
## Best Tune Naive Bayes:
##   laplace usekernel adjust
## 5       0      TRUE    0.5
##          Parameter          Nilai
## 1    Metode Tuning    Grid Search
## 2  Search Strategy           Grid
## 3 Cross-Validation      5-Fold CV
## 4     Grid laplace      0, 0.5, 1
## 5   Grid usekernel    FALSE, TRUE
## 6      Grid adjust 0.5, 1, 1.5, 2
## 7     Best laplace              0
## 8   Best usekernel           TRUE
## 9      Best adjust            0.5
## Split    : 80:20 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Naive Bayes Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2968    1832
##   Merokok                1478    3759
##                                           
##                Accuracy : 0.6702          
##                  95% CI : (0.6609, 0.6794)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.3371          
##                                           
##  Mcnemar's Test P-Value : 8.481e-10       
##                                           
##             Sensitivity : 0.6723          
##             Specificity : 0.6676          
##          Pos Pred Value : 0.7178          
##          Neg Pred Value : 0.6183          
##              Prevalence : 0.5570          
##          Detection Rate : 0.3745          
##    Detection Prevalence : 0.5218          
##       Balanced Accuracy : 0.6699          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 80:20 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Naive Bayes Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           281     433
##   Merokok                 274     964
##                                          
##                Accuracy : 0.6378         
##                  95% CI : (0.616, 0.6592)
##     No Information Rate : 0.7157         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1808         
##                                          
##  Mcnemar's Test P-Value : 2.812e-09      
##                                          
##             Sensitivity : 0.6901         
##             Specificity : 0.5063         
##          Pos Pred Value : 0.7787         
##          Neg Pred Value : 0.3936         
##              Prevalence : 0.7157         
##          Detection Rate : 0.4939         
##    Detection Prevalence : 0.6342         
##       Balanced Accuracy : 0.5982         
##                                          
##        'Positive' Class : Merokok        
##

tuning_70 <- tuning_model_grid(train_70_smote, test_30_num, "70:30")

## 
## >>> KNN Grid Search <<<
## 
## Hasil Grid KNN:
##    k  Accuracy     Kappa AccuracySD    KappaSD
## 1  3 0.6654539 0.3169267 0.01203198 0.02371039
## 2  5 0.6581671 0.2994442 0.01358479 0.02928201
## 3  7 0.6513347 0.2831086 0.01310253 0.02631077
## 4  9 0.6466662 0.2727664 0.01174490 0.02290493
## 5 11 0.6397197 0.2576399 0.01200059 0.02403883
## 6 13 0.6415406 0.2607905 0.01445131 0.02948253
## 7 15 0.6391503 0.2547549 0.01519120 0.03127271
## 
## Best Tune KNN:
##   k
## 1 3
##          Parameter                  Nilai
## 1    Metode Tuning            Grid Search
## 2  Search Strategy                   Grid
## 3 Cross-Validation              5-Fold CV
## 4           Grid k 3, 5, 7, 9, 11, 13, 15
## 5           Best k                      3
## Split    : 70:30 
## Kondisi  : SMOTE + Grid Search 
## Metode   : KNN Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          3036     671
##   Merokok                 854    4221
##                                           
##                Accuracy : 0.8263          
##                  95% CI : (0.8183, 0.8342)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6464          
##                                           
##  Mcnemar's Test P-Value : 3.154e-06       
##                                           
##             Sensitivity : 0.8628          
##             Specificity : 0.7805          
##          Pos Pred Value : 0.8317          
##          Neg Pred Value : 0.8190          
##              Prevalence : 0.5570          
##          Detection Rate : 0.4806          
##    Detection Prevalence : 0.5779          
##       Balanced Accuracy : 0.8217          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 70:30 
## Kondisi  : SMOTE + Grid Search 
## Metode   : KNN Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           318     587
##   Merokok                 515    1509
##                                           
##                Accuracy : 0.6238          
##                  95% CI : (0.6059, 0.6413)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 1.00000         
##                                           
##                   Kappa : 0.0991          
##                                           
##  Mcnemar's Test P-Value : 0.03245         
##                                           
##             Sensitivity : 0.7199          
##             Specificity : 0.3818          
##          Pos Pred Value : 0.7456          
##          Neg Pred Value : 0.3514          
##              Prevalence : 0.7156          
##          Detection Rate : 0.5152          
##    Detection Prevalence : 0.6910          
##       Balanced Accuracy : 0.5508          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Decision Tree Grid Search <<<
## 
## Hasil Grid Decision Tree:
##      cp  Accuracy     Kappa  AccuracySD     KappaSD
## 1 1e-04 0.6938071 0.3704462 0.012072840 0.023340879
## 2 1e-03 0.6876561 0.3463824 0.007264530 0.014419113
## 3 5e-03 0.6629465 0.2835895 0.005438087 0.013400742
## 4 1e-02 0.6437012 0.2536315 0.010797773 0.023369060
## 5 5e-02 0.6032791 0.1476151 0.002392875 0.009688167
## 6 1e-01 0.6032791 0.1476151 0.002392875 0.009688167
## 
## Best Tune Decision Tree:
##      cp
## 1 1e-04
##             Parameter                                Nilai
## 1       Metode Tuning                          Grid Search
## 2     Search Strategy                                 Grid
## 3    Cross-Validation                            5-Fold CV
## 4             Grid cp 1e-04, 0.001, 0.005, 0.01, 0.05, 0.1
## 5 Splitting Criterion                           Gini Index
## 6             Best cp                                1e-04
## 7            maxdepth                                   10
## Split    : 70:30 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Decision Tree Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          1927     508
##   Merokok                1963    4384
##                                          
##                Accuracy : 0.7186         
##                  95% CI : (0.7091, 0.728)
##     No Information Rate : 0.557          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.4071         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.8962         
##             Specificity : 0.4954         
##          Pos Pred Value : 0.6907         
##          Neg Pred Value : 0.7914         
##              Prevalence : 0.5570         
##          Detection Rate : 0.4992         
##    Detection Prevalence : 0.7227         
##       Balanced Accuracy : 0.6958         
##                                          
##        'Positive' Class : Merokok        
##                                          
## Split    : 70:30 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Decision Tree Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           221     250
##   Merokok                 612    1846
##                                           
##                Accuracy : 0.7057          
##                  95% CI : (0.6888, 0.7222)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 0.8863          
##                                           
##                   Kappa : 0.168           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8807          
##             Specificity : 0.2653          
##          Pos Pred Value : 0.7510          
##          Neg Pred Value : 0.4692          
##              Prevalence : 0.7156          
##          Detection Rate : 0.6302          
##    Detection Prevalence : 0.8392          
##       Balanced Accuracy : 0.5730          
##                                           
##        'Positive' Class : Merokok         
##                                           
## 
## >>> Naive Bayes Grid Search <<<
## 
## Hasil Grid Naive Bayes:
##    laplace usekernel adjust  Accuracy     Kappa  AccuracySD    KappaSD
## 1      0.0     FALSE    0.5 0.5824406 0.1769451 0.008527046 0.01581122
## 2      0.0     FALSE    1.0 0.5824406 0.1769451 0.008527046 0.01581122
## 3      0.0     FALSE    1.5 0.5824406 0.1769451 0.008527046 0.01581122
## 4      0.0     FALSE    2.0 0.5824406 0.1769451 0.008527046 0.01581122
## 5      0.0      TRUE    0.5 0.6709171 0.3354427 0.006595026 0.01107505
## 6      0.0      TRUE    1.0 0.6323153 0.2633345 0.008541983 0.01555615
## 7      0.0      TRUE    1.5 0.6068084 0.2233559 0.010308988 0.01873607
## 8      0.0      TRUE    2.0 0.5946238 0.2032387 0.009400893 0.01646852
## 9      0.5     FALSE    0.5 0.5824406 0.1769451 0.008527046 0.01581122
## 10     0.5     FALSE    1.0 0.5824406 0.1769451 0.008527046 0.01581122
## 11     0.5     FALSE    1.5 0.5824406 0.1769451 0.008527046 0.01581122
## 12     0.5     FALSE    2.0 0.5824406 0.1769451 0.008527046 0.01581122
## 13     0.5      TRUE    0.5 0.6709171 0.3354427 0.006595026 0.01107505
## 14     0.5      TRUE    1.0 0.6323153 0.2633345 0.008541983 0.01555615
## 15     0.5      TRUE    1.5 0.6068084 0.2233559 0.010308988 0.01873607
## 16     0.5      TRUE    2.0 0.5946238 0.2032387 0.009400893 0.01646852
## 17     1.0     FALSE    0.5 0.5824406 0.1769451 0.008527046 0.01581122
## 18     1.0     FALSE    1.0 0.5824406 0.1769451 0.008527046 0.01581122
## 19     1.0     FALSE    1.5 0.5824406 0.1769451 0.008527046 0.01581122
## 20     1.0     FALSE    2.0 0.5824406 0.1769451 0.008527046 0.01581122
## 21     1.0      TRUE    0.5 0.6709171 0.3354427 0.006595026 0.01107505
## 22     1.0      TRUE    1.0 0.6323153 0.2633345 0.008541983 0.01555615
## 23     1.0      TRUE    1.5 0.6068084 0.2233559 0.010308988 0.01873607
## 24     1.0      TRUE    2.0 0.5946238 0.2032387 0.009400893 0.01646852
## 
## Best Tune Naive Bayes:
##   laplace usekernel adjust
## 5       0      TRUE    0.5
##          Parameter          Nilai
## 1    Metode Tuning    Grid Search
## 2  Search Strategy           Grid
## 3 Cross-Validation      5-Fold CV
## 4     Grid laplace      0, 0.5, 1
## 5   Grid usekernel    FALSE, TRUE
## 6      Grid adjust 0.5, 1, 1.5, 2
## 7     Best laplace              0
## 8   Best usekernel           TRUE
## 9      Best adjust            0.5
## Split    : 70:30 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Naive Bayes Tuning 
## Set Data : Training 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok          2514    1469
##   Merokok                1376    3423
##                                           
##                Accuracy : 0.676           
##                  95% CI : (0.6661, 0.6858)
##     No Information Rate : 0.557           
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.3451          
##                                           
##  Mcnemar's Test P-Value : 0.08456         
##                                           
##             Sensitivity : 0.6997          
##             Specificity : 0.6463          
##          Pos Pred Value : 0.7133          
##          Neg Pred Value : 0.6312          
##              Prevalence : 0.5570          
##          Detection Rate : 0.3898          
##    Detection Prevalence : 0.5465          
##       Balanced Accuracy : 0.6730          
##                                           
##        'Positive' Class : Merokok         
##                                           
## Split    : 70:30 
## Kondisi  : SMOTE + Grid Search 
## Metode   : Naive Bayes Tuning 
## Set Data : Testing 
## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           437     587
##   Merokok                 396    1509
##                                          
##                Accuracy : 0.6644         
##                  95% CI : (0.647, 0.6815)
##     No Information Rate : 0.7156         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2287         
##                                          
##  Mcnemar's Test P-Value : 1.361e-09      
##                                          
##             Sensitivity : 0.7199         
##             Specificity : 0.5246         
##          Pos Pred Value : 0.7921         
##          Neg Pred Value : 0.4268         
##              Prevalence : 0.7156         
##          Detection Rate : 0.5152         
##    Detection Prevalence : 0.6504         
##       Balanced Accuracy : 0.6223         
##                                          
##        'Positive' Class : Merokok        
##

15.Hasil Parameter Optimal Model pada Kondisi SMOTE dengan Grid Search

Berdasarkan proses Grid Search pada data hasil SMOTE, diperoleh parameter optimal untuk model Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Parameter tersebut digunakan sebagai dasar dalam pembentukan model klasifikasi yang akan dievaluasi pada tahap selanjutnya. Hasil parameter optimal masing-masing model disajikan pada tabel berikut.

# --- KNN ---
best_knn <- data.frame(
  Model = "KNN",
  Split = c("90:10", "80:20", "70:30"),
  k     = c(tuning_90$knn$bestTune$k,
            tuning_80$knn$bestTune$k,
            tuning_70$knn$bestTune$k)
)
cat("\n--- Parameter Terbaik KNN ---\n")

## 
## --- Parameter Terbaik KNN ---

print(best_knn)

##   Model Split k
## 1   KNN 90:10 3
## 2   KNN 80:20 3
## 3   KNN 70:30 3

# --- Decision Tree ---
best_dt <- data.frame(
  Model = "Decision Tree",
  Split = c("90:10", "80:20", "70:30"),
  cp    = round(c(tuning_90$dt_cv$bestTune$cp,
                  tuning_80$dt_cv$bestTune$cp,
                  tuning_70$dt_cv$bestTune$cp), 6))
cat("\n--- Parameter Terbaik Decision Tree ---\n")

## 
## --- Parameter Terbaik Decision Tree ---

print(best_dt)

##           Model Split    cp
## 1 Decision Tree 90:10 1e-03
## 2 Decision Tree 80:20 1e-03
## 3 Decision Tree 70:30 1e-04

# --- Naive Bayes ---
best_nb <- data.frame(
  Model     = "Naive Bayes",
  Split     = c("90:10", "80:20", "70:30"),
  laplace   = c(tuning_90$nb$bestTune$laplace,
                tuning_80$nb$bestTune$laplace,
                tuning_70$nb$bestTune$laplace),
  usekernel = c(tuning_90$nb$bestTune$usekernel,
                tuning_80$nb$bestTune$usekernel,
                tuning_70$nb$bestTune$usekernel),
  adjust    = c(tuning_90$nb$bestTune$adjust,
                tuning_80$nb$bestTune$adjust,
                tuning_70$nb$bestTune$adjust)
)
cat("\n--- Parameter Terbaik Naive Bayes ---\n")

## 
## --- Parameter Terbaik Naive Bayes ---

print(best_nb)

##         Model Split laplace usekernel adjust
## 1 Naive Bayes 90:10       0      TRUE    0.5
## 2 Naive Bayes 80:20       0      TRUE    0.5
## 3 Naive Bayes 70:30       0      TRUE    0.5

16. Evaluasi Model Menggunakan Parameter Optimal

Pada tahap ini dilakukan evaluasi model klasifikasi dengan menggunakan parameter optimal yang telah diperoleh dari proses hyperparameter tuning Grid Search. Evaluasi dilakukan untuk mengukur kinerja masing-masing model pada data training dan testing, Hasil evaluasi ini menjadi dasar dalam menentukan model terbaik.

# Hasil evaluasi
hasil_tuning_semua <- bind_rows(
  tuning_90$hasil,
  tuning_80$hasil,
  tuning_70$hasil)
print(hasil_tuning_semua)

##                Split             Kondisi                Model Set_Data
## Precision...1  90:10 SMOTE + Grid Search           KNN Tuning Training
## Precision...2  90:10 SMOTE + Grid Search           KNN Tuning  Testing
## Precision...3  90:10 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...4  90:10 SMOTE + Grid Search Decision Tree Tuning  Testing
## Precision...5  90:10 SMOTE + Grid Search   Naive Bayes Tuning Training
## Precision...6  90:10 SMOTE + Grid Search   Naive Bayes Tuning  Testing
## Precision...7  80:20 SMOTE + Grid Search           KNN Tuning Training
## Precision...8  80:20 SMOTE + Grid Search           KNN Tuning  Testing
## Precision...9  80:20 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...10 80:20 SMOTE + Grid Search Decision Tree Tuning  Testing
## Precision...11 80:20 SMOTE + Grid Search   Naive Bayes Tuning Training
## Precision...12 80:20 SMOTE + Grid Search   Naive Bayes Tuning  Testing
## Precision...13 70:30 SMOTE + Grid Search           KNN Tuning Training
## Precision...14 70:30 SMOTE + Grid Search           KNN Tuning  Testing
## Precision...15 70:30 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...16 70:30 SMOTE + Grid Search Decision Tree Tuning  Testing
## Precision...17 70:30 SMOTE + Grid Search   Naive Bayes Tuning Training
## Precision...18 70:30 SMOTE + Grid Search   Naive Bayes Tuning  Testing
##                 Accuracy Precision    Recall  F1_Score Specificity      Kappa
## Precision...1  0.8143819 0.8170547 0.8591415 0.8375697   0.7580968 0.62135903
## Precision...2  0.6143590 0.7306590 0.7306590 0.7306590   0.3212996 0.05195866
## Precision...3  0.7061637 0.6813522 0.8875994 0.7709196   0.4780088 0.38043466
## Precision...4  0.6984615 0.7451456 0.8796562 0.8068331   0.2418773 0.14085588
## Precision...5  0.6959795 0.7157529 0.7534181 0.7341027   0.6237505 0.37974143
## Precision...6  0.6676923 0.7679083 0.7679083 0.7679083   0.4151625 0.18307076
## Precision...7  0.8210621 0.8307478 0.8524414 0.8414548   0.7816014 0.63618206
## Precision...8  0.6075820 0.7281273 0.7208304 0.7244604   0.3225225 0.04302580
## Precision...9  0.7057886 0.6812809 0.8866035 0.7704982   0.4784076 0.37974813
## Precision...10 0.6977459 0.7486137 0.8697208 0.8046358   0.2648649 0.15341177
## Precision...11 0.6702202 0.7177774 0.6723305 0.6943110   0.6675664 0.33714810
## Precision...12 0.6378074 0.7786753 0.6900501 0.7316888   0.5063063 0.18075116
## Precision...13 0.8263494 0.8317241 0.8628373 0.8469951   0.7804627 0.64641433
## Precision...14 0.6237624 0.7455534 0.7199427 0.7325243   0.3817527 0.09911434
## Precision...15 0.7186290 0.6907200 0.8961570 0.7801406   0.4953728 0.40712500
## Precision...16 0.7057016 0.7510171 0.8807252 0.8107159   0.2653061 0.16803208
## Precision...17 0.6760419 0.7132736 0.6997138 0.7064286   0.6462725 0.34514126
## Precision...18 0.6643906 0.7921260 0.7199427 0.7543114   0.5246098 0.22874989

17. Penentuan Model Terbaik Keseluruhan

set.seed(123)
perbandingan_final <- bind_rows(hasil_awal, hasil_smote, hasil_tuning_semua)
print(perbandingan_final)

##                Split             Kondisi                Model Set_Data
## Precision...1  90:10       Sebelum SMOTE                  KNN Training
## Precision...2  90:10       Sebelum SMOTE                  KNN  Testing
## Precision...3  90:10       Sebelum SMOTE        Decision Tree Training
## Precision...4  90:10       Sebelum SMOTE        Decision Tree  Testing
## Precision...5  90:10       Sebelum SMOTE          Naive Bayes Training
## Precision...6  90:10       Sebelum SMOTE          Naive Bayes  Testing
## Precision...7  80:20       Sebelum SMOTE                  KNN Training
## Precision...8  80:20       Sebelum SMOTE                  KNN  Testing
## Precision...9  80:20       Sebelum SMOTE        Decision Tree Training
## Precision...10 80:20       Sebelum SMOTE        Decision Tree  Testing
## Precision...11 80:20       Sebelum SMOTE          Naive Bayes Training
## Precision...12 80:20       Sebelum SMOTE          Naive Bayes  Testing
## Precision...13 70:30       Sebelum SMOTE                  KNN Training
## Precision...14 70:30       Sebelum SMOTE                  KNN  Testing
## Precision...15 70:30       Sebelum SMOTE        Decision Tree Training
## Precision...16 70:30       Sebelum SMOTE        Decision Tree  Testing
## Precision...17 70:30       Sebelum SMOTE          Naive Bayes Training
## Precision...18 70:30       Sebelum SMOTE          Naive Bayes  Testing
## Precision...19 90:10       Sesudah SMOTE                  KNN Training
## Precision...20 90:10       Sesudah SMOTE                  KNN  Testing
## Precision...21 90:10       Sesudah SMOTE        Decision Tree Training
## Precision...22 90:10       Sesudah SMOTE        Decision Tree  Testing
## Precision...23 90:10       Sesudah SMOTE          Naive Bayes Training
## Precision...24 90:10       Sesudah SMOTE          Naive Bayes  Testing
## Precision...25 80:20       Sesudah SMOTE                  KNN Training
## Precision...26 80:20       Sesudah SMOTE                  KNN  Testing
## Precision...27 80:20       Sesudah SMOTE        Decision Tree Training
## Precision...28 80:20       Sesudah SMOTE        Decision Tree  Testing
## Precision...29 80:20       Sesudah SMOTE          Naive Bayes Training
## Precision...30 80:20       Sesudah SMOTE          Naive Bayes  Testing
## Precision...31 70:30       Sesudah SMOTE                  KNN Training
## Precision...32 70:30       Sesudah SMOTE                  KNN  Testing
## Precision...33 70:30       Sesudah SMOTE        Decision Tree Training
## Precision...34 70:30       Sesudah SMOTE        Decision Tree  Testing
## Precision...35 70:30       Sesudah SMOTE          Naive Bayes Training
## Precision...36 70:30       Sesudah SMOTE          Naive Bayes  Testing
## Precision...37 90:10 SMOTE + Grid Search           KNN Tuning Training
## Precision...38 90:10 SMOTE + Grid Search           KNN Tuning  Testing
## Precision...39 90:10 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...40 90:10 SMOTE + Grid Search Decision Tree Tuning  Testing
## Precision...41 90:10 SMOTE + Grid Search   Naive Bayes Tuning Training
## Precision...42 90:10 SMOTE + Grid Search   Naive Bayes Tuning  Testing
## Precision...43 80:20 SMOTE + Grid Search           KNN Tuning Training
## Precision...44 80:20 SMOTE + Grid Search           KNN Tuning  Testing
## Precision...45 80:20 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...46 80:20 SMOTE + Grid Search Decision Tree Tuning  Testing
## Precision...47 80:20 SMOTE + Grid Search   Naive Bayes Tuning Training
## Precision...48 80:20 SMOTE + Grid Search   Naive Bayes Tuning  Testing
## Precision...49 70:30 SMOTE + Grid Search           KNN Tuning Training
## Precision...50 70:30 SMOTE + Grid Search           KNN Tuning  Testing
## Precision...51 70:30 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...52 70:30 SMOTE + Grid Search Decision Tree Tuning  Testing
## Precision...53 70:30 SMOTE + Grid Search   Naive Bayes Tuning Training
## Precision...54 70:30 SMOTE + Grid Search   Naive Bayes Tuning  Testing
##                 Accuracy Precision    Recall  F1_Score Specificity      Kappa
## Precision...1  0.7590718 0.7834239 0.9166932 0.8448352   0.3626549 0.32066914
## Precision...2  0.6758974 0.7323601 0.8624642 0.7921053   0.2057762 0.07889072
## Precision...3  0.7155045 0.7155045 1.0000000 0.8341622   0.0000000 0.00000000
## Precision...4  0.7158974 0.7158974 1.0000000 0.8344292   0.0000000 0.00000000
## Precision...5  0.5042657 0.8039648 0.4062003 0.5397127   0.7508996 0.11427327
## Precision...6  0.4789744 0.7668539 0.3911175 0.5180266   0.7003610 0.06665863
## Precision...7  0.7623496 0.7896370 0.9103917 0.8457257   0.3900135 0.33976551
## Precision...8  0.6618852 0.7333756 0.8289191 0.7782258   0.2414414 0.07807641
## Precision...9  0.7155106 0.7155106 1.0000000 0.8341664   0.0000000 0.00000000
## Precision...10 0.7156762 0.7156762 1.0000000 0.8342789   0.0000000 0.00000000
## Precision...11 0.5327617 0.7964548 0.4661062 0.5880627   0.7004049 0.12670095
## Precision...12 0.5225410 0.7866831 0.4566929 0.5778986   0.6882883 0.10998482
## Precision...13 0.7629077 0.7871817 0.9163941 0.8468877   0.3768638 0.33490458
## Precision...14 0.6828269 0.7416149 0.8544847 0.7940590   0.2509004 0.11913025
## Precision...15 0.7155185 0.7155185 1.0000000 0.8341717   0.0000000 0.00000000
## Precision...16 0.7156026 0.7156026 1.0000000 0.8342289   0.0000000 0.00000000
## Precision...17 0.4591195 0.8061538 0.3213410 0.4595148   0.8056555 0.08724664
## Precision...18 0.4653465 0.8139810 0.3277672 0.4673469   0.8115246 0.09587575
## Precision...19 0.7638151 0.7642597 0.8329094 0.7971092   0.6769292 0.51580380
## Precision...20 0.6317949 0.7411095 0.7464183 0.7437545   0.3429603 0.08986783
## Precision...21 0.6548884 0.6535352 0.8096979 0.7232834   0.4602159 0.27848119
## Precision...22 0.6789744 0.7604871 0.8051576 0.7821851   0.3610108 0.17393234
## Precision...23 0.5812079 0.6669519 0.4957075 0.5687187   0.6887245 0.17853030
## Precision...24 0.5620513 0.8017817 0.5157593 0.6277245   0.6787004 0.15298634
## Precision...25 0.7735379 0.7795753 0.8274012 0.8027766   0.7058030 0.53744984
## Precision...26 0.6178279 0.7370721 0.7244094 0.7306859   0.3495495 0.07300758
## Precision...27 0.6616519 0.6585754 0.8152388 0.7285806   0.4685110 0.29271613
## Precision...28 0.6828893 0.7610738 0.8117394 0.7855906   0.3585586 0.17935511
## Precision...29 0.5912125 0.6668161 0.5319263 0.5917819   0.6657670 0.19267445
## Precision...30 0.5609631 0.7812500 0.5368647 0.6364022   0.6216216 0.12809180
## Precision...31 0.7765885 0.7795802 0.8350368 0.8063561   0.7030848 0.54309738
## Precision...32 0.6220553 0.7454094 0.7166031 0.7307225   0.3841537 0.09788887
## Precision...33 0.6522432 0.6505077 0.8119379 0.7223131   0.4514139 0.27204662
## Precision...34 0.6971663 0.7709547 0.8206107 0.7950081   0.3865546 0.21780010
## Precision...35 0.5843771 0.6615505 0.5198283 0.5821886   0.6655527 0.18040353
## Precision...36 0.5705019 0.7997139 0.5333969 0.6399542   0.6638655 0.15750004
## Precision...37 0.8143819 0.8170547 0.8591415 0.8375697   0.7580968 0.62135903
## Precision...38 0.6143590 0.7306590 0.7306590 0.7306590   0.3212996 0.05195866
## Precision...39 0.7061637 0.6813522 0.8875994 0.7709196   0.4780088 0.38043466
## Precision...40 0.6984615 0.7451456 0.8796562 0.8068331   0.2418773 0.14085588
## Precision...41 0.6959795 0.7157529 0.7534181 0.7341027   0.6237505 0.37974143
## Precision...42 0.6676923 0.7679083 0.7679083 0.7679083   0.4151625 0.18307076
## Precision...43 0.8210621 0.8307478 0.8524414 0.8414548   0.7816014 0.63618206
## Precision...44 0.6075820 0.7281273 0.7208304 0.7244604   0.3225225 0.04302580
## Precision...45 0.7057886 0.6812809 0.8866035 0.7704982   0.4784076 0.37974813
## Precision...46 0.6977459 0.7486137 0.8697208 0.8046358   0.2648649 0.15341177
## Precision...47 0.6702202 0.7177774 0.6723305 0.6943110   0.6675664 0.33714810
## Precision...48 0.6378074 0.7786753 0.6900501 0.7316888   0.5063063 0.18075116
## Precision...49 0.8263494 0.8317241 0.8628373 0.8469951   0.7804627 0.64641433
## Precision...50 0.6237624 0.7455534 0.7199427 0.7325243   0.3817527 0.09911434
## Precision...51 0.7186290 0.6907200 0.8961570 0.7801406   0.4953728 0.40712500
## Precision...52 0.7057016 0.7510171 0.8807252 0.8107159   0.2653061 0.16803208
## Precision...53 0.6760419 0.7132736 0.6997138 0.7064286   0.6462725 0.34514126
## Precision...54 0.6643906 0.7921260 0.7199427 0.7543114   0.5246098 0.22874989

# --- Ringkasan Testing ---
perbandingan_testing <- perbandingan_final %>%
  filter(Set_Data == "Testing") %>%
  dplyr::select(Split, Kondisi, Model,
                Accuracy, Precision, Recall, F1_Score, Specificity, Kappa) %>%
  mutate(across(where(is.numeric), ~ round(., 4)))


cat("\n\n========== PENENTUAN METODE TERBAIK ==========\n")

## 
## 
## ========== PENENTUAN METODE TERBAIK ==========

best_analysis <- perbandingan_final %>%
  filter(Set_Data == "Testing") %>%
  mutate(across(c(Accuracy, Precision, Recall, F1_Score, Specificity, Kappa),
                ~ round(., 4)))

# --- Terbaik per Split ---
cat("\n--- Metode Terbaik per Split (F1-Score Tertinggi) ---\n")

## 
## --- Metode Terbaik per Split (F1-Score Tertinggi) ---

best_per_split <- best_analysis %>%
  group_by(Split) %>%
  slice_max(order_by = F1_Score, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  dplyr::select(Split, Kondisi, Model, Accuracy, Precision,
                Recall, F1_Score, Specificity, Kappa)
print(best_per_split)

## # A tibble: 3 × 9
##   Split Kondisi       Model Accuracy Precision Recall F1_Score Specificity Kappa
##   <chr> <chr>         <chr>    <dbl>     <dbl>  <dbl>    <dbl>       <dbl> <dbl>
## 1 70:30 Sebelum SMOTE Deci…    0.716     0.716      1    0.834           0     0
## 2 80:20 Sebelum SMOTE Deci…    0.716     0.716      1    0.834           0     0
## 3 90:10 Sebelum SMOTE Deci…    0.716     0.716      1    0.834           0     0

# --- Ranking Lengkap ---
cat("\n--- Ranking Lengkap Semua Model berdasarkan F1-Score (Testing) ---\n")

## 
## --- Ranking Lengkap Semua Model berdasarkan F1-Score (Testing) ---

ranking_semua <- best_analysis %>%
  arrange(desc(F1_Score), desc(Accuracy), desc(Kappa)) %>%
  mutate(Rank = row_number()) %>%
  dplyr::select(Rank, Split, Kondisi, Model,
                Accuracy, Precision, Recall, F1_Score, Specificity, Kappa)
print(ranking_semua)

##                Rank Split             Kondisi                Model Accuracy
## Precision...1     1 90:10       Sebelum SMOTE        Decision Tree   0.7159
## Precision...2     2 80:20       Sebelum SMOTE        Decision Tree   0.7157
## Precision...3     3 70:30       Sebelum SMOTE        Decision Tree   0.7156
## Precision...4     4 70:30 SMOTE + Grid Search Decision Tree Tuning   0.7057
## Precision...5     5 90:10 SMOTE + Grid Search Decision Tree Tuning   0.6985
## Precision...6     6 80:20 SMOTE + Grid Search Decision Tree Tuning   0.6977
## Precision...7     7 70:30       Sesudah SMOTE        Decision Tree   0.6972
## Precision...8     8 70:30       Sebelum SMOTE                  KNN   0.6828
## Precision...9     9 90:10       Sebelum SMOTE                  KNN   0.6759
## Precision...10   10 80:20       Sesudah SMOTE        Decision Tree   0.6829
## Precision...11   11 90:10       Sesudah SMOTE        Decision Tree   0.6790
## Precision...12   12 80:20       Sebelum SMOTE                  KNN   0.6619
## Precision...13   13 90:10 SMOTE + Grid Search   Naive Bayes Tuning   0.6677
## Precision...14   14 70:30 SMOTE + Grid Search   Naive Bayes Tuning   0.6644
## Precision...15   15 90:10       Sesudah SMOTE                  KNN   0.6318
## Precision...16   16 70:30 SMOTE + Grid Search           KNN Tuning   0.6238
## Precision...17   17 80:20 SMOTE + Grid Search   Naive Bayes Tuning   0.6378
## Precision...18   18 70:30       Sesudah SMOTE                  KNN   0.6221
## Precision...19   19 80:20       Sesudah SMOTE                  KNN   0.6178
## Precision...20   20 90:10 SMOTE + Grid Search           KNN Tuning   0.6144
## Precision...21   21 80:20 SMOTE + Grid Search           KNN Tuning   0.6076
## Precision...22   22 70:30       Sesudah SMOTE          Naive Bayes   0.5705
## Precision...23   23 80:20       Sesudah SMOTE          Naive Bayes   0.5610
## Precision...24   24 90:10       Sesudah SMOTE          Naive Bayes   0.5621
## Precision...25   25 80:20       Sebelum SMOTE          Naive Bayes   0.5225
## Precision...26   26 90:10       Sebelum SMOTE          Naive Bayes   0.4790
## Precision...27   27 70:30       Sebelum SMOTE          Naive Bayes   0.4653
##                Precision Recall F1_Score Specificity  Kappa
## Precision...1     0.7159 1.0000   0.8344      0.0000 0.0000
## Precision...2     0.7157 1.0000   0.8343      0.0000 0.0000
## Precision...3     0.7156 1.0000   0.8342      0.0000 0.0000
## Precision...4     0.7510 0.8807   0.8107      0.2653 0.1680
## Precision...5     0.7451 0.8797   0.8068      0.2419 0.1409
## Precision...6     0.7486 0.8697   0.8046      0.2649 0.1534
## Precision...7     0.7710 0.8206   0.7950      0.3866 0.2178
## Precision...8     0.7416 0.8545   0.7941      0.2509 0.1191
## Precision...9     0.7324 0.8625   0.7921      0.2058 0.0789
## Precision...10    0.7611 0.8117   0.7856      0.3586 0.1794
## Precision...11    0.7605 0.8052   0.7822      0.3610 0.1739
## Precision...12    0.7334 0.8289   0.7782      0.2414 0.0781
## Precision...13    0.7679 0.7679   0.7679      0.4152 0.1831
## Precision...14    0.7921 0.7199   0.7543      0.5246 0.2287
## Precision...15    0.7411 0.7464   0.7438      0.3430 0.0899
## Precision...16    0.7456 0.7199   0.7325      0.3818 0.0991
## Precision...17    0.7787 0.6901   0.7317      0.5063 0.1808
## Precision...18    0.7454 0.7166   0.7307      0.3842 0.0979
## Precision...19    0.7371 0.7244   0.7307      0.3495 0.0730
## Precision...20    0.7307 0.7307   0.7307      0.3213 0.0520
## Precision...21    0.7281 0.7208   0.7245      0.3225 0.0430
## Precision...22    0.7997 0.5334   0.6400      0.6639 0.1575
## Precision...23    0.7812 0.5369   0.6364      0.6216 0.1281
## Precision...24    0.8018 0.5158   0.6277      0.6787 0.1530
## Precision...25    0.7867 0.4567   0.5779      0.6883 0.1100
## Precision...26    0.7669 0.3911   0.5180      0.7004 0.0667
## Precision...27    0.8140 0.3278   0.4673      0.8115 0.0959

# --- Terbaik Keseluruhan ---
best_overall <- ranking_semua %>% filter(Rank == 1)
cat("\n>>> METODE TERBAIK KESELURUHAN <<<\n")

## 
## >>> METODE TERBAIK KESELURUHAN <<<

print(best_overall)

##           Rank Split       Kondisi         Model Accuracy Precision Recall
## Precision    1 90:10 Sebelum SMOTE Decision Tree   0.7159    0.7159      1
##           F1_Score Specificity Kappa
## Precision   0.8344           0     0

cat("\nKesimpulan:\n")

## 
## Kesimpulan:

cat("  Split      :", best_overall$Split,       "\n")

##   Split      : 90:10

cat("  Kondisi    :", best_overall$Kondisi,     "\n")

##   Kondisi    : Sebelum SMOTE

cat("  Model      :", best_overall$Model,       "\n")

##   Model      : Decision Tree

cat("  Accuracy   :", best_overall$Accuracy,    "\n")

##   Accuracy   : 0.7159

cat("  Precision  :", best_overall$Precision,   "\n")

##   Precision  : 0.7159

cat("  Recall     :", best_overall$Recall,      "\n")

##   Recall     : 1

cat("  F1-Score   :", best_overall$F1_Score,    "\n")

##   F1-Score   : 0.8344

cat("  Specificity:", best_overall$Specificity, "\n")

##   Specificity: 0

cat("  Kappa      :", best_overall$Kappa,       "\n")

##   Kappa      : 0

18. Confusion Matrix Model Terbaik Keseluruhan

Berikut disajikan visualisasi confusion matrix dari model terbaik keseluruhan, Confusion matrix digunakan untuk mengevaluasi kinerja model dalam mengklasifikasikan status merokok dengan melihat hasil prediksi yang benar maupun salah, yang terdiri dari True Positive (TP), True Negative (TN), False Positive (FP), dan False Negative (FN).

model_best <- res_awal_90$dt
test_data  <- test_10
library(caret)

test_x <- test_data %>% dplyr::select(-Y)
test_y <- test_data$Y

pred_best <- predict(model_best, test_x, type = "class")
pred_best <- factor(pred_best, levels = levels(test_y))

cm_best <- confusionMatrix(pred_best, test_y, positive = "Merokok")
cm_best

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok             0       0
##   Merokok                 277     698
##                                          
##                Accuracy : 0.7159         
##                  95% CI : (0.6865, 0.744)
##     No Information Rate : 0.7159         
##     P-Value [Acc > NIR] : 0.5162         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.7159         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.7159         
##          Detection Rate : 0.7159         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : Merokok        
##

cm_table <- as.data.frame(cm_best$table)

# Tambahkan kategori (TP, TN, FP, FN)
cm_table <- cm_table %>%
  mutate(
    Kategori = case_when(
      Prediction == "Merokok" & Reference == "Merokok" ~ "TP (True Positive)",
      Prediction == "Tidak_Merokok" & Reference == "Tidak_Merokok" ~ "TN (True Negative)",
      Prediction == "Merokok" & Reference == "Tidak_Merokok" ~ "FP (False Positive)",
      Prediction == "Tidak_Merokok" & Reference == "Merokok" ~ "FN (False Negative)"
    ))

ggplot(cm_table, aes(x = Reference, y = Prediction, fill = Kategori)) +
  geom_tile(color = "white", linewidth = 1.2) +
  geom_text(aes(label = Freq), size = 6, fontface = "bold") +
  
  scale_fill_manual(values = c(
    "TP (True Positive)" = "#2ECC71",   # hijau
    "TN (True Negative)" = "#3498DB",   # biru
    "FP (False Positive)" = "#E74C3C",  # merah
    "FN (False Negative)" = "#F1C40F"   # kuning
  )) +
  
  labs(
    title = "Confusion Matrix - Model Terbaik (Decision Tree)",
    subtitle = "Klasifikasi Perilaku Merokok",
    x = "Aktual",
    y = "Prediksi",
    fill = "Kategori"
  ) +
  
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
    plot.subtitle = element_text(hjust = 0.5),
    axis.text = element_text(face = "bold"),
    legend.position = "right"
  )

# 19. Penentuan Model Terbaik dari Hasil Tuning Parameter Pada tahap ini dilakukan penentuan model terbaik berdasarkan hasil hyperparameter tuning menggunakan metode Grid Search dengan 5-Fold Cross Validation

# LOCK DATA (WAJIB untuk konsistensi)
set.seed(123)
perbandingan_final <- perbandingan_final %>%
  dplyr::distinct() %>%
  dplyr::mutate(
    Accuracy    = as.numeric(Accuracy),
    Precision   = as.numeric(Precision),
    Recall      = as.numeric(Recall),
    F1_Score    = as.numeric(F1_Score),
    Specificity = as.numeric(Specificity),
    Kappa       = as.numeric(Kappa)
  )


# RANKING MODEL TERBAIK Hasil Tuning
ranking_smote_grid <- perbandingan_final %>%
  dplyr::filter(Set_Data == "Testing",
                Kondisi == "SMOTE + Grid Search") %>%
  
  # rounding HANYA untuk tampilan (bukan sorting)
  dplyr::mutate(
    Accuracy    = round(Accuracy, 4),
    Precision   = round(Precision, 4),
    Recall      = round(Recall, 4),
    F1_Score    = round(F1_Score, 4),
    Specificity = round(Specificity, 4),
    Kappa       = round(Kappa, 4)
  ) %>%
  
  # sorting utama (stabil + konsisten)
  dplyr::arrange(
    dplyr::desc(F1_Score),
    dplyr::desc(Accuracy),
    dplyr::desc(Kappa),
    dplyr::desc(Recall)   # tambahan untuk menghindari tie berbeda urutan
  ) %>%
  
  dplyr::mutate(Rank = dplyr::row_number()) %>%
  
  dplyr::select(Rank, Split, Kondisi, Model,
                Accuracy, Precision, Recall,
                F1_Score, Specificity, Kappa)

cat("\n--- Ranking Lengkap Model SMOTE + Grid Search ---\n")

## 
## --- Ranking Lengkap Model SMOTE + Grid Search ---

print(ranking_smote_grid)

##               Rank Split             Kondisi                Model Accuracy
## Precision...1    1 70:30 SMOTE + Grid Search Decision Tree Tuning   0.7057
## Precision...2    2 90:10 SMOTE + Grid Search Decision Tree Tuning   0.6985
## Precision...3    3 80:20 SMOTE + Grid Search Decision Tree Tuning   0.6977
## Precision...4    4 90:10 SMOTE + Grid Search   Naive Bayes Tuning   0.6677
## Precision...5    5 70:30 SMOTE + Grid Search   Naive Bayes Tuning   0.6644
## Precision...6    6 70:30 SMOTE + Grid Search           KNN Tuning   0.6238
## Precision...7    7 80:20 SMOTE + Grid Search   Naive Bayes Tuning   0.6378
## Precision...8    8 90:10 SMOTE + Grid Search           KNN Tuning   0.6144
## Precision...9    9 80:20 SMOTE + Grid Search           KNN Tuning   0.6076
##               Precision Recall F1_Score Specificity  Kappa
## Precision...1    0.7510 0.8807   0.8107      0.2653 0.1680
## Precision...2    0.7451 0.8797   0.8068      0.2419 0.1409
## Precision...3    0.7486 0.8697   0.8046      0.2649 0.1534
## Precision...4    0.7679 0.7679   0.7679      0.4152 0.1831
## Precision...5    0.7921 0.7199   0.7543      0.5246 0.2287
## Precision...6    0.7456 0.7199   0.7325      0.3818 0.0991
## Precision...7    0.7787 0.6901   0.7317      0.5063 0.1808
## Precision...8    0.7307 0.7307   0.7307      0.3213 0.0520
## Precision...9    0.7281 0.7208   0.7245      0.3225 0.0430

# --- Validasi hasil ranking tidak kosong ---
if (nrow(ranking_smote_grid) == 0) {
  stop("Filter 'SMOTE + Grid Search' kosong. Cek label kondisi dengan unique(perbandingan_final$Kondisi)")
}

# --- Ambil model terbaik Rank 1 ---
best_sg     <- ranking_smote_grid %>% filter(Rank == 1)
best_mod_sg <- best_sg$Model
best_spl_sg <- best_sg$Split
best_kon_sg <- best_sg$Kondisi

cat("\n>>> MODEL TERBAIK SMOTE + GRID SEARCH <<<\n")

## 
## >>> MODEL TERBAIK SMOTE + GRID SEARCH <<<

cat("  Model      :", best_mod_sg,           "\n")

##   Model      : Decision Tree Tuning

cat("  Split      :", best_spl_sg,           "\n")

##   Split      : 70:30

cat("  Kondisi    :", best_kon_sg,           "\n")

##   Kondisi    : SMOTE + Grid Search

cat("  Accuracy   :", best_sg$Accuracy,      "\n")

##   Accuracy   : 0.7057

cat("  Precision  :", best_sg$Precision,     "\n")

##   Precision  : 0.751

cat("  Recall     :", best_sg$Recall,        "\n")

##   Recall     : 0.8807

cat("  F1-Score   :", best_sg$F1_Score,      "\n")

##   F1-Score   : 0.8107

cat("  Specificity:", best_sg$Specificity,   "\n")

##   Specificity: 0.2653

cat("  Kappa      :", best_sg$Kappa,         "\n")

##   Kappa      : 0.168

# --- Ambil objek model sesuai split terbaik ---
res_sg <- if (best_spl_sg == "90:10") tuning_90 else
  if (best_spl_sg == "80:20") tuning_80 else tuning_70

20. Confusion Matrix Hasil Model Terbaik dari Hasil Tuning Parameter

Pada tahap ini dilakukan visualisai terhadap model terbaik yang diperoleh dari proses hyperparameter tuning menggunakan Grid Search dengan 5-Fold Cross Validation. Evaluasi dilakukan menggunakan confusion matrix untuk melihat performa klasifikasi secara lebih rinci dalam membedakan kelas “Merokok” dan “Tidak Merokok” pada data pengujian.

# Confusion Matrix - Model Terbaik SMOTE + Grid Search (Rank 1)

best_grid   <- ranking_smote_grid %>% filter(Rank == 1)
best_split_g <- best_grid$Split
best_model_g <- best_grid$Model

cat(">>> Model Terbaik SMOTE + Grid Search (Rank 1) <<<\n")

## >>> Model Terbaik SMOTE + Grid Search (Rank 1) <<<

cat("Split :", best_split_g, "\n")

## Split : 70:30

cat("Model :", best_model_g, "\n")

## Model : Decision Tree Tuning

# Test data sesuai split
test_data_grid <- switch(best_split_g,
  "90:10" = test_10_num,
  "80:20" = test_20_num,
  "70:30" = test_30_num
)

# Objek model sesuai split & model
model_grid_obj <- switch(best_split_g,
  "90:10" = switch(best_model_g,
    "KNN Tuning"           = tuning_90$knn,
    "Decision Tree Tuning" = tuning_90$dt,
    "Naive Bayes Tuning"   = tuning_90$nb),
  "80:20" = switch(best_model_g,
    "KNN Tuning"           = tuning_80$knn,
    "Decision Tree Tuning" = tuning_80$dt,
    "Naive Bayes Tuning"   = tuning_80$nb),
  "70:30" = switch(best_model_g,
    "KNN Tuning"           = tuning_70$knn,
    "Decision Tree Tuning" = tuning_70$dt,
    "Naive Bayes Tuning"   = tuning_70$nb)
)

# Prediksi
test_x_g <- test_data_grid %>% dplyr::select(-Y)
test_y_g  <- test_data_grid$Y

if (best_model_g == "Decision Tree Tuning") {
  pred_grid <- factor(predict(model_grid_obj, test_x_g, type = "class"),
                      levels = levels(test_y_g))
} else {
  pred_grid <- factor(predict(model_grid_obj, test_x_g),
                      levels = levels(test_y_g))
}

# Confusion Matrix
cm_grid <- confusionMatrix(pred_grid, test_y_g, positive = "Merokok")
print(cm_grid)

## Confusion Matrix and Statistics
## 
##                Reference
## Prediction      Tidak_Merokok Merokok
##   Tidak_Merokok           221     250
##   Merokok                 612    1846
##                                           
##                Accuracy : 0.7057          
##                  95% CI : (0.6888, 0.7222)
##     No Information Rate : 0.7156          
##     P-Value [Acc > NIR] : 0.8863          
##                                           
##                   Kappa : 0.168           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8807          
##             Specificity : 0.2653          
##          Pos Pred Value : 0.7510          
##          Neg Pred Value : 0.4692          
##              Prevalence : 0.7156          
##          Detection Rate : 0.6302          
##    Detection Prevalence : 0.8392          
##       Balanced Accuracy : 0.5730          
##                                           
##        'Positive' Class : Merokok         
##

# Ambil nilai TP, TN, FP, FN untuk anotasi
cm_tbl  <- cm_grid$table
TP <- cm_tbl["Merokok",       "Merokok"]
TN <- cm_tbl["Tidak_Merokok", "Tidak_Merokok"]
FP <- cm_tbl["Merokok",       "Tidak_Merokok"]
FN <- cm_tbl["Tidak_Merokok", "Merokok"]

# Tabel visualisasi
cm_df <- as.data.frame(cm_grid$table) %>%
  dplyr::mutate(
    Kategori = dplyr::case_when(
      Prediction == "Merokok"       & Reference == "Merokok"       ~ "TP",
      Prediction == "Tidak_Merokok" & Reference == "Tidak_Merokok" ~ "TN",
      Prediction == "Merokok"       & Reference == "Tidak_Merokok" ~ "FP",
      Prediction == "Tidak_Merokok" & Reference == "Merokok"       ~ "FN"
    ),
    Label = dplyr::case_when(
      Kategori == "TP" ~ paste0("TP\n(True Positive)\n", Freq),
      Kategori == "TN" ~ paste0("TN\n(True Negative)\n", Freq),
      Kategori == "FP" ~ paste0("FP\n(False Positive)\n", Freq),
      Kategori == "FN" ~ paste0("FN\n(False Negative)\n", Freq)
    )
  )

# Metrik tambahan untuk subtitle
acc  <- round(cm_grid$overall["Accuracy"]    * 100, 2)
prec <- round(cm_grid$byClass["Precision"]   * 100, 2)
rec  <- round(cm_grid$byClass["Recall"]      * 100, 2)
f1   <- round(cm_grid$byClass["F1"]          * 100, 2)
spec <- round(cm_grid$byClass["Specificity"] * 100, 2)

# Visualisasi
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Kategori)) +
  geom_tile(color = "white", linewidth = 1.5) +
  geom_text(aes(label = Label), size = 5.5, fontface = "bold", lineheight = 1.3) +

  scale_fill_manual(
    values = c(
      "TP" = "#2ECC71",
      "TN" = "#3498DB",
      "FP" = "#E74C3C",
      "FN" = "#F1C40F"
    ),
    labels = c(
      "TP" = "TP (True Positive)",
      "TN" = "TN (True Negative)",
      "FP" = "FP (False Positive)",
      "FN" = "FN (False Negative)"
    )
  ) +

  labs(
    title    = paste0("Confusion Matrix — ", best_model_g),
    subtitle = paste0(
      "Split: ", best_split_g, "  |  Kondisi: SMOTE + Grid Search\n",
      "Accuracy: ", acc, "%  |  Precision: ", prec, "%  |  ",
      "Recall: ", rec, "%  |  F1-Score: ", f1, "%  |  Specificity: ", spec, "%"
    ),
    x    = "Aktual",
    y    = "Prediksi",
    fill = "Kategori"
  ) +

  theme_minimal(base_size = 13) +
  theme(
    plot.title      = element_text(hjust = 0.5, face = "bold", size = 16),
    plot.subtitle   = element_text(hjust = 0.5, size = 10, color = "gray40"),
    axis.text       = element_text(face = "bold", size = 12),
    axis.title      = element_text(face = "bold", size = 13),
    legend.position = "right",
    legend.title    = element_text(face = "bold"),
    panel.grid      = element_blank()
  )

21. PERBANDINGAN ACCURACY & F1-SCORE — SEMUA MODEL (TESTING)

Pada bagian ini dilakukan perbandingan performa seluruh model yang diuji berdasarkan nilai accuracy dan F1-score pada data testing. Perbandingan ini bertujuan untuk mengetahui model mana yang memiliki kinerja terbaik dalam melakukan klasifikasi, khususnya dalam menyeimbangkan ketepatan prediksi dan kemampuan model dalam menangani ketidakseimbangan kelas.

perbandingan_viz <- perbandingan_final %>%
  dplyr::filter(Set_Data == "Testing") %>%
  dplyr::mutate(
    Accuracy = round(Accuracy, 4),
    F1_Score = round(F1_Score, 4)
  )

# --- PLOT ACCURACY ---
ggplot(perbandingan_viz,
       aes(x    = Model,
           y    = Accuracy,
           fill = Kondisi)) +

  geom_col(position = position_dodge(width = 0.8),
           width = 0.7) +

  geom_text(aes(label = Accuracy),
            position = position_dodge(width = 0.8),
            vjust    = -0.4,
            size     = 3,
            fontface = "bold") +

  scale_fill_manual(values = c(
    "Sebelum SMOTE"       = "#52BE80",
    "Sesudah SMOTE"       = "#F0956A",
    "SMOTE + Grid Search" = "#AED6F1"
  )) +

  facet_wrap(~ Split, ncol = 3) +

  scale_y_continuous(limits = c(0, 1.25),
                     breaks = seq(0, 1.2, 0.2),
                     expand = expansion(mult = c(0, 0))) +

  labs(
    title = "Perbandingan Akurasi per Model, Kondisi & Split (Testing)",
    x     = "Model",
    y     = "Akurasi",
    fill  = "Kondisi"
  ) +

  theme_minimal(base_size = 11) +
  theme(
    plot.title         = element_text(hjust = 0.5, face = "bold", size = 13),
    axis.text.x        = element_text(angle = 45, hjust = 1,
                                      face = "bold", size = 8),
    axis.text.y        = element_text(size = 9),
    axis.title         = element_text(face = "bold", size = 11),
    strip.text         = element_text(face = "bold", size = 11),
    strip.background   = element_rect(fill = "gray95", color = NA),
    legend.position    = "bottom",
    legend.title       = element_text(face = "bold"),
    panel.grid.major.x = element_blank()
  )

# --- PLOT F1-SCORE ---
ggplot(perbandingan_viz,
       aes(x    = Model,
           y    = F1_Score,
           fill = Kondisi)) +

  geom_col(position = position_dodge(width = 0.8),
           width = 0.7) +

  geom_text(aes(label = F1_Score),
            position = position_dodge(width = 0.8),
            vjust    = -0.4,
            size     = 3,
            fontface = "bold") +

  scale_fill_manual(values = c(
    "Sebelum SMOTE"       = "#52BE80",
    "Sesudah SMOTE"       = "#F0956A",
    "SMOTE + Grid Search" = "#AED6F1"
  )) +

  facet_wrap(~ Split, ncol = 3) +

  scale_y_continuous(limits = c(0, 1.25),
                     breaks = seq(0, 1.2, 0.2),
                     expand = expansion(mult = c(0, 0))) +

  labs(
    title = "Perbandingan F1-Score per Model, Kondisi & Split (Testing)",
    x     = "Model",
    y     = "F1-Score",
    fill  = "Kondisi"
  ) +

  theme_minimal(base_size = 11) +
  theme(
    plot.title         = element_text(hjust = 0.5, face = "bold", size = 13),
    axis.text.x        = element_text(angle = 45, hjust = 1,
                                      face = "bold", size = 8),
    axis.text.y        = element_text(size = 9),
    axis.title         = element_text(face = "bold", size = 11),
    strip.text         = element_text(face = "bold", size = 11),
    strip.background   = element_rect(fill = "gray95", color = NA),
    legend.position    = "bottom",
    legend.title       = element_text(face = "bold"),
    panel.grid.major.x = element_blank()
  )

22. VARIABLE IMPORTANCE - Model Terbaik SMOTE + Grid Search

Pada bagian ini dilakukan analisis variable importance pada model terbaik yang diperoleh dari kombinasi SMOTE dan Grid Search. Analisis ini bertujuan untuk mengetahui variabel-variabel yang paling berpengaruh dalam proses klasifikasi, sehingga dapat memberikan interpretasi yang lebih jelas terhadap faktor-faktor yang berkontribusi terhadap hasil prediksi model.

best_grid    <- ranking_smote_grid %>% dplyr::filter(Rank == 1)
best_split_g <- best_grid$Split
best_model_g <- best_grid$Model

cat(">>> Variable Importance — Model Terbaik SMOTE + Grid Search <<<\n")

## >>> Variable Importance — Model Terbaik SMOTE + Grid Search <<<

cat("Split :", best_split_g, "\n")

## Split : 70:30

cat("Model :", best_model_g, "\n\n")

## Model : Decision Tree Tuning

# Ambil objek model
model_grid_obj <- switch(best_split_g,
  "90:10" = switch(best_model_g,
    "KNN Tuning"           = tuning_90$knn,
    "Decision Tree Tuning" = tuning_90$dt,
    "Naive Bayes Tuning"   = tuning_90$nb),
  "80:20" = switch(best_model_g,
    "KNN Tuning"           = tuning_80$knn,
    "Decision Tree Tuning" = tuning_80$dt,
    "Naive Bayes Tuning"   = tuning_80$nb),
  "70:30" = switch(best_model_g,
    "KNN Tuning"           = tuning_70$knn,
    "Decision Tree Tuning" = tuning_70$dt,
    "Naive Bayes Tuning"   = tuning_70$nb)
)


# Hitung Variable Importance sesuai jenis model

if (best_model_g == "Decision Tree Tuning") {

  vi_raw <- model_grid_obj$variable.importance

  if (is.null(vi_raw) || length(vi_raw) == 0) {
    stop("variable.importance kosong — pohon mungkin hanya root node.")
  }

  vi_df <- data.frame(
    Variabel   = names(vi_raw),
    Importance = as.numeric(vi_raw),
    stringsAsFactors = FALSE
  ) %>%
    dplyr::filter(nchar(Variabel) > 0) %>%
    dplyr::mutate(
      Importance = round(Importance / max(Importance) * 100, 2)
    ) %>%
    dplyr::arrange(dplyr::desc(Importance))

} else if (best_model_g == "KNN Tuning") {

  vi_raw <- caret::varImp(model_grid_obj, scale = TRUE)$importance

  vi_df <- data.frame(
    Variabel   = rownames(vi_raw),
    Importance = round(vi_raw[, 1], 2),
    stringsAsFactors = FALSE
  ) %>%
    dplyr::filter(nchar(Variabel) > 0) %>%
    dplyr::arrange(dplyr::desc(Importance))

} else if (best_model_g == "Naive Bayes Tuning") {

  vi_raw <- caret::varImp(model_grid_obj, scale = TRUE)$importance

  vi_df <- data.frame(
    Variabel   = rownames(vi_raw),
    Importance = round(rowMeans(vi_raw), 2),
    stringsAsFactors = FALSE
  ) %>%
    dplyr::filter(nchar(Variabel) > 0) %>%
    dplyr::arrange(dplyr::desc(Importance))
}

# Tampilkan tabel
cat("\n--- Tabel Variable Importance ---\n")

## 
## --- Tabel Variable Importance ---

print(vi_df)

##            Variabel Importance
## 1          kekayaan     100.00
## 2          asuransi      45.72
## 3        pendidikan      42.56
## 4         pekerjaan      40.08
## 5              usia      38.73
## 6    tempat_tinggal      25.96
## 7                tv       9.92
## 8           bekerja       0.60
## 9 status_pernikahan       0.53

# Visualisasi

ggplot(vi_df, aes(x = reorder(Variabel, Importance),
                  y = Importance,
                  fill = Importance)) +

  geom_col(width = 0.7, show.legend = FALSE) +

  geom_text(aes(label = paste0(Importance, "%")),
            hjust = -0.15, size = 4, fontface = "bold") +

  scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +

  coord_flip() +

  scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +

  labs(
    title    = paste0("Variable Importance — ", best_model_g),
    subtitle = paste0("Split: ", best_split_g,
                      "  |  Kondisi: SMOTE + Grid Search"),
    x = "Variabel",
    y = "Importance (%)"
  ) +

  theme_minimal(base_size = 13) +
  theme(
    plot.title         = element_text(hjust = 0.5, face = "bold", size = 15),
    plot.subtitle      = element_text(hjust = 0.5, size = 10, color = "gray40"),
    axis.text          = element_text(face = "bold", size = 11),
    axis.title         = element_text(face = "bold", size = 12),
    panel.grid.major.y = element_blank()
  )

Perbandingan K-Nearest Neighbor, Naive Bayes, dan Decision Tree untuk Klasifikasi Perilaku Merokok di Indonesia Menggunakan SMOTE dan Grid Search

Ghea Ananta Ramadani, Nur Hasanah, Maulidia Anam, Alifia Meidika

2026-05-31