Berikut adalah package yang digunakan dalam analisis ini. Package caret digunakan untuk proses pemodelan dan evaluasi model, readxl untuk membaca data, dplyr untuk manipulasi data, ggplot2 untuk visualisasi data, rpart dan rpart.plot untuk algoritma dan visualisasi Decision Tree, naivebayes sebagai package utama algoritma Naive Bayes, smotefamily untuk penanganan data tidak seimbang menggunakan SMOTE, serta reshape2 untuk transformasi struktur data.
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.3
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Loading required package: lattice
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.5.3
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.5.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.5.3
library(reshape2)
Dataset yang digunakan dalam penelitian ini bersumber dari Demographic and Health Survey (IDHS) dari Indonesia tahun 2017. Dataset tersebut terdiri atas 10.009 data responden, 10 variabel independen, dan 1 variabel dependen berupa status merokok yang diklasifikasikan menjadi tidak merokok (0) dan merokok (1)
datarokok <- read_excel("~/Lidia Semester 6/Data Mininng/datarokok.xlsx")
head(datarokok)
## # A tibble: 6 × 10
## `Frequency currently smokes tobacco (Y)` `Current age` Highest educational l…¹
## <dbl> <dbl> <dbl>
## 1 1 33 1
## 2 1 45 1
## 3 0 39 3
## 4 1 44 2
## 5 1 42 2
## 6 1 28 3
## # ℹ abbreviated name: ¹`Highest educational level`
## # ℹ 7 more variables: `Currently working` <dbl>, Occupation <dbl>,
## # `Current marital status` <dbl>, `Type of place of residence` <dbl>,
## # `Wealth index combined` <dbl>, `Covered by health insurance` <dbl>,
## # `Frequency of watching television` <dbl>
cat("Dimensi data:", dim(datarokok), "\n")
## Dimensi data: 10009 10
names(datarokok) <- c("Y", "usia", "pendidikan", "bekerja", "pekerjaan",
"status_pernikahan", "tempat_tinggal", "kekayaan",
"asuransi", "tv")
Dilakukan pemeriksaan dan penanganan missing value dengan mengubah kode khusus (9, 98, dan 99) menjadi nilai NA. Selanjutnya, data yang mengandung missing value dihapus menggunakan metode listwise deletion (na.omit)
cat("Missing value sebelum preprocessing:\n")
## Missing value sebelum preprocessing:
print(colSums(is.na(datarokok)))
## Y usia pendidikan bekerja
## 0 0 204 0
## pekerjaan status_pernikahan tempat_tinggal kekayaan
## 0 0 0 0
## asuransi tv
## 0 0
datarokok$Y[datarokok$Y == 9] <- NA
datarokok$bekerja[datarokok$bekerja == 9] <- NA
datarokok$pekerjaan[datarokok$pekerjaan %in% c(98, 99)] <- NA
datarokok$tv[datarokok$tv == 9] <- NA
cat("\nMissing value setelah penggantian kode (9/98/99 -> NA):\n")
##
## Missing value setelah penggantian kode (9/98/99 -> NA):
print(colSums(is.na(datarokok)))
## Y usia pendidikan bekerja
## 3 0 204 2
## pekerjaan status_pernikahan tempat_tinggal kekayaan
## 18 0 0 0
## asuransi tv
## 0 16
cat("Total missing value:", sum(is.na(datarokok)), "\n")
## Total missing value: 243
cat("Persentase missing per kolom (%):\n")
## Persentase missing per kolom (%):
print(round(colSums(is.na(datarokok)) / nrow(datarokok) * 100, 2))
## Y usia pendidikan bekerja
## 0.03 0.00 2.04 0.02
## pekerjaan status_pernikahan tempat_tinggal kekayaan
## 0.18 0.00 0.00 0.00
## asuransi tv
## 0.00 0.16
n_sebelum <- nrow(datarokok)
datarokok <- na.omit(datarokok)
n_sesudah <- nrow(datarokok)
cat("\nJumlah baris sebelum hapus missing:", n_sebelum, "\n")
##
## Jumlah baris sebelum hapus missing: 10009
cat("Jumlah baris setelah hapus missing :", n_sesudah, "\n")
## Jumlah baris setelah hapus missing : 9766
cat("Baris yang dibuang :", n_sebelum - n_sesudah, "\n")
## Baris yang dibuang : 243
cat("Persentase data terbuang :",
round((n_sebelum - n_sesudah) / n_sebelum * 100, 2), "%\n")
## Persentase data terbuang : 2.43 %
cat("\nMissing value setelah na.omit:\n")
##
## Missing value setelah na.omit:
print(colSums(is.na(datarokok)))
## Y usia pendidikan bekerja
## 0 0 0 0
## pekerjaan status_pernikahan tempat_tinggal kekayaan
## 0 0 0 0
## asuransi tv
## 0 0
variabel target dikategorikan menjadi tidak merokok (0) dan merokok (1)
datarokok$Y <- ifelse(datarokok$Y == 0, 0, 1)
datarokok$Y <- factor(datarokok$Y, levels = c(0, 1),
labels = c("Tidak_Merokok", "Merokok"))
Variabel kategorik dikonversi ke dalam bentuk faktor sebelum dilakukan pemodelan.
for (col in c("pendidikan", "bekerja", "pekerjaan", "status_pernikahan",
"tempat_tinggal", "kekayaan", "asuransi", "tv")) {
datarokok[[col]] <- factor(datarokok[[col]])}
cat("\nStruktur data setelah preprocessing:\n")
##
## Struktur data setelah preprocessing:
str(datarokok)
## tibble [9,766 × 10] (S3: tbl_df/tbl/data.frame)
## $ Y : Factor w/ 2 levels "Tidak_Merokok",..: 2 2 1 2 2 2 1 1 2 2 ...
## $ usia : num [1:9766] 33 45 39 44 42 28 45 45 35 37 ...
## $ pendidikan : Factor w/ 5 levels "1","2","3","4",..: 1 1 3 2 2 3 3 2 1 2 ...
## $ bekerja : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ pekerjaan : Factor w/ 9 levels "0","1","2","3",..: 7 7 7 8 7 7 8 7 7 7 ...
## $ status_pernikahan: Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ tempat_tinggal : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ kekayaan : Factor w/ 5 levels "1","2","3","4",..: 1 1 2 3 4 2 2 4 1 3 ...
## $ asuransi : Factor w/ 3 levels "0","1","9": 1 2 2 1 2 2 2 1 1 1 ...
## $ tv : Factor w/ 3 levels "0","1","2": 3 3 3 3 3 3 3 2 2 3 ...
## - attr(*, "na.action")= 'omit' Named int [1:243] 76 261 268 316 336 439 473 474 475 476 ...
## ..- attr(*, "names")= chr [1:243] "76" "261" "268" "316" ...
dataset penelitian terdiri atas … observasi dan … variabel. Variabel yang digunakan meliputi … variabel numerik dan … variabel kategorik, yang selanjutnya digunakan dalam proses analisis klasifikasi status merokok.
statdesk_dimensi2 <- data.frame(
Keterangan = c("Jumlah Observasi", "Jumlah Variabel",
"Jumlah Variabel Numerik", "Jumlah Variabel Kategorik"),
Nilai = c(nrow(datarokok), ncol(datarokok),
sum(sapply(datarokok, is.numeric)),
sum(sapply(datarokok, is.factor))))
print(statdesk_dimensi2)
## Keterangan Nilai
## 1 Jumlah Observasi 9766
## 2 Jumlah Variabel 10
## 3 Jumlah Variabel Numerik 1
## 4 Jumlah Variabel Kategorik 9
##4.2 Distribusi Variabel Target Distribusi variabel target menunjukkan jumlah dan persentase responden pada kategori tidak merokok dan merokok. Informasi ini digunakan untuk mengetahui keseimbangan kelas dalam data sebelum proses pemodelan dilakukan.
distribusi_target <- datarokok %>%
dplyr::count(Y) %>%
dplyr::mutate(Persentase = round(n / sum(n) * 100, 2))
print(distribusi_target)
## # A tibble: 2 × 3
## Y n Persentase
## <fct> <int> <dbl>
## 1 Tidak_Merokok 2778 28.4
## 2 Merokok 6988 71.6
ggplot(distribusi_target, aes(x = Y, y = n, fill = Y)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(n, " (", Persentase, "%)")),
vjust = -0.3, size = 5) +
scale_fill_manual(values = c("Tidak_Merokok" = "steelblue", "Merokok" = "tomato")) +
labs(title = "Distribusi Variabel Target: Status Merokok",
x = "Status Merokok", y = "Frekuensi") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold", size = 14),
axis.text.x = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 13, face = "bold"))
## 4.3 Statistik Deskriptif Variabel Kategorik Statistik deskriptif
variabel kategorik digunakan untuk memberikan gambaran umum mengenai
karakteristik responden berdasarkan kategori pada setiap variabel
penelitian.
var_kategorik <- c(
"Y",
"pendidikan",
"bekerja",
"pekerjaan",
"status_pernikahan",
"tempat_tinggal",
"kekayaan",
"asuransi",
"tv")
statdesk_kategorik <- bind_rows(
lapply(var_kategorik, function(var) {
tbl <- as.data.frame(table(datarokok[[var]]))
colnames(tbl) <- c("Kategori", "Frekuensi")
tbl <- tbl %>%
mutate(
Variabel = var,
Persentase = round(Frekuensi / sum(Frekuensi) * 100, 2)
) %>%
dplyr::select(Variabel, Kategori, Frekuensi, Persentase)
tbl }))
print(statdesk_kategorik)
## Variabel Kategori Frekuensi Persentase
## 1 Y Tidak_Merokok 2778 28.45
## 2 Y Merokok 6988 71.55
## 3 pendidikan 1 3079 31.53
## 4 pendidikan 2 1899 19.45
## 5 pendidikan 3 3355 34.35
## 6 pendidikan 4 276 2.83
## 7 pendidikan 5 1157 11.85
## 8 bekerja 0 196 2.01
## 9 bekerja 1 9570 97.99
## 10 pekerjaan 0 56 0.57
## 11 pekerjaan 1 758 7.76
## 12 pekerjaan 2 199 2.04
## 13 pekerjaan 3 594 6.08
## 14 pekerjaan 4 1345 13.77
## 15 pekerjaan 5 1683 17.23
## 16 pekerjaan 6 2747 28.13
## 17 pekerjaan 7 2326 23.82
## 18 pekerjaan 96 58 0.59
## 19 status_pernikahan 1 9703 99.35
## 20 status_pernikahan 2 63 0.65
## 21 tempat_tinggal 1 4987 51.06
## 22 tempat_tinggal 2 4779 48.94
## 23 kekayaan 1 2128 21.79
## 24 kekayaan 2 1940 19.86
## 25 kekayaan 3 1935 19.81
## 26 kekayaan 4 1866 19.11
## 27 kekayaan 5 1897 19.42
## 28 asuransi 0 3858 39.50
## 29 asuransi 1 5906 60.48
## 30 asuransi 9 2 0.02
## 31 tv 0 314 3.22
## 32 tv 1 1206 12.35
## 33 tv 2 8246 84.44
Evaluasi model dilakukan untuk mengetahui tingkat performa model klasifikasi yang dibangun. Penilaian kinerja model dilakukan berdasarkan nilai metrik evaluasi yang diperoleh dari confusion matrix.
evaluasi_model <- function(actual, pred, nama_model, split, kondisi, set_data) {
cat("Split :", split, "\n")
cat("Kondisi :", kondisi, "\n")
cat("Metode :", nama_model, "\n")
cat("Set Data :", set_data, "\n")
cm <- confusionMatrix(pred, actual, positive = "Merokok")
print(cm)
hasil <- data.frame(
Split = split,
Kondisi = kondisi,
Model = nama_model,
Set_Data = set_data,
Accuracy = as.numeric(cm$overall["Accuracy"]),
Precision = ifelse(is.na(cm$byClass["Precision"]), 0, cm$byClass["Precision"]),
Recall = ifelse(is.na(cm$byClass["Recall"]), 0, cm$byClass["Recall"]),
F1_Score = ifelse(is.na(cm$byClass["F1"]), 0, cm$byClass["F1"]),
Specificity = ifelse(is.na(cm$byClass["Specificity"]), 0, cm$byClass["Specificity"]),
Kappa = as.numeric(cm$overall["Kappa"])
)
return(hasil)}
SMOTE (Synthetic Minority Over-sampling Technique) digunakan untuk mengatasi masalah ketidakseimbangan kelas (class imbalance) pada data.
smote_data <- function(train_df) {
train_num <- train_df
for (col in setdiff(names(train_num), "Y")) {
train_num[[col]] <- as.numeric(as.factor(train_num[[col]])) }
train_num$Y <- as.numeric(train_num$Y) - 1
jumlah_tidak <- sum(train_df$Y == "Tidak_Merokok")
jumlah_rokok <- sum(train_df$Y == "Merokok")
cat("\n--- Info SMOTE ---\n")
cat("Sebelum SMOTE - Tidak_Merokok:",
jumlah_tidak,
"| Merokok:",
jumlah_rokok,
"\n")
smote_res <- SMOTE(
X = train_num[, setdiff(names(train_num), "Y")],
target = train_num$Y,
K = 5,
dup_size = 0 )
tb <- smote_res$data
cat("\nStruktur hasil SMOTE:\n")
str(tb$class)
tb$Y <- factor(
as.numeric(tb$class),
levels = c(0, 1),
labels = c("Tidak_Merokok", "Merokok") )
tb$class <- NULL
cat("Sesudah SMOTE - Tidak_Merokok:",
sum(tb$Y == "Tidak_Merokok"),
"| Merokok:",
sum(tb$Y == "Merokok"),
"\n")
return(tb)}
Fungsi konversi numerik digunakan untuk mengubah seluruh variabel prediktor yang bertipe kategorik menjadi numerik, sedangkan variabel target tetap dipertahankan. Tahap ini dilakukan untuk memastikan data dapat diproses oleh metode SMOTE yang memerlukan input berupa data numerik.
konversi_numerik <- function(df) {
out <- df
for (col in setdiff(names(out), "Y")) {
out[[col]] <- as.numeric(as.factor(out[[col]])) }
out}
Data dibagi menjadi data pelatihan dan data pengujian dengan rasio 90:10, 80:20, dan 70:30. Pembagian ini bertujuan untuk membangun model serta mengukur kemampuan model dalam melakukan prediksi pada data baru.
set.seed(123)
index_90 <- createDataPartition(datarokok$Y, p = 0.90, list = FALSE)
train_90 <- datarokok[ index_90, ]; test_10 <- datarokok[-index_90, ]
index_80 <- createDataPartition(datarokok$Y, p = 0.80, list = FALSE)
train_80 <- datarokok[ index_80, ]; test_20 <- datarokok[-index_80, ]
index_70 <- createDataPartition(datarokok$Y, p = 0.70, list = FALSE)
train_70 <- datarokok[ index_70, ]; test_30 <- datarokok[-index_70, ]
cat("\n--- Distribusi Target per Split ---\n")
##
## --- Distribusi Target per Split ---
cat("Train 90:10\n"); print(summary(train_90$Y))
## Train 90:10
## Tidak_Merokok Merokok
## 2501 6290
cat("Train 80:20\n"); print(summary(train_80$Y))
## Train 80:20
## Tidak_Merokok Merokok
## 2223 5591
cat("Train 70:30\n"); print(summary(train_70$Y))
## Train 70:30
## Tidak_Merokok Merokok
## 1945 4892
Fungsi ini digunakan untuk menjalankan proses pemodelan klasifikasi pada data asli dan data hasil SMOTE menggunakan metode Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Selanjutnya, kinerja masing-masing model dievaluasi berdasarkan hasil klasifikasi pada data pelatihan dan data pengujian.
jalankan_model <- function(train_data, test_data, split_name, kondisi) {
hasil_semua <- data.frame()
train_x <- train_data %>% dplyr::select(-Y)
train_y <- train_data$Y
test_x <- test_data %>% dplyr::select(-Y)
test_y <- test_data$Y
# A. KNN (k = 5, default)
cat("\n>>> KNN <<<\n")
spek_knn <- data.frame(
Parameter = c("Metode", "k (default)", "Preprocessing", "Platform"),
Nilai = c("KNN", "5", "Center + Scale", "R - caret")
)
cat("Spesifikasi KNN:\n"); print(spek_knn)
set.seed(123)
model_knn <- train(Y ~ ., data = train_data, method = "knn",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(k = 5),
preProcess = c("center", "scale"))
pred_knn_train <- factor(predict(model_knn, train_x), levels = levels(train_y))
hasil_semua <- bind_rows(hasil_semua,
evaluasi_model(train_y, pred_knn_train, "KNN", split_name, kondisi, "Training"))
pred_knn_test <- factor(predict(model_knn, test_x), levels = levels(test_y))
hasil_semua <- bind_rows(hasil_semua,
evaluasi_model(test_y, pred_knn_test, "KNN", split_name, kondisi, "Testing"))
# B. Decision Tree (cp = 0.01, maxdepth = 10, default)
cat("\n>>> Decision Tree <<<\n")
spek_dt <- data.frame(
Parameter = c("Metode", "Splitting Criterion", "cp (default)",
"maxdepth", "Platform"),
Nilai = c("rpart", "Gini Index", "0.01", "10", "R - rpart")
)
cat("Spesifikasi Decision Tree:\n"); print(spek_dt)
set.seed(123)
model_dt <- rpart(Y ~ ., data = train_data, method = "class",
control = rpart.control(cp = 0.01, maxdepth = 10))
pred_dt_train <- factor(predict(model_dt, train_x, type = "class"),
levels = levels(train_y))
hasil_semua <- bind_rows(hasil_semua,
evaluasi_model(train_y, pred_dt_train,
"Decision Tree", split_name, kondisi, "Training"))
pred_dt_test <- factor(predict(model_dt, test_x, type = "class"),
levels = levels(test_y))
hasil_semua <- bind_rows(hasil_semua,
evaluasi_model(test_y, pred_dt_test,
"Decision Tree", split_name, kondisi, "Testing"))
# C. Naive Bayes (laplace = 0, usekernel = FALSE, default)
cat("\n>>> Naive Bayes <<<\n")
spek_nb <- data.frame(
Parameter = c("Metode", "laplace (default)", "usekernel (default)",
"adjust (default)", "Platform"),
Nilai = c("naive_bayes", "0", "FALSE", "1", "R - naivebayes")
)
cat("Spesifikasi Naive Bayes:\n"); print(spek_nb)
set.seed(123)
model_nb <- train(Y ~ ., data = train_data, method = "naive_bayes",
trControl = trainControl(method = "none"),
tuneGrid = data.frame(laplace = 0,
usekernel = FALSE,
adjust = 1))
pred_nb_train <- factor(predict(model_nb, train_x), levels = levels(train_y))
hasil_semua <- bind_rows(hasil_semua,
evaluasi_model(train_y, pred_nb_train,
"Naive Bayes", split_name, kondisi, "Training"))
pred_nb_test <- factor(predict(model_nb, test_x), levels = levels(test_y))
hasil_semua <- bind_rows(hasil_semua,
evaluasi_model(test_y, pred_nb_test,
"Naive Bayes", split_name, kondisi, "Testing"))
# Kembalikan hasil + simpan model
return(list(
hasil = hasil_semua,
knn = model_knn,
dt = model_dt,
nb = model_nb ))}
Berdasarkan fungsi pemodelan yang telah dibangun sebelumnya, dilakukan proses klasifikasi pada data sebelum penerapan SMOTE menggunakan metode Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Hasil evaluasi masing-masing model disajikan pada bagian berikut.
res_awal_90 <- jalankan_model(train_90, test_10, "90:10", "Sebelum SMOTE")
##
## >>> KNN <<<
## Spesifikasi KNN:
## Parameter Nilai
## 1 Metode KNN
## 2 k (default) 5
## 3 Preprocessing Center + Scale
## 4 Platform R - caret
## Split : 90:10
## Kondisi : Sebelum SMOTE
## Metode : KNN
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 907 524
## Merokok 1594 5766
##
## Accuracy : 0.7591
## 95% CI : (0.75, 0.768)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3207
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9167
## Specificity : 0.3627
## Pos Pred Value : 0.7834
## Neg Pred Value : 0.6338
## Prevalence : 0.7155
## Detection Rate : 0.6559
## Detection Prevalence : 0.8372
## Balanced Accuracy : 0.6397
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : Sebelum SMOTE
## Metode : KNN
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 57 96
## Merokok 220 602
##
## Accuracy : 0.6759
## 95% CI : (0.6455, 0.7052)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 0.9972
##
## Kappa : 0.0789
##
## Mcnemar's Test P-Value : 4.539e-12
##
## Sensitivity : 0.8625
## Specificity : 0.2058
## Pos Pred Value : 0.7324
## Neg Pred Value : 0.3725
## Prevalence : 0.7159
## Detection Rate : 0.6174
## Detection Prevalence : 0.8431
## Balanced Accuracy : 0.5341
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
## Parameter Nilai
## 1 Metode rpart
## 2 Splitting Criterion Gini Index
## 3 cp (default) 0.01
## 4 maxdepth 10
## 5 Platform R - rpart
## Split : 90:10
## Kondisi : Sebelum SMOTE
## Metode : Decision Tree
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 0 0
## Merokok 2501 6290
##
## Accuracy : 0.7155
## 95% CI : (0.7059, 0.7249)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : 0.5054
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7155
## Neg Pred Value : NaN
## Prevalence : 0.7155
## Detection Rate : 0.7155
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : Sebelum SMOTE
## Metode : Decision Tree
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 0 0
## Merokok 277 698
##
## Accuracy : 0.7159
## 95% CI : (0.6865, 0.744)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 0.5162
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7159
## Neg Pred Value : NaN
## Prevalence : 0.7159
## Detection Rate : 0.7159
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
## Parameter Nilai
## 1 Metode naive_bayes
## 2 laplace (default) 0
## 3 usekernel (default) FALSE
## 4 adjust (default) 1
## 5 Platform R - naivebayes
## Split : 90:10
## Kondisi : Sebelum SMOTE
## Metode : Naive Bayes
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 1878 3735
## Merokok 623 2555
##
## Accuracy : 0.5043
## 95% CI : (0.4938, 0.5148)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1143
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.4062
## Specificity : 0.7509
## Pos Pred Value : 0.8040
## Neg Pred Value : 0.3346
## Prevalence : 0.7155
## Detection Rate : 0.2906
## Detection Prevalence : 0.3615
## Balanced Accuracy : 0.5785
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : Sebelum SMOTE
## Metode : Naive Bayes
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 194 425
## Merokok 83 273
##
## Accuracy : 0.479
## 95% CI : (0.4472, 0.5109)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0667
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.3911
## Specificity : 0.7004
## Pos Pred Value : 0.7669
## Neg Pred Value : 0.3134
## Prevalence : 0.7159
## Detection Rate : 0.2800
## Detection Prevalence : 0.3651
## Balanced Accuracy : 0.5457
##
## 'Positive' Class : Merokok
##
res_awal_80 <- jalankan_model(train_80, test_20, "80:20", "Sebelum SMOTE")
##
## >>> KNN <<<
## Spesifikasi KNN:
## Parameter Nilai
## 1 Metode KNN
## 2 k (default) 5
## 3 Preprocessing Center + Scale
## 4 Platform R - caret
## Split : 80:20
## Kondisi : Sebelum SMOTE
## Metode : KNN
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 867 501
## Merokok 1356 5090
##
## Accuracy : 0.7623
## 95% CI : (0.7528, 0.7718)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3398
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9104
## Specificity : 0.3900
## Pos Pred Value : 0.7896
## Neg Pred Value : 0.6338
## Prevalence : 0.7155
## Detection Rate : 0.6514
## Detection Prevalence : 0.8249
## Balanced Accuracy : 0.6502
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : Sebelum SMOTE
## Metode : KNN
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 134 239
## Merokok 421 1158
##
## Accuracy : 0.6619
## 95% CI : (0.6404, 0.6829)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0781
##
## Mcnemar's Test P-Value : 1.849e-12
##
## Sensitivity : 0.8289
## Specificity : 0.2414
## Pos Pred Value : 0.7334
## Neg Pred Value : 0.3592
## Prevalence : 0.7157
## Detection Rate : 0.5932
## Detection Prevalence : 0.8089
## Balanced Accuracy : 0.5352
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
## Parameter Nilai
## 1 Metode rpart
## 2 Splitting Criterion Gini Index
## 3 cp (default) 0.01
## 4 maxdepth 10
## 5 Platform R - rpart
## Split : 80:20
## Kondisi : Sebelum SMOTE
## Metode : Decision Tree
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 0 0
## Merokok 2223 5591
##
## Accuracy : 0.7155
## 95% CI : (0.7054, 0.7255)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : 0.5057
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7155
## Neg Pred Value : NaN
## Prevalence : 0.7155
## Detection Rate : 0.7155
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : Sebelum SMOTE
## Metode : Decision Tree
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 0 0
## Merokok 555 1397
##
## Accuracy : 0.7157
## 95% CI : (0.6951, 0.7356)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 0.5114
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7157
## Neg Pred Value : NaN
## Prevalence : 0.7157
## Detection Rate : 0.7157
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
## Parameter Nilai
## 1 Metode naive_bayes
## 2 laplace (default) 0
## 3 usekernel (default) FALSE
## 4 adjust (default) 1
## 5 Platform R - naivebayes
## Split : 80:20
## Kondisi : Sebelum SMOTE
## Metode : Naive Bayes
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 1557 2985
## Merokok 666 2606
##
## Accuracy : 0.5328
## 95% CI : (0.5216, 0.5439)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1267
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.4661
## Specificity : 0.7004
## Pos Pred Value : 0.7965
## Neg Pred Value : 0.3428
## Prevalence : 0.7155
## Detection Rate : 0.3335
## Detection Prevalence : 0.4187
## Balanced Accuracy : 0.5833
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : Sebelum SMOTE
## Metode : Naive Bayes
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 382 759
## Merokok 173 638
##
## Accuracy : 0.5225
## 95% CI : (0.5001, 0.5449)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.11
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.4567
## Specificity : 0.6883
## Pos Pred Value : 0.7867
## Neg Pred Value : 0.3348
## Prevalence : 0.7157
## Detection Rate : 0.3268
## Detection Prevalence : 0.4155
## Balanced Accuracy : 0.5725
##
## 'Positive' Class : Merokok
##
res_awal_70 <- jalankan_model(train_70, test_30, "70:30", "Sebelum SMOTE")
##
## >>> KNN <<<
## Spesifikasi KNN:
## Parameter Nilai
## 1 Metode KNN
## 2 k (default) 5
## 3 Preprocessing Center + Scale
## 4 Platform R - caret
## Split : 70:30
## Kondisi : Sebelum SMOTE
## Metode : KNN
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 733 409
## Merokok 1212 4483
##
## Accuracy : 0.7629
## 95% CI : (0.7526, 0.7729)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3349
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9164
## Specificity : 0.3769
## Pos Pred Value : 0.7872
## Neg Pred Value : 0.6419
## Prevalence : 0.7155
## Detection Rate : 0.6557
## Detection Prevalence : 0.8330
## Balanced Accuracy : 0.6466
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : Sebelum SMOTE
## Metode : KNN
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 209 305
## Merokok 624 1791
##
## Accuracy : 0.6828
## 95% CI : (0.6656, 0.6997)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1191
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8545
## Specificity : 0.2509
## Pos Pred Value : 0.7416
## Neg Pred Value : 0.4066
## Prevalence : 0.7156
## Detection Rate : 0.6115
## Detection Prevalence : 0.8245
## Balanced Accuracy : 0.5527
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
## Parameter Nilai
## 1 Metode rpart
## 2 Splitting Criterion Gini Index
## 3 cp (default) 0.01
## 4 maxdepth 10
## 5 Platform R - rpart
## Split : 70:30
## Kondisi : Sebelum SMOTE
## Metode : Decision Tree
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 0 0
## Merokok 1945 4892
##
## Accuracy : 0.7155
## 95% CI : (0.7047, 0.7262)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : 0.5061
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7155
## Neg Pred Value : NaN
## Prevalence : 0.7155
## Detection Rate : 0.7155
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : Sebelum SMOTE
## Metode : Decision Tree
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 0 0
## Merokok 833 2096
##
## Accuracy : 0.7156
## 95% CI : (0.6989, 0.7319)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 0.5093
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7156
## Neg Pred Value : NaN
## Prevalence : 0.7156
## Detection Rate : 0.7156
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
## Parameter Nilai
## 1 Metode naive_bayes
## 2 laplace (default) 0
## 3 usekernel (default) FALSE
## 4 adjust (default) 1
## 5 Platform R - naivebayes
## Split : 70:30
## Kondisi : Sebelum SMOTE
## Metode : Naive Bayes
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 1567 3320
## Merokok 378 1572
##
## Accuracy : 0.4591
## 95% CI : (0.4473, 0.471)
## No Information Rate : 0.7155
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0872
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.3213
## Specificity : 0.8057
## Pos Pred Value : 0.8062
## Neg Pred Value : 0.3206
## Prevalence : 0.7155
## Detection Rate : 0.2299
## Detection Prevalence : 0.2852
## Balanced Accuracy : 0.5635
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : Sebelum SMOTE
## Metode : Naive Bayes
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 676 1409
## Merokok 157 687
##
## Accuracy : 0.4653
## 95% CI : (0.4472, 0.4836)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0959
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.3278
## Specificity : 0.8115
## Pos Pred Value : 0.8140
## Neg Pred Value : 0.3242
## Prevalence : 0.7156
## Detection Rate : 0.2346
## Detection Prevalence : 0.2882
## Balanced Accuracy : 0.5696
##
## 'Positive' Class : Merokok
##
hasil_awal <- bind_rows(res_awal_90$hasil, res_awal_80$hasil, res_awal_70$hasil)
print(hasil_awal)
## Split Kondisi Model Set_Data Accuracy Precision
## Precision...1 90:10 Sebelum SMOTE KNN Training 0.7590718 0.7834239
## Precision...2 90:10 Sebelum SMOTE KNN Testing 0.6758974 0.7323601
## Precision...3 90:10 Sebelum SMOTE Decision Tree Training 0.7155045 0.7155045
## Precision...4 90:10 Sebelum SMOTE Decision Tree Testing 0.7158974 0.7158974
## Precision...5 90:10 Sebelum SMOTE Naive Bayes Training 0.5042657 0.8039648
## Precision...6 90:10 Sebelum SMOTE Naive Bayes Testing 0.4789744 0.7668539
## Precision...7 80:20 Sebelum SMOTE KNN Training 0.7623496 0.7896370
## Precision...8 80:20 Sebelum SMOTE KNN Testing 0.6618852 0.7333756
## Precision...9 80:20 Sebelum SMOTE Decision Tree Training 0.7155106 0.7155106
## Precision...10 80:20 Sebelum SMOTE Decision Tree Testing 0.7156762 0.7156762
## Precision...11 80:20 Sebelum SMOTE Naive Bayes Training 0.5327617 0.7964548
## Precision...12 80:20 Sebelum SMOTE Naive Bayes Testing 0.5225410 0.7866831
## Precision...13 70:30 Sebelum SMOTE KNN Training 0.7629077 0.7871817
## Precision...14 70:30 Sebelum SMOTE KNN Testing 0.6828269 0.7416149
## Precision...15 70:30 Sebelum SMOTE Decision Tree Training 0.7155185 0.7155185
## Precision...16 70:30 Sebelum SMOTE Decision Tree Testing 0.7156026 0.7156026
## Precision...17 70:30 Sebelum SMOTE Naive Bayes Training 0.4591195 0.8061538
## Precision...18 70:30 Sebelum SMOTE Naive Bayes Testing 0.4653465 0.8139810
## Recall F1_Score Specificity Kappa
## Precision...1 0.9166932 0.8448352 0.3626549 0.32066914
## Precision...2 0.8624642 0.7921053 0.2057762 0.07889072
## Precision...3 1.0000000 0.8341622 0.0000000 0.00000000
## Precision...4 1.0000000 0.8344292 0.0000000 0.00000000
## Precision...5 0.4062003 0.5397127 0.7508996 0.11427327
## Precision...6 0.3911175 0.5180266 0.7003610 0.06665863
## Precision...7 0.9103917 0.8457257 0.3900135 0.33976551
## Precision...8 0.8289191 0.7782258 0.2414414 0.07807641
## Precision...9 1.0000000 0.8341664 0.0000000 0.00000000
## Precision...10 1.0000000 0.8342789 0.0000000 0.00000000
## Precision...11 0.4661062 0.5880627 0.7004049 0.12670095
## Precision...12 0.4566929 0.5778986 0.6882883 0.10998482
## Precision...13 0.9163941 0.8468877 0.3768638 0.33490458
## Precision...14 0.8544847 0.7940590 0.2509004 0.11913025
## Precision...15 1.0000000 0.8341717 0.0000000 0.00000000
## Precision...16 1.0000000 0.8342289 0.0000000 0.00000000
## Precision...17 0.3213410 0.4595148 0.8056555 0.08724664
## Precision...18 0.3277672 0.4673469 0.8115246 0.09587575
Berdasarkan fungsi SMOTE yang telah dibuat pada tahap sebelumnya, dilakukan penerapan smothe yang berproses untuk penyeimbangan data dengan menambahkan data sintetis pada kelas minoritas. Penerapan metode ini bertujuan untuk menghasilkan distribusi kelas yang lebih seimbang sehingga dapat meningkatkan kinerja model klasifikasi. Hasil penerapan SMOTE disajikan pada bagian berikut.
train_90_smote <- smote_data(train_90)
##
## --- Info SMOTE ---
## Sebelum SMOTE - Tidak_Merokok: 2501 | Merokok: 6290
##
## Struktur hasil SMOTE:
## chr [1:11292] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
## Sesudah SMOTE - Tidak_Merokok: 5002 | Merokok: 6290
train_80_smote <- smote_data(train_80)
##
## --- Info SMOTE ---
## Sebelum SMOTE - Tidak_Merokok: 2223 | Merokok: 5591
##
## Struktur hasil SMOTE:
## chr [1:10037] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
## Sesudah SMOTE - Tidak_Merokok: 4446 | Merokok: 5591
train_70_smote <- smote_data(train_70)
##
## --- Info SMOTE ---
## Sebelum SMOTE - Tidak_Merokok: 1945 | Merokok: 4892
##
## Struktur hasil SMOTE:
## chr [1:8782] "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" "0" ...
## Sesudah SMOTE - Tidak_Merokok: 3890 | Merokok: 4892
# Test set dikonversi numerik agar cocok dengan format SMOTE
test_10_num <- konversi_numerik(test_10); test_10_num$Y <- test_10$Y
test_20_num <- konversi_numerik(test_20); test_20_num$Y <- test_20$Y
test_30_num <- konversi_numerik(test_30); test_30_num$Y <- test_30$Y
# Perbandingan distribusi sebelum vs sesudah SMOTE
distribusi_smote <- bind_rows(
lapply(list(
"90:10 Sebelum SMOTE" = train_90,
"80:20 Sebelum SMOTE" = train_80,
"70:30 Sebelum SMOTE" = train_70,
"90:10 Sesudah SMOTE" = train_90_smote,
"80:20 Sesudah SMOTE" = train_80_smote,
"70:30 Sesudah SMOTE" = train_70_smote
), function(x) as.data.frame(table(x$Y))),
.id = "Keterangan"
)
colnames(distribusi_smote)[2:3] <- c("Status", "Frekuensi")
cat("\n--- Distribusi Data Sebelum dan Sesudah SMOTE ---\n")
##
## --- Distribusi Data Sebelum dan Sesudah SMOTE ---
print(distribusi_smote)
## Keterangan Status Frekuensi
## 1 90:10 Sebelum SMOTE Tidak_Merokok 2501
## 2 90:10 Sebelum SMOTE Merokok 6290
## 3 80:20 Sebelum SMOTE Tidak_Merokok 2223
## 4 80:20 Sebelum SMOTE Merokok 5591
## 5 70:30 Sebelum SMOTE Tidak_Merokok 1945
## 6 70:30 Sebelum SMOTE Merokok 4892
## 7 90:10 Sesudah SMOTE Tidak_Merokok 5002
## 8 90:10 Sesudah SMOTE Merokok 6290
## 9 80:20 Sesudah SMOTE Tidak_Merokok 4446
## 10 80:20 Sesudah SMOTE Merokok 5591
## 11 70:30 Sesudah SMOTE Tidak_Merokok 3890
## 12 70:30 Sesudah SMOTE Merokok 4892
table(train_90_smote$Y)
##
## Tidak_Merokok Merokok
## 5002 6290
table(train_80_smote$Y)
##
## Tidak_Merokok Merokok
## 4446 5591
table(train_70_smote$Y)
##
## Tidak_Merokok Merokok
## 3890 4892
Berdasarkan data hasil penerapan SMOTE, proses pemodelan klasifikasi dilakukan menggunakan metode Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Hasil yang diperoleh kemudian dievaluasi untuk mengetahui kinerja model setelah penyeimbangan kelas dilakukan.
res_smote_90 <- jalankan_model(train_90_smote, test_10_num, "90:10", "Sesudah SMOTE")
##
## >>> KNN <<<
## Spesifikasi KNN:
## Parameter Nilai
## 1 Metode KNN
## 2 k (default) 5
## 3 Preprocessing Center + Scale
## 4 Platform R - caret
## Split : 90:10
## Kondisi : Sesudah SMOTE
## Metode : KNN
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 3386 1051
## Merokok 1616 5239
##
## Accuracy : 0.7638
## 95% CI : (0.7559, 0.7716)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5158
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8329
## Specificity : 0.6769
## Pos Pred Value : 0.7643
## Neg Pred Value : 0.7631
## Prevalence : 0.5570
## Detection Rate : 0.4640
## Detection Prevalence : 0.6071
## Balanced Accuracy : 0.7549
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : Sesudah SMOTE
## Metode : KNN
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 95 177
## Merokok 182 521
##
## Accuracy : 0.6318
## 95% CI : (0.6006, 0.6621)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.0899
##
## Mcnemar's Test P-Value : 0.8328
##
## Sensitivity : 0.7464
## Specificity : 0.3430
## Pos Pred Value : 0.7411
## Neg Pred Value : 0.3493
## Prevalence : 0.7159
## Detection Rate : 0.5344
## Detection Prevalence : 0.7210
## Balanced Accuracy : 0.5447
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
## Parameter Nilai
## 1 Metode rpart
## 2 Splitting Criterion Gini Index
## 3 cp (default) 0.01
## 4 maxdepth 10
## 5 Platform R - rpart
## Split : 90:10
## Kondisi : Sesudah SMOTE
## Metode : Decision Tree
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2302 1197
## Merokok 2700 5093
##
## Accuracy : 0.6549
## 95% CI : (0.646, 0.6637)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2785
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8097
## Specificity : 0.4602
## Pos Pred Value : 0.6535
## Neg Pred Value : 0.6579
## Prevalence : 0.5570
## Detection Rate : 0.4510
## Detection Prevalence : 0.6901
## Balanced Accuracy : 0.6350
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : Sesudah SMOTE
## Metode : Decision Tree
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 100 136
## Merokok 177 562
##
## Accuracy : 0.679
## 95% CI : (0.6486, 0.7082)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 0.99484
##
## Kappa : 0.1739
##
## Mcnemar's Test P-Value : 0.02376
##
## Sensitivity : 0.8052
## Specificity : 0.3610
## Pos Pred Value : 0.7605
## Neg Pred Value : 0.4237
## Prevalence : 0.7159
## Detection Rate : 0.5764
## Detection Prevalence : 0.7579
## Balanced Accuracy : 0.5831
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
## Parameter Nilai
## 1 Metode naive_bayes
## 2 laplace (default) 0
## 3 usekernel (default) FALSE
## 4 adjust (default) 1
## 5 Platform R - naivebayes
## Split : 90:10
## Kondisi : Sesudah SMOTE
## Metode : Naive Bayes
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 3445 3172
## Merokok 1557 3118
##
## Accuracy : 0.5812
## 95% CI : (0.572, 0.5903)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : 1.154e-07
##
## Kappa : 0.1785
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4957
## Specificity : 0.6887
## Pos Pred Value : 0.6670
## Neg Pred Value : 0.5206
## Prevalence : 0.5570
## Detection Rate : 0.2761
## Detection Prevalence : 0.4140
## Balanced Accuracy : 0.5922
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : Sesudah SMOTE
## Metode : Naive Bayes
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 188 338
## Merokok 89 360
##
## Accuracy : 0.5621
## 95% CI : (0.5302, 0.5935)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.153
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5158
## Specificity : 0.6787
## Pos Pred Value : 0.8018
## Neg Pred Value : 0.3574
## Prevalence : 0.7159
## Detection Rate : 0.3692
## Detection Prevalence : 0.4605
## Balanced Accuracy : 0.5972
##
## 'Positive' Class : Merokok
##
res_smote_80 <- jalankan_model(train_80_smote, test_20_num, "80:20", "Sesudah SMOTE")
##
## >>> KNN <<<
## Spesifikasi KNN:
## Parameter Nilai
## 1 Metode KNN
## 2 k (default) 5
## 3 Preprocessing Center + Scale
## 4 Platform R - caret
## Split : 80:20
## Kondisi : Sesudah SMOTE
## Metode : KNN
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 3138 965
## Merokok 1308 4626
##
## Accuracy : 0.7735
## 95% CI : (0.7652, 0.7817)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5374
##
## Mcnemar's Test P-Value : 7.315e-13
##
## Sensitivity : 0.8274
## Specificity : 0.7058
## Pos Pred Value : 0.7796
## Neg Pred Value : 0.7648
## Prevalence : 0.5570
## Detection Rate : 0.4609
## Detection Prevalence : 0.5912
## Balanced Accuracy : 0.7666
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : Sesudah SMOTE
## Metode : KNN
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 194 385
## Merokok 361 1012
##
## Accuracy : 0.6178
## 95% CI : (0.5959, 0.6394)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.073
##
## Mcnemar's Test P-Value : 0.3997
##
## Sensitivity : 0.7244
## Specificity : 0.3495
## Pos Pred Value : 0.7371
## Neg Pred Value : 0.3351
## Prevalence : 0.7157
## Detection Rate : 0.5184
## Detection Prevalence : 0.7034
## Balanced Accuracy : 0.5370
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
## Parameter Nilai
## 1 Metode rpart
## 2 Splitting Criterion Gini Index
## 3 cp (default) 0.01
## 4 maxdepth 10
## 5 Platform R - rpart
## Split : 80:20
## Kondisi : Sesudah SMOTE
## Metode : Decision Tree
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2083 1033
## Merokok 2363 4558
##
## Accuracy : 0.6617
## 95% CI : (0.6523, 0.6709)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.2927
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8152
## Specificity : 0.4685
## Pos Pred Value : 0.6586
## Neg Pred Value : 0.6685
## Prevalence : 0.5570
## Detection Rate : 0.4541
## Detection Prevalence : 0.6895
## Balanced Accuracy : 0.6419
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : Sesudah SMOTE
## Metode : Decision Tree
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 199 263
## Merokok 356 1134
##
## Accuracy : 0.6829
## 95% CI : (0.6617, 0.7035)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 0.9993233
##
## Kappa : 0.1794
##
## Mcnemar's Test P-Value : 0.0002175
##
## Sensitivity : 0.8117
## Specificity : 0.3586
## Pos Pred Value : 0.7611
## Neg Pred Value : 0.4307
## Prevalence : 0.7157
## Detection Rate : 0.5809
## Detection Prevalence : 0.7633
## Balanced Accuracy : 0.5851
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
## Parameter Nilai
## 1 Metode naive_bayes
## 2 laplace (default) 0
## 3 usekernel (default) FALSE
## 4 adjust (default) 1
## 5 Platform R - naivebayes
## Split : 80:20
## Kondisi : Sesudah SMOTE
## Metode : Naive Bayes
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2960 2617
## Merokok 1486 2974
##
## Accuracy : 0.5912
## 95% CI : (0.5815, 0.6009)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : 2.549e-12
##
## Kappa : 0.1927
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5319
## Specificity : 0.6658
## Pos Pred Value : 0.6668
## Neg Pred Value : 0.5308
## Prevalence : 0.5570
## Detection Rate : 0.2963
## Detection Prevalence : 0.4444
## Balanced Accuracy : 0.5988
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : Sesudah SMOTE
## Metode : Naive Bayes
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 345 647
## Merokok 210 750
##
## Accuracy : 0.561
## 95% CI : (0.5386, 0.5831)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1281
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5369
## Specificity : 0.6216
## Pos Pred Value : 0.7812
## Neg Pred Value : 0.3478
## Prevalence : 0.7157
## Detection Rate : 0.3842
## Detection Prevalence : 0.4918
## Balanced Accuracy : 0.5792
##
## 'Positive' Class : Merokok
##
res_smote_70 <- jalankan_model(train_70_smote, test_30_num, "70:30", "Sesudah SMOTE")
##
## >>> KNN <<<
## Spesifikasi KNN:
## Parameter Nilai
## 1 Metode KNN
## 2 k (default) 5
## 3 Preprocessing Center + Scale
## 4 Platform R - caret
## Split : 70:30
## Kondisi : Sesudah SMOTE
## Metode : KNN
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2735 807
## Merokok 1155 4085
##
## Accuracy : 0.7766
## 95% CI : (0.7677, 0.7853)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5431
##
## Mcnemar's Test P-Value : 4.728e-15
##
## Sensitivity : 0.8350
## Specificity : 0.7031
## Pos Pred Value : 0.7796
## Neg Pred Value : 0.7722
## Prevalence : 0.5570
## Detection Rate : 0.4652
## Detection Prevalence : 0.5967
## Balanced Accuracy : 0.7691
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : Sesudah SMOTE
## Metode : KNN
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 320 594
## Merokok 513 1502
##
## Accuracy : 0.6221
## 95% CI : (0.6042, 0.6397)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.0979
##
## Mcnemar's Test P-Value : 0.0162
##
## Sensitivity : 0.7166
## Specificity : 0.3842
## Pos Pred Value : 0.7454
## Neg Pred Value : 0.3501
## Prevalence : 0.7156
## Detection Rate : 0.5128
## Detection Prevalence : 0.6879
## Balanced Accuracy : 0.5504
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree <<<
## Spesifikasi Decision Tree:
## Parameter Nilai
## 1 Metode rpart
## 2 Splitting Criterion Gini Index
## 3 cp (default) 0.01
## 4 maxdepth 10
## 5 Platform R - rpart
## Split : 70:30
## Kondisi : Sesudah SMOTE
## Metode : Decision Tree
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 1756 920
## Merokok 2134 3972
##
## Accuracy : 0.6522
## 95% CI : (0.6422, 0.6622)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.272
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8119
## Specificity : 0.4514
## Pos Pred Value : 0.6505
## Neg Pred Value : 0.6562
## Prevalence : 0.5570
## Detection Rate : 0.4523
## Detection Prevalence : 0.6953
## Balanced Accuracy : 0.6317
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : Sesudah SMOTE
## Metode : Decision Tree
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 322 376
## Merokok 511 1720
##
## Accuracy : 0.6972
## 95% CI : (0.6802, 0.7138)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 0.9868
##
## Kappa : 0.2178
##
## Mcnemar's Test P-Value : 6.818e-06
##
## Sensitivity : 0.8206
## Specificity : 0.3866
## Pos Pred Value : 0.7710
## Neg Pred Value : 0.4613
## Prevalence : 0.7156
## Detection Rate : 0.5872
## Detection Prevalence : 0.7617
## Balanced Accuracy : 0.6036
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes <<<
## Spesifikasi Naive Bayes:
## Parameter Nilai
## 1 Metode naive_bayes
## 2 laplace (default) 0
## 3 usekernel (default) FALSE
## 4 adjust (default) 1
## 5 Platform R - naivebayes
## Split : 70:30
## Kondisi : Sesudah SMOTE
## Metode : Naive Bayes
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2589 2349
## Merokok 1301 2543
##
## Accuracy : 0.5844
## 95% CI : (0.574, 0.5947)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : 1.257e-07
##
## Kappa : 0.1804
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5198
## Specificity : 0.6656
## Pos Pred Value : 0.6616
## Neg Pred Value : 0.5243
## Prevalence : 0.5570
## Detection Rate : 0.2896
## Detection Prevalence : 0.4377
## Balanced Accuracy : 0.5927
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : Sesudah SMOTE
## Metode : Naive Bayes
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 553 978
## Merokok 280 1118
##
## Accuracy : 0.5705
## 95% CI : (0.5523, 0.5885)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1575
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5334
## Specificity : 0.6639
## Pos Pred Value : 0.7997
## Neg Pred Value : 0.3612
## Prevalence : 0.7156
## Detection Rate : 0.3817
## Detection Prevalence : 0.4773
## Balanced Accuracy : 0.5986
##
## 'Positive' Class : Merokok
##
hasil_smote <- bind_rows(res_smote_90$hasil, res_smote_80$hasil, res_smote_70$hasil)
print(hasil_smote)
## Split Kondisi Model Set_Data Accuracy Precision
## Precision...1 90:10 Sesudah SMOTE KNN Training 0.7638151 0.7642597
## Precision...2 90:10 Sesudah SMOTE KNN Testing 0.6317949 0.7411095
## Precision...3 90:10 Sesudah SMOTE Decision Tree Training 0.6548884 0.6535352
## Precision...4 90:10 Sesudah SMOTE Decision Tree Testing 0.6789744 0.7604871
## Precision...5 90:10 Sesudah SMOTE Naive Bayes Training 0.5812079 0.6669519
## Precision...6 90:10 Sesudah SMOTE Naive Bayes Testing 0.5620513 0.8017817
## Precision...7 80:20 Sesudah SMOTE KNN Training 0.7735379 0.7795753
## Precision...8 80:20 Sesudah SMOTE KNN Testing 0.6178279 0.7370721
## Precision...9 80:20 Sesudah SMOTE Decision Tree Training 0.6616519 0.6585754
## Precision...10 80:20 Sesudah SMOTE Decision Tree Testing 0.6828893 0.7610738
## Precision...11 80:20 Sesudah SMOTE Naive Bayes Training 0.5912125 0.6668161
## Precision...12 80:20 Sesudah SMOTE Naive Bayes Testing 0.5609631 0.7812500
## Precision...13 70:30 Sesudah SMOTE KNN Training 0.7765885 0.7795802
## Precision...14 70:30 Sesudah SMOTE KNN Testing 0.6220553 0.7454094
## Precision...15 70:30 Sesudah SMOTE Decision Tree Training 0.6522432 0.6505077
## Precision...16 70:30 Sesudah SMOTE Decision Tree Testing 0.6971663 0.7709547
## Precision...17 70:30 Sesudah SMOTE Naive Bayes Training 0.5843771 0.6615505
## Precision...18 70:30 Sesudah SMOTE Naive Bayes Testing 0.5705019 0.7997139
## Recall F1_Score Specificity Kappa
## Precision...1 0.8329094 0.7971092 0.6769292 0.51580380
## Precision...2 0.7464183 0.7437545 0.3429603 0.08986783
## Precision...3 0.8096979 0.7232834 0.4602159 0.27848119
## Precision...4 0.8051576 0.7821851 0.3610108 0.17393234
## Precision...5 0.4957075 0.5687187 0.6887245 0.17853030
## Precision...6 0.5157593 0.6277245 0.6787004 0.15298634
## Precision...7 0.8274012 0.8027766 0.7058030 0.53744984
## Precision...8 0.7244094 0.7306859 0.3495495 0.07300758
## Precision...9 0.8152388 0.7285806 0.4685110 0.29271613
## Precision...10 0.8117394 0.7855906 0.3585586 0.17935511
## Precision...11 0.5319263 0.5917819 0.6657670 0.19267445
## Precision...12 0.5368647 0.6364022 0.6216216 0.12809180
## Precision...13 0.8350368 0.8063561 0.7030848 0.54309738
## Precision...14 0.7166031 0.7307225 0.3841537 0.09788887
## Precision...15 0.8119379 0.7223131 0.4514139 0.27204662
## Precision...16 0.8206107 0.7950081 0.3865546 0.21780010
## Precision...17 0.5198283 0.5821886 0.6655527 0.18040353
## Precision...18 0.5333969 0.6399542 0.6638655 0.15750004
Proses hyperparameter tuning dilakukan menggunakan Grid Search dengan 5-Fold Cross Validation. Pendekatan ini digunakan untuk memperoleh kombinasi parameter terbaik berdasarkan hasil evaluasi pada lima lipatan data pelatihan.
set.seed(123)
ctrl_grid <- trainControl(
method = "cv",
number = 5,
classProbs = TRUE,
search = "grid",
summaryFunction = defaultSummary)
Hyperparameter tuning dengan metode Grid Search dilakukan pada data hasil SMOTE untuk memperoleh kombinasi parameter yang optimal pada model Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Parameter terbaik yang diperoleh kemudian digunakan dalam proses pemodelan dan evaluasi model.
tuning_model_grid <- function(train_data, test_data, split_name) {
hasil_tuning <- data.frame()
kondisi <- "SMOTE + Grid Search"
train_x <- train_data %>% dplyr::select(-Y)
train_y <- train_data$Y
test_x <- test_data %>% dplyr::select(-Y)
test_y <- test_data$Y
# A. KNN — grid k
cat("\n>>> KNN Grid Search <<<\n")
grid_knn <- expand.grid(k = c(3, 5, 7, 9, 11, 13, 15))
set.seed(123)
tune_knn <- train(Y ~ ., data = train_data, method = "knn",
trControl = ctrl_grid,
tuneGrid = grid_knn,
preProcess = c("center", "scale"),
metric = "Accuracy")
cat("\nHasil Grid KNN:\n"); print(tune_knn$results)
cat("\nBest Tune KNN:\n"); print(tune_knn$bestTune)
spek_knn_tuning <- data.frame(
Parameter = c("Metode Tuning", "Search Strategy", "Cross-Validation",
"Grid k", "Best k"),
Nilai = c("Grid Search", "Grid", "5-Fold CV",
paste(grid_knn$k, collapse = ", "),
as.character(tune_knn$bestTune$k))
)
print(spek_knn_tuning)
pred_knn_train <- factor(predict(tune_knn, train_x), levels = levels(train_y))
hasil_tuning <- bind_rows(hasil_tuning,
evaluasi_model(train_y, pred_knn_train,
"KNN Tuning", split_name, kondisi, "Training"))
pred_knn_test <- factor(predict(tune_knn, test_x), levels = levels(test_y))
hasil_tuning <- bind_rows(hasil_tuning,
evaluasi_model(test_y, pred_knn_test,
"KNN Tuning", split_name, kondisi, "Testing"))
# B. Decision Tree — grid cp
cat("\n>>> Decision Tree Grid Search <<<\n")
grid_dt <- expand.grid(cp = c(0.0001, 0.001, 0.005, 0.01, 0.05, 0.1))
set.seed(123)
tune_dt_cv <- train(Y ~ ., data = train_data, method = "rpart",
trControl = ctrl_grid,
tuneGrid = grid_dt,
metric = "Accuracy")
cat("\nHasil Grid Decision Tree:\n"); print(tune_dt_cv$results)
cat("\nBest Tune Decision Tree:\n"); print(tune_dt_cv$bestTune)
spek_dt_tuning <- data.frame(
Parameter = c("Metode Tuning", "Search Strategy", "Cross-Validation",
"Grid cp", "Splitting Criterion", "Best cp", "maxdepth"),
Nilai = c("Grid Search", "Grid", "5-Fold CV",
paste(grid_dt$cp, collapse = ", "),
"Gini Index",
as.character(round(tune_dt_cv$bestTune$cp, 6)), "10")
)
print(spek_dt_tuning)
# Latih ulang dengan cp terbaik
tune_dt <- rpart(Y ~ ., data = train_data, method = "class",
control = rpart.control(cp = tune_dt_cv$bestTune$cp,
maxdepth = 10))
pred_dt_train <- factor(predict(tune_dt, train_x, type = "class"),
levels = levels(train_y))
hasil_tuning <- bind_rows(hasil_tuning,
evaluasi_model(train_y, pred_dt_train,
"Decision Tree Tuning", split_name, kondisi, "Training"))
pred_dt_test <- factor(predict(tune_dt, test_x, type = "class"),
levels = levels(test_y))
hasil_tuning <- bind_rows(hasil_tuning,
evaluasi_model(test_y, pred_dt_test,
"Decision Tree Tuning", split_name, kondisi, "Testing"))
# C. Naive Bayes — grid laplace, usekernel, adjust
cat("\n>>> Naive Bayes Grid Search <<<\n")
grid_nb <- expand.grid(
laplace = c(0, 0.5, 1),
usekernel = c(FALSE, TRUE),
adjust = c(0.5, 1, 1.5, 2)
)
set.seed(123)
tune_nb <- train(Y ~ ., data = train_data, method = "naive_bayes",
trControl = ctrl_grid,
tuneGrid = grid_nb,
metric = "Accuracy")
cat("\nHasil Grid Naive Bayes:\n"); print(tune_nb$results)
cat("\nBest Tune Naive Bayes:\n"); print(tune_nb$bestTune)
spek_nb_tuning <- data.frame(
Parameter = c("Metode Tuning", "Search Strategy", "Cross-Validation",
"Grid laplace", "Grid usekernel", "Grid adjust",
"Best laplace", "Best usekernel", "Best adjust"),
Nilai = c("Grid Search", "Grid", "5-Fold CV",
paste(unique(grid_nb$laplace), collapse = ", "),
paste(unique(grid_nb$usekernel), collapse = ", "),
paste(unique(grid_nb$adjust), collapse = ", "),
as.character(tune_nb$bestTune$laplace),
as.character(tune_nb$bestTune$usekernel),
as.character(tune_nb$bestTune$adjust))
)
print(spek_nb_tuning)
pred_nb_train <- factor(predict(tune_nb, train_x), levels = levels(train_y))
hasil_tuning <- bind_rows(hasil_tuning,
evaluasi_model(train_y, pred_nb_train,
"Naive Bayes Tuning", split_name, kondisi, "Training"))
pred_nb_test <- factor(predict(tune_nb, test_x), levels = levels(test_y))
hasil_tuning <- bind_rows(hasil_tuning,
evaluasi_model(test_y, pred_nb_test,
"Naive Bayes Tuning", split_name, kondisi, "Testing"))
return(list(
hasil = hasil_tuning,
knn = tune_knn,
dt = tune_dt,
dt_cv = tune_dt_cv,
nb = tune_nb
))}
Pada tahap ini dilakukan proses hyperparameter tuning menggunakan metode Grid Search pada data hasil SMOTE untuk memperoleh kombinasi parameter terbaik pada model klasifikasi, yaitu Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN).
tuning_90 <- tuning_model_grid(train_90_smote, test_10_num, "90:10")
##
## >>> KNN Grid Search <<<
##
## Hasil Grid KNN:
## k Accuracy Kappa AccuracySD KappaSD
## 1 3 0.6526736 0.2891045 0.006751281 0.014633744
## 2 5 0.6483350 0.2785450 0.004970076 0.010820510
## 3 7 0.6462088 0.2728919 0.005466383 0.011393553
## 4 9 0.6436411 0.2661629 0.002691579 0.006242274
## 5 11 0.6412496 0.2606083 0.004882112 0.010561670
## 6 13 0.6393010 0.2556801 0.008963066 0.019413253
## 7 15 0.6377956 0.2510948 0.008086724 0.017535153
##
## Best Tune KNN:
## k
## 1 3
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid k 3, 5, 7, 9, 11, 13, 15
## 5 Best k 3
## Split : 90:10
## Kondisi : SMOTE + Grid Search
## Metode : KNN Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 3792 886
## Merokok 1210 5404
##
## Accuracy : 0.8144
## 95% CI : (0.8071, 0.8215)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6214
##
## Mcnemar's Test P-Value : 1.724e-12
##
## Sensitivity : 0.8591
## Specificity : 0.7581
## Pos Pred Value : 0.8171
## Neg Pred Value : 0.8106
## Prevalence : 0.5570
## Detection Rate : 0.4786
## Detection Prevalence : 0.5857
## Balanced Accuracy : 0.8086
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : SMOTE + Grid Search
## Metode : KNN Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 89 188
## Merokok 188 510
##
## Accuracy : 0.6144
## 95% CI : (0.583, 0.645)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.052
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.7307
## Specificity : 0.3213
## Pos Pred Value : 0.7307
## Neg Pred Value : 0.3213
## Prevalence : 0.7159
## Detection Rate : 0.5231
## Detection Prevalence : 0.7159
## Balanced Accuracy : 0.5260
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree Grid Search <<<
##
## Hasil Grid Decision Tree:
## cp Accuracy Kappa AccuracySD KappaSD
## 1 1e-04 0.6871226 0.3562079 0.006777489 0.01519806
## 2 1e-03 0.6948268 0.3612574 0.008934419 0.01877888
## 3 5e-03 0.6592271 0.2842918 0.007847472 0.01622820
## 4 1e-02 0.6480692 0.2619817 0.011089281 0.02330709
## 5 5e-02 0.6152145 0.1799497 0.005137419 0.01091414
## 6 1e-01 0.6152145 0.1799497 0.005137419 0.01091414
##
## Best Tune Decision Tree:
## cp
## 2 0.001
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid cp 1e-04, 0.001, 0.005, 0.01, 0.05, 0.1
## 5 Splitting Criterion Gini Index
## 6 Best cp 0.001
## 7 maxdepth 10
## Split : 90:10
## Kondisi : SMOTE + Grid Search
## Metode : Decision Tree Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2391 707
## Merokok 2611 5583
##
## Accuracy : 0.7062
## 95% CI : (0.6977, 0.7146)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3804
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8876
## Specificity : 0.4780
## Pos Pred Value : 0.6814
## Neg Pred Value : 0.7718
## Prevalence : 0.5570
## Detection Rate : 0.4944
## Detection Prevalence : 0.7256
## Balanced Accuracy : 0.6828
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : SMOTE + Grid Search
## Metode : Decision Tree Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 67 84
## Merokok 210 614
##
## Accuracy : 0.6985
## 95% CI : (0.6686, 0.7271)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 0.8925
##
## Kappa : 0.1409
##
## Mcnemar's Test P-Value : 3.096e-13
##
## Sensitivity : 0.8797
## Specificity : 0.2419
## Pos Pred Value : 0.7451
## Neg Pred Value : 0.4437
## Prevalence : 0.7159
## Detection Rate : 0.6297
## Detection Prevalence : 0.8451
## Balanced Accuracy : 0.5608
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes Grid Search <<<
##
## Hasil Grid Naive Bayes:
## laplace usekernel adjust Accuracy Kappa AccuracySD KappaSD
## 1 0.0 FALSE 0.5 0.5757183 0.1690515 0.012728918 0.022399680
## 2 0.0 FALSE 1.0 0.5757183 0.1690515 0.012728918 0.022399680
## 3 0.0 FALSE 1.5 0.5757183 0.1690515 0.012728918 0.022399680
## 4 0.0 FALSE 2.0 0.5757183 0.1690515 0.012728918 0.022399680
## 5 0.0 TRUE 0.5 0.6923478 0.3724414 0.005540614 0.012912229
## 6 0.0 TRUE 1.0 0.6541803 0.2987830 0.005573450 0.008554418
## 7 0.0 TRUE 1.5 0.6250451 0.2506262 0.011094902 0.018577419
## 8 0.0 TRUE 2.0 0.6047652 0.2178888 0.007280343 0.009985229
## 9 0.5 FALSE 0.5 0.5757183 0.1690515 0.012728918 0.022399680
## 10 0.5 FALSE 1.0 0.5757183 0.1690515 0.012728918 0.022399680
## 11 0.5 FALSE 1.5 0.5757183 0.1690515 0.012728918 0.022399680
## 12 0.5 FALSE 2.0 0.5757183 0.1690515 0.012728918 0.022399680
## 13 0.5 TRUE 0.5 0.6923478 0.3724414 0.005540614 0.012912229
## 14 0.5 TRUE 1.0 0.6541803 0.2987830 0.005573450 0.008554418
## 15 0.5 TRUE 1.5 0.6250451 0.2506262 0.011094902 0.018577419
## 16 0.5 TRUE 2.0 0.6047652 0.2178888 0.007280343 0.009985229
## 17 1.0 FALSE 0.5 0.5757183 0.1690515 0.012728918 0.022399680
## 18 1.0 FALSE 1.0 0.5757183 0.1690515 0.012728918 0.022399680
## 19 1.0 FALSE 1.5 0.5757183 0.1690515 0.012728918 0.022399680
## 20 1.0 FALSE 2.0 0.5757183 0.1690515 0.012728918 0.022399680
## 21 1.0 TRUE 0.5 0.6923478 0.3724414 0.005540614 0.012912229
## 22 1.0 TRUE 1.0 0.6541803 0.2987830 0.005573450 0.008554418
## 23 1.0 TRUE 1.5 0.6250451 0.2506262 0.011094902 0.018577419
## 24 1.0 TRUE 2.0 0.6047652 0.2178888 0.007280343 0.009985229
##
## Best Tune Naive Bayes:
## laplace usekernel adjust
## 5 0 TRUE 0.5
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid laplace 0, 0.5, 1
## 5 Grid usekernel FALSE, TRUE
## 6 Grid adjust 0.5, 1, 1.5, 2
## 7 Best laplace 0
## 8 Best usekernel TRUE
## 9 Best adjust 0.5
## Split : 90:10
## Kondisi : SMOTE + Grid Search
## Metode : Naive Bayes Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 3120 1551
## Merokok 1882 4739
##
## Accuracy : 0.696
## 95% CI : (0.6874, 0.7045)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3797
##
## Mcnemar's Test P-Value : 1.779e-08
##
## Sensitivity : 0.7534
## Specificity : 0.6238
## Pos Pred Value : 0.7158
## Neg Pred Value : 0.6680
## Prevalence : 0.5570
## Detection Rate : 0.4197
## Detection Prevalence : 0.5863
## Balanced Accuracy : 0.6886
##
## 'Positive' Class : Merokok
##
## Split : 90:10
## Kondisi : SMOTE + Grid Search
## Metode : Naive Bayes Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 115 162
## Merokok 162 536
##
## Accuracy : 0.6677
## 95% CI : (0.6371, 0.6972)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 0.9996
##
## Kappa : 0.1831
##
## Mcnemar's Test P-Value : 1.0000
##
## Sensitivity : 0.7679
## Specificity : 0.4152
## Pos Pred Value : 0.7679
## Neg Pred Value : 0.4152
## Prevalence : 0.7159
## Detection Rate : 0.5497
## Detection Prevalence : 0.7159
## Balanced Accuracy : 0.5915
##
## 'Positive' Class : Merokok
##
tuning_80 <- tuning_model_grid(train_80_smote, test_20_num, "80:20")
##
## >>> KNN Grid Search <<<
##
## Hasil Grid KNN:
## k Accuracy Kappa AccuracySD KappaSD
## 1 3 0.6603563 0.3061821 0.002219518 0.003602833
## 2 5 0.6578639 0.2981994 0.010000064 0.020695645
## 3 7 0.6533814 0.2877321 0.005278955 0.010749839
## 4 9 0.6476025 0.2749781 0.008234597 0.016831306
## 5 11 0.6447138 0.2680527 0.004509737 0.009778393
## 6 13 0.6428203 0.2630049 0.007938468 0.016046519
## 7 15 0.6424224 0.2609627 0.005359674 0.011055896
##
## Best Tune KNN:
## k
## 1 3
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid k 3, 5, 7, 9, 11, 13, 15
## 5 Best k 3
## Split : 80:20
## Kondisi : SMOTE + Grid Search
## Metode : KNN Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 3475 825
## Merokok 971 4766
##
## Accuracy : 0.8211
## 95% CI : (0.8134, 0.8285)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6362
##
## Mcnemar's Test P-Value : 0.0006228
##
## Sensitivity : 0.8524
## Specificity : 0.7816
## Pos Pred Value : 0.8307
## Neg Pred Value : 0.8081
## Prevalence : 0.5570
## Detection Rate : 0.4748
## Detection Prevalence : 0.5716
## Balanced Accuracy : 0.8170
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : SMOTE + Grid Search
## Metode : KNN Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 179 390
## Merokok 376 1007
##
## Accuracy : 0.6076
## 95% CI : (0.5855, 0.6293)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 1.0000
##
## Kappa : 0.043
##
## Mcnemar's Test P-Value : 0.6386
##
## Sensitivity : 0.7208
## Specificity : 0.3225
## Pos Pred Value : 0.7281
## Neg Pred Value : 0.3146
## Prevalence : 0.7157
## Detection Rate : 0.5159
## Detection Prevalence : 0.7085
## Balanced Accuracy : 0.5217
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree Grid Search <<<
##
## Hasil Grid Decision Tree:
## cp Accuracy Kappa AccuracySD KappaSD
## 1 1e-04 0.6937310 0.3705395 0.010407544 0.023202931
## 2 1e-03 0.6941313 0.3636930 0.004555454 0.009537227
## 3 5e-03 0.6717145 0.3094030 0.003795039 0.009691605
## 4 1e-02 0.6574673 0.2811684 0.005214245 0.014762149
## 5 5e-02 0.6166197 0.1830240 0.006949055 0.016473587
## 6 1e-01 0.6166197 0.1830240 0.006949055 0.016473587
##
## Best Tune Decision Tree:
## cp
## 2 0.001
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid cp 1e-04, 0.001, 0.005, 0.01, 0.05, 0.1
## 5 Splitting Criterion Gini Index
## 6 Best cp 0.001
## 7 maxdepth 10
## Split : 80:20
## Kondisi : SMOTE + Grid Search
## Metode : Decision Tree Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2127 634
## Merokok 2319 4957
##
## Accuracy : 0.7058
## 95% CI : (0.6968, 0.7147)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3797
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8866
## Specificity : 0.4784
## Pos Pred Value : 0.6813
## Neg Pred Value : 0.7704
## Prevalence : 0.5570
## Detection Rate : 0.4939
## Detection Prevalence : 0.7249
## Balanced Accuracy : 0.6825
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : SMOTE + Grid Search
## Metode : Decision Tree Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 147 182
## Merokok 408 1215
##
## Accuracy : 0.6977
## 95% CI : (0.6768, 0.7181)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 0.9619
##
## Kappa : 0.1534
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8697
## Specificity : 0.2649
## Pos Pred Value : 0.7486
## Neg Pred Value : 0.4468
## Prevalence : 0.7157
## Detection Rate : 0.6224
## Detection Prevalence : 0.8315
## Balanced Accuracy : 0.5673
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes Grid Search <<<
##
## Hasil Grid Naive Bayes:
## laplace usekernel adjust Accuracy Kappa AccuracySD KappaSD
## 1 0.0 FALSE 0.5 0.5891196 0.1893072 0.01244551 0.01948005
## 2 0.0 FALSE 1.0 0.5891196 0.1893072 0.01244551 0.01948005
## 3 0.0 FALSE 1.5 0.5891196 0.1893072 0.01244551 0.01948005
## 4 0.0 FALSE 2.0 0.5891196 0.1893072 0.01244551 0.01948005
## 5 0.0 TRUE 0.5 0.6692221 0.3345024 0.01880158 0.02933722
## 6 0.0 TRUE 1.0 0.6319608 0.2654318 0.01792967 0.02585402
## 7 0.0 TRUE 1.5 0.6105399 0.2333733 0.02231004 0.03222596
## 8 0.0 TRUE 2.0 0.5962928 0.2092832 0.01965208 0.02728571
## 9 0.5 FALSE 0.5 0.5891196 0.1893072 0.01244551 0.01948005
## 10 0.5 FALSE 1.0 0.5891196 0.1893072 0.01244551 0.01948005
## 11 0.5 FALSE 1.5 0.5891196 0.1893072 0.01244551 0.01948005
## 12 0.5 FALSE 2.0 0.5891196 0.1893072 0.01244551 0.01948005
## 13 0.5 TRUE 0.5 0.6692221 0.3345024 0.01880158 0.02933722
## 14 0.5 TRUE 1.0 0.6319608 0.2654318 0.01792967 0.02585402
## 15 0.5 TRUE 1.5 0.6105399 0.2333733 0.02231004 0.03222596
## 16 0.5 TRUE 2.0 0.5962928 0.2092832 0.01965208 0.02728571
## 17 1.0 FALSE 0.5 0.5891196 0.1893072 0.01244551 0.01948005
## 18 1.0 FALSE 1.0 0.5891196 0.1893072 0.01244551 0.01948005
## 19 1.0 FALSE 1.5 0.5891196 0.1893072 0.01244551 0.01948005
## 20 1.0 FALSE 2.0 0.5891196 0.1893072 0.01244551 0.01948005
## 21 1.0 TRUE 0.5 0.6692221 0.3345024 0.01880158 0.02933722
## 22 1.0 TRUE 1.0 0.6319608 0.2654318 0.01792967 0.02585402
## 23 1.0 TRUE 1.5 0.6105399 0.2333733 0.02231004 0.03222596
## 24 1.0 TRUE 2.0 0.5962928 0.2092832 0.01965208 0.02728571
##
## Best Tune Naive Bayes:
## laplace usekernel adjust
## 5 0 TRUE 0.5
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid laplace 0, 0.5, 1
## 5 Grid usekernel FALSE, TRUE
## 6 Grid adjust 0.5, 1, 1.5, 2
## 7 Best laplace 0
## 8 Best usekernel TRUE
## 9 Best adjust 0.5
## Split : 80:20
## Kondisi : SMOTE + Grid Search
## Metode : Naive Bayes Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2968 1832
## Merokok 1478 3759
##
## Accuracy : 0.6702
## 95% CI : (0.6609, 0.6794)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3371
##
## Mcnemar's Test P-Value : 8.481e-10
##
## Sensitivity : 0.6723
## Specificity : 0.6676
## Pos Pred Value : 0.7178
## Neg Pred Value : 0.6183
## Prevalence : 0.5570
## Detection Rate : 0.3745
## Detection Prevalence : 0.5218
## Balanced Accuracy : 0.6699
##
## 'Positive' Class : Merokok
##
## Split : 80:20
## Kondisi : SMOTE + Grid Search
## Metode : Naive Bayes Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 281 433
## Merokok 274 964
##
## Accuracy : 0.6378
## 95% CI : (0.616, 0.6592)
## No Information Rate : 0.7157
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1808
##
## Mcnemar's Test P-Value : 2.812e-09
##
## Sensitivity : 0.6901
## Specificity : 0.5063
## Pos Pred Value : 0.7787
## Neg Pred Value : 0.3936
## Prevalence : 0.7157
## Detection Rate : 0.4939
## Detection Prevalence : 0.6342
## Balanced Accuracy : 0.5982
##
## 'Positive' Class : Merokok
##
tuning_70 <- tuning_model_grid(train_70_smote, test_30_num, "70:30")
##
## >>> KNN Grid Search <<<
##
## Hasil Grid KNN:
## k Accuracy Kappa AccuracySD KappaSD
## 1 3 0.6654539 0.3169267 0.01203198 0.02371039
## 2 5 0.6581671 0.2994442 0.01358479 0.02928201
## 3 7 0.6513347 0.2831086 0.01310253 0.02631077
## 4 9 0.6466662 0.2727664 0.01174490 0.02290493
## 5 11 0.6397197 0.2576399 0.01200059 0.02403883
## 6 13 0.6415406 0.2607905 0.01445131 0.02948253
## 7 15 0.6391503 0.2547549 0.01519120 0.03127271
##
## Best Tune KNN:
## k
## 1 3
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid k 3, 5, 7, 9, 11, 13, 15
## 5 Best k 3
## Split : 70:30
## Kondisi : SMOTE + Grid Search
## Metode : KNN Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 3036 671
## Merokok 854 4221
##
## Accuracy : 0.8263
## 95% CI : (0.8183, 0.8342)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6464
##
## Mcnemar's Test P-Value : 3.154e-06
##
## Sensitivity : 0.8628
## Specificity : 0.7805
## Pos Pred Value : 0.8317
## Neg Pred Value : 0.8190
## Prevalence : 0.5570
## Detection Rate : 0.4806
## Detection Prevalence : 0.5779
## Balanced Accuracy : 0.8217
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : SMOTE + Grid Search
## Metode : KNN Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 318 587
## Merokok 515 1509
##
## Accuracy : 0.6238
## 95% CI : (0.6059, 0.6413)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 1.00000
##
## Kappa : 0.0991
##
## Mcnemar's Test P-Value : 0.03245
##
## Sensitivity : 0.7199
## Specificity : 0.3818
## Pos Pred Value : 0.7456
## Neg Pred Value : 0.3514
## Prevalence : 0.7156
## Detection Rate : 0.5152
## Detection Prevalence : 0.6910
## Balanced Accuracy : 0.5508
##
## 'Positive' Class : Merokok
##
##
## >>> Decision Tree Grid Search <<<
##
## Hasil Grid Decision Tree:
## cp Accuracy Kappa AccuracySD KappaSD
## 1 1e-04 0.6938071 0.3704462 0.012072840 0.023340879
## 2 1e-03 0.6876561 0.3463824 0.007264530 0.014419113
## 3 5e-03 0.6629465 0.2835895 0.005438087 0.013400742
## 4 1e-02 0.6437012 0.2536315 0.010797773 0.023369060
## 5 5e-02 0.6032791 0.1476151 0.002392875 0.009688167
## 6 1e-01 0.6032791 0.1476151 0.002392875 0.009688167
##
## Best Tune Decision Tree:
## cp
## 1 1e-04
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid cp 1e-04, 0.001, 0.005, 0.01, 0.05, 0.1
## 5 Splitting Criterion Gini Index
## 6 Best cp 1e-04
## 7 maxdepth 10
## Split : 70:30
## Kondisi : SMOTE + Grid Search
## Metode : Decision Tree Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 1927 508
## Merokok 1963 4384
##
## Accuracy : 0.7186
## 95% CI : (0.7091, 0.728)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4071
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8962
## Specificity : 0.4954
## Pos Pred Value : 0.6907
## Neg Pred Value : 0.7914
## Prevalence : 0.5570
## Detection Rate : 0.4992
## Detection Prevalence : 0.7227
## Balanced Accuracy : 0.6958
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : SMOTE + Grid Search
## Metode : Decision Tree Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 221 250
## Merokok 612 1846
##
## Accuracy : 0.7057
## 95% CI : (0.6888, 0.7222)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 0.8863
##
## Kappa : 0.168
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8807
## Specificity : 0.2653
## Pos Pred Value : 0.7510
## Neg Pred Value : 0.4692
## Prevalence : 0.7156
## Detection Rate : 0.6302
## Detection Prevalence : 0.8392
## Balanced Accuracy : 0.5730
##
## 'Positive' Class : Merokok
##
##
## >>> Naive Bayes Grid Search <<<
##
## Hasil Grid Naive Bayes:
## laplace usekernel adjust Accuracy Kappa AccuracySD KappaSD
## 1 0.0 FALSE 0.5 0.5824406 0.1769451 0.008527046 0.01581122
## 2 0.0 FALSE 1.0 0.5824406 0.1769451 0.008527046 0.01581122
## 3 0.0 FALSE 1.5 0.5824406 0.1769451 0.008527046 0.01581122
## 4 0.0 FALSE 2.0 0.5824406 0.1769451 0.008527046 0.01581122
## 5 0.0 TRUE 0.5 0.6709171 0.3354427 0.006595026 0.01107505
## 6 0.0 TRUE 1.0 0.6323153 0.2633345 0.008541983 0.01555615
## 7 0.0 TRUE 1.5 0.6068084 0.2233559 0.010308988 0.01873607
## 8 0.0 TRUE 2.0 0.5946238 0.2032387 0.009400893 0.01646852
## 9 0.5 FALSE 0.5 0.5824406 0.1769451 0.008527046 0.01581122
## 10 0.5 FALSE 1.0 0.5824406 0.1769451 0.008527046 0.01581122
## 11 0.5 FALSE 1.5 0.5824406 0.1769451 0.008527046 0.01581122
## 12 0.5 FALSE 2.0 0.5824406 0.1769451 0.008527046 0.01581122
## 13 0.5 TRUE 0.5 0.6709171 0.3354427 0.006595026 0.01107505
## 14 0.5 TRUE 1.0 0.6323153 0.2633345 0.008541983 0.01555615
## 15 0.5 TRUE 1.5 0.6068084 0.2233559 0.010308988 0.01873607
## 16 0.5 TRUE 2.0 0.5946238 0.2032387 0.009400893 0.01646852
## 17 1.0 FALSE 0.5 0.5824406 0.1769451 0.008527046 0.01581122
## 18 1.0 FALSE 1.0 0.5824406 0.1769451 0.008527046 0.01581122
## 19 1.0 FALSE 1.5 0.5824406 0.1769451 0.008527046 0.01581122
## 20 1.0 FALSE 2.0 0.5824406 0.1769451 0.008527046 0.01581122
## 21 1.0 TRUE 0.5 0.6709171 0.3354427 0.006595026 0.01107505
## 22 1.0 TRUE 1.0 0.6323153 0.2633345 0.008541983 0.01555615
## 23 1.0 TRUE 1.5 0.6068084 0.2233559 0.010308988 0.01873607
## 24 1.0 TRUE 2.0 0.5946238 0.2032387 0.009400893 0.01646852
##
## Best Tune Naive Bayes:
## laplace usekernel adjust
## 5 0 TRUE 0.5
## Parameter Nilai
## 1 Metode Tuning Grid Search
## 2 Search Strategy Grid
## 3 Cross-Validation 5-Fold CV
## 4 Grid laplace 0, 0.5, 1
## 5 Grid usekernel FALSE, TRUE
## 6 Grid adjust 0.5, 1, 1.5, 2
## 7 Best laplace 0
## 8 Best usekernel TRUE
## 9 Best adjust 0.5
## Split : 70:30
## Kondisi : SMOTE + Grid Search
## Metode : Naive Bayes Tuning
## Set Data : Training
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 2514 1469
## Merokok 1376 3423
##
## Accuracy : 0.676
## 95% CI : (0.6661, 0.6858)
## No Information Rate : 0.557
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.3451
##
## Mcnemar's Test P-Value : 0.08456
##
## Sensitivity : 0.6997
## Specificity : 0.6463
## Pos Pred Value : 0.7133
## Neg Pred Value : 0.6312
## Prevalence : 0.5570
## Detection Rate : 0.3898
## Detection Prevalence : 0.5465
## Balanced Accuracy : 0.6730
##
## 'Positive' Class : Merokok
##
## Split : 70:30
## Kondisi : SMOTE + Grid Search
## Metode : Naive Bayes Tuning
## Set Data : Testing
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 437 587
## Merokok 396 1509
##
## Accuracy : 0.6644
## 95% CI : (0.647, 0.6815)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2287
##
## Mcnemar's Test P-Value : 1.361e-09
##
## Sensitivity : 0.7199
## Specificity : 0.5246
## Pos Pred Value : 0.7921
## Neg Pred Value : 0.4268
## Prevalence : 0.7156
## Detection Rate : 0.5152
## Detection Prevalence : 0.6504
## Balanced Accuracy : 0.6223
##
## 'Positive' Class : Merokok
##
Berdasarkan proses Grid Search pada data hasil SMOTE, diperoleh parameter optimal untuk model Decision Tree, Naive Bayes, dan K-Nearest Neighbor (KNN). Parameter tersebut digunakan sebagai dasar dalam pembentukan model klasifikasi yang akan dievaluasi pada tahap selanjutnya. Hasil parameter optimal masing-masing model disajikan pada tabel berikut.
# --- KNN ---
best_knn <- data.frame(
Model = "KNN",
Split = c("90:10", "80:20", "70:30"),
k = c(tuning_90$knn$bestTune$k,
tuning_80$knn$bestTune$k,
tuning_70$knn$bestTune$k)
)
cat("\n--- Parameter Terbaik KNN ---\n")
##
## --- Parameter Terbaik KNN ---
print(best_knn)
## Model Split k
## 1 KNN 90:10 3
## 2 KNN 80:20 3
## 3 KNN 70:30 3
# --- Decision Tree ---
best_dt <- data.frame(
Model = "Decision Tree",
Split = c("90:10", "80:20", "70:30"),
cp = round(c(tuning_90$dt_cv$bestTune$cp,
tuning_80$dt_cv$bestTune$cp,
tuning_70$dt_cv$bestTune$cp), 6))
cat("\n--- Parameter Terbaik Decision Tree ---\n")
##
## --- Parameter Terbaik Decision Tree ---
print(best_dt)
## Model Split cp
## 1 Decision Tree 90:10 1e-03
## 2 Decision Tree 80:20 1e-03
## 3 Decision Tree 70:30 1e-04
# --- Naive Bayes ---
best_nb <- data.frame(
Model = "Naive Bayes",
Split = c("90:10", "80:20", "70:30"),
laplace = c(tuning_90$nb$bestTune$laplace,
tuning_80$nb$bestTune$laplace,
tuning_70$nb$bestTune$laplace),
usekernel = c(tuning_90$nb$bestTune$usekernel,
tuning_80$nb$bestTune$usekernel,
tuning_70$nb$bestTune$usekernel),
adjust = c(tuning_90$nb$bestTune$adjust,
tuning_80$nb$bestTune$adjust,
tuning_70$nb$bestTune$adjust)
)
cat("\n--- Parameter Terbaik Naive Bayes ---\n")
##
## --- Parameter Terbaik Naive Bayes ---
print(best_nb)
## Model Split laplace usekernel adjust
## 1 Naive Bayes 90:10 0 TRUE 0.5
## 2 Naive Bayes 80:20 0 TRUE 0.5
## 3 Naive Bayes 70:30 0 TRUE 0.5
Pada tahap ini dilakukan evaluasi model klasifikasi dengan menggunakan parameter optimal yang telah diperoleh dari proses hyperparameter tuning Grid Search. Evaluasi dilakukan untuk mengukur kinerja masing-masing model pada data training dan testing, Hasil evaluasi ini menjadi dasar dalam menentukan model terbaik.
# Hasil evaluasi
hasil_tuning_semua <- bind_rows(
tuning_90$hasil,
tuning_80$hasil,
tuning_70$hasil)
print(hasil_tuning_semua)
## Split Kondisi Model Set_Data
## Precision...1 90:10 SMOTE + Grid Search KNN Tuning Training
## Precision...2 90:10 SMOTE + Grid Search KNN Tuning Testing
## Precision...3 90:10 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...4 90:10 SMOTE + Grid Search Decision Tree Tuning Testing
## Precision...5 90:10 SMOTE + Grid Search Naive Bayes Tuning Training
## Precision...6 90:10 SMOTE + Grid Search Naive Bayes Tuning Testing
## Precision...7 80:20 SMOTE + Grid Search KNN Tuning Training
## Precision...8 80:20 SMOTE + Grid Search KNN Tuning Testing
## Precision...9 80:20 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...10 80:20 SMOTE + Grid Search Decision Tree Tuning Testing
## Precision...11 80:20 SMOTE + Grid Search Naive Bayes Tuning Training
## Precision...12 80:20 SMOTE + Grid Search Naive Bayes Tuning Testing
## Precision...13 70:30 SMOTE + Grid Search KNN Tuning Training
## Precision...14 70:30 SMOTE + Grid Search KNN Tuning Testing
## Precision...15 70:30 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...16 70:30 SMOTE + Grid Search Decision Tree Tuning Testing
## Precision...17 70:30 SMOTE + Grid Search Naive Bayes Tuning Training
## Precision...18 70:30 SMOTE + Grid Search Naive Bayes Tuning Testing
## Accuracy Precision Recall F1_Score Specificity Kappa
## Precision...1 0.8143819 0.8170547 0.8591415 0.8375697 0.7580968 0.62135903
## Precision...2 0.6143590 0.7306590 0.7306590 0.7306590 0.3212996 0.05195866
## Precision...3 0.7061637 0.6813522 0.8875994 0.7709196 0.4780088 0.38043466
## Precision...4 0.6984615 0.7451456 0.8796562 0.8068331 0.2418773 0.14085588
## Precision...5 0.6959795 0.7157529 0.7534181 0.7341027 0.6237505 0.37974143
## Precision...6 0.6676923 0.7679083 0.7679083 0.7679083 0.4151625 0.18307076
## Precision...7 0.8210621 0.8307478 0.8524414 0.8414548 0.7816014 0.63618206
## Precision...8 0.6075820 0.7281273 0.7208304 0.7244604 0.3225225 0.04302580
## Precision...9 0.7057886 0.6812809 0.8866035 0.7704982 0.4784076 0.37974813
## Precision...10 0.6977459 0.7486137 0.8697208 0.8046358 0.2648649 0.15341177
## Precision...11 0.6702202 0.7177774 0.6723305 0.6943110 0.6675664 0.33714810
## Precision...12 0.6378074 0.7786753 0.6900501 0.7316888 0.5063063 0.18075116
## Precision...13 0.8263494 0.8317241 0.8628373 0.8469951 0.7804627 0.64641433
## Precision...14 0.6237624 0.7455534 0.7199427 0.7325243 0.3817527 0.09911434
## Precision...15 0.7186290 0.6907200 0.8961570 0.7801406 0.4953728 0.40712500
## Precision...16 0.7057016 0.7510171 0.8807252 0.8107159 0.2653061 0.16803208
## Precision...17 0.6760419 0.7132736 0.6997138 0.7064286 0.6462725 0.34514126
## Precision...18 0.6643906 0.7921260 0.7199427 0.7543114 0.5246098 0.22874989
set.seed(123)
perbandingan_final <- bind_rows(hasil_awal, hasil_smote, hasil_tuning_semua)
print(perbandingan_final)
## Split Kondisi Model Set_Data
## Precision...1 90:10 Sebelum SMOTE KNN Training
## Precision...2 90:10 Sebelum SMOTE KNN Testing
## Precision...3 90:10 Sebelum SMOTE Decision Tree Training
## Precision...4 90:10 Sebelum SMOTE Decision Tree Testing
## Precision...5 90:10 Sebelum SMOTE Naive Bayes Training
## Precision...6 90:10 Sebelum SMOTE Naive Bayes Testing
## Precision...7 80:20 Sebelum SMOTE KNN Training
## Precision...8 80:20 Sebelum SMOTE KNN Testing
## Precision...9 80:20 Sebelum SMOTE Decision Tree Training
## Precision...10 80:20 Sebelum SMOTE Decision Tree Testing
## Precision...11 80:20 Sebelum SMOTE Naive Bayes Training
## Precision...12 80:20 Sebelum SMOTE Naive Bayes Testing
## Precision...13 70:30 Sebelum SMOTE KNN Training
## Precision...14 70:30 Sebelum SMOTE KNN Testing
## Precision...15 70:30 Sebelum SMOTE Decision Tree Training
## Precision...16 70:30 Sebelum SMOTE Decision Tree Testing
## Precision...17 70:30 Sebelum SMOTE Naive Bayes Training
## Precision...18 70:30 Sebelum SMOTE Naive Bayes Testing
## Precision...19 90:10 Sesudah SMOTE KNN Training
## Precision...20 90:10 Sesudah SMOTE KNN Testing
## Precision...21 90:10 Sesudah SMOTE Decision Tree Training
## Precision...22 90:10 Sesudah SMOTE Decision Tree Testing
## Precision...23 90:10 Sesudah SMOTE Naive Bayes Training
## Precision...24 90:10 Sesudah SMOTE Naive Bayes Testing
## Precision...25 80:20 Sesudah SMOTE KNN Training
## Precision...26 80:20 Sesudah SMOTE KNN Testing
## Precision...27 80:20 Sesudah SMOTE Decision Tree Training
## Precision...28 80:20 Sesudah SMOTE Decision Tree Testing
## Precision...29 80:20 Sesudah SMOTE Naive Bayes Training
## Precision...30 80:20 Sesudah SMOTE Naive Bayes Testing
## Precision...31 70:30 Sesudah SMOTE KNN Training
## Precision...32 70:30 Sesudah SMOTE KNN Testing
## Precision...33 70:30 Sesudah SMOTE Decision Tree Training
## Precision...34 70:30 Sesudah SMOTE Decision Tree Testing
## Precision...35 70:30 Sesudah SMOTE Naive Bayes Training
## Precision...36 70:30 Sesudah SMOTE Naive Bayes Testing
## Precision...37 90:10 SMOTE + Grid Search KNN Tuning Training
## Precision...38 90:10 SMOTE + Grid Search KNN Tuning Testing
## Precision...39 90:10 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...40 90:10 SMOTE + Grid Search Decision Tree Tuning Testing
## Precision...41 90:10 SMOTE + Grid Search Naive Bayes Tuning Training
## Precision...42 90:10 SMOTE + Grid Search Naive Bayes Tuning Testing
## Precision...43 80:20 SMOTE + Grid Search KNN Tuning Training
## Precision...44 80:20 SMOTE + Grid Search KNN Tuning Testing
## Precision...45 80:20 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...46 80:20 SMOTE + Grid Search Decision Tree Tuning Testing
## Precision...47 80:20 SMOTE + Grid Search Naive Bayes Tuning Training
## Precision...48 80:20 SMOTE + Grid Search Naive Bayes Tuning Testing
## Precision...49 70:30 SMOTE + Grid Search KNN Tuning Training
## Precision...50 70:30 SMOTE + Grid Search KNN Tuning Testing
## Precision...51 70:30 SMOTE + Grid Search Decision Tree Tuning Training
## Precision...52 70:30 SMOTE + Grid Search Decision Tree Tuning Testing
## Precision...53 70:30 SMOTE + Grid Search Naive Bayes Tuning Training
## Precision...54 70:30 SMOTE + Grid Search Naive Bayes Tuning Testing
## Accuracy Precision Recall F1_Score Specificity Kappa
## Precision...1 0.7590718 0.7834239 0.9166932 0.8448352 0.3626549 0.32066914
## Precision...2 0.6758974 0.7323601 0.8624642 0.7921053 0.2057762 0.07889072
## Precision...3 0.7155045 0.7155045 1.0000000 0.8341622 0.0000000 0.00000000
## Precision...4 0.7158974 0.7158974 1.0000000 0.8344292 0.0000000 0.00000000
## Precision...5 0.5042657 0.8039648 0.4062003 0.5397127 0.7508996 0.11427327
## Precision...6 0.4789744 0.7668539 0.3911175 0.5180266 0.7003610 0.06665863
## Precision...7 0.7623496 0.7896370 0.9103917 0.8457257 0.3900135 0.33976551
## Precision...8 0.6618852 0.7333756 0.8289191 0.7782258 0.2414414 0.07807641
## Precision...9 0.7155106 0.7155106 1.0000000 0.8341664 0.0000000 0.00000000
## Precision...10 0.7156762 0.7156762 1.0000000 0.8342789 0.0000000 0.00000000
## Precision...11 0.5327617 0.7964548 0.4661062 0.5880627 0.7004049 0.12670095
## Precision...12 0.5225410 0.7866831 0.4566929 0.5778986 0.6882883 0.10998482
## Precision...13 0.7629077 0.7871817 0.9163941 0.8468877 0.3768638 0.33490458
## Precision...14 0.6828269 0.7416149 0.8544847 0.7940590 0.2509004 0.11913025
## Precision...15 0.7155185 0.7155185 1.0000000 0.8341717 0.0000000 0.00000000
## Precision...16 0.7156026 0.7156026 1.0000000 0.8342289 0.0000000 0.00000000
## Precision...17 0.4591195 0.8061538 0.3213410 0.4595148 0.8056555 0.08724664
## Precision...18 0.4653465 0.8139810 0.3277672 0.4673469 0.8115246 0.09587575
## Precision...19 0.7638151 0.7642597 0.8329094 0.7971092 0.6769292 0.51580380
## Precision...20 0.6317949 0.7411095 0.7464183 0.7437545 0.3429603 0.08986783
## Precision...21 0.6548884 0.6535352 0.8096979 0.7232834 0.4602159 0.27848119
## Precision...22 0.6789744 0.7604871 0.8051576 0.7821851 0.3610108 0.17393234
## Precision...23 0.5812079 0.6669519 0.4957075 0.5687187 0.6887245 0.17853030
## Precision...24 0.5620513 0.8017817 0.5157593 0.6277245 0.6787004 0.15298634
## Precision...25 0.7735379 0.7795753 0.8274012 0.8027766 0.7058030 0.53744984
## Precision...26 0.6178279 0.7370721 0.7244094 0.7306859 0.3495495 0.07300758
## Precision...27 0.6616519 0.6585754 0.8152388 0.7285806 0.4685110 0.29271613
## Precision...28 0.6828893 0.7610738 0.8117394 0.7855906 0.3585586 0.17935511
## Precision...29 0.5912125 0.6668161 0.5319263 0.5917819 0.6657670 0.19267445
## Precision...30 0.5609631 0.7812500 0.5368647 0.6364022 0.6216216 0.12809180
## Precision...31 0.7765885 0.7795802 0.8350368 0.8063561 0.7030848 0.54309738
## Precision...32 0.6220553 0.7454094 0.7166031 0.7307225 0.3841537 0.09788887
## Precision...33 0.6522432 0.6505077 0.8119379 0.7223131 0.4514139 0.27204662
## Precision...34 0.6971663 0.7709547 0.8206107 0.7950081 0.3865546 0.21780010
## Precision...35 0.5843771 0.6615505 0.5198283 0.5821886 0.6655527 0.18040353
## Precision...36 0.5705019 0.7997139 0.5333969 0.6399542 0.6638655 0.15750004
## Precision...37 0.8143819 0.8170547 0.8591415 0.8375697 0.7580968 0.62135903
## Precision...38 0.6143590 0.7306590 0.7306590 0.7306590 0.3212996 0.05195866
## Precision...39 0.7061637 0.6813522 0.8875994 0.7709196 0.4780088 0.38043466
## Precision...40 0.6984615 0.7451456 0.8796562 0.8068331 0.2418773 0.14085588
## Precision...41 0.6959795 0.7157529 0.7534181 0.7341027 0.6237505 0.37974143
## Precision...42 0.6676923 0.7679083 0.7679083 0.7679083 0.4151625 0.18307076
## Precision...43 0.8210621 0.8307478 0.8524414 0.8414548 0.7816014 0.63618206
## Precision...44 0.6075820 0.7281273 0.7208304 0.7244604 0.3225225 0.04302580
## Precision...45 0.7057886 0.6812809 0.8866035 0.7704982 0.4784076 0.37974813
## Precision...46 0.6977459 0.7486137 0.8697208 0.8046358 0.2648649 0.15341177
## Precision...47 0.6702202 0.7177774 0.6723305 0.6943110 0.6675664 0.33714810
## Precision...48 0.6378074 0.7786753 0.6900501 0.7316888 0.5063063 0.18075116
## Precision...49 0.8263494 0.8317241 0.8628373 0.8469951 0.7804627 0.64641433
## Precision...50 0.6237624 0.7455534 0.7199427 0.7325243 0.3817527 0.09911434
## Precision...51 0.7186290 0.6907200 0.8961570 0.7801406 0.4953728 0.40712500
## Precision...52 0.7057016 0.7510171 0.8807252 0.8107159 0.2653061 0.16803208
## Precision...53 0.6760419 0.7132736 0.6997138 0.7064286 0.6462725 0.34514126
## Precision...54 0.6643906 0.7921260 0.7199427 0.7543114 0.5246098 0.22874989
# --- Ringkasan Testing ---
perbandingan_testing <- perbandingan_final %>%
filter(Set_Data == "Testing") %>%
dplyr::select(Split, Kondisi, Model,
Accuracy, Precision, Recall, F1_Score, Specificity, Kappa) %>%
mutate(across(where(is.numeric), ~ round(., 4)))
cat("\n\n========== PENENTUAN METODE TERBAIK ==========\n")
##
##
## ========== PENENTUAN METODE TERBAIK ==========
best_analysis <- perbandingan_final %>%
filter(Set_Data == "Testing") %>%
mutate(across(c(Accuracy, Precision, Recall, F1_Score, Specificity, Kappa),
~ round(., 4)))
# --- Terbaik per Split ---
cat("\n--- Metode Terbaik per Split (F1-Score Tertinggi) ---\n")
##
## --- Metode Terbaik per Split (F1-Score Tertinggi) ---
best_per_split <- best_analysis %>%
group_by(Split) %>%
slice_max(order_by = F1_Score, n = 1, with_ties = FALSE) %>%
ungroup() %>%
dplyr::select(Split, Kondisi, Model, Accuracy, Precision,
Recall, F1_Score, Specificity, Kappa)
print(best_per_split)
## # A tibble: 3 × 9
## Split Kondisi Model Accuracy Precision Recall F1_Score Specificity Kappa
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 70:30 Sebelum SMOTE Deci… 0.716 0.716 1 0.834 0 0
## 2 80:20 Sebelum SMOTE Deci… 0.716 0.716 1 0.834 0 0
## 3 90:10 Sebelum SMOTE Deci… 0.716 0.716 1 0.834 0 0
# --- Ranking Lengkap ---
cat("\n--- Ranking Lengkap Semua Model berdasarkan F1-Score (Testing) ---\n")
##
## --- Ranking Lengkap Semua Model berdasarkan F1-Score (Testing) ---
ranking_semua <- best_analysis %>%
arrange(desc(F1_Score), desc(Accuracy), desc(Kappa)) %>%
mutate(Rank = row_number()) %>%
dplyr::select(Rank, Split, Kondisi, Model,
Accuracy, Precision, Recall, F1_Score, Specificity, Kappa)
print(ranking_semua)
## Rank Split Kondisi Model Accuracy
## Precision...1 1 90:10 Sebelum SMOTE Decision Tree 0.7159
## Precision...2 2 80:20 Sebelum SMOTE Decision Tree 0.7157
## Precision...3 3 70:30 Sebelum SMOTE Decision Tree 0.7156
## Precision...4 4 70:30 SMOTE + Grid Search Decision Tree Tuning 0.7057
## Precision...5 5 90:10 SMOTE + Grid Search Decision Tree Tuning 0.6985
## Precision...6 6 80:20 SMOTE + Grid Search Decision Tree Tuning 0.6977
## Precision...7 7 70:30 Sesudah SMOTE Decision Tree 0.6972
## Precision...8 8 70:30 Sebelum SMOTE KNN 0.6828
## Precision...9 9 90:10 Sebelum SMOTE KNN 0.6759
## Precision...10 10 80:20 Sesudah SMOTE Decision Tree 0.6829
## Precision...11 11 90:10 Sesudah SMOTE Decision Tree 0.6790
## Precision...12 12 80:20 Sebelum SMOTE KNN 0.6619
## Precision...13 13 90:10 SMOTE + Grid Search Naive Bayes Tuning 0.6677
## Precision...14 14 70:30 SMOTE + Grid Search Naive Bayes Tuning 0.6644
## Precision...15 15 90:10 Sesudah SMOTE KNN 0.6318
## Precision...16 16 70:30 SMOTE + Grid Search KNN Tuning 0.6238
## Precision...17 17 80:20 SMOTE + Grid Search Naive Bayes Tuning 0.6378
## Precision...18 18 70:30 Sesudah SMOTE KNN 0.6221
## Precision...19 19 80:20 Sesudah SMOTE KNN 0.6178
## Precision...20 20 90:10 SMOTE + Grid Search KNN Tuning 0.6144
## Precision...21 21 80:20 SMOTE + Grid Search KNN Tuning 0.6076
## Precision...22 22 70:30 Sesudah SMOTE Naive Bayes 0.5705
## Precision...23 23 80:20 Sesudah SMOTE Naive Bayes 0.5610
## Precision...24 24 90:10 Sesudah SMOTE Naive Bayes 0.5621
## Precision...25 25 80:20 Sebelum SMOTE Naive Bayes 0.5225
## Precision...26 26 90:10 Sebelum SMOTE Naive Bayes 0.4790
## Precision...27 27 70:30 Sebelum SMOTE Naive Bayes 0.4653
## Precision Recall F1_Score Specificity Kappa
## Precision...1 0.7159 1.0000 0.8344 0.0000 0.0000
## Precision...2 0.7157 1.0000 0.8343 0.0000 0.0000
## Precision...3 0.7156 1.0000 0.8342 0.0000 0.0000
## Precision...4 0.7510 0.8807 0.8107 0.2653 0.1680
## Precision...5 0.7451 0.8797 0.8068 0.2419 0.1409
## Precision...6 0.7486 0.8697 0.8046 0.2649 0.1534
## Precision...7 0.7710 0.8206 0.7950 0.3866 0.2178
## Precision...8 0.7416 0.8545 0.7941 0.2509 0.1191
## Precision...9 0.7324 0.8625 0.7921 0.2058 0.0789
## Precision...10 0.7611 0.8117 0.7856 0.3586 0.1794
## Precision...11 0.7605 0.8052 0.7822 0.3610 0.1739
## Precision...12 0.7334 0.8289 0.7782 0.2414 0.0781
## Precision...13 0.7679 0.7679 0.7679 0.4152 0.1831
## Precision...14 0.7921 0.7199 0.7543 0.5246 0.2287
## Precision...15 0.7411 0.7464 0.7438 0.3430 0.0899
## Precision...16 0.7456 0.7199 0.7325 0.3818 0.0991
## Precision...17 0.7787 0.6901 0.7317 0.5063 0.1808
## Precision...18 0.7454 0.7166 0.7307 0.3842 0.0979
## Precision...19 0.7371 0.7244 0.7307 0.3495 0.0730
## Precision...20 0.7307 0.7307 0.7307 0.3213 0.0520
## Precision...21 0.7281 0.7208 0.7245 0.3225 0.0430
## Precision...22 0.7997 0.5334 0.6400 0.6639 0.1575
## Precision...23 0.7812 0.5369 0.6364 0.6216 0.1281
## Precision...24 0.8018 0.5158 0.6277 0.6787 0.1530
## Precision...25 0.7867 0.4567 0.5779 0.6883 0.1100
## Precision...26 0.7669 0.3911 0.5180 0.7004 0.0667
## Precision...27 0.8140 0.3278 0.4673 0.8115 0.0959
# --- Terbaik Keseluruhan ---
best_overall <- ranking_semua %>% filter(Rank == 1)
cat("\n>>> METODE TERBAIK KESELURUHAN <<<\n")
##
## >>> METODE TERBAIK KESELURUHAN <<<
print(best_overall)
## Rank Split Kondisi Model Accuracy Precision Recall
## Precision 1 90:10 Sebelum SMOTE Decision Tree 0.7159 0.7159 1
## F1_Score Specificity Kappa
## Precision 0.8344 0 0
cat("\nKesimpulan:\n")
##
## Kesimpulan:
cat(" Split :", best_overall$Split, "\n")
## Split : 90:10
cat(" Kondisi :", best_overall$Kondisi, "\n")
## Kondisi : Sebelum SMOTE
cat(" Model :", best_overall$Model, "\n")
## Model : Decision Tree
cat(" Accuracy :", best_overall$Accuracy, "\n")
## Accuracy : 0.7159
cat(" Precision :", best_overall$Precision, "\n")
## Precision : 0.7159
cat(" Recall :", best_overall$Recall, "\n")
## Recall : 1
cat(" F1-Score :", best_overall$F1_Score, "\n")
## F1-Score : 0.8344
cat(" Specificity:", best_overall$Specificity, "\n")
## Specificity: 0
cat(" Kappa :", best_overall$Kappa, "\n")
## Kappa : 0
Berikut disajikan visualisasi confusion matrix dari model terbaik keseluruhan, Confusion matrix digunakan untuk mengevaluasi kinerja model dalam mengklasifikasikan status merokok dengan melihat hasil prediksi yang benar maupun salah, yang terdiri dari True Positive (TP), True Negative (TN), False Positive (FP), dan False Negative (FN).
model_best <- res_awal_90$dt
test_data <- test_10
library(caret)
test_x <- test_data %>% dplyr::select(-Y)
test_y <- test_data$Y
pred_best <- predict(model_best, test_x, type = "class")
pred_best <- factor(pred_best, levels = levels(test_y))
cm_best <- confusionMatrix(pred_best, test_y, positive = "Merokok")
cm_best
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 0 0
## Merokok 277 698
##
## Accuracy : 0.7159
## 95% CI : (0.6865, 0.744)
## No Information Rate : 0.7159
## P-Value [Acc > NIR] : 0.5162
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.7159
## Neg Pred Value : NaN
## Prevalence : 0.7159
## Detection Rate : 0.7159
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : Merokok
##
cm_table <- as.data.frame(cm_best$table)
# Tambahkan kategori (TP, TN, FP, FN)
cm_table <- cm_table %>%
mutate(
Kategori = case_when(
Prediction == "Merokok" & Reference == "Merokok" ~ "TP (True Positive)",
Prediction == "Tidak_Merokok" & Reference == "Tidak_Merokok" ~ "TN (True Negative)",
Prediction == "Merokok" & Reference == "Tidak_Merokok" ~ "FP (False Positive)",
Prediction == "Tidak_Merokok" & Reference == "Merokok" ~ "FN (False Negative)"
))
ggplot(cm_table, aes(x = Reference, y = Prediction, fill = Kategori)) +
geom_tile(color = "white", linewidth = 1.2) +
geom_text(aes(label = Freq), size = 6, fontface = "bold") +
scale_fill_manual(values = c(
"TP (True Positive)" = "#2ECC71", # hijau
"TN (True Negative)" = "#3498DB", # biru
"FP (False Positive)" = "#E74C3C", # merah
"FN (False Negative)" = "#F1C40F" # kuning
)) +
labs(
title = "Confusion Matrix - Model Terbaik (Decision Tree)",
subtitle = "Klasifikasi Perilaku Merokok",
x = "Aktual",
y = "Prediksi",
fill = "Kategori"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_text(face = "bold"),
legend.position = "right"
)
# 19. Penentuan Model Terbaik dari Hasil Tuning Parameter Pada tahap ini
dilakukan penentuan model terbaik berdasarkan hasil hyperparameter
tuning menggunakan metode Grid Search dengan 5-Fold Cross Validation
# LOCK DATA (WAJIB untuk konsistensi)
set.seed(123)
perbandingan_final <- perbandingan_final %>%
dplyr::distinct() %>%
dplyr::mutate(
Accuracy = as.numeric(Accuracy),
Precision = as.numeric(Precision),
Recall = as.numeric(Recall),
F1_Score = as.numeric(F1_Score),
Specificity = as.numeric(Specificity),
Kappa = as.numeric(Kappa)
)
# RANKING MODEL TERBAIK Hasil Tuning
ranking_smote_grid <- perbandingan_final %>%
dplyr::filter(Set_Data == "Testing",
Kondisi == "SMOTE + Grid Search") %>%
# rounding HANYA untuk tampilan (bukan sorting)
dplyr::mutate(
Accuracy = round(Accuracy, 4),
Precision = round(Precision, 4),
Recall = round(Recall, 4),
F1_Score = round(F1_Score, 4),
Specificity = round(Specificity, 4),
Kappa = round(Kappa, 4)
) %>%
# sorting utama (stabil + konsisten)
dplyr::arrange(
dplyr::desc(F1_Score),
dplyr::desc(Accuracy),
dplyr::desc(Kappa),
dplyr::desc(Recall) # tambahan untuk menghindari tie berbeda urutan
) %>%
dplyr::mutate(Rank = dplyr::row_number()) %>%
dplyr::select(Rank, Split, Kondisi, Model,
Accuracy, Precision, Recall,
F1_Score, Specificity, Kappa)
cat("\n--- Ranking Lengkap Model SMOTE + Grid Search ---\n")
##
## --- Ranking Lengkap Model SMOTE + Grid Search ---
print(ranking_smote_grid)
## Rank Split Kondisi Model Accuracy
## Precision...1 1 70:30 SMOTE + Grid Search Decision Tree Tuning 0.7057
## Precision...2 2 90:10 SMOTE + Grid Search Decision Tree Tuning 0.6985
## Precision...3 3 80:20 SMOTE + Grid Search Decision Tree Tuning 0.6977
## Precision...4 4 90:10 SMOTE + Grid Search Naive Bayes Tuning 0.6677
## Precision...5 5 70:30 SMOTE + Grid Search Naive Bayes Tuning 0.6644
## Precision...6 6 70:30 SMOTE + Grid Search KNN Tuning 0.6238
## Precision...7 7 80:20 SMOTE + Grid Search Naive Bayes Tuning 0.6378
## Precision...8 8 90:10 SMOTE + Grid Search KNN Tuning 0.6144
## Precision...9 9 80:20 SMOTE + Grid Search KNN Tuning 0.6076
## Precision Recall F1_Score Specificity Kappa
## Precision...1 0.7510 0.8807 0.8107 0.2653 0.1680
## Precision...2 0.7451 0.8797 0.8068 0.2419 0.1409
## Precision...3 0.7486 0.8697 0.8046 0.2649 0.1534
## Precision...4 0.7679 0.7679 0.7679 0.4152 0.1831
## Precision...5 0.7921 0.7199 0.7543 0.5246 0.2287
## Precision...6 0.7456 0.7199 0.7325 0.3818 0.0991
## Precision...7 0.7787 0.6901 0.7317 0.5063 0.1808
## Precision...8 0.7307 0.7307 0.7307 0.3213 0.0520
## Precision...9 0.7281 0.7208 0.7245 0.3225 0.0430
# --- Validasi hasil ranking tidak kosong ---
if (nrow(ranking_smote_grid) == 0) {
stop("Filter 'SMOTE + Grid Search' kosong. Cek label kondisi dengan unique(perbandingan_final$Kondisi)")
}
# --- Ambil model terbaik Rank 1 ---
best_sg <- ranking_smote_grid %>% filter(Rank == 1)
best_mod_sg <- best_sg$Model
best_spl_sg <- best_sg$Split
best_kon_sg <- best_sg$Kondisi
cat("\n>>> MODEL TERBAIK SMOTE + GRID SEARCH <<<\n")
##
## >>> MODEL TERBAIK SMOTE + GRID SEARCH <<<
cat(" Model :", best_mod_sg, "\n")
## Model : Decision Tree Tuning
cat(" Split :", best_spl_sg, "\n")
## Split : 70:30
cat(" Kondisi :", best_kon_sg, "\n")
## Kondisi : SMOTE + Grid Search
cat(" Accuracy :", best_sg$Accuracy, "\n")
## Accuracy : 0.7057
cat(" Precision :", best_sg$Precision, "\n")
## Precision : 0.751
cat(" Recall :", best_sg$Recall, "\n")
## Recall : 0.8807
cat(" F1-Score :", best_sg$F1_Score, "\n")
## F1-Score : 0.8107
cat(" Specificity:", best_sg$Specificity, "\n")
## Specificity: 0.2653
cat(" Kappa :", best_sg$Kappa, "\n")
## Kappa : 0.168
# --- Ambil objek model sesuai split terbaik ---
res_sg <- if (best_spl_sg == "90:10") tuning_90 else
if (best_spl_sg == "80:20") tuning_80 else tuning_70
Pada tahap ini dilakukan visualisai terhadap model terbaik yang diperoleh dari proses hyperparameter tuning menggunakan Grid Search dengan 5-Fold Cross Validation. Evaluasi dilakukan menggunakan confusion matrix untuk melihat performa klasifikasi secara lebih rinci dalam membedakan kelas “Merokok” dan “Tidak Merokok” pada data pengujian.
# Confusion Matrix - Model Terbaik SMOTE + Grid Search (Rank 1)
best_grid <- ranking_smote_grid %>% filter(Rank == 1)
best_split_g <- best_grid$Split
best_model_g <- best_grid$Model
cat(">>> Model Terbaik SMOTE + Grid Search (Rank 1) <<<\n")
## >>> Model Terbaik SMOTE + Grid Search (Rank 1) <<<
cat("Split :", best_split_g, "\n")
## Split : 70:30
cat("Model :", best_model_g, "\n")
## Model : Decision Tree Tuning
# Test data sesuai split
test_data_grid <- switch(best_split_g,
"90:10" = test_10_num,
"80:20" = test_20_num,
"70:30" = test_30_num
)
# Objek model sesuai split & model
model_grid_obj <- switch(best_split_g,
"90:10" = switch(best_model_g,
"KNN Tuning" = tuning_90$knn,
"Decision Tree Tuning" = tuning_90$dt,
"Naive Bayes Tuning" = tuning_90$nb),
"80:20" = switch(best_model_g,
"KNN Tuning" = tuning_80$knn,
"Decision Tree Tuning" = tuning_80$dt,
"Naive Bayes Tuning" = tuning_80$nb),
"70:30" = switch(best_model_g,
"KNN Tuning" = tuning_70$knn,
"Decision Tree Tuning" = tuning_70$dt,
"Naive Bayes Tuning" = tuning_70$nb)
)
# Prediksi
test_x_g <- test_data_grid %>% dplyr::select(-Y)
test_y_g <- test_data_grid$Y
if (best_model_g == "Decision Tree Tuning") {
pred_grid <- factor(predict(model_grid_obj, test_x_g, type = "class"),
levels = levels(test_y_g))
} else {
pred_grid <- factor(predict(model_grid_obj, test_x_g),
levels = levels(test_y_g))
}
# Confusion Matrix
cm_grid <- confusionMatrix(pred_grid, test_y_g, positive = "Merokok")
print(cm_grid)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Tidak_Merokok Merokok
## Tidak_Merokok 221 250
## Merokok 612 1846
##
## Accuracy : 0.7057
## 95% CI : (0.6888, 0.7222)
## No Information Rate : 0.7156
## P-Value [Acc > NIR] : 0.8863
##
## Kappa : 0.168
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8807
## Specificity : 0.2653
## Pos Pred Value : 0.7510
## Neg Pred Value : 0.4692
## Prevalence : 0.7156
## Detection Rate : 0.6302
## Detection Prevalence : 0.8392
## Balanced Accuracy : 0.5730
##
## 'Positive' Class : Merokok
##
# Ambil nilai TP, TN, FP, FN untuk anotasi
cm_tbl <- cm_grid$table
TP <- cm_tbl["Merokok", "Merokok"]
TN <- cm_tbl["Tidak_Merokok", "Tidak_Merokok"]
FP <- cm_tbl["Merokok", "Tidak_Merokok"]
FN <- cm_tbl["Tidak_Merokok", "Merokok"]
# Tabel visualisasi
cm_df <- as.data.frame(cm_grid$table) %>%
dplyr::mutate(
Kategori = dplyr::case_when(
Prediction == "Merokok" & Reference == "Merokok" ~ "TP",
Prediction == "Tidak_Merokok" & Reference == "Tidak_Merokok" ~ "TN",
Prediction == "Merokok" & Reference == "Tidak_Merokok" ~ "FP",
Prediction == "Tidak_Merokok" & Reference == "Merokok" ~ "FN"
),
Label = dplyr::case_when(
Kategori == "TP" ~ paste0("TP\n(True Positive)\n", Freq),
Kategori == "TN" ~ paste0("TN\n(True Negative)\n", Freq),
Kategori == "FP" ~ paste0("FP\n(False Positive)\n", Freq),
Kategori == "FN" ~ paste0("FN\n(False Negative)\n", Freq)
)
)
# Metrik tambahan untuk subtitle
acc <- round(cm_grid$overall["Accuracy"] * 100, 2)
prec <- round(cm_grid$byClass["Precision"] * 100, 2)
rec <- round(cm_grid$byClass["Recall"] * 100, 2)
f1 <- round(cm_grid$byClass["F1"] * 100, 2)
spec <- round(cm_grid$byClass["Specificity"] * 100, 2)
# Visualisasi
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Kategori)) +
geom_tile(color = "white", linewidth = 1.5) +
geom_text(aes(label = Label), size = 5.5, fontface = "bold", lineheight = 1.3) +
scale_fill_manual(
values = c(
"TP" = "#2ECC71",
"TN" = "#3498DB",
"FP" = "#E74C3C",
"FN" = "#F1C40F"
),
labels = c(
"TP" = "TP (True Positive)",
"TN" = "TN (True Negative)",
"FP" = "FP (False Positive)",
"FN" = "FN (False Negative)"
)
) +
labs(
title = paste0("Confusion Matrix — ", best_model_g),
subtitle = paste0(
"Split: ", best_split_g, " | Kondisi: SMOTE + Grid Search\n",
"Accuracy: ", acc, "% | Precision: ", prec, "% | ",
"Recall: ", rec, "% | F1-Score: ", f1, "% | Specificity: ", spec, "%"
),
x = "Aktual",
y = "Prediksi",
fill = "Kategori"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
plot.subtitle = element_text(hjust = 0.5, size = 10, color = "gray40"),
axis.text = element_text(face = "bold", size = 12),
axis.title = element_text(face = "bold", size = 13),
legend.position = "right",
legend.title = element_text(face = "bold"),
panel.grid = element_blank()
)
Pada bagian ini dilakukan perbandingan performa seluruh model yang diuji berdasarkan nilai accuracy dan F1-score pada data testing. Perbandingan ini bertujuan untuk mengetahui model mana yang memiliki kinerja terbaik dalam melakukan klasifikasi, khususnya dalam menyeimbangkan ketepatan prediksi dan kemampuan model dalam menangani ketidakseimbangan kelas.
perbandingan_viz <- perbandingan_final %>%
dplyr::filter(Set_Data == "Testing") %>%
dplyr::mutate(
Accuracy = round(Accuracy, 4),
F1_Score = round(F1_Score, 4)
)
# --- PLOT ACCURACY ---
ggplot(perbandingan_viz,
aes(x = Model,
y = Accuracy,
fill = Kondisi)) +
geom_col(position = position_dodge(width = 0.8),
width = 0.7) +
geom_text(aes(label = Accuracy),
position = position_dodge(width = 0.8),
vjust = -0.4,
size = 3,
fontface = "bold") +
scale_fill_manual(values = c(
"Sebelum SMOTE" = "#52BE80",
"Sesudah SMOTE" = "#F0956A",
"SMOTE + Grid Search" = "#AED6F1"
)) +
facet_wrap(~ Split, ncol = 3) +
scale_y_continuous(limits = c(0, 1.25),
breaks = seq(0, 1.2, 0.2),
expand = expansion(mult = c(0, 0))) +
labs(
title = "Perbandingan Akurasi per Model, Kondisi & Split (Testing)",
x = "Model",
y = "Akurasi",
fill = "Kondisi"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
axis.text.x = element_text(angle = 45, hjust = 1,
face = "bold", size = 8),
axis.text.y = element_text(size = 9),
axis.title = element_text(face = "bold", size = 11),
strip.text = element_text(face = "bold", size = 11),
strip.background = element_rect(fill = "gray95", color = NA),
legend.position = "bottom",
legend.title = element_text(face = "bold"),
panel.grid.major.x = element_blank()
)
# --- PLOT F1-SCORE ---
ggplot(perbandingan_viz,
aes(x = Model,
y = F1_Score,
fill = Kondisi)) +
geom_col(position = position_dodge(width = 0.8),
width = 0.7) +
geom_text(aes(label = F1_Score),
position = position_dodge(width = 0.8),
vjust = -0.4,
size = 3,
fontface = "bold") +
scale_fill_manual(values = c(
"Sebelum SMOTE" = "#52BE80",
"Sesudah SMOTE" = "#F0956A",
"SMOTE + Grid Search" = "#AED6F1"
)) +
facet_wrap(~ Split, ncol = 3) +
scale_y_continuous(limits = c(0, 1.25),
breaks = seq(0, 1.2, 0.2),
expand = expansion(mult = c(0, 0))) +
labs(
title = "Perbandingan F1-Score per Model, Kondisi & Split (Testing)",
x = "Model",
y = "F1-Score",
fill = "Kondisi"
) +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
axis.text.x = element_text(angle = 45, hjust = 1,
face = "bold", size = 8),
axis.text.y = element_text(size = 9),
axis.title = element_text(face = "bold", size = 11),
strip.text = element_text(face = "bold", size = 11),
strip.background = element_rect(fill = "gray95", color = NA),
legend.position = "bottom",
legend.title = element_text(face = "bold"),
panel.grid.major.x = element_blank()
)
Pada bagian ini dilakukan analisis variable importance pada model terbaik yang diperoleh dari kombinasi SMOTE dan Grid Search. Analisis ini bertujuan untuk mengetahui variabel-variabel yang paling berpengaruh dalam proses klasifikasi, sehingga dapat memberikan interpretasi yang lebih jelas terhadap faktor-faktor yang berkontribusi terhadap hasil prediksi model.
best_grid <- ranking_smote_grid %>% dplyr::filter(Rank == 1)
best_split_g <- best_grid$Split
best_model_g <- best_grid$Model
cat(">>> Variable Importance — Model Terbaik SMOTE + Grid Search <<<\n")
## >>> Variable Importance — Model Terbaik SMOTE + Grid Search <<<
cat("Split :", best_split_g, "\n")
## Split : 70:30
cat("Model :", best_model_g, "\n\n")
## Model : Decision Tree Tuning
# Ambil objek model
model_grid_obj <- switch(best_split_g,
"90:10" = switch(best_model_g,
"KNN Tuning" = tuning_90$knn,
"Decision Tree Tuning" = tuning_90$dt,
"Naive Bayes Tuning" = tuning_90$nb),
"80:20" = switch(best_model_g,
"KNN Tuning" = tuning_80$knn,
"Decision Tree Tuning" = tuning_80$dt,
"Naive Bayes Tuning" = tuning_80$nb),
"70:30" = switch(best_model_g,
"KNN Tuning" = tuning_70$knn,
"Decision Tree Tuning" = tuning_70$dt,
"Naive Bayes Tuning" = tuning_70$nb)
)
# Hitung Variable Importance sesuai jenis model
if (best_model_g == "Decision Tree Tuning") {
vi_raw <- model_grid_obj$variable.importance
if (is.null(vi_raw) || length(vi_raw) == 0) {
stop("variable.importance kosong — pohon mungkin hanya root node.")
}
vi_df <- data.frame(
Variabel = names(vi_raw),
Importance = as.numeric(vi_raw),
stringsAsFactors = FALSE
) %>%
dplyr::filter(nchar(Variabel) > 0) %>%
dplyr::mutate(
Importance = round(Importance / max(Importance) * 100, 2)
) %>%
dplyr::arrange(dplyr::desc(Importance))
} else if (best_model_g == "KNN Tuning") {
vi_raw <- caret::varImp(model_grid_obj, scale = TRUE)$importance
vi_df <- data.frame(
Variabel = rownames(vi_raw),
Importance = round(vi_raw[, 1], 2),
stringsAsFactors = FALSE
) %>%
dplyr::filter(nchar(Variabel) > 0) %>%
dplyr::arrange(dplyr::desc(Importance))
} else if (best_model_g == "Naive Bayes Tuning") {
vi_raw <- caret::varImp(model_grid_obj, scale = TRUE)$importance
vi_df <- data.frame(
Variabel = rownames(vi_raw),
Importance = round(rowMeans(vi_raw), 2),
stringsAsFactors = FALSE
) %>%
dplyr::filter(nchar(Variabel) > 0) %>%
dplyr::arrange(dplyr::desc(Importance))
}
# Tampilkan tabel
cat("\n--- Tabel Variable Importance ---\n")
##
## --- Tabel Variable Importance ---
print(vi_df)
## Variabel Importance
## 1 kekayaan 100.00
## 2 asuransi 45.72
## 3 pendidikan 42.56
## 4 pekerjaan 40.08
## 5 usia 38.73
## 6 tempat_tinggal 25.96
## 7 tv 9.92
## 8 bekerja 0.60
## 9 status_pernikahan 0.53
# Visualisasi
ggplot(vi_df, aes(x = reorder(Variabel, Importance),
y = Importance,
fill = Importance)) +
geom_col(width = 0.7, show.legend = FALSE) +
geom_text(aes(label = paste0(Importance, "%")),
hjust = -0.15, size = 4, fontface = "bold") +
scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +
coord_flip() +
scale_y_continuous(expand = expansion(mult = c(0, 0.15))) +
labs(
title = paste0("Variable Importance — ", best_model_g),
subtitle = paste0("Split: ", best_split_g,
" | Kondisi: SMOTE + Grid Search"),
x = "Variabel",
y = "Importance (%)"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 15),
plot.subtitle = element_text(hjust = 0.5, size = 10, color = "gray40"),
axis.text = element_text(face = "bold", size = 11),
axis.title = element_text(face = "bold", size = 12),
panel.grid.major.y = element_blank()
)