library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(moments)
##
## Attaching package: 'moments'
## The following objects are masked from 'package:e1071':
##
## kurtosis, moment, skewness
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.4.3
set.seed(123)
# Sesuaikan path file jika perlu
train <- read_excel("C:/Users/LENOVO/Downloads/Data Training.xlsx")
test <- read_excel("C:/Users/LENOVO/Downloads/Data Testing.xlsx")
train <- as.data.frame(train)
test <- as.data.frame(test)
cat("Dimensi data training :", dim(train), "\n")
## Dimensi data training : 41188 21
cat("Dimensi data testing :", dim(test), "\n")
## Dimensi data testing : 4119 21
cat("\nNama kolom training:\n")
##
## Nama kolom training:
print(names(train))
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
cat("\nNama kolom testing:\n")
##
## Nama kolom testing:
print(names(test))
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
# Target
train$y <- factor(train$y, levels = c("no", "yes"))
test$y <- factor(test$y, levels = c("no", "yes"))
cat_vars <- c("job", "marital", "education", "default", "housing", "loan",
"contact", "month", "day_of_week", "poutcome")
for (v in cat_vars) {
train[[v]] <- as.factor(train[[v]])
test[[v]] <- as.factor(test[[v]])
}
num_vars <- setdiff(names(train), c(cat_vars, "y"))
for (v in num_vars) {
train[[v]] <- as.numeric(train[[v]])
test[[v]] <- as.numeric(test[[v]])
}
# CEK DATA AWAL
cat("\n===== Missing Value Training =====\n")
##
## ===== Missing Value Training =====
print(colSums(is.na(train)))
## age job marital education default
## 0 330 80 1731 8597
## housing loan contact month day_of_week
## 990 990 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
cat("\n===== Missing Value Testing =====\n")
##
## ===== Missing Value Testing =====
print(colSums(is.na(test)))
## age job marital education default
## 0 39 11 167 803
## housing loan contact month day_of_week
## 105 105 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
cat("\n===== Distribusi Target Training =====\n")
##
## ===== Distribusi Target Training =====
print(table(train$y))
##
## no yes
## 36548 4640
print(prop.table(table(train$y)))
##
## no yes
## 0.8873458 0.1126542
cat("\n===== Distribusi Target Testing =====\n")
##
## ===== Distribusi Target Testing =====
print(table(test$y))
##
## no yes
## 3668 451
print(prop.table(table(test$y)))
##
## no yes
## 0.8905074 0.1094926
Pada hasil pengecekan missing value dapat diketahui bahwa terdapat
beberapa variabel yang memiliki missing value, yaitu variabel
job, marital, education,
default, housing, dan loan Selain
itu, distribusi target menunjukkan adanya ketidakseimbangan kelas,
dengan kelas “no” lebih dominan dibandingkan kelas “yes”. Hal ini perlu
diperhatikan dalam tahap pemodelan selanjutnya, terutama dalam
penanganan missing value dan ketidakseimbangan kelas.
impute_mode <- function(x) {
x <- x[!is.na(x)]
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
impute_data <- function(df, ref_df = NULL, cat_vars, target_var = "y") {
if (is.null(ref_df)) ref_df <- df
out <- df
for (col in names(out)) {
if (col == target_var) next
if (col %in% cat_vars) {
mode_value <- impute_mode(ref_df[[col]])
out[[col]][is.na(out[[col]])] <- mode_value
out[[col]] <- factor(out[[col]], levels = levels(ref_df[[col]]))
} else {
med_value <- median(ref_df[[col]], na.rm = TRUE)
out[[col]][is.na(out[[col]])] <- med_value
}
}
return(out)
}
train_imp <- impute_data(train, ref_df = train, cat_vars = cat_vars, target_var = "y")
test_imp <- impute_data(test, ref_df = train, cat_vars = cat_vars, target_var = "y")
cat("\n===== Missing Value Setelah Imputasi =====\n")
##
## ===== Missing Value Setelah Imputasi =====
print(colSums(is.na(train_imp)))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
print(colSums(is.na(test_imp)))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
train_out <- train_imp
test_out <- test_imp
for (v in num_vars) {
q1 <- quantile(train_out[[v]], 0.25, na.rm = TRUE)
q3 <- quantile(train_out[[v]], 0.75, na.rm = TRUE)
iqr_val <- q3 - q1
lower <- q1 - 1.5 * iqr_val
upper <- q3 + 1.5 * iqr_val
train_out[[v]][train_out[[v]] < lower] <- lower
train_out[[v]][train_out[[v]] > upper] <- upper
test_out[[v]][test_out[[v]] < lower] <- lower
test_out[[v]][test_out[[v]] > upper] <- upper
}
safe_skewness <- function(x) {
x <- x[is.finite(x)]
if (length(x) < 3) return(NA)
if (sd(x) == 0) return(NA)
moments::skewness(x)
}
skew_vals <- sapply(train_out[num_vars], safe_skewness)
cat("\n===== Skewness Variabel Numerik (Training) =====\n")
##
## ===== Skewness Variabel Numerik (Training) =====
print(skew_vals)
## age duration campaign pdays previous
## 0.5675749 1.0448624 1.2121184 NA NA
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## -0.7240692 -0.2308792 0.3008034 -0.7091621 -1.0442244
invalid_skew <- names(skew_vals[!is.finite(skew_vals)])
if (length(invalid_skew) > 0) {
cat("\nVariabel dengan skewness tidak valid:\n")
print(invalid_skew)
}
##
## Variabel dengan skewness tidak valid:
## [1] "pdays" "previous"
Berdasarkan hasil pengecekan skewness, sebagian besar variabel
numerik dalam data training memiliki distribusi yang cukup baik, dengan
beberapa variabel seperti age, duration, dan
campaign memiliki skewness positif, menandakan distribusi
yang sedikit miring ke kanan. Sementara itu, emp.var.rate,
euribor3m, dan nr.employed memiliki skewness
negatif, menunjukkan distribusi yang miring ke kiri. Variabel
cons.price.idx dan cons.conf.idx relatif
simetris dengan skewness yang kecil. Namun, variabel pdays dan previous
tidak memiliki nilai skewness yang valid (NA), kemungkinan besar karena
distribusi keduanya sangat terpusat pada satu nilai, yang sering terjadi
pada variabel seperti pdays yang memiliki nilai 999 untuk nasabah yang
belum dihubungi sebelumnya.
skew_threshold <- 1
train_prep <- train_out
test_prep <- test_out
for (v in num_vars) {
if (v == "pdays") {
cat("Transformasi dilewati untuk:", v, "\n")
next
}
if (is.finite(skew_vals[v]) && abs(skew_vals[v]) > skew_threshold) {
if (min(train_prep[[v]], na.rm = TRUE) >= 0) {
cat("Transformasi log1p pada:", v, "\n")
train_prep[[v]] <- log1p(train_prep[[v]])
test_prep[[v]] <- log1p(test_prep[[v]])
} else {
cat("Tidak ditransformasi (ada nilai negatif):", v, "\n")
}
} else {
cat("Skewness tidak valid / tidak perlu transformasi:", v, "\n")
}
}
## Skewness tidak valid / tidak perlu transformasi: age
## Transformasi log1p pada: duration
## Transformasi log1p pada: campaign
## Transformasi dilewati untuk: pdays
## Skewness tidak valid / tidak perlu transformasi: previous
## Skewness tidak valid / tidak perlu transformasi: emp.var.rate
## Skewness tidak valid / tidak perlu transformasi: cons.price.idx
## Skewness tidak valid / tidak perlu transformasi: cons.conf.idx
## Skewness tidak valid / tidak perlu transformasi: euribor3m
## Transformasi log1p pada: nr.employed
Berdasarkan hasil transformasi skewness, hanya beberapa variabel numerik yang memerlukan penyesuaian untuk mengurangi ketidaksimetrian distribusi, yaitu duration, campaign, dan nr.employed, yang kemudian ditransformasi menggunakan log1p(). Proses transformasi bertujuan agar variabel yang sangat miring menjadi lebih stabil dan lebih sesuai untuk digunakan dalam pemodelan.
use_duration <- FALSE
if (!use_duration) {
train_model <- train_prep %>% select(-duration)
test_model <- test_prep %>% select(-duration)
cat("\nModel menggunakan skenario REALISTIS (tanpa duration)\n")
} else {
train_model <- train_prep
test_model <- test_prep
cat("\nModel menggunakan skenario BENCHMARK (dengan duration)\n")
}
##
## Model menggunakan skenario REALISTIS (tanpa duration)
cat_vars_model <- names(train_model)[sapply(train_model, is.factor)]
cat_vars_model <- setdiff(cat_vars_model, "y")
for (v in cat_vars_model) {
all_levels <- union(levels(train_model[[v]]), levels(test_model[[v]]))
train_model[[v]] <- factor(train_model[[v]], levels = all_levels)
test_model[[v]] <- factor(test_model[[v]], levels = all_levels)
}
cat("\n===== Distribusi Kelas Sebelum SMOTE (Training) =====\n")
##
## ===== Distribusi Kelas Sebelum SMOTE (Training) =====
print(table(train_model$y))
##
## no yes
## 36548 4640
print(prop.table(table(train_model$y)))
##
## no yes
## 0.8873458 0.1126542
y_train <- train_model$y
y_test <- test_model$y
x_train_raw <- train_model[, setdiff(names(train_model), "y"), drop = FALSE]
x_test_raw <- test_model[, setdiff(names(test_model), "y"), drop = FALSE]
gabung_data <- rbind(x_train_raw, x_test_raw)
dummy_all <- as.data.frame(model.matrix(~ . - 1, data = gabung_data))
n_train <- nrow(x_train_raw)
x_train <- dummy_all[1:n_train, , drop = FALSE]
x_test <- dummy_all[(n_train + 1):nrow(dummy_all), , drop = FALSE]
cat("\nDimensi x_train setelah dummy:", dim(x_train), "\n")
##
## Dimensi x_train setelah dummy: 41188 47
cat("Dimensi x_test setelah dummy :", dim(x_test), "\n")
## Dimensi x_test setelah dummy : 4119 47
y_train_num <- ifelse(y_train == "yes", 1, 0)
smote_result <- smotefamily::SMOTE(
X = x_train,
target = y_train_num,
K = 5,
dup_size = 2
)
train_smote_data <- smote_result$data
# pastikan nama kolom target hasil SMOTE
target_col <- ncol(train_smote_data)
x_train_smote <- train_smote_data[, -target_col, drop = FALSE]
y_train_smote_num <- train_smote_data[, target_col]
y_train_smote <- factor(ifelse(y_train_smote_num == 1, "yes", "no"),
levels = c("no", "yes"))
cat("\n===== Distribusi Kelas Setelah SMOTE =====\n")
##
## ===== Distribusi Kelas Setelah SMOTE =====
print(table(y_train_smote))
## y_train_smote
## no yes
## 36548 13920
print(prop.table(table(y_train_smote)))
## y_train_smote
## no yes
## 0.7241817 0.2758183
Hasil SMOTE pada y_train menunjukkan bahwa distribusi kelas pada data training menjadi lebih seimbang dibandingkan sebelum oversampling, dengan jumlah kelas no sebanyak 36.548 dan kelas yes sebanyak 13.920. Secara proporsi, kelas mayoritas no menjadi 72,42%, sedangkan kelas minoritas yes meningkat menjadi 27,58%. Hal ini menunjukkan bahwa SMOTE berhasil menambah representasi kelas minoritas sehingga ketimpangan kelas berkurang dan model diharapkan lebih mampu mengenali nasabah yang berpotensi berlangganan deposito.
zero_var_cols <- names(x_train_smote)[apply(x_train_smote, 2, function(z) var(z, na.rm = TRUE) == 0)]
cat("\nKolom zero variance:\n")
##
## Kolom zero variance:
print(zero_var_cols)
## [1] "pdays" "previous"
if (length(zero_var_cols) > 0) {
x_train_smote <- x_train_smote[, !(names(x_train_smote) %in% zero_var_cols), drop = FALSE]
x_test <- x_test[, !(names(x_test) %in% zero_var_cols), drop = FALSE]
}
cat("\nDimensi x_train_smote setelah hapus zero variance:", dim(x_train_smote), "\n")
##
## Dimensi x_train_smote setelah hapus zero variance: 50468 45
cat("Dimensi x_test setelah hapus zero variance :", dim(x_test), "\n")
## Dimensi x_test setelah hapus zero variance : 4119 45
train_means <- sapply(x_train_smote, mean, na.rm = TRUE)
train_sds <- sapply(x_train_smote, sd, na.rm = TRUE)
train_sds[train_sds == 0] <- 1
x_train_sc <- as.data.frame(scale(x_train_smote, center = train_means, scale = train_sds))
x_test_sc <- as.data.frame(scale(x_test, center = train_means, scale = train_sds))
# pastikan tidak ada NA/Inf setelah scaling
x_train_sc[!is.finite(as.matrix(x_train_sc))] <- 0
x_test_sc[!is.finite(as.matrix(x_test_sc))] <- 0
model_svm <- svm(
x = x_train_sc,
y = y_train_smote,
kernel = "radial",
cost = 1,
gamma = 1 / ncol(x_train_sc),
probability = FALSE
)
cat("\n===== Ringkasan Model SVM =====\n")
##
## ===== Ringkasan Model SVM =====
print(model_svm)
##
## Call:
## svm.default(x = x_train_sc, y = y_train_smote, kernel = "radial",
## gamma = 1/ncol(x_train_sc), cost = 1, probability = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 18778
Berdasarkan pemodelan SVM diatas menunjukkan bahwa model yang dibangun adalah Support Vector Machine (SVM) untuk klasifikasi biner dengan kernel radial, sehingga model dapat menangkap hubungan yang bersifat nonlinier antar variabel prediktor dan target. Parameter cost = 1 menunjukkan bahwa model menggunakan penalti kesalahan klasifikasi pada tingkat moderat, sedangkan nilai gamma = 1/ncol(x_train_sc) digunakan sebagai pengaturan awal berdasarkan jumlah fitur hasil preprocessing. Selain itu, model menghasilkan 18.778 support vectors, yang berarti terdapat cukup banyak observasi training yang berperan penting dalam membentuk batas pemisah antar kelas, sehingga dapat diindikasikan bahwa pola pemisahan antara kelas yes dan no pada data cukup kompleks.
# Pastikan data numerik
x_train_sc <- as.data.frame(x_train_sc)
x_test_sc <- as.data.frame(x_test_sc)
pred_train <- e1071:::predict.svm(model_svm, newdata = x_train_sc)
pred_test <- e1071:::predict.svm(model_svm, newdata = x_test_sc)
pred_train <- factor(as.character(pred_train), levels = c("no", "yes"))
pred_test <- factor(as.character(pred_test), levels = c("no", "yes"))
evaluate_model <- function(actual, predicted, positive_class = "yes") {
actual <- factor(actual, levels = c("no", "yes"))
predicted <- factor(predicted, levels = c("no", "yes"))
cm <- confusionMatrix(predicted, actual, positive = positive_class)
acc <- as.numeric(cm$overall["Accuracy"])
sens <- as.numeric(cm$byClass["Sensitivity"])
spec <- as.numeric(cm$byClass["Specificity"])
bal_acc <- (sens + spec) / 2
precision <- as.numeric(cm$byClass["Pos Pred Value"])
f1 <- ifelse((precision + sens) == 0, 0,
2 * precision * sens / (precision + sens))
metrics <- data.frame(
Accuracy = acc,
Sensitivity = sens,
Specificity = spec,
Balanced_Accuracy = bal_acc,
F1_Score = f1
)
return(list(conf_matrix = cm, metrics = metrics))
}
eval_train <- evaluate_model(y_train_smote, pred_train)
cat("\n=============================\n")
##
## =============================
cat("CONFUSION MATRIX - TRAINING SMOTE\n")
## CONFUSION MATRIX - TRAINING SMOTE
cat("=============================\n")
## =============================
print(eval_train$conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 35461 4415
## yes 1087 9505
##
## Accuracy : 0.891
## 95% CI : (0.8882, 0.8937)
## No Information Rate : 0.7242
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7053
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6828
## Specificity : 0.9703
## Pos Pred Value : 0.8974
## Neg Pred Value : 0.8893
## Prevalence : 0.2758
## Detection Rate : 0.1883
## Detection Prevalence : 0.2099
## Balanced Accuracy : 0.8265
##
## 'Positive' Class : yes
##
cat("\n=============================\n")
##
## =============================
cat("METRICS - TRAINING SMOTE\n")
## METRICS - TRAINING SMOTE
cat("=============================\n")
## =============================
print(eval_train$metrics)
## Accuracy Sensitivity Specificity Balanced_Accuracy F1_Score
## 1 0.8909804 0.6828305 0.9702583 0.8265444 0.7755385
eval_test <- evaluate_model(y_test, pred_test)
cat("\n=============================\n")
##
## =============================
cat("CONFUSION MATRIX - TESTING\n")
## CONFUSION MATRIX - TESTING
cat("=============================\n")
## =============================
print(eval_test$conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3567 250
## yes 101 201
##
## Accuracy : 0.9148
## 95% CI : (0.9058, 0.9231)
## No Information Rate : 0.8905
## P-Value [Acc > NIR] : 1.405e-07
##
## Kappa : 0.489
##
## Mcnemar's Test P-Value : 2.797e-15
##
## Sensitivity : 0.44568
## Specificity : 0.97246
## Pos Pred Value : 0.66556
## Neg Pred Value : 0.93450
## Prevalence : 0.10949
## Detection Rate : 0.04880
## Detection Prevalence : 0.07332
## Balanced Accuracy : 0.70907
##
## 'Positive' Class : yes
##
cat("\n=============================\n")
##
## =============================
cat("METRICS - TESTING\n")
## METRICS - TESTING
cat("=============================\n")
## =============================
print(eval_test$metrics)
## Accuracy Sensitivity Specificity Balanced_Accuracy F1_Score
## 1 0.9147851 0.4456763 0.9724646 0.7090704 0.5338645
hasil_akhir <- bind_rows(
cbind(Model = "SVM + SMOTE", Data = "Training_SMOTE", eval_train$metrics),
cbind(Model = "SVM + SMOTE", Data = "Testing", eval_test$metrics)
)
cat("\n=============================\n")
##
## =============================
cat("RINGKASAN HASIL AKHIR\n")
## RINGKASAN HASIL AKHIR
cat("=============================\n")
## =============================
print(hasil_akhir)
## Model Data Accuracy Sensitivity Specificity
## 1 SVM + SMOTE Training_SMOTE 0.8909804 0.6828305 0.9702583
## 2 SVM + SMOTE Testing 0.9147851 0.4456763 0.9724646
## Balanced_Accuracy F1_Score
## 1 0.8265444 0.7755385
## 2 0.7090704 0.5338645
Berdasarkan ringkasan hasil akhir, model SVM dengan penanganan SMOTE menunjukkan kinerja yang cukup baik, terutama pada data testing dengan akurasi 0,9138. Nilai specificity sebesar 0,9714 menunjukkan bahwa model sangat baik dalam mengidentifikasi nasabah yang tidak berlangganan deposito (no). Namun, sensitivity sebesar 0,4457 menandakan bahwa kemampuan model dalam mendeteksi nasabah yang berpotensi berlangganan deposito (yes) masih relatif terbatas. Nilai balanced accuracy sebesar 0,7085 dan F1-score sebesar 0,5310 menunjukkan bahwa model sudah cukup mampu menangani ketidakseimbangan kelas, tetapi performanya pada kelas minoritas belum optimal.
Jika dibandingkan antara training SMOTE dan testing, terlihat bahwa performa testing masih stabil dan tidak menunjukkan penurunan yang ekstrem, sehingga model dapat dikatakan tidak mengalami overfitting yang berat. Secara keseluruhan, model SVM + SMOTE layak digunakan sebagai model klasifikasi pada data ini, terutama jika fokus utama adalah memprediksi kelas mayoritas dengan baik. Namun, apabila tujuan penelitian lebih menekankan pada pendeteksian nasabah yang akan berlangganan deposito (yes), maka model ini masih perlu ditingkatkan, misalnya melalui tuning parameter SVM, penyesuaian rasio SMOTE, atau pengaturan threshold klasifikasi agar sensitivity dan F1-score dapat meningkat.