1 Import Library & Data

# Import library
library(readxl) # untuk import data excel
library(dplyr) # untuk memanipulasi/mengolah dataset

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(moments) # untuk menghitung momen statistik dari distribusi data
library(smotefamily) # untuk mengatasi imbalanced data
library(e1071) # untuk membuat model SVM

## 
## Attaching package: 'e1071'

## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness

library(caret) # untuk evaluasi metrik

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:e1071':
## 
##     element

## Loading required package: lattice

# Import data
set.seed(123)
train <- read_excel("C:/Users/Lenovo/Documents/Teknik Pembelajaran Mesin/Tugas Individu/Tugas Individu 1/Data Training.xlsx")
test <- read_excel("C:/Users/Lenovo/Documents/Teknik Pembelajaran Mesin/Tugas Individu/Tugas Individu 1/Data Testing.xlsx")

# Cek struktur data
str(train)

## tibble [41,188 × 21] (S3: tbl_df/tbl/data.frame)
##  $ age           : num [1:41188] 56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr [1:41188] "housemaid" "services" "services" "admin." ...
##  $ marital       : chr [1:41188] "married" "married" "married" "married" ...
##  $ education     : chr [1:41188] "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr [1:41188] "no" NA "no" "no" ...
##  $ housing       : chr [1:41188] "no" "no" "yes" "no" ...
##  $ loan          : chr [1:41188] "no" "no" "no" "no" ...
##  $ contact       : chr [1:41188] "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr [1:41188] "may" "may" "may" "may" ...
##  $ day_of_week   : chr [1:41188] "mon" "mon" "mon" "mon" ...
##  $ duration      : num [1:41188] 261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : num [1:41188] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : num [1:41188] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : num [1:41188] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr [1:41188] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num [1:41188] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num [1:41188] 94 94 94 94 94 ...
##  $ cons.conf.idx : num [1:41188] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num [1:41188] 4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num [1:41188] 5191 5191 5191 5191 5191 ...
##  $ y             : chr [1:41188] "no" "no" "no" "no" ...

str(test)

## tibble [4,119 × 21] (S3: tbl_df/tbl/data.frame)
##  $ age           : num [1:4119] 30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : chr [1:4119] "blue-collar" "services" "services" "services" ...
##  $ marital       : chr [1:4119] "married" "single" "married" "married" ...
##  $ education     : chr [1:4119] "basic.9y" "high.school" "high.school" "basic.9y" ...
##  $ default       : chr [1:4119] "no" "no" "no" "no" ...
##  $ housing       : chr [1:4119] "yes" "no" "yes" NA ...
##  $ loan          : chr [1:4119] "no" "no" "no" NA ...
##  $ contact       : chr [1:4119] "cellular" "telephone" "telephone" "telephone" ...
##  $ month         : chr [1:4119] "may" "may" "jun" "jun" ...
##  $ day_of_week   : chr [1:4119] "fri" "fri" "wed" "fri" ...
##  $ duration      : num [1:4119] 487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : num [1:4119] 2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : num [1:4119] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : num [1:4119] 0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : chr [1:4119] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num [1:4119] -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num [1:4119] 92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num [1:4119] -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num [1:4119] 1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num [1:4119] 5099 5191 5228 5228 5196 ...
##  $ y             : chr [1:4119] "no" "no" "no" "no" ...

2 Data Preprocessing

2.1 Transformasi Peubah

# Tranformasi peubah respon jadi factor
train$y <- factor(train$y, levels = c("no", "yes"))
test$y  <- factor(test$y, levels = c("no", "yes"))

# Identifikasi daftar peubah kategorik
cat_vars <- c("job", "marital", "education", "default"
              , "housing", "loan", "contact", "month",
              "day_of_week", "poutcome")

# Ubah semua peubah kategorik (dalam cat_vars) jadi tipe faktor
for (v in cat_vars) {
  train[[v]] <- as.factor(train[[v]])
  test[[v]]  <- as.factor(test[[v]])
}

# Identifikasi daftar peubah numberik
num_vars <- setdiff(names(train), c(cat_vars, "y"))

# Pastikan semua peubah numerik diidentifikasi numerik di R
for (v in num_vars) {
  train[[v]] <- as.numeric(train[[v]])
  test[[v]]  <- as.numeric(test[[v]])
}

2.2 Identifikasi Missing Value

Missing Value Data Training

print(colSums(is.na(train)))

##            age            job        marital      education        default 
##              0            330             80           1731           8597 
##        housing           loan        contact          month    day_of_week 
##            990            990              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0

Missing Value Data Testing

print(colSums(is.na(test)))

##            age            job        marital      education        default 
##              0             39             11            167            803 
##        housing           loan        contact          month    day_of_week 
##            105            105              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0

Identifikasi Distribusi Target

Distribusi Target Data Training

print(table(train$y))

## 
##    no   yes 
## 36548  4640

print(prop.table(table(train$y)))

## 
##        no       yes 
## 0.8873458 0.1126542

Distribusi Target Data Testing

print(table(test$y))

## 
##   no  yes 
## 3668  451

print(prop.table(table(test$y)))

## 
##        no       yes 
## 0.8905074 0.1094926

2.3 Imputasi Missing Value

# Membuat fungsi impute_mode untuk mencari nilai modus dari sebuah vektor/peubah
impute_mode <- function(x) {
  x <- x[!is.na(x)]
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

# Membuat fungsi impute_data untuk melakukan imputasi missing value pada dataset
impute_data <- function(df, ref_df = NULL, cat_vars, target_var = "y") {
  if (is.null(ref_df)) ref_df <- df
  
  out <- df
  
  for (col in names(out)) {
    if (col == target_var) next
    
    if (col %in% cat_vars) {
      mode_value <- impute_mode(ref_df[[col]])
      out[[col]][is.na(out[[col]])] <- mode_value
      out[[col]] <- factor(out[[col]], levels = levels(ref_df[[col]]))
    } else {
      med_value <- median(ref_df[[col]], na.rm = TRUE)
      out[[col]][is.na(out[[col]])] <- med_value
    }
  }
  
  return(out)
}

# Melakukan imputasi pada data training dan data testing dengan referensi dari data training
train_imp <- impute_data(train, ref_df = train, cat_vars = cat_vars, target_var = "y")
test_imp  <- impute_data(test,  ref_df = train, cat_vars = cat_vars, target_var = "y")

# Mengecek jumlah missing value setelah imputasi
print(colSums(is.na(train_imp)))

##            age            job        marital      education        default 
##              0              0              0              0              0 
##        housing           loan        contact          month    day_of_week 
##              0              0              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0

2.4 Penanganan Outlier

# Menyalin data training dan data testing yang sudah dilakukan imputasi untuk ditangani outliernya
train_out <- train_imp
test_out  <- test_imp

# Membuat fungsi untuk menangani outlier 
# Nilai yang lebih kecil dari batas bawah data training akan diganti jadi nilai batas bawah data training. 
# Nilai yang lebih besar dari batas atas data training akan diganti jadi nilai batas atas data training.
for (v in num_vars) {
  q1 <- quantile(train_out[[v]], 0.25, na.rm = TRUE)
  q3 <- quantile(train_out[[v]], 0.75, na.rm = TRUE)
  iqr_val <- q3 - q1
  
  lower <- q1 - 1.5 * iqr_val # Batas Bawah
  upper <- q3 + 1.5 * iqr_val # Batas Atas
  
  train_out[[v]][train_out[[v]] < lower] <- lower
  train_out[[v]][train_out[[v]] > upper] <- upper
  
  test_out[[v]][test_out[[v]] < lower] <- lower
  test_out[[v]][test_out[[v]] > upper] <- upper
}

2.5 Penanganan Skewness yang Invalid

# Membuat fungsi untuk menghitung skewness peubah numerik
safe_skewness <- function(x) {
  x <- x[is.finite(x)]
  if (length(x) < 3) return(NA)
  if (sd(x) == 0) return(NA)
  moments::skewness(x)
}

skew_vals <- sapply(train_out[num_vars], safe_skewness)

print(skew_vals)

##            age       duration       campaign          pdays       previous 
##      0.5675749      1.0448624      1.2121184             NA             NA 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##     -0.7240692     -0.2308792      0.3008034     -0.7091621     -1.0442244

# Identifikasi peubah yang skewness-nya tidak valid
invalid_skew <- names(skew_vals[!is.finite(skew_vals)])
if (length(invalid_skew) > 0) {
  print(invalid_skew)
}

## [1] "pdays"    "previous"

# Transformasi skewness yang tidak valid
skew_threshold <- 1

train_prep <- train_out
test_prep  <- test_out

for (v in num_vars) {
  if (v == "pdays") {
    cat("Transformasi dilewati untuk:", v, "\n")
    next
  }
  
  if (is.finite(skew_vals[v]) && abs(skew_vals[v]) > skew_threshold) {
    if (min(train_prep[[v]], na.rm = TRUE) >= 0) {
      cat("Transformasi log1p pada:", v, "\n")
      train_prep[[v]] <- log1p(train_prep[[v]])
      test_prep[[v]]  <- log1p(test_prep[[v]])
    } else {
      cat("Tidak ditransformasi (ada nilai negatif):", v, "\n")
    }
  } else {
    cat("Skewness valid / tidak perlu transformasi:", v, "\n")
  }
}

## Skewness valid / tidak perlu transformasi: age 
## Transformasi log1p pada: duration 
## Transformasi log1p pada: campaign 
## Transformasi dilewati untuk: pdays 
## Skewness valid / tidak perlu transformasi: previous 
## Skewness valid / tidak perlu transformasi: emp.var.rate 
## Skewness valid / tidak perlu transformasi: cons.price.idx 
## Skewness valid / tidak perlu transformasi: cons.conf.idx 
## Skewness valid / tidak perlu transformasi: euribor3m 
## Transformasi log1p pada: nr.employed

2.6 Opsi Skenario Fitur dan Penyamaan Level Faktor Train dan Test

use_duration <- FALSE

if (!use_duration) {
  train_model <- train_prep %>% select(-duration)
  test_model  <- test_prep %>% select(-duration)
  cat("\nModel menggunakan skenario REALISTIS (tanpa duration)\n")
} else {
  train_model <- train_prep
  test_model  <- test_prep
  cat("\nModel menggunakan skenario BENCHMARK (dengan duration)\n")
}

## 
## Model menggunakan skenario REALISTIS (tanpa duration)

cat_vars_model <- names(train_model)[sapply(train_model, is.factor)]
cat_vars_model <- setdiff(cat_vars_model, "y")

for (v in cat_vars_model) {
  all_levels <- union(levels(train_model[[v]]), levels(test_model[[v]]))
  train_model[[v]] <- factor(train_model[[v]], levels = all_levels)
  test_model[[v]]  <- factor(test_model[[v]], levels = all_levels)
}

Distribusi Kelas Sebelum SMOTE (Training)

print(table(train_model$y))

## 
##    no   yes 
## 36548  4640

print(prop.table(table(train_model$y)))

## 
##        no       yes 
## 0.8873458 0.1126542

2.7 Pembuatan Peubah Dummy

y_train <- train_model$y
y_test  <- test_model$y

x_train_raw <- train_model[, setdiff(names(train_model), "y"), drop = FALSE]
x_test_raw  <- test_model[, setdiff(names(test_model), "y"), drop = FALSE]

gabung_data <- rbind(x_train_raw, x_test_raw)

dummy_all <- as.data.frame(model.matrix(~ . - 1, data = gabung_data))

n_train <- nrow(x_train_raw)
x_train <- dummy_all[1:n_train, , drop = FALSE]
x_test  <- dummy_all[(n_train + 1):nrow(dummy_all), , drop = FALSE]

Cek dimensi data setelah menggunakan peubah dummy

cat("\nDimensi x_train setelah dummy:", dim(x_train), "\n")

## 
## Dimensi x_train setelah dummy: 41188 47

cat("Dimensi x_test setelah dummy :", dim(x_test), "\n")

## Dimensi x_test setelah dummy : 4119 47

Cek ringkasan data setelah menggunakan peubah dummy

str(x_train)

## 'data.frame':    41188 obs. of  47 variables:
##  $ age                         : num  56 57 37 40 56 45 59 41 24 25 ...
##  $ jobadmin.                   : num  0 0 0 1 0 0 1 0 0 0 ...
##  $ jobblue-collar              : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ jobentrepreneur             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ jobhousemaid                : num  1 0 0 0 0 0 0 0 0 0 ...
##  $ jobmanagement               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ jobretired                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ jobself-employed            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ jobservices                 : num  0 1 1 0 1 1 0 0 0 1 ...
##  $ jobstudent                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ jobtechnician               : num  0 0 0 0 0 0 0 0 1 0 ...
##  $ jobunemployed               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ maritalmarried              : num  1 1 1 1 1 1 1 1 0 0 ...
##  $ maritalsingle               : num  0 0 0 0 0 0 0 0 1 1 ...
##  $ educationbasic.6y           : num  0 0 0 1 0 0 0 0 0 0 ...
##  $ educationbasic.9y           : num  0 0 0 0 0 1 0 0 0 0 ...
##  $ educationhigh.school        : num  0 1 1 0 1 0 0 0 0 1 ...
##  $ educationilliterate         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ educationprofessional.course: num  0 0 0 0 0 0 1 0 1 0 ...
##  $ educationuniversity.degree  : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ defaultyes                  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ housingyes                  : num  0 0 1 0 0 0 0 0 1 1 ...
##  $ loanyes                     : num  0 0 0 0 1 0 0 0 0 0 ...
##  $ contacttelephone            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ monthaug                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ monthdec                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ monthjul                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ monthjun                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ monthmar                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ monthmay                    : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ monthnov                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ monthoct                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ monthsep                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ day_of_weekmon              : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ day_of_weekthu              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ day_of_weektue              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ day_of_weekwed              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ campaign                    : num  0.693 0.693 0.693 0.693 0.693 ...
##  $ pdays                       : num  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous                    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcomenonexistent         : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ poutcomesuccess             : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ emp.var.rate                : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx              : num  94 94 94 94 94 ...
##  $ cons.conf.idx               : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m                   : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed                 : num  8.55 8.55 8.55 8.55 8.55 ...

2.8 SMOTE

# Mengubah peubah target y_train menjadi bentuk numerik dengan "yes" = 1 dan "no" = 0
y_train_num <- ifelse(y_train == "yes", 1, 0)

# Menerapkan metode SMOTE dari package smotefamily untuk menyeimbangkan kelas pada data training
smote_result <- smotefamily::SMOTE(
  X = x_train,
  target = y_train_num,
  K = 5,
  dup_size = 2
)

# Mengambil dataset hasil SMOTE yang berisi peubah prediktor dan target
train_smote_data <- smote_result$data

# Menentukan posisi kolom terakhir sebagai kolom target pada data hasil SMOTE
target_col <- ncol(train_smote_data)

# Mengambil semua kolom kecuali kolom terakhir sebagai peubah prediktor
x_train_smote <- train_smote_data[, -target_col, drop = FALSE]

# Mengambil kolom terakhir sebagai peubah target dalam bentuk numerik
y_train_smote_num <- train_smote_data[, target_col]

# Mengubah kembali target numerik menjadi faktor dengan label "yes" dan "no"
y_train_smote <- factor(ifelse(y_train_smote_num == 1, "yes", "no"), levels = c("no", "yes"))

Distribusi Kelas Setelah SMOTE

print(table(y_train_smote))

## y_train_smote
##    no   yes 
## 36548 13920

print(prop.table(table(y_train_smote)))

## y_train_smote
##        no       yes 
## 0.7241817 0.2758183

2.9 Penanganan Zero Variance dan Scaling

2.9.1 Zero Variance

Identifikasi peubah dengan zero variance (semua peubah nilainya sama)

zero_var_cols <- names(x_train_smote)[apply(x_train_smote, 2, function(z) var(z, na.rm = TRUE) == 0)]
print(zero_var_cols)

## [1] "pdays"    "previous"

Penghapusan zero variance

if (length(zero_var_cols) > 0) {
  x_train_smote <- x_train_smote[, !(names(x_train_smote) %in% zero_var_cols), drop = FALSE]
  x_test        <- x_test[, !(names(x_test) %in% zero_var_cols), drop = FALSE]
}

Cek dimensi data setelah penghapusan zero variance

cat("\nDimensi x_train_smote setelah hapus zero variance:", dim(x_train_smote), "\n")

## 
## Dimensi x_train_smote setelah hapus zero variance: 50468 45

cat("Dimensi x_test setelah hapus zero variance       :", dim(x_test), "\n")

## Dimensi x_test setelah hapus zero variance       : 4119 45

2.9.2 Scaling

train_means <- sapply(x_train_smote, mean, na.rm = TRUE)
train_sds   <- sapply(x_train_smote, sd, na.rm = TRUE)

train_sds[train_sds == 0] <- 1

x_train_sc <- as.data.frame(scale(x_train_smote, center = train_means, scale = train_sds))
x_test_sc  <- as.data.frame(scale(x_test, center = train_means, scale = train_sds))

# Memastikan tidak ada missing value  setelah scaling
x_train_sc[!is.finite(as.matrix(x_train_sc))] <- 0
x_test_sc[!is.finite(as.matrix(x_test_sc))] <- 0

3 Pemodelan Support Vector Machine (SVM)

model_svm <- svm(
  x = x_train_sc,
  y = y_train_smote,
  kernel = "radial",
  cost = 1,
  gamma = 1 / ncol(x_train_sc),
  probability = FALSE
)

print(model_svm)

## 
## Call:
## svm.default(x = x_train_sc, y = y_train_smote, kernel = "radial", 
##     gamma = 1/ncol(x_train_sc), cost = 1, probability = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  18778

4 Prediksi

x_train_sc <- as.data.frame(x_train_sc)
x_test_sc  <- as.data.frame(x_test_sc)

pred_train <- e1071:::predict.svm(model_svm, newdata = x_train_sc)
pred_test  <- e1071:::predict.svm(model_svm, newdata = x_test_sc)

pred_train <- factor(as.character(pred_train), levels = c("no", "yes"))
pred_test  <- factor(as.character(pred_test), levels = c("no", "yes"))

5 Evaluasi Model

evaluate_model <- function(actual, predicted, positive_class = "yes") {
  actual <- factor(actual, levels = c("no", "yes"))
  predicted <- factor(predicted, levels = c("no", "yes"))
  
  cm <- confusionMatrix(predicted, actual, positive = positive_class)
  
  acc  <- as.numeric(cm$overall["Accuracy"])
  sens <- as.numeric(cm$byClass["Sensitivity"])
  spec <- as.numeric(cm$byClass["Specificity"])
  bal_acc <- (sens + spec) / 2
  precision <- as.numeric(cm$byClass["Pos Pred Value"])
  f1 <- ifelse((precision + sens) == 0, 0,
               2 * precision * sens / (precision + sens))
  
  metrics <- data.frame(
    Accuracy = acc,
    Sensitivity = sens,
    Specificity = spec,
    Balanced_Accuracy = bal_acc,
    F1_Score = f1
  )
  
  return(list(conf_matrix = cm, metrics = metrics))
}

eval_train <- evaluate_model(y_train_smote, pred_train)

eval_test <- evaluate_model(y_test, pred_test)

5.0.1 Data Training

Confusion Matrix

print(eval_train$conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    no   yes
##        no  35461  4415
##        yes  1087  9505
##                                           
##                Accuracy : 0.891           
##                  95% CI : (0.8882, 0.8937)
##     No Information Rate : 0.7242          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7053          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6828          
##             Specificity : 0.9703          
##          Pos Pred Value : 0.8974          
##          Neg Pred Value : 0.8893          
##              Prevalence : 0.2758          
##          Detection Rate : 0.1883          
##    Detection Prevalence : 0.2099          
##       Balanced Accuracy : 0.8265          
##                                           
##        'Positive' Class : yes             
##

Metrics

print(eval_train$metrics)

##    Accuracy Sensitivity Specificity Balanced_Accuracy  F1_Score
## 1 0.8909804   0.6828305   0.9702583         0.8265444 0.7755385

5.0.2 Data Testing

Confusion Matrix

print(eval_test$conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3567  250
##        yes  101  201
##                                           
##                Accuracy : 0.9148          
##                  95% CI : (0.9058, 0.9231)
##     No Information Rate : 0.8905          
##     P-Value [Acc > NIR] : 1.405e-07       
##                                           
##                   Kappa : 0.489           
##                                           
##  Mcnemar's Test P-Value : 2.797e-15       
##                                           
##             Sensitivity : 0.44568         
##             Specificity : 0.97246         
##          Pos Pred Value : 0.66556         
##          Neg Pred Value : 0.93450         
##              Prevalence : 0.10949         
##          Detection Rate : 0.04880         
##    Detection Prevalence : 0.07332         
##       Balanced Accuracy : 0.70907         
##                                           
##        'Positive' Class : yes             
##

Metrics

print(eval_test$metrics)

##    Accuracy Sensitivity Specificity Balanced_Accuracy  F1_Score
## 1 0.9147851   0.4456763   0.9724646         0.7090704 0.5338645

5.0.3 Ringkasan Hasil Evaluasi Model

hasil_akhir <- bind_rows(
  cbind(Model = "SVM + SMOTE", Data = "Training_SMOTE", eval_train$metrics),
  cbind(Model = "SVM + SMOTE", Data = "Testing", eval_test$metrics)
)

print(hasil_akhir)

##         Model           Data  Accuracy Sensitivity Specificity
## 1 SVM + SMOTE Training_SMOTE 0.8909804   0.6828305   0.9702583
## 2 SVM + SMOTE        Testing 0.9147851   0.4456763   0.9724646
##   Balanced_Accuracy  F1_Score
## 1         0.8265444 0.7755385
## 2         0.7090704 0.5338645

Hasil evaluasi menunjukkan bahwa model Support Vector Machine yang dilatih dengan teknik SMOTE memiliki performa yang cukup baik pada data training maupun testing dari sisi akurasi dan spesifisitas. Pada data training SMOTE, model menghasilkan akurasi sebesar 0.891 dengan sensitivitas 0.683 dan spesifisitas 0.970, serta balanced accuracy 0.827 dan F1-score 0.776. Hal ini menunjukkan bahwa model mampu mengklasifikasikan sebagian besar data dengan benar, terutama dalam mengenali kelas mayoritas, yang terlihat dari nilai spesifisitas yang sangat tinggi.

Namun pada data testing, meskipun akurasi masih cukup tinggi yaitu 0.915 dan spesifisitas tetap tinggi (0.972), nilai sensitivitas menurun menjadi 0.446 dan F1-score menjadi 0.534. Penurunan ini menunjukkan bahwa model masih kurang baik dalam mendeteksi kelas positif (minoritas) pada data baru, meskipun teknik SMOTE telah digunakan untuk menyeimbangkan data saat pelatihan. Dengan demikian, model cenderung lebih baik dalam mengidentifikasi kelas negatif dibandingkan kelas positif, sehingga peningkatan performa terutama pada sensitivitas masih diperlukan agar model lebih seimbang dalam mendeteksi kedua kelas.

Prediksi dengan Pendekatan Support Vector Machine (SVM)

Cut Ashifa Sawallida

2026-03-06

1 Import Library & Data

2 Data Preprocessing

2.1 Transformasi Peubah

2.2 Identifikasi Missing Value

2.3 Imputasi Missing Value

2.4 Penanganan Outlier

2.5 Penanganan Skewness yang Invalid

2.6 Opsi Skenario Fitur dan Penyamaan Level Faktor Train dan Test

2.7 Pembuatan Peubah Dummy

2.8 SMOTE

2.9 Penanganan Zero Variance dan Scaling

2.9.1 Zero Variance

2.9.2 Scaling

3 Pemodelan Support Vector Machine (SVM)

4 Prediksi

5 Evaluasi Model

5.0.1 Data Training

5.0.2 Data Testing

5.0.3 Ringkasan Hasil Evaluasi Model