library(stringr)
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.3
## Loading required package: lattice
library(MLmetrics)
## Warning: package 'MLmetrics' was built under R version 4.3.3
##
## Attaching package: 'MLmetrics'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:base':
##
## Recall
library(class)
library(FSelector)
## Warning: package 'FSelector' was built under R version 4.3.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(party)
## Warning: package 'party' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.3.3
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
##
## Predict
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
##
## Attaching package: 'strucchange'
## The following object is masked from 'package:stringr':
##
## boundary
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
library(nnet)
## Warning: package 'nnet' was built under R version 4.3.3
# Baca data dari file CSV
databank <- read.csv("bank latih.csv")
Eksplorasi data adalah tahap awal dalam analisis data yang bertujuan untuk memahami karakteristik dataset sebelum melakukan preprocessing atau modeling. Tujuan utama dari eksplorasi data meliputi: - Mengetahui jumlah observasi dan variabel dalam dataset. - Memahami tipe data dari setiap variabel. - Mengidentifikasi data yang hilang atau tidak sesuai. - Menilai distribusi data menggunakan statistik deskriptif.
#Mengecek jumlah observasi (baris) dan atribut/variabel (kolom)
dim(databank) # Menampilkan jumlah baris dan kolom
## [1] 4521 17
# Mengecek tipe data dari tiap variabel
str(databank) # Cek struktur data
## 'data.frame': 4521 obs. of 17 variables:
## $ Age : int 30 33 35 30 59 35 36 39 41 43 ...
## $ job : chr "unemployed" "services" "management" "management" ...
## $ marital : chr "married" "married" "single" "married" ...
## $ education: chr "primary" "secondary" "tertiary" "tertier" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : chr "no" "yes" "yes" "yes" ...
## $ loan : chr "no" "yes" "no" "yes" ...
## $ contact : chr "cellular" "cellular" "cellular" "unknown" ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : chr "10" "may" "apr" "jun" ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : chr "unknown" "failure" "failure" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
# Menampilkan ringkasan statistik dari data
summary(databank)
## Age job marital education
## Min. :19.00 Length:4521 Length:4521 Length:4521
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :41.17
## 3rd Qu.:49.00
## Max. :87.00
## default balance housing loan
## Length:4521 Min. :-3313 Length:4521 Length:4521
## Class :character 1st Qu.: 69 Class :character Class :character
## Mode :character Median : 444 Mode :character Mode :character
## Mean : 1423
## 3rd Qu.: 1480
## Max. :71188
## contact day month duration
## Length:4521 Min. : 1.00 Length:4521 Min. : 4
## Class :character 1st Qu.: 9.00 Class :character 1st Qu.: 104
## Mode :character Median :16.00 Mode :character Median : 185
## Mean :15.92 Mean : 264
## 3rd Qu.:21.00 3rd Qu.: 329
## Max. :31.00 Max. :3025
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.00 Min. : 0.0000 Length:4521
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.00 Median : 0.0000 Mode :character
## Mean : 2.794 Mean : 39.77 Mean : 0.5426
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :50.000 Max. :871.00 Max. :25.0000
## y
## Length:4521
## Class :character
## Mode :character
##
##
##
# Cek missing values
missing_summary <- colSums(is.na(databank))
# Summary hasil missing values keseluruhan
print(missing_summary)
## Age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
# Mengubah semua teks menjadi huruf kecil (case folding)
databank <- databank %>%
mutate(across(where(is.character), tolower))
# Cek apakah ada tanda titik di seluruh kolom karakter
kolom_karakter <- select(databank, where(is.character))
ada_titik <- sapply(kolom_karakter, function(x) any(grepl("\\.", x)))
# Menampilkan kolom mana saja yang mengandung tanda titik
kolom_dengan_titik <- names(ada_titik[ada_titik == TRUE])
print(paste("Kolom yang mengandung tanda titik:", paste(kolom_dengan_titik, collapse = ", ")))
## [1] "Kolom yang mengandung tanda titik: job"
# Hitung jumlah tanda titik (.) di seluruh kolom job
jumlah_titik <- sum(str_count(databank$job, "\\."))
print(paste("Jumlah tanda titik di kolom job:", jumlah_titik))
## [1] "Jumlah tanda titik di kolom job: 478"
# Menghapus tanda titik dari kolom job
databank <- databank %>%
mutate(job = gsub("\\.", "", job))
print("Tanda titik pada kolom job telah dihapus")
## [1] "Tanda titik pada kolom job telah dihapus"
# Hitung jumlah tanda titik (.) di seluruh kolom job
jumlah_titik <- sum(str_count(databank$job, "\\."))
print(paste("Jumlah tanda titik di kolom job:", jumlah_titik))
## [1] "Jumlah tanda titik di kolom job: 0"
# Melihat daftar unik dari setiap kolom non-numerik
categorical_cols <- sapply(databank, is.character) # Pilih kolom kategori
databank_categorical <- databank[, categorical_cols]
# Cek nilai unik di setiap kolom kategori
lapply(databank_categorical, unique)
## $job
## [1] "unemployed" "services" "management" "blue-collar"
## [5] "self-employed" "technician" "entrepreneur" "admin"
## [9] "student" "housemaid" "retired" "unknown"
##
## $marital
## [1] "married" "single" "menikah" "divorced"
##
## $education
## [1] "primary" "secondary" "tertiary" "tertier" "sekunder" "unknown"
##
## $default
## [1] "no" "yes"
##
## $housing
## [1] "no" "yes" "tidak"
##
## $loan
## [1] "no" "yes" "tidak"
##
## $contact
## [1] "cellular" "unknown" "seluler" "telephone"
##
## $month
## [1] "10" "may" "apr" "jun" "feb" "aug" "jan" "7" "nov" "jul" "oct" "sep"
## [13] "mar" "dec"
##
## $poutcome
## [1] "unknown" "failure" "other" "success"
##
## $y
## [1] "no" "iya" "yes"
# Menangani ketidak-konsistenan isian
databank <- databank %>%
mutate(across(c(marital, education, housing, loan, contact, y, month), as.character)) %>% # Pastikan kolom bertipe karakter
mutate(across(marital, ~ gsub("menikah", "married", .))) %>%
mutate(across(education, ~ gsub("sekunder", "secondary", .))) %>%
mutate(across(education, ~ gsub("tertier", "tertiary", .))) %>%
mutate(across(housing, ~ gsub("tidak", "no", .))) %>%
mutate(across(loan, ~ gsub("tidak", "no", .))) %>%
mutate(across(contact, ~ gsub("seluler", "cellular", .))) %>%
mutate(across(month, ~ gsub("^10$", "oct", .))) %>%
mutate(across(month, ~ gsub("^7$", "jul", .))) %>%
mutate(across(y, ~ gsub("iya", "yes", .)))
# Cek Hasil perubahan
# Pilih hanya kolom kategori
categorical_cols <- sapply(databank, is.character)
databank_categorical <- databank[, categorical_cols]
# Loop untuk setiap kolom kategori
for (col in colnames(databank_categorical)) {
# Hitung jumlah kemunculan setiap nilai unik
count_data <- databank %>%
count(!!sym(col)) %>%
arrange(desc(n)) # Urutkan dari terbesar
# Hitung total keseluruhan
total_count <- sum(count_data$n)
# Tentukan lebar maksimum untuk teks kategori
max_width <- max(nchar(count_data[[col]]), na.rm = TRUE)
# Cetak header kategori
cat("\n", col, "\n", strrep("=", nchar(col)), "\n")
cat(sprintf("%-*s %10s\n", max_width, "Kategori", "Jumlah")) # Header tabel
cat(strrep("-", max_width + 12), "\n") # Garis pemisah
# Cetak data kategori dengan format rapi
for (i in 1:nrow(count_data)) {
cat(sprintf("%-*s %10d\n", max_width, count_data[[col]][i], count_data$n[i]))
}
# Cetak total dengan format yang sejajar
cat(strrep("-", max_width + 12), "\n")
cat(sprintf("%-*s %10d\n\n", max_width, "Total", total_count))
}
##
## job
## ===
## Kategori Jumlah
## -------------------------
## management 969
## blue-collar 946
## technician 768
## admin 478
## services 417
## retired 230
## self-employed 183
## entrepreneur 168
## unemployed 128
## housemaid 112
## student 84
## unknown 38
## -------------------------
## Total 4521
##
##
## marital
## =======
## Kategori Jumlah
## --------------------
## married 2797
## single 1196
## divorced 528
## --------------------
## Total 4521
##
##
## education
## =========
## Kategori Jumlah
## ---------------------
## secondary 2306
## tertiary 1350
## primary 678
## unknown 187
## ---------------------
## Total 4521
##
##
## default
## =======
## Kategori Jumlah
## ---------------
## no 4445
## yes 76
## ---------------
## Total 4521
##
##
## housing
## =======
## Kategori Jumlah
## ---------------
## yes 2559
## no 1962
## ---------------
## Total 4521
##
##
## loan
## ====
## Kategori Jumlah
## ---------------
## no 3830
## yes 691
## ---------------
## Total 4521
##
##
## contact
## =======
## Kategori Jumlah
## ---------------------
## cellular 2896
## unknown 1324
## telephone 301
## ---------------------
## Total 4521
##
##
## month
## =====
## Kategori Jumlah
## ---------------
## may 1398
## jul 706
## aug 633
## jun 531
## nov 389
## apr 293
## feb 222
## jan 148
## oct 80
## sep 52
## mar 49
## dec 20
## ---------------
## Total 4521
##
##
## poutcome
## ========
## Kategori Jumlah
## -------------------
## unknown 3705
## failure 490
## other 197
## success 129
## -------------------
## Total 4521
##
##
## y
## =
## Kategori Jumlah
## ---------------
## no 4000
## yes 521
## ---------------
## Total 4521
# Mengecek rule validasi
# Cek nilai negatif di kolom Age
invalid_age <- databank %>% filter(Age < 0)
if (nrow(invalid_age) > 0) {
print("Ditemukan nilai negatif di kolom Age:")
print(invalid_age)
} else {
print("Tidak ada nilai negatif di kolom Age.")
}
## [1] "Tidak ada nilai negatif di kolom Age."
# Cek nilai negatif di kolom duration
invalid_duration <- databank %>% filter(duration < 0)
if (nrow(invalid_duration) > 0) {
print("Ditemukan nilai negatif di kolom duration:")
print(invalid_duration)
} else {
print("Tidak ada nilai negatif di kolom duration.")
}
## [1] "Tidak ada nilai negatif di kolom duration."
## Definisikan jumlah hari maksimum untuk setiap bulan
max_days <- c("jan" = 31, "feb" = 29, "mar" = 31, "apr" = 30,
"may" = 31, "jun" = 30, "jul" = 31, "aug" = 31,
"sep" = 30, "oct" = 31, "nov" = 30, "dec" = 31)
# Pastikan day bertipe numerik
databank <- databank %>%
mutate(day = as.integer(day))
# Filter baris yang memiliki tanggal tidak valid tanpa menambahkan kolom baru
invalid_rows <- databank %>%
filter(!(day >= 1 & day <= max_days[tolower(month)]))
# Cek hasil validasi
if(nrow(invalid_rows) > 0) {
print("Ada data yang tidak valid:")
print(invalid_rows)
} else {
print("Semua data valid!")
}
## [1] "Semua data valid!"
# Cek duplikasi data
duplikasi <- sum(duplicated(databank))
print(paste("Jumlah baris duplikat:", duplikasi))
## [1] "Jumlah baris duplikat: 0"
Berikut ini dilakukan reduksi data (data reduction) dengan fokus pada pemilihan variabel (feature selection) menggunakan korelasi antar variabel numerik dan uji multikolinearitas dengan Variance Inflation Factor (VIF). Nantinya, hasil ini dapat berguna untuk melakukan metode reduksi dimensionalitas seperti PCA.
# Feature Selection (Pemilihan Variabel)
# 1. Cek Korelasi antar variabel numerik
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(databank %>% select_if(is.numeric))
corrplot(cor_matrix, method = "color")
# Pilih hanya variabel numerik
num_vars <- databank %>% select_if(is.numeric)
# Hitung matriks korelasi
cor_matrix <- cor(num_vars, use = "pairwise.complete.obs")
# Tampilkan matriks korelasi dengan angka desimal 3 digit
print(round(cor_matrix, 3))
## Age balance day duration campaign pdays previous
## Age 1.000 0.084 -0.018 -0.002 -0.005 -0.009 -0.004
## balance 0.084 1.000 -0.009 -0.016 -0.010 0.009 0.026
## day -0.018 -0.009 1.000 -0.025 0.161 -0.094 -0.059
## duration -0.002 -0.016 -0.025 1.000 -0.068 0.010 0.018
## campaign -0.005 -0.010 0.161 -0.068 1.000 -0.093 -0.068
## pdays -0.009 0.009 -0.094 0.010 -0.093 1.000 0.578
## previous -0.004 0.026 -0.059 0.018 -0.068 0.578 1.000
# Load library yang dibutuhkan
library(car)
# Pastikan y ada di databank dan bertipe numerik atau faktor biner (0/1)
databank$y <- ifelse(databank$y == "yes", 1, 0)
# Pilih hanya variabel numerik dari databank
num_vars <- databank[, sapply(databank, is.numeric)]
# Hapus baris dengan NA agar panjang variabel sama
num_vars <- na.omit(num_vars)
# Cek apakah semua variabel memiliki jumlah baris yang sama
if (nrow(num_vars) > 0 && all(sapply(num_vars, length) == nrow(num_vars))) {
# Model regresi untuk menghitung VIF
model <- lm(y ~ ., data = num_vars) # Pastikan y ada dalam num_vars
# Hitung VIF
vif_values <- vif(model)
# Tampilkan hasil VIF
print(vif_values)
} else {
print("Error: Ada masalah dengan panjang variabel atau dataset kosong!")
}
## Age balance day duration campaign pdays previous
## 1.007513 1.008210 1.033819 1.005384 1.037721 1.513716 1.502266
Hasil dari corrplot menunjukkan bahwa Mayoritas nilai korelasi rendah, yang artinya tidak ada hubungan linear yang kuat antara variabel-variabel ini. Lalu, Tidak ada indikasi kuat multikolinearitas karena semua VIF di bawah 5. Oleh karena nilai korelasi rendah dan tidak ada multikoleniaritas, maka tidak perlu dilakukan reduksi dimensionalitas.
Berikut ini dicoba untuk melakukan reduksi Numerositas dengan teknik clustering #### Reduksi Numerositas
# Load library yang diperlukan
library(tidyverse) # Untuk manipulasi data
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ strucchange::boundary() masks stringr::boundary()
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ✖ party::where() masks dplyr::where()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster) # Untuk analisis clustering
## Warning: package 'cluster' was built under R version 4.3.3
library(factoextra) # Untuk visualisasi clustering
## Warning: package 'factoextra' was built under R version 4.3.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Preprocessing Data: Pilih hanya variabel numerik
num_vars <- databank %>% select_if(is.numeric)
# Normalisasi Data (agar skala variabel seragam)
num_vars_scaled <- scale(num_vars)
# Tentukan jumlah cluster optimal dengan metode Elbow
# Metode Elbow dengan garis penanda
fviz_nbclust(num_vars_scaled, kmeans, method = "wss") +
geom_vline(xintercept = 7, linetype = "dashed", color = "red") + # Garis merah di elbow
labs(title = "Metode Elbow dengan Penanda Optimal Cluster")
# 4. Jalankan K-Means Clustering
set.seed(123) # Untuk hasil yang reproducible
kmeans_result <- kmeans(num_vars_scaled, centers = 7, nstart = 25) # Sesuaikan jumlah cluster
# Tambahkan hasil cluster ke data asli
databank$Cluster <- as.factor(kmeans_result$cluster)
# 5. Visualisasi hasil clustering
fviz_cluster(kmeans_result, data = num_vars_scaled,
geom = "point", ellipse.type = "convex") +
ggtitle("Hasil Clustering dengan K-Means")
Transformasi data adalah mengubah data menjadi bentuk yang lebih sesuai untuk data mining. Salah satu strategi dari transformasi data adalah dengan Discretization: Mengonversi nilai numerik menjadi label interval (misalnya usia: 0–10, 11–20) atau label konseptual (muda, dewasa, lansia).
# Mendiskretisasi data usia (Age)
databank <- databank %>%
mutate(age_group = cut(Age, breaks = c(0, 25, 40, 60, Inf), labels = c("Young", "Adult", "Middle-aged", "Senior")))
#### **Encoding variabel kategorik menjadi variabel numerik**
# Encoding variabel job
databank$job <- factor(databank$job,
levels = c("management", "blue-collar", "technician", "admin",
"services", "retired", "self-employed", "entrepreneur",
"unemployed", "housemaid", "student", "unknown"),
labels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
# Encoding variabel marital
databank$marital <- factor(databank$marital,
levels = c("married", "single", "divorced"),
labels = c(1, 2, 3))
# Encoding variabel education
databank$education <- factor(databank$education,
levels = c("secondary", "tertiary", "primary", "unknown"),
labels = c(1, 2, 3, 4))
# Encoding variabel default
databank$default <- factor(databank$default,
levels = c("no", "yes"),
labels = c(0, 1))
# Encoding variabel housing
databank$housing <- factor(databank$housing,
levels = c("no", "yes"),
labels = c(0, 1))
# Encoding variabel loan
databank$loan <- factor(databank$loan,
levels = c("no", "yes"),
labels = c(0, 1))
# Encoding variabel contact
databank$contact <- factor(databank$contact,
levels = c("cellular", "unknown", "telephone"),
labels = c(1, 2, 3))
# Encoding variabel month
databank$month <- factor(databank$month,
levels = c("jan", "feb", "mar", "apr", "may", "jun", "jul",
"aug", "sep", "oct", "nov", "dec"),
labels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
# Encoding variabel poutcome
databank$poutcome <- factor(databank$poutcome,
levels = c("unknown", "failure", "other", "success"),
labels = c(1, 2, 3, 4))
# Menampilkan heading data setelah encoding
head(databank)
## Age job marital education default balance housing loan contact day month
## 1 30 9 1 3 0 1787 0 0 1 19 10
## 2 33 5 1 1 0 4789 1 1 1 11 5
## 3 35 1 2 2 0 1350 1 0 1 16 4
## 4 30 1 1 2 0 1476 1 1 2 3 6
## 5 59 2 1 1 0 0 1 0 2 5 5
## 6 35 1 2 2 0 747 0 0 1 23 2
## duration campaign pdays previous poutcome y Cluster age_group
## 1 79 1 -1 0 1 0 3 Adult
## 2 220 1 339 4 2 0 5 Adult
## 3 185 1 330 1 2 0 5 Adult
## 4 199 4 -1 0 1 0 7 Adult
## 5 226 1 -1 0 1 0 2 Middle-aged
## 6 141 2 176 3 2 0 5 Adult
# Simpan hasil preprocessing tanpa kolom Age
write.csv(databank %>% select(-Age), "D:/3SD2/SEMESTER 6/Datmin/Kelompok 3/hasilPrepro_databank.csv", row.names = FALSE)
print("Preprocessing selesai! Data telah disimpan sebagai hasilPrepro_databank.csv")
## [1] "Preprocessing selesai! Data telah disimpan sebagai hasilPrepro_databank.csv"
data <- read.csv("hasilPrepro_databank.csv", header = TRUE)
# Melihat struktur data
str(data)
## 'data.frame': 4521 obs. of 18 variables:
## $ job : int 9 5 1 1 2 1 7 3 8 5 ...
## $ marital : int 1 1 2 1 1 2 1 1 1 1 ...
## $ education: int 3 1 2 2 1 2 2 1 2 3 ...
## $ default : int 0 0 0 0 0 0 0 0 0 0 ...
## $ balance : int 1787 4789 1350 1476 0 747 307 147 221 -88 ...
## $ housing : int 0 1 1 1 1 0 1 1 1 1 ...
## $ loan : int 0 1 0 1 0 0 0 0 0 1 ...
## $ contact : int 1 1 1 2 2 1 1 1 2 1 ...
## $ day : int 19 11 16 3 5 23 14 6 14 17 ...
## $ month : int 10 5 4 6 5 2 5 5 5 4 ...
## $ duration : int 79 220 185 199 226 141 341 151 57 313 ...
## $ campaign : int 1 1 1 4 1 2 1 2 2 1 ...
## $ pdays : int -1 339 330 -1 -1 176 330 -1 -1 147 ...
## $ previous : int 0 4 1 0 0 3 2 0 0 2 ...
## $ poutcome : int 1 2 2 1 1 2 3 1 1 2 ...
## $ y : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Cluster : int 3 5 5 7 2 5 5 7 7 5 ...
## $ age_group: chr "Adult" "Adult" "Adult" "Adult" ...
# Pastikan target 'y' bertipe faktor
data$y <- as.factor(data$y)
# Konversi variabel kategorikal menjadi faktor
categorical_cols <- c("job", "marital", "education", "default", "housing",
"loan", "contact", "month", "poutcome")
data[categorical_cols] <- lapply(data[categorical_cols], as.factor)
# Feature Selection menggunakan Information Gain
weights <- information.gain(y ~ ., data = data)
weights
## attr_importance
## job 6.925136e-03
## marital 2.060405e-03
## education 1.639668e-03
## default 8.418705e-07
## balance 3.699592e-03
## housing 5.425481e-03
## loan 2.850844e-03
## contact 1.132256e-02
## day 0.000000e+00
## month 2.072607e-02
## duration 7.277524e-02
## campaign 0.000000e+00
## pdays 2.463002e-02
## previous 1.124728e-02
## poutcome 2.604928e-02
## Cluster 3.002471e-01
## age_group 7.798001e-03
selected_features <- cutoff.k(weights, 10) # ambil 10 fitur terbaik
selected_features
## [1] "Cluster" "duration" "poutcome" "pdays" "month" "contact"
## [7] "previous" "age_group" "job" "housing"
# Ambil data berdasarkan fitur terpilih
selected_formula <- as.formula(paste("y ~", paste(selected_features, collapse = "+")))
data_selected <- data[, c("y", selected_features)]
Penjelasan: Model klasifikasi seperti Naive Bayes, biasanya butuh target (label) dalam bentuk factor, bukan numerik. Pada datasetnya, masih berupa angka (0, 1), sehingga diubah dulu menjadi faktor agar dikenali sebagai label kategorik bukan ordinal. Hal yang sama dilakukan pada variabel kategorikal lainnya. Selanjutnya dilakukan feature selection atau pemilihan atribut/variabel dengan information gain. Feature selection ini dilakukan untuk mengurangi dimensi data agar model lebih cepat, tidak overfitting, dan mudah ditafsirkan, serta agar model fokus hanya pada fitur yang paling informatif untuk prediksi variabel y. Penggunaan information gain dipilih karena information gain cocok untuk klasifikasi, dimana target y adalah kategori (misal: 0 dan 1). Information.gain() menghitung seberapa besar kontribusi tiap fitur dalam mengurangi ketidakpastian (entropi) terhadap target y. Semakin besar nilai Information Gain, maka fitur tersebut dianggap semakin relevan. Oleh sebab itu, disini dipilih 10 fitur teratas berdasarkan skor Information Gain.
set.seed(123)
trainIndex <- createDataPartition(data_selected$y, p = 0.75, list = FALSE)
trainData <- data_selected[trainIndex, ]
testData <- data_selected[-trainIndex, ]
# Naive Bayes Model
model_nb <- naiveBayes(selected_formula, data = trainData)
model_nb
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.8846948 0.1153052
##
## Conditional probabilities:
## Cluster
## Y [,1] [,2]
## 0 4.06000 2.0389014
## 1 5.86445 0.5109465
##
## duration
## Y [,1] [,2]
## 0 224.8693 208.7637
## 1 536.6036 369.6727
##
## poutcome
## Y 1 2 3 4
## 0 0.83933333 0.10833333 0.04033333 0.01200000
## 1 0.63427110 0.12276215 0.07416880 0.16879795
##
## pdays
## Y [,1] [,2]
## 0 36.28833 96.14579
## 1 71.58312 126.36681
##
## month
## Y 1 2 3 4 5 6
## 0 0.034666667 0.046000000 0.006000000 0.063333333 0.326000000 0.114333333
## 1 0.035805627 0.071611253 0.043478261 0.107416880 0.194373402 0.092071611
## month
## Y 7 8 9 10 11 12
## 0 0.159000000 0.138333333 0.008666667 0.011000000 0.089666667 0.003000000
## 1 0.125319693 0.140664962 0.030690537 0.066496164 0.071611253 0.020460358
##
## contact
## Y 1 2 3
## 0 0.62600000 0.30833333 0.06566667
## 1 0.80562660 0.10741688 0.08695652
##
## previous
## Y [,1] [,2]
## 0 0.4883333 1.679128
## 1 1.0869565 1.959228
##
## age_group
## Y Adult Middle-aged Senior Young
## 0 0.52066667 0.43700000 0.01966667 0.02266667
## 1 0.47058824 0.37340153 0.10485934 0.05115090
##
## job
## Y 1 2 3 4 5 6
## 0 0.20533333 0.21600000 0.17133333 0.10933333 0.09600000 0.04500000
## 1 0.25319693 0.13299233 0.14833760 0.10485934 0.07672634 0.10230179
## job
## Y 7 8 9 10 11 12
## 0 0.04033333 0.03700000 0.02766667 0.02633333 0.01666667 0.00900000
## 1 0.03580563 0.03324808 0.02813299 0.02813299 0.04092072 0.01534527
##
## housing
## Y 0 1
## 0 0.4153333 0.5846667
## 1 0.5754476 0.4245524
Tipe model yang digunakan adalah Naive Bayes Classifier untuk prediktor diskrit (Discrete Predictors). Artinya, semua fitur dianggap kategori (kalau numerik, diasumsikan distribusi normal sederhana per kelas). Model ini mengasumsikan independensi antar fitur (fitur-fitur dianggap tidak saling mempengaruhi secara langsung). Pada output yang dihasilkan, terdapat A-Priori Probabilities (Probabilitas Sebelum Observasi Data), di mana hasilnya adalah proporsi kelas di dataset yakni: Proporsi Kelas 0: 88.47% (mayoritas) Proporsi Kelas 1: 11.53% (minoritas) Artinya dataset yang digunakan sangat tidak seimbang ke arah kelas 0. Selanjutnya, ditampilkan pula Conditional Probabilities (Probabilitas Bersyarat per Fitur). Dengan fitur-fitur tersebut, model belajar pola probabilitas bersyarat agar bisa mengklasifikasikan apakah seorang nasabah akan diberi pinjaman (“yes”=1) atau tidak (“nol”=0) saat mengajukan pinjaman ke bank.
Selanjutnya, dilakukan prediksi untuk data training dan data testingnya, yang kemudian nantinya akan dibandingkan dengan data aslinya.
# Prediksi untuk data training dan testing
pred_train <- predict(model_nb, trainData)
pred_test <- predict(model_nb, testData)
Evaluasi hasil prediksi tersebut dilakukan menggunakan confusion matrix dan berbagai metrik evaluasi, seperti Accuracy, Recall (Sensitivity), F1 Score, Balanced Accuracy, Macro Average F1-Score. ## Evaluasi Hasil Prediksi Model Naive Bayes ## Untuk Data Training
### **Untuk Data Training**
# === Evaluasi Data Training ===
# Membuat confusion matrix
conf_train <- confusionMatrix(pred_train, trainData$y)
# Ambil confusion matrix table
conf_matrix <- conf_train$table
# Jumlah sampel per kelas aktual
jumlah_kelas_0 <- sum(trainData$y == 0)
jumlah_kelas_1 <- sum(trainData$y == 1)
total_data <- jumlah_kelas_0 + jumlah_kelas_1
# === Recall (Sensitivitas) ===
recall_kelas_0 <- conf_train$byClass["Sensitivity"] # Recall kelas 0 (positif)
recall_kelas_1 <- conf_train$byClass["Specificity"] # Recall kelas 1 (negatif)
# === Precision ===
# Precision kelas 0 = TP_0 / (TP_0 + FP_0)
precision_0 <- conf_matrix[1,1] / (conf_matrix[1,1] + conf_matrix[1,2])
# Precision kelas 1 = TP_1 / (TP_1 + FP_1)
precision_1 <- conf_matrix[2,2] / (conf_matrix[2,2] + conf_matrix[2,1])
# === F1-score ===
f1_kelas_0 <- conf_train$byClass["F1"] # F1 untuk kelas 0 dari caret
f1_kelas_1 <- 2 * (precision_1 * recall_kelas_1) / (precision_1 + recall_kelas_1)
# === Macro & Weighted Averages ===
# Macro
precision_macro <- (precision_0 + precision_1) / 2
recall_macro <- (recall_kelas_0 + recall_kelas_1) / 2
f1_macro <- (f1_kelas_0 + f1_kelas_1) / 2
# Weighted
precision_weighted <- (precision_0 * jumlah_kelas_0 + precision_1 * jumlah_kelas_1) / total_data
recall_weighted <- (recall_kelas_0 * jumlah_kelas_0 + recall_kelas_1 * jumlah_kelas_1) / total_data
# Balanced Accuracy = Macro Recall
balanced_accuracy <- recall_macro
# Accuracy
accuracy_train <- conf_train$overall["Accuracy"]
# Menampilkan hasil evaluasi untuk data training
cat("\n=== Evaluasi Data Training ===\n")
##
## === Evaluasi Data Training ===
print(conf_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2837 130
## 1 163 261
##
## Accuracy : 0.9136
## 95% CI : (0.9036, 0.9228)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : 2.54e-08
##
## Kappa : 0.5915
##
## Mcnemar's Test P-Value : 0.06156
##
## Sensitivity : 0.9457
## Specificity : 0.6675
## Pos Pred Value : 0.9562
## Neg Pred Value : 0.6156
## Prevalence : 0.8847
## Detection Rate : 0.8366
## Detection Prevalence : 0.8750
## Balanced Accuracy : 0.8066
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (no): %.4f\n", recall_kelas_0))
## Recall kelas 0 (no): 0.9457
cat(sprintf("Precision kelas 0 (no): %.4f\n", precision_0))
## Precision kelas 0 (no): 0.9562
cat(sprintf("F1-score kelas 0 (no): %.4f\n", f1_kelas_0))
## F1-score kelas 0 (no): 0.9509
cat(sprintf("Recall kelas 1 (yes): %.4f\n", recall_kelas_1))
## Recall kelas 1 (yes): 0.6675
cat(sprintf("Precision kelas 1 (yes): %.4f\n", precision_1))
## Precision kelas 1 (yes): 0.6156
cat(sprintf("F1-score kelas 1 (yes): %.4f\n", f1_kelas_1))
## F1-score kelas 1 (yes): 0.6405
cat(sprintf("Accuracy: %.4f\n", accuracy_train))
## Accuracy: 0.9136
cat(sprintf("Balanced Accuracy: %.4f\n", balanced_accuracy))
## Balanced Accuracy: 0.8066
cat(sprintf("Macro Average Precision: %.4f\n", precision_macro))
## Macro Average Precision: 0.7859
cat(sprintf("Weighted Average Precision: %.4f\n", precision_weighted))
## Weighted Average Precision: 0.9169
cat(sprintf("Macro Average Recall: %.4f\n", recall_macro))
## Macro Average Recall: 0.8066
cat(sprintf("Weighted Average Recall: %.4f\n", recall_weighted))
## Weighted Average Recall: 0.9136
cat(sprintf("Macro Average F1-score: %.4f\n", f1_macro))
## Macro Average F1-score: 0.7957
Interpretasi Output: Confussion Matrix: Dari total data training, 2837 data diklasifikasikan benar sebagai kelas 0 (no). 261 data diklasifikasikan benar sebagai kelas 1 (yes). 130 data kelas 1 salah diklasifikasikan sebagai 0 (false negative). 163 data kelas 0 salah diklasifikasikan sebagai 1 (false positive).
Balanced Accuracy sebesar 0.8066, ini merupakan rata-rata recall dari kedua kelas, digunakan untuk menghindari bias karena data imbalance.
Macro F1-score = 0.7957 artinya secara rata-rata, performa model dalam menyeimbangkan precision dan recall untuk kedua kelas (0 dan 1) adalah sekitar 79.6%.
### **Untuk Data Testing**
# === Evaluasi Data Testing ===
# Membuat confusion matrix
conf_test <- confusionMatrix(pred_test, testData$y)
# Ambil confusion matrix table
conf_matrix_test <- conf_test$table
# Jumlah sampel per kelas aktual
jumlah_kelas_0_test <- sum(testData$y == 0)
jumlah_kelas_1_test <- sum(testData$y == 1)
total_data_test <- jumlah_kelas_0_test + jumlah_kelas_1_test
# === Recall (Sensitivitas) ===
recall_kelas_0_test <- conf_test$byClass["Sensitivity"] # Recall kelas 0 (positif)
recall_kelas_1_test <- conf_test$byClass["Specificity"] # Recall kelas 1 (negatif)
# === Precision ===
# Precision kelas 0 = TP_0 / (TP_0 + FP_0)
precision_0_test <- conf_matrix_test[1,1] / (conf_matrix_test[1,1] + conf_matrix_test[1,2])
# Precision kelas 1 = TP_1 / (TP_1 + FP_1)
precision_1_test <- conf_matrix_test[2,2] / (conf_matrix_test[2,2] + conf_matrix_test[2,1])
# === F1-score ===
f1_kelas_0_test <- conf_test$byClass["F1"] # F1 kelas 0 dari caret
f1_kelas_1_test <- 2 * (precision_1_test * recall_kelas_1_test) / (precision_1_test + recall_kelas_1_test)
# === Macro & Weighted Averages ===
# Macro
precision_macro_test <- (precision_0_test + precision_1_test) / 2
recall_macro_test <- (recall_kelas_0_test + recall_kelas_1_test) / 2
f1_macro_test <- (f1_kelas_0_test + f1_kelas_1_test) / 2
# Weighted
precision_weighted_test <- (precision_0_test * jumlah_kelas_0_test + precision_1_test * jumlah_kelas_1_test) / total_data_test
recall_weighted_test <- (recall_kelas_0_test * jumlah_kelas_0_test + recall_kelas_1_test * jumlah_kelas_1_test) / total_data_test
# Balanced Accuracy = Macro Recall
balanced_accuracy_test <- recall_macro_test
# Accuracy
accuracy_test <- conf_test$overall["Accuracy"]
# Menampilkan hasil evaluasi untuk data testing
cat("\n=== Evaluasi Data Testing ===\n")
##
## === Evaluasi Data Testing ===
print(conf_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 961 44
## 1 39 86
##
## Accuracy : 0.9265
## 95% CI : (0.9098, 0.9411)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 2.202e-06
##
## Kappa : 0.6331
##
## Mcnemar's Test P-Value : 0.6606
##
## Sensitivity : 0.9610
## Specificity : 0.6615
## Pos Pred Value : 0.9562
## Neg Pred Value : 0.6880
## Prevalence : 0.8850
## Detection Rate : 0.8504
## Detection Prevalence : 0.8894
## Balanced Accuracy : 0.8113
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (no): %.4f\n", recall_kelas_0_test))
## Recall kelas 0 (no): 0.9610
cat(sprintf("Precision kelas 0 (no): %.4f\n", precision_0_test))
## Precision kelas 0 (no): 0.9562
cat(sprintf("F1-score kelas 0 (no): %.4f\n", f1_kelas_0_test))
## F1-score kelas 0 (no): 0.9586
cat(sprintf("Recall kelas 1 (yes): %.4f\n", recall_kelas_1_test))
## Recall kelas 1 (yes): 0.6615
cat(sprintf("Precision kelas 1 (yes): %.4f\n", precision_1_test))
## Precision kelas 1 (yes): 0.6880
cat(sprintf("F1-score kelas 1 (yes): %.4f\n", f1_kelas_1_test))
## F1-score kelas 1 (yes): 0.6745
cat(sprintf("Accuracy: %.4f\n", accuracy_test))
## Accuracy: 0.9265
cat(sprintf("Balanced Accuracy: %.4f\n", balanced_accuracy_test))
## Balanced Accuracy: 0.8113
cat(sprintf("Macro Average Precision: %.4f\n", precision_macro_test))
## Macro Average Precision: 0.8221
cat(sprintf("Weighted Average Precision: %.4f\n", precision_weighted_test))
## Weighted Average Precision: 0.9254
cat(sprintf("Macro Average Recall: %.4f\n", recall_macro_test))
## Macro Average Recall: 0.8113
cat(sprintf("Weighted Average Recall: %.4f\n", recall_weighted_test))
## Weighted Average Recall: 0.9265
cat(sprintf("Macro Average F1-score: %.4f\n", f1_macro_test))
## Macro Average F1-score: 0.8166
Interpretasi Output: Confussion Matrix: Dari total data testing, 961 data diklasifikasikan benar sebagai kelas 0 (no). 86 data diklasifikasikan benar sebagai kelas 1 (yes). 44 data kelas 1 salah diklasifikasikan sebagai 0 (false negative). 86 data kelas 0 salah diklasifikasikan sebagai 1 (false positive).
Balanced Accuracy sebesar 0.8113, ini merupakan rata-rata recall dari kedua kelas, digunakan untuk menghindari bias karena data imbalance. Nilai 81.13% menunjukkan model cukup seimbang dalam mengenali kedua kelas.
Macro F1-score = 0.8166 artinya secara rata-rata, performa model dalam menyeimbangkan precision dan recall untuk kedua kelas (0 dan 1) adalah sekitar 81.66%. Nilai 81.66% artinya model relatif adil terhadap kedua kelas, meskipun performa di kelas 1 (yes) masih lebih rendah dari kelas 0.
# -----------------------------------------
# Decision Tree Model using ctree
# -----------------------------------------
# Pastikan semua variabel character diubah menjadi factor
trainData[] <- lapply(trainData, function(x) {
if (is.character(x)) factor(x) else x
})
testData[] <- lapply(testData, function(x) {
if (is.character(x)) factor(x) else x
})
# Menyamakan level faktor antara data pelatihan dan data pengujian
levels_train <- levels(trainData$y)
trainData$y <- factor(trainData$y, levels = levels_train)
testData$y <- factor(testData$y, levels = levels_train)
# Melatih model Decision Tree menggunakan ctree
model_tree <- ctree(y ~ ., data = trainData)
print(model_tree)
##
## Conditional inference tree with 26 terminal nodes
##
## Response: y
## Inputs: Cluster, duration, poutcome, pdays, month, contact, previous, age_group, job, housing
## Number of observations: 3391
##
## 1) duration <= 383; criterion = 1, statistic = 523.66
## 2) poutcome == {4}; criterion = 1, statistic = 432.766
## 3) Cluster <= 5; criterion = 1, statistic = 27.718
## 4)* weights = 33
## 3) Cluster > 5
## 5)* weights = 40
## 2) poutcome == {1, 2, 3}
## 6) month == {3, 9, 10}; criterion = 1, statistic = 228.702
## 7) duration <= 184; criterion = 1, statistic = 17.148
## 8)* weights = 60
## 7) duration > 184
## 9) Cluster <= 5; criterion = 0.986, statistic = 10.214
## 10)* weights = 16
## 9) Cluster > 5
## 11)* weights = 22
## 6) month == {1, 2, 4, 5, 6, 7, 8, 11, 12}
## 12) age_group == {Senior, Young}; criterion = 1, statistic = 81.432
## 13) Cluster <= 5; criterion = 1, statistic = 20.057
## 14)* weights = 73
## 13) Cluster > 5
## 15) Cluster <= 6; criterion = 1, statistic = 37
## 16)* weights = 18
## 15) Cluster > 6
## 17)* weights = 20
## 12) age_group == {Adult, Middle-aged}
## 18) duration <= 222; criterion = 1, statistic = 57.511
## 19) Cluster <= 5; criterion = 1, statistic = 36.503
## 20) poutcome == {3}; criterion = 0.967, statistic = 11.387
## 21)* weights = 67
## 20) poutcome == {1, 2}
## 22) pdays <= 366; criterion = 0.995, statistic = 15.39
## 23)* weights = 1248
## 22) pdays > 366
## 24)* weights = 14
## 19) Cluster > 5
## 25) Cluster <= 6; criterion = 1, statistic = 509
## 26)* weights = 23
## 25) Cluster > 6
## 27)* weights = 487
## 18) duration > 222
## 28) poutcome == {3}; criterion = 1, statistic = 39.538
## 29)* weights = 25
## 28) poutcome == {1, 2}
## 30) Cluster <= 3; criterion = 1, statistic = 29.366
## 31)* weights = 312
## 30) Cluster > 3
## 32) Cluster <= 6; criterion = 0.998, statistic = 19.071
## 33) Cluster <= 5; criterion = 1, statistic = 49.222
## 34) month == {1, 2, 4, 5, 6, 11, 12}; criterion = 1, statistic = 36.339
## 35)* weights = 54
## 34) month == {7, 8}
## 36)* weights = 7
## 33) Cluster > 5
## 37)* weights = 28
## 32) Cluster > 6
## 38)* weights = 167
## 1) duration > 383
## 39) Cluster <= 5; criterion = 1, statistic = 129.463
## 40) age_group == {Senior, Young}; criterion = 1, statistic = 54.157
## 41)* weights = 11
## 40) age_group == {Adult, Middle-aged}
## 42) pdays <= 364; criterion = 1, statistic = 37.685
## 43) previous <= 7; criterion = 1, statistic = 24.237
## 44)* weights = 297
## 43) previous > 7
## 45)* weights = 7
## 42) pdays > 364
## 46)* weights = 10
## 39) Cluster > 5
## 47) Cluster <= 6; criterion = 1, statistic = 314.661
## 48) duration <= 1309; criterion = 1, statistic = 47.296
## 49)* weights = 201
## 48) duration > 1309
## 50)* weights = 24
## 47) Cluster > 6
## 51)* weights = 127
# Menampilkan Decision Tree
plot(model_tree)
# Membuat prediksi pada data latih dan uji
pred_tree_train <- predict(model_tree, trainData)
pred_tree_test <- predict(model_tree, testData)
# Menyamakan level faktor pada prediksi dengan data aktual
pred_tree_train <- factor(pred_tree_train, levels = levels_train)
pred_tree_test <- factor(pred_tree_test, levels = levels_train)
# === Evaluasi pada Training Set ===
evaluasi_tree_train <- confusionMatrix(pred_tree_train, trainData$y)
tabel_konfusi_tree_train <- evaluasi_tree_train$table
recall_0_tree_train <- evaluasi_tree_train$byClass["Sensitivity"]
recall_1_tree_train <- evaluasi_tree_train$byClass["Specificity"]
f1_0_tree_train <- evaluasi_tree_train$byClass["F1"]
precision_1_tree_train <- tabel_konfusi_tree_train[2,2] / sum(tabel_konfusi_tree_train[2,])
f1_1_tree_train <- 2 * (precision_1_tree_train * recall_1_tree_train) / (precision_1_tree_train + recall_1_tree_train)
akurasi_tree_train <- evaluasi_tree_train$overall["Accuracy"]
akurasi_avg_tree_train <- (recall_0_tree_train + recall_1_tree_train) / 2
f1_macro_tree_train <- (f1_0_tree_train + f1_1_tree_train) / 2
cat("\n=== Evaluasi Model Tree pada Data Training ===\n")
##
## === Evaluasi Model Tree pada Data Training ===
print(evaluasi_tree_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2982 46
## 1 18 345
##
## Accuracy : 0.9811
## 95% CI : (0.976, 0.9854)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9045
##
## Mcnemar's Test P-Value : 0.0007382
##
## Sensitivity : 0.9940
## Specificity : 0.8824
## Pos Pred Value : 0.9848
## Neg Pred Value : 0.9504
## Prevalence : 0.8847
## Detection Rate : 0.8794
## Detection Prevalence : 0.8930
## Balanced Accuracy : 0.9382
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0_tree_train))
## Recall kelas 0 (negatif): 0.9940
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0_tree_train))
## F1-score kelas 0 (negatif): 0.9894
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1_tree_train))
## Recall kelas 1 (positif): 0.8824
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1_tree_train))
## F1-score kelas 1 (positif): 0.9151
cat(sprintf("Akurasi: %.4f\n", akurasi_tree_train))
## Akurasi: 0.9811
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg_tree_train))
## Balanced Accuracy: 0.9382
cat(sprintf("F1-score Macro: %.4f\n", f1_macro_tree_train))
## F1-score Macro: 0.9523
# === Evaluasi pada Testing Set ===
evaluasi_tree_test <- confusionMatrix(pred_tree_test, testData$y)
tabel_konfusi_tree_test <- evaluasi_tree_test$table
recall_0_tree_test <- evaluasi_tree_test$byClass["Sensitivity"]
recall_1_tree_test <- evaluasi_tree_test$byClass["Specificity"]
f1_0_tree_test <- evaluasi_tree_test$byClass["F1"]
precision_1_tree_test <- tabel_konfusi_tree_test[2,2] / sum(tabel_konfusi_tree_test[2,])
f1_1_tree_test <- 2 * (precision_1_tree_test * recall_1_tree_test) / (precision_1_tree_test + recall_1_tree_test)
akurasi_tree_test <- evaluasi_tree_test$overall["Accuracy"]
akurasi_avg_tree_test <- (recall_0_tree_test + recall_1_tree_test) / 2
f1_macro_tree_test <- (f1_0_tree_test + f1_1_tree_test) / 2
cat("\n=== Evaluasi Model Tree pada Data Testing ===\n")
##
## === Evaluasi Model Tree pada Data Testing ===
print(evaluasi_tree_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 990 18
## 1 10 112
##
## Accuracy : 0.9752
## 95% CI : (0.9644, 0.9835)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.875
##
## Mcnemar's Test P-Value : 0.1859
##
## Sensitivity : 0.9900
## Specificity : 0.8615
## Pos Pred Value : 0.9821
## Neg Pred Value : 0.9180
## Prevalence : 0.8850
## Detection Rate : 0.8761
## Detection Prevalence : 0.8920
## Balanced Accuracy : 0.9258
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0_tree_test))
## Recall kelas 0 (negatif): 0.9900
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0_tree_test))
## F1-score kelas 0 (negatif): 0.9861
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1_tree_test))
## Recall kelas 1 (positif): 0.8615
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1_tree_test))
## F1-score kelas 1 (positif): 0.8889
cat(sprintf("Akurasi: %.4f\n", akurasi_tree_test))
## Akurasi: 0.9752
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg_tree_test))
## Balanced Accuracy: 0.9258
cat(sprintf("F1-score Macro: %.4f\n", f1_macro_tree_test))
## F1-score Macro: 0.9375
Prinsip dasar: Random Forest adalah kumpulan dari banyak decision tree (ensemble). Ia membuat prediksi dengan “voting” dari banyak pohon.
Cara kerja: 1. Buat banyak pohon keputusan dari subset acak data dan fitur. 2. Masing-masing pohon membuat prediksi. 3. Hasil akhir ditentukan oleh mayoritas vote (untuk klasifikasi) atau rata-rata (untuk regresi).
# === 3. SMOTE untuk Imbalance (hanya untuk train set) ===
trainData$y <- as.factor(trainData$y) # SMOTE butuh faktor
kontrol_cv <- trainControl(method = "cv", number = 4, sampling = "smote")
# Tuning ringan untuk mtry
grid_mtry <- expand.grid(mtry = c(2, 3, 4, 5))
set.seed(123)
model_rf <- train(
y ~ .,
data = trainData,
method = "rf",
trControl = kontrol_cv,
tuneGrid = expand.grid(mtry = c(2, 3, 4)),
ntree = 50,
nodesize = 20, # memperbesar nodesize biar tiap pohon ga terlalu dalam
maxnodes = 25, # batasi jumlah node biar gak overfit
importance = TRUE
)
## Warning: package 'themis' was built under R version 4.3.3
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.3.3
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats4':
##
## update
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
4-Fold Cross Validation, yang mengurangi risiko overfitting karena model diuji pada data yang tidak digunakan saat pelatihan di setiap fold.
# === Evaluasi di train set ===
pred_train <- predict(model_rf, trainData)
evaluasi_train <- confusionMatrix(pred_train, trainData$y)
# === Ambil metrik dan tampilkan ===
tabel_konfusi <- evaluasi_train$table
recall_0 <- evaluasi_train$byClass["Sensitivity"]
recall_1 <- evaluasi_train$byClass["Specificity"]
recall_macro_train <- (recall_0 + recall_1)/2
f1_0 <- evaluasi_train$byClass["F1"]
# Hitung precision per kelas
precision_0 <- tabel_konfusi[1,1] / sum(tabel_konfusi[1,])
precision_1 <- tabel_konfusi[2,2] / sum(tabel_konfusi[2,])
# Hitung F1-score kelas 1
f1_1 <- 2 * (precision_1 * recall_1) / (precision_1 + recall_1)
# Hitung akurasi & F1-score rata-rata
akurasi <- evaluasi_train$overall["Accuracy"]
akurasi_avg <- (recall_0 + recall_1) / 2
f1_macro <- (f1_0 + f1_1) / 2
# Precision makro dan weighted
precision_macro <- (precision_0 + precision_1) / 2
n_0 <- sum(trainData$y == "0")
n_1 <- sum(trainData$y == "1")
precision_weighted <- (n_0 * precision_0 + n_1 * precision_1) / (n_0 + n_1)
# === 8. Cetak hasil evaluasi ===
cat("\n=== Evaluasi Model pada Data Train ===\n")
##
## === Evaluasi Model pada Data Train ===
print(evaluasi_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2895 64
## 1 105 327
##
## Accuracy : 0.9502
## 95% CI : (0.9423, 0.9572)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7664
##
## Mcnemar's Test P-Value : 0.002091
##
## Sensitivity : 0.9650
## Specificity : 0.8363
## Pos Pred Value : 0.9784
## Neg Pred Value : 0.7569
## Prevalence : 0.8847
## Detection Rate : 0.8537
## Detection Prevalence : 0.8726
## Balanced Accuracy : 0.9007
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0))
## Recall kelas 0 (negatif): 0.9650
cat(sprintf("Precision kelas 0 (negatif): %.4f\n", precision_0))
## Precision kelas 0 (negatif): 0.9784
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0))
## F1-score kelas 0 (negatif): 0.9716
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1))
## Recall kelas 1 (positif): 0.8363
cat(sprintf("Precision kelas 1 (positif): %.4f\n", precision_1))
## Precision kelas 1 (positif): 0.7569
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1))
## F1-score kelas 1 (positif): 0.7947
cat(sprintf("Akurasi: %.4f\n", akurasi))
## Akurasi: 0.9502
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg))
## Balanced Accuracy: 0.9007
cat(sprintf("Precision Macro: %.4f\n", precision_macro))
## Precision Macro: 0.8677
cat(sprintf("Precision Weighted: %.4f\n", precision_weighted))
## Precision Weighted: 0.9528
cat(sprintf("Recall macro: %.4f\n", recall_macro_train))
## Recall macro: 0.9007
cat(sprintf("F1-score Macro: %.4f\n", f1_macro))
## F1-score Macro: 0.8831
# === Evaluasi di testing set ===
pred_test <- predict(model_rf, testData)
evaluasi_test <- confusionMatrix(pred_test, testData$y)
# === Ambil metrik dan tampilkan ===
tabel_konfusi <- evaluasi_test$table
recall_0 <- evaluasi_test$byClass["Sensitivity"]
recall_1 <- evaluasi_test$byClass["Specificity"]
recall_macro_test <- (recall_0 + recall_1)/2
f1_0 <- evaluasi_test$byClass["F1"]
# Hitung precision per kelas
precision_0 <- tabel_konfusi[1,1] / sum(tabel_konfusi[1,])
precision_1 <- tabel_konfusi[2,2] / sum(tabel_konfusi[2,])
# Hitung F1 kelas 1
f1_1 <- 2 * (precision_1 * recall_1) / (precision_1 + recall_1)
# Hitung akurasi & F1 macro
akurasi <- evaluasi_test$overall["Accuracy"]
akurasi_avg <- (recall_0 + recall_1) / 2
f1_macro <- (f1_0 + f1_1) / 2
# Hitung precision macro & weighted
n_0 <- sum(testData$y == "0")
n_1 <- sum(testData$y == "1")
precision_macro <- (precision_0 + precision_1) / 2
precision_weighted <- (n_0 * precision_0 + n_1 * precision_1) / (n_0 + n_1)
# === Cetak hasil evaluasi ===
cat("\n=== Evaluasi Model pada Data Test ===\n")
##
## === Evaluasi Model pada Data Test ===
print(evaluasi_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 970 31
## 1 30 99
##
## Accuracy : 0.946
## 95% CI : (0.9312, 0.9585)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 1.082e-12
##
## Kappa : 0.734
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9700
## Specificity : 0.7615
## Pos Pred Value : 0.9690
## Neg Pred Value : 0.7674
## Prevalence : 0.8850
## Detection Rate : 0.8584
## Detection Prevalence : 0.8858
## Balanced Accuracy : 0.8658
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0))
## Recall kelas 0 (negatif): 0.9700
cat(sprintf("Precision kelas 0 (negatif): %.4f\n", precision_0))
## Precision kelas 0 (negatif): 0.9690
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0))
## F1-score kelas 0 (negatif): 0.9695
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1))
## Recall kelas 1 (positif): 0.7615
cat(sprintf("Precision kelas 1 (positif): %.4f\n", precision_1))
## Precision kelas 1 (positif): 0.7674
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1))
## F1-score kelas 1 (positif): 0.7645
cat(sprintf("Akurasi: %.4f\n", akurasi))
## Akurasi: 0.9460
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg))
## Balanced Accuracy: 0.8658
cat(sprintf("Precision Macro: %.4f\n", precision_macro))
## Precision Macro: 0.8682
cat(sprintf("Precision Weighted: %.4f\n", precision_weighted))
## Precision Weighted: 0.9458
cat(sprintf("Recall Macro: %.4f\n", recall_macro_test))
## Recall Macro: 0.8658
cat(sprintf("F1-score Macro: %.4f\n", f1_macro))
## F1-score Macro: 0.8670
## =====================
## Model kNN dengan Cross-Validation & Preprocessing
## =====================
# === SMOTE untuk Imbalance (hanya untuk train set) ===
trainData$y <- as.factor(trainData$y) # SMOTE butuh faktor
# Cross-validation 10-fold
ctrl <- trainControl(method = "cv", number = 10, sampling = "smote")
# Latih model kNN dan cari k terbaik, dengan preprocessing otomatis
set.seed(123)
knn_fit <- train(
y ~ .,
data = trainData,
method = "knn",
trControl = ctrl,
preProcess = c("center", "scale"), # caret akan handle scaling
tuneLength = 25
)
# Lihat nilai k terbaik dan performa selama CV
print(knn_fit)
## k-Nearest Neighbors
##
## 3391 samples
## 10 predictor
## 2 classes: '0', '1'
##
## Pre-processing: centered (35), scaled (35)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 3052, 3052, 3051, 3052, 3052, 3052, ...
## Addtional sampling using SMOTE prior to pre-processing
##
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8504902 0.4016002
## 7 0.8384019 0.3868677
## 9 0.8322054 0.3709745
## 11 0.8342634 0.3842933
## 13 0.8389884 0.3902729
## 15 0.8310246 0.3757812
## 17 0.8274866 0.3785580
## 19 0.8322054 0.3836027
## 21 0.8325030 0.3875498
## 23 0.8295523 0.3830672
## 25 0.8245393 0.3748604
## 27 0.8251249 0.3680730
## 29 0.8263049 0.3734873
## 31 0.8292478 0.3812176
## 33 0.8257132 0.3800156
## 35 0.8295462 0.3813649
## 37 0.8192226 0.3650877
## 39 0.8180392 0.3690391
## 41 0.8174518 0.3668668
## 43 0.8147952 0.3673494
## 45 0.8168619 0.3708661
## 47 0.8091983 0.3607834
## 49 0.8094898 0.3634133
## 51 0.8083177 0.3575371
## 53 0.8097857 0.3631431
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 5.
plot(knn_fit)
# One-hot encoding
dummies <- dummyVars(" ~ .", data = data_selected)
data_numeric <- data.frame(predict(dummies, newdata = data_selected))
# Tambahkan kembali label
data_numeric$y <- data_selected$y
# Hilangkan baris dengan NA/NaN/Inf
data_numeric <- data_numeric[complete.cases(data_numeric), ]
data_numeric <- data_numeric[!apply(data_numeric, 1, function(x) any(is.infinite(x))), ]
# Bagi data menjadi train dan test
set.seed(123)
trainIndex <- createDataPartition(data_numeric$y, p = 0.75, list = FALSE)
trainData <- data_numeric[trainIndex, ]
testData <- data_numeric[-trainIndex, ]
# Pisahkan fitur dan label
x_train <- trainData[, setdiff(names(trainData), "y")]
y_train <- trainData$y
x_test <- testData[, setdiff(names(testData), "y")]
y_test <- testData$y
# Jalankan KNN (k = 35)
knn_train_pred <- knn(train = x_train, test = x_train, cl = y_train, k = 5)
knn_test_pred <- knn(train = x_train, test = x_test, cl = y_train, k = 5)
# === Evaluasi pada Training Set ===
evaluasi_knn_train <- confusionMatrix(knn_train_pred, y_train)
tabel_konfusi_knn_train <- evaluasi_knn_train$table
recall_0_knn_train <- evaluasi_knn_train$byClass["Sensitivity"]
recall_1_knn_train <- evaluasi_knn_train$byClass["Specificity"]
f1_0_knn_train <- evaluasi_knn_train$byClass["F1"]
precision_1_knn_train <- tabel_konfusi_knn_train[2,2] / sum(tabel_konfusi_knn_train[2,])
f1_1_knn_train <- 2 * (precision_1_knn_train * recall_1_knn_train) / (precision_1_knn_train + recall_1_knn_train)
akurasi_knn_train <- evaluasi_knn_train$overall["Accuracy"]
akurasi_avg_knn_train <- (recall_0_knn_train + recall_1_knn_train) / 2
f1_macro_knn_train <- (f1_0_knn_train + f1_1_knn_train) / 2
cat("\n=== Evaluasi Model KNN pada Data Training ===\n")
##
## === Evaluasi Model KNN pada Data Training ===
print(evaluasi_knn_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2912 172
## 1 88 219
##
## Accuracy : 0.9233
## 95% CI : (0.9139, 0.9321)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : 6.326e-14
##
## Kappa : 0.5855
##
## Mcnemar's Test P-Value : 2.641e-07
##
## Sensitivity : 0.9707
## Specificity : 0.5601
## Pos Pred Value : 0.9442
## Neg Pred Value : 0.7134
## Prevalence : 0.8847
## Detection Rate : 0.8587
## Detection Prevalence : 0.9095
## Balanced Accuracy : 0.7654
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0_knn_train))
## Recall kelas 0 (negatif): 0.9707
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0_knn_train))
## F1-score kelas 0 (negatif): 0.9573
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1_knn_train))
## Recall kelas 1 (positif): 0.5601
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1_knn_train))
## F1-score kelas 1 (positif): 0.6275
cat(sprintf("Akurasi: %.4f\n", akurasi_knn_train))
## Akurasi: 0.9233
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg_knn_train))
## Balanced Accuracy: 0.7654
cat(sprintf("F1-score Macro: %.4f\n", f1_macro_knn_train))
## F1-score Macro: 0.7924
# === Evaluasi pada Testing Set ===
evaluasi_knn_test <- confusionMatrix(knn_test_pred, y_test)
tabel_konfusi_knn_test <- evaluasi_knn_test$table
recall_0_knn_test <- evaluasi_knn_test$byClass["Sensitivity"]
recall_1_knn_test <- evaluasi_knn_test$byClass["Specificity"]
f1_0_knn_test <- evaluasi_knn_test$byClass["F1"]
precision_1_knn_test <- tabel_konfusi_knn_test[2,2] / sum(tabel_konfusi_knn_test[2,])
f1_1_knn_test <- 2 * (precision_1_knn_test * recall_1_knn_test) / (precision_1_knn_test + recall_1_knn_test)
akurasi_knn_test <- evaluasi_knn_test$overall["Accuracy"]
akurasi_avg_knn_test <- (recall_0_knn_test + recall_1_knn_test) / 2
f1_macro_knn_test <- (f1_0_knn_test + f1_1_knn_test) / 2
cat("\n=== Evaluasi Model KNN pada Data Testing ===\n")
##
## === Evaluasi Model KNN pada Data Testing ===
print(evaluasi_knn_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 960 84
## 1 40 46
##
## Accuracy : 0.8903
## 95% CI : (0.8706, 0.9079)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : 0.3071676
##
## Kappa : 0.368
##
## Mcnemar's Test P-Value : 0.0001127
##
## Sensitivity : 0.9600
## Specificity : 0.3538
## Pos Pred Value : 0.9195
## Neg Pred Value : 0.5349
## Prevalence : 0.8850
## Detection Rate : 0.8496
## Detection Prevalence : 0.9239
## Balanced Accuracy : 0.6569
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0_knn_test))
## Recall kelas 0 (negatif): 0.9600
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0_knn_test))
## F1-score kelas 0 (negatif): 0.9393
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1_knn_test))
## Recall kelas 1 (positif): 0.3538
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1_knn_test))
## F1-score kelas 1 (positif): 0.4259
cat(sprintf("Akurasi: %.4f\n", akurasi_knn_test))
## Akurasi: 0.8903
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg_knn_test))
## Balanced Accuracy: 0.6569
cat(sprintf("F1-score Macro: %.4f\n", f1_macro_knn_test))
## F1-score Macro: 0.6826
# One-hot encoding
dummies <- dummyVars(" ~ .", data = data_selected)
data_numeric <- data.frame(predict(dummies, newdata = data_selected))
# Tambahkan kembali label
data_numeric$y <- data_selected$y
# Hilangkan baris dengan NA/NaN/Inf
data_numeric <- data_numeric[complete.cases(data_numeric), ]
data_numeric <- data_numeric[!apply(data_numeric, 1, function(x) any(is.infinite(x))), ]
# Bagi data menjadi train dan test
set.seed(123)
trainIndex <- createDataPartition(data_numeric$y, p = 0.75, list = FALSE)
trainData <- data_numeric[trainIndex, ]
testData <- data_numeric[-trainIndex, ]
# -----------------------------------------
# Artificial Neural Network (ANN) Model
# -----------------------------------------
# Melatih model Neural Network (ANN)
model_ann <- nnet(y ~ ., data = trainData, size = 5, decay = 0.05, maxit = 40, linout = FALSE, trace = FALSE)
print(model_ann)
## a 43-5-1 network with 226 weights
## inputs: y.0 y.1 Cluster duration poutcome.1 poutcome.2 poutcome.3 poutcome.4 pdays month.1 month.2 month.3 month.4 month.5 month.6 month.7 month.8 month.9 month.10 month.11 month.12 contact.1 contact.2 contact.3 previous age_groupAdult age_groupMiddle.aged age_groupSenior age_groupYoung job.1 job.2 job.3 job.4 job.5 job.6 job.7 job.8 job.9 job.10 job.11 job.12 housing.0 housing.1
## output(s): y
## options were - entropy fitting decay=0.05
# Membuat prediksi pada data latih dan uji
pred_ann_train <- predict(model_ann, trainData, type = "class")
pred_ann_test <- predict(model_ann, testData, type = "class")
# Menyamakan level faktor pada prediksi dengan data aktual
pred_ann_train <- factor(pred_ann_train, levels = levels_train)
pred_ann_test <- factor(pred_ann_test, levels = levels_train)
# === Evaluasi pada Training Set ===
evaluasi_ann_train <- confusionMatrix(pred_ann_train, y_train)
tabel_konfusi_ann_train <- evaluasi_ann_train$table
recall_0_ann_train <- evaluasi_ann_train$byClass["Sensitivity"]
recall_1_ann_train <- evaluasi_ann_train$byClass["Specificity"]
f1_0_ann_train <- evaluasi_ann_train$byClass["F1"]
precision_1_ann_train <- tabel_konfusi_ann_train[2,2] / sum(tabel_konfusi_ann_train[2,])
f1_1_ann_train <- 2 * (precision_1_ann_train * recall_1_ann_train) / (precision_1_ann_train + recall_1_ann_train)
akurasi_ann_train <- evaluasi_ann_train$overall["Accuracy"]
akurasi_avg_ann_train <- (recall_0_ann_train + recall_1_ann_train) / 2
f1_macro_ann_train <- (f1_0_ann_train + f1_1_ann_train) / 2
cat("\n=== Evaluasi Model ANN pada Data Training ===\n")
##
## === Evaluasi Model ANN pada Data Training ===
print(evaluasi_ann_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2999 111
## 1 1 280
##
## Accuracy : 0.967
## 95% CI : (0.9604, 0.9727)
## No Information Rate : 0.8847
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8155
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9997
## Specificity : 0.7161
## Pos Pred Value : 0.9643
## Neg Pred Value : 0.9964
## Prevalence : 0.8847
## Detection Rate : 0.8844
## Detection Prevalence : 0.9171
## Balanced Accuracy : 0.8579
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0_ann_train))
## Recall kelas 0 (negatif): 0.9997
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0_ann_train))
## F1-score kelas 0 (negatif): 0.9817
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1_ann_train))
## Recall kelas 1 (positif): 0.7161
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1_ann_train))
## F1-score kelas 1 (positif): 0.8333
cat(sprintf("Akurasi: %.4f\n", akurasi_ann_train))
## Akurasi: 0.9670
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg_ann_train))
## Balanced Accuracy: 0.8579
cat(sprintf("F1-score Macro: %.4f\n", f1_macro_ann_train))
## F1-score Macro: 0.9075
# === Evaluasi pada Testing Set ===
evaluasi_ann_test <- confusionMatrix(pred_ann_test, y_test)
tabel_konfusi_ann_test <- evaluasi_ann_test$table
recall_0_ann_test <- evaluasi_ann_test$byClass["Sensitivity"]
recall_1_ann_test <- evaluasi_ann_test$byClass["Specificity"]
f1_0_ann_test <- evaluasi_ann_test$byClass["F1"]
precision_1_ann_test <- tabel_konfusi_ann_test[2,2] / sum(tabel_konfusi_ann_test[2,])
f1_1_ann_test <- 2 * (precision_1_ann_test * recall_1_ann_test) / (precision_1_ann_test + recall_1_ann_test)
akurasi_ann_test <- evaluasi_ann_test$overall["Accuracy"]
akurasi_avg_ann_test <- (recall_0_ann_test + recall_1_ann_test) / 2
f1_macro_ann_test <- (f1_0_ann_test + f1_1_ann_test) / 2
cat("\n=== Evaluasi Model ANN pada Data Testing ===\n")
##
## === Evaluasi Model ANN pada Data Testing ===
print(evaluasi_ann_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 999 36
## 1 1 94
##
## Accuracy : 0.9673
## 95% CI : (0.9551, 0.9768)
## No Information Rate : 0.885
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8179
##
## Mcnemar's Test P-Value : 2.276e-08
##
## Sensitivity : 0.9990
## Specificity : 0.7231
## Pos Pred Value : 0.9652
## Neg Pred Value : 0.9895
## Prevalence : 0.8850
## Detection Rate : 0.8841
## Detection Prevalence : 0.9159
## Balanced Accuracy : 0.8610
##
## 'Positive' Class : 0
##
cat(sprintf("Recall kelas 0 (negatif): %.4f\n", recall_0_ann_test))
## Recall kelas 0 (negatif): 0.9990
cat(sprintf("F1-score kelas 0 (negatif): %.4f\n", f1_0_ann_test))
## F1-score kelas 0 (negatif): 0.9818
cat(sprintf("Recall kelas 1 (positif): %.4f\n", recall_1_ann_test))
## Recall kelas 1 (positif): 0.7231
cat(sprintf("F1-score kelas 1 (positif): %.4f\n", f1_1_ann_test))
## F1-score kelas 1 (positif): 0.8356
cat(sprintf("Akurasi: %.4f\n", akurasi_ann_test))
## Akurasi: 0.9673
cat(sprintf("Balanced Accuracy: %.4f\n", akurasi_avg_ann_test))
## Balanced Accuracy: 0.8610
cat(sprintf("F1-score Macro: %.4f\n", f1_macro_ann_test))
## F1-score Macro: 0.9087