library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
##
## Attaching package: 'e1071'
##
## The following object is masked from 'package:ggplot2':
##
## element
# LOAD DATA (PENTING: pakai ; )
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
library(caret)
library(ROSE)
# load data (WAJIB pakai ;)
data <- read.csv("C:/Users/USER/Downloads/data bank.csv", sep=";", stringsAsFactors=FALSE)
cat("=== DIMENSI DATA ===\n")
## === DIMENSI DATA ===
kable(data.frame(Keterangan=c("Baris","Kolom"), Nilai=dim(data)))
cat("=== NAMA KOLOM ===\n")
## === NAMA KOLOM ===
kable(data.frame(Kolom=names(data)))
| id |
| age |
| job |
| marital |
| education |
| default |
| housing |
| loan |
| contact |
| month |
| day_of_week |
| campaign |
| pdays |
| previous |
| poutcome |
| emp.var.rate |
| cons.price.idx |
| cons.conf.idx |
| euribor3m |
| nr.employed |
("=== CEK TARGET ===\n")
## [1] "=== CEK TARGET ===\n"
# karena tidak ada y → pakai loan
data$y <- data$loan
data$loan <- NULL # 🔥 INI YANG PENTING
cat("=== DISTRIBUSI TARGET AWAL ===\n")
## === DISTRIBUSI TARGET AWAL ===
kable(data.frame(Kategori=names(table(data$y)),
Jumlah=as.vector(table(data$y))))
| no |
10337 |
| unknown |
286 |
| yes |
1920 |
# ambil hanya yes/no
data <- data[data$y %in% c("yes","no"), ]
data$y <- as.factor(data$y)
cat("=== TARGET SETELAH CLEANING ===\n")
## === TARGET SETELAH CLEANING ===
kable(data.frame(Kategori=names(table(data$y)),
Jumlah=as.vector(table(data$y))))
# ubah unknown jadi NA (kecuali target)
for(col in names(data)){
if(col != "y" && is.character(data[[col]])){
data[[col]][data[[col]] == "unknown"] <- NA
}
}
na_table <- data.frame(
Variabel=names(data),
Jumlah_NA=colSums(is.na(data)),
Persen=round(colSums(is.na(data))/nrow(data)*100,2)
)
cat("=== DETEKSI MISSING VALUE ===\n")
## === DETEKSI MISSING VALUE ===
kable(na_table)
| id |
id |
0 |
0.00 |
| age |
age |
0 |
0.00 |
| job |
job |
86 |
0.70 |
| marital |
marital |
24 |
0.20 |
| education |
education |
511 |
4.17 |
| default |
default |
2574 |
21.00 |
| housing |
housing |
0 |
0.00 |
| contact |
contact |
0 |
0.00 |
| month |
month |
0 |
0.00 |
| day_of_week |
day_of_week |
0 |
0.00 |
| campaign |
campaign |
0 |
0.00 |
| pdays |
pdays |
0 |
0.00 |
| previous |
previous |
0 |
0.00 |
| poutcome |
poutcome |
0 |
0.00 |
| emp.var.rate |
emp.var.rate |
0 |
0.00 |
| cons.price.idx |
cons.price.idx |
0 |
0.00 |
| cons.conf.idx |
cons.conf.idx |
0 |
0.00 |
| euribor3m |
euribor3m |
0 |
0.00 |
| nr.employed |
nr.employed |
0 |
0.00 |
| y |
y |
0 |
0.00 |
mode_func <- function(x){
ux <- na.omit(unique(x))
ux[which.max(tabulate(match(x, ux)))]
}
for(col in names(data)){
if(col != "y"){
if(is.character(data[[col]])){
data[[col]][is.na(data[[col]])] <- mode_func(data[[col]])
} else {
data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm=TRUE)
}
}
}
cat("=== CEK SETELAH IMPUTASI ===\n")
## === CEK SETELAH IMPUTASI ===
kable(data.frame(Variabel=names(data),
Sisa_NA=colSums(is.na(data))))
| id |
id |
0 |
| age |
age |
0 |
| job |
job |
0 |
| marital |
marital |
0 |
| education |
education |
0 |
| default |
default |
0 |
| housing |
housing |
0 |
| contact |
contact |
0 |
| month |
month |
0 |
| day_of_week |
day_of_week |
0 |
| campaign |
campaign |
0 |
| pdays |
pdays |
0 |
| previous |
previous |
0 |
| poutcome |
poutcome |
0 |
| emp.var.rate |
emp.var.rate |
0 |
| cons.price.idx |
cons.price.idx |
0 |
| cons.conf.idx |
cons.conf.idx |
0 |
| euribor3m |
euribor3m |
0 |
| nr.employed |
nr.employed |
0 |
| y |
y |
0 |
tidak ada missing value
data <- unique(data)
cat("=== JUMLAH DATA ===\n")
## === JUMLAH DATA ===
kable(data.frame(Jumlah=nrow(data)))
data[] <- lapply(data, function(x){
if(is.character(x)) as.factor(x) else x
})
data$y <- as.factor(data$y)
cat("=== STRUKTUR DATA ===\n")
## === STRUKTUR DATA ===
kable(data.frame(
Variabel=names(data),
Tipe=sapply(data,class)
))
| id |
id |
integer |
| age |
age |
integer |
| job |
job |
factor |
| marital |
marital |
factor |
| education |
education |
factor |
| default |
default |
factor |
| housing |
housing |
factor |
| contact |
contact |
factor |
| month |
month |
factor |
| day_of_week |
day_of_week |
factor |
| campaign |
campaign |
integer |
| pdays |
pdays |
integer |
| previous |
previous |
integer |
| poutcome |
poutcome |
factor |
| emp.var.rate |
emp.var.rate |
numeric |
| cons.price.idx |
cons.price.idx |
numeric |
| cons.conf.idx |
cons.conf.idx |
numeric |
| euribor3m |
euribor3m |
numeric |
| nr.employed |
nr.employed |
numeric |
| y |
y |
factor |
cat("=== DISTRIBUSI TARGET FINAL ===\n")
## === DISTRIBUSI TARGET FINAL ===
kable(data.frame(Kategori=names(table(data$y)),
Jumlah=as.vector(table(data$y))))
if(length(unique(data$y)) < 2){
stop("ERROR: Target tidak valid")
}
set.seed(123)
trainIndex <- createDataPartition(data$y, p=0.8, list=FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
cat("=== SPLIT DATA ===\n")
## === SPLIT DATA ===
kable(data.frame(
Dataset=c("Train","Test"),
Jumlah=c(nrow(train), nrow(test))
))
cat("=== SEBELUM BALANCING ===\n")
## === SEBELUM BALANCING ===
kable(data.frame(Kategori=names(table(train$y)),
Jumlah=as.vector(table(train$y))))
train_balanced <- ovun.sample(y ~ ., data=train, method="over")$data
cat("=== SETELAH BALANCING ===\n")
## === SETELAH BALANCING ===
kable(data.frame(Kategori=names(table(train_balanced$y)),
Jumlah=as.vector(table(train_balanced$y))))
model <- glm(y ~ ., data=train_balanced, family=binomial)
summary(model)
##
## Call:
## glm(formula = y ~ ., family = binomial, data = train_balanced)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.403e+01 4.160e+01 -0.337 0.735902
## id -1.129e-07 1.320e-06 -0.085 0.931880
## age 1.672e-03 1.920e-03 0.871 0.383749
## jobblue-collar -6.025e-02 5.651e-02 -1.066 0.286362
## jobentrepreneur -2.389e-01 8.934e-02 -2.674 0.007499 **
## jobhousemaid -1.973e-01 1.058e-01 -1.864 0.062322 .
## jobmanagement -9.471e-02 6.985e-02 -1.356 0.175162
## jobretired -2.720e-01 9.776e-02 -2.783 0.005391 **
## jobself-employed -2.164e-01 9.475e-02 -2.284 0.022386 *
## jobservices -1.087e-01 6.253e-02 -1.738 0.082152 .
## jobstudent -2.717e-02 1.179e-01 -0.230 0.817721
## jobtechnician -2.290e-01 5.606e-02 -4.084 4.43e-05 ***
## jobunemployed -1.977e-01 1.084e-01 -1.824 0.068126 .
## maritalmarried 2.100e-02 5.126e-02 0.410 0.682064
## maritalsingle -1.644e-03 5.895e-02 -0.028 0.977756
## educationbasic.6y -1.906e-01 8.744e-02 -2.180 0.029238 *
## educationbasic.9y -1.223e-02 6.649e-02 -0.184 0.854088
## educationhigh.school -5.373e-04 6.831e-02 -0.008 0.993724
## educationilliterate -1.140e+01 1.115e+02 -0.102 0.918606
## educationprofessional.course 1.906e-01 7.560e-02 2.521 0.011694 *
## educationuniversity.degree -1.647e-02 6.692e-02 -0.246 0.805539
## defaultyes -1.130e+01 1.970e+02 -0.057 0.954233
## housingyes 2.386e-01 3.182e-02 7.498 6.47e-14 ***
## contacttelephone -8.899e-02 6.249e-02 -1.424 0.154448
## monthaug -3.499e-02 1.507e-01 -0.232 0.816361
## monthdec 1.189e-01 2.546e-01 0.467 0.640470
## monthjul 2.246e-02 8.708e-02 0.258 0.796433
## monthjun -1.049e-01 1.503e-01 -0.698 0.485225
## monthmar 8.956e-02 1.792e-01 0.500 0.617301
## monthmay -5.153e-02 8.371e-02 -0.616 0.538169
## monthnov -1.972e-01 1.063e-01 -1.856 0.063462 .
## monthoct -7.174e-01 1.852e-01 -3.874 0.000107 ***
## monthsep 9.552e-02 2.057e-01 0.464 0.642369
## day_of_weekmon -1.117e-01 4.995e-02 -2.237 0.025277 *
## day_of_weekthu -2.288e-02 4.941e-02 -0.463 0.643307
## day_of_weektue -8.149e-02 5.070e-02 -1.607 0.107977
## day_of_weekwed -7.406e-02 5.067e-02 -1.462 0.143795
## campaign 4.953e-03 5.589e-03 0.886 0.375518
## pdays -4.861e-04 2.996e-04 -1.623 0.104694
## previous -4.162e-01 8.734e-02 -4.765 1.89e-06 ***
## poutcomenonexistent -5.031e-01 1.111e-01 -4.527 5.99e-06 ***
## poutcomesuccess -3.810e-01 2.941e-01 -1.296 0.195103
## emp.var.rate -2.159e-01 1.692e-01 -1.276 0.201949
## cons.price.idx 2.293e-01 2.778e-01 0.825 0.409173
## cons.conf.idx -1.996e-03 9.031e-03 -0.221 0.825043
## euribor3m 2.353e-01 1.254e-01 1.876 0.060694 .
## nr.employed -1.421e-03 3.272e-03 -0.434 0.664006
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 22833 on 16470 degrees of freedom
## Residual deviance: 22652 on 16424 degrees of freedom
## AIC: 22746
##
## Number of Fisher Scoring iterations: 10
pred <- predict(model, test, type="response")
kable(head(data.frame(Probabilitas=pred)))
| 19 |
0.4993936 |
| 26 |
0.5189362 |
| 31 |
0.5587478 |
| 37 |
0.4701323 |
| 38 |
0.4522836 |
| 41 |
0.5051517 |
pred_class <- ifelse(pred > 0.5, "yes", "no")
pred_class <- as.factor(pred_class)
kable(data.frame(Kategori=names(table(pred_class)),
Jumlah=as.vector(table(pred_class))))
cat("=== MODEL TANPA BALANCING ===\n")
## === MODEL TANPA BALANCING ===
model_awal <- glm(y ~ ., data=train, family=binomial)
pred_awal_prob <- predict(model_awal, test, type="response")
pred_awal <- ifelse(pred_awal_prob > 0.5, "yes", "no")
pred_awal <- as.factor(pred_awal)
library(caret)
cm_awal <- confusionMatrix(pred_awal, test$y)
## Warning in confusionMatrix.default(pred_awal, test$y): Levels are not in the
## same order for reference and data. Refactoring data to match.
print(cm_awal)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2067 384
## yes 0 0
##
## Accuracy : 0.8433
## 95% CI : (0.8283, 0.8575)
## No Information Rate : 0.8433
## P-Value [Acc > NIR] : 0.5136
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8433
## Neg Pred Value : NaN
## Prevalence : 0.8433
## Detection Rate : 0.8433
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : no
##
cat("=== MODEL DENGAN BALANCING ===\n")
## === MODEL DENGAN BALANCING ===
model_akhir <- glm(y ~ ., data=train_balanced, family=binomial)
pred_akhir_prob <- predict(model_akhir, test, type="response")
pred_akhir <- ifelse(pred_akhir_prob > 0.5, "yes", "no")
pred_akhir <- as.factor(pred_akhir)
cm_akhir <- confusionMatrix(pred_akhir, test$y)
print(cm_akhir)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1101 197
## yes 966 187
##
## Accuracy : 0.5255
## 95% CI : (0.5055, 0.5454)
## No Information Rate : 0.8433
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0108
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5327
## Specificity : 0.4870
## Pos Pred Value : 0.8482
## Neg Pred Value : 0.1622
## Prevalence : 0.8433
## Detection Rate : 0.4492
## Detection Prevalence : 0.5296
## Balanced Accuracy : 0.5098
##
## 'Positive' Class : no
##
cat("=== METRIK MODEL ===\n")
## === METRIK MODEL ===
# ambil metrik dari confusion matrix
get_metrics <- function(cm){
accuracy <- cm$overall["Accuracy"]
precision <- cm$byClass["Precision"]
recall <- cm$byClass["Recall"]
f1 <- cm$byClass["F1"]
return(c(Accuracy=accuracy,
Precision=precision,
Recall=recall,
F1_Score=f1))
}
metrik_awal <- get_metrics(cm_awal)
metrik_akhir <- get_metrics(cm_akhir)
hasil_metrik <- data.frame(
Metrik = names(metrik_awal),
Tanpa_Preprocessing = as.numeric(metrik_awal),
Dengan_Preprocessing = as.numeric(metrik_akhir)
)
library(knitr)
kable(hasil_metrik)
| Accuracy.Accuracy |
0.8433293 |
0.5254998 |
| Precision.Precision |
0.8433293 |
0.8482280 |
| Recall.Recall |
1.0000000 |
0.5326560 |
| F1_Score.F1 |
0.9150066 |
0.6543834 |