library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.4.3
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.4.3
## Loaded ROSE 0.0-4
library(e1071)
## Warning: package 'e1071' was built under R version 4.4.3
## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element
# LOAD DATA (PENTING: pakai ; )
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.3
library(caret)
library(ROSE)

# load data (WAJIB pakai ;)
data <- read.csv("C:/Users/USER/Downloads/data bank.csv", sep=";", stringsAsFactors=FALSE)

cat("=== DIMENSI DATA ===\n")
## === DIMENSI DATA ===
kable(data.frame(Keterangan=c("Baris","Kolom"), Nilai=dim(data)))
Keterangan Nilai
Baris 12543
Kolom 20
cat("=== NAMA KOLOM ===\n")
## === NAMA KOLOM ===
kable(data.frame(Kolom=names(data)))
Kolom
id
age
job
marital
education
default
housing
loan
contact
month
day_of_week
campaign
pdays
previous
poutcome
emp.var.rate
cons.price.idx
cons.conf.idx
euribor3m
nr.employed
("=== CEK TARGET ===\n")
## [1] "=== CEK TARGET ===\n"
# karena tidak ada y → pakai loan
data$y <- data$loan
data$loan <- NULL   # 🔥 INI YANG PENTING

cat("=== DISTRIBUSI TARGET AWAL ===\n")
## === DISTRIBUSI TARGET AWAL ===
kable(data.frame(Kategori=names(table(data$y)),
                 Jumlah=as.vector(table(data$y))))
Kategori Jumlah
no 10337
unknown 286
yes 1920
# ambil hanya yes/no
data <- data[data$y %in% c("yes","no"), ]

data$y <- as.factor(data$y)

cat("=== TARGET SETELAH CLEANING ===\n")
## === TARGET SETELAH CLEANING ===
kable(data.frame(Kategori=names(table(data$y)),
                 Jumlah=as.vector(table(data$y))))
Kategori Jumlah
no 10337
yes 1920
# ubah unknown jadi NA (kecuali target)
for(col in names(data)){
  if(col != "y" && is.character(data[[col]])){
    data[[col]][data[[col]] == "unknown"] <- NA
  }
}

na_table <- data.frame(
  Variabel=names(data),
  Jumlah_NA=colSums(is.na(data)),
  Persen=round(colSums(is.na(data))/nrow(data)*100,2)
)

cat("=== DETEKSI MISSING VALUE ===\n")
## === DETEKSI MISSING VALUE ===
kable(na_table)
Variabel Jumlah_NA Persen
id id 0 0.00
age age 0 0.00
job job 86 0.70
marital marital 24 0.20
education education 511 4.17
default default 2574 21.00
housing housing 0 0.00
contact contact 0 0.00
month month 0 0.00
day_of_week day_of_week 0 0.00
campaign campaign 0 0.00
pdays pdays 0 0.00
previous previous 0 0.00
poutcome poutcome 0 0.00
emp.var.rate emp.var.rate 0 0.00
cons.price.idx cons.price.idx 0 0.00
cons.conf.idx cons.conf.idx 0 0.00
euribor3m euribor3m 0 0.00
nr.employed nr.employed 0 0.00
y y 0 0.00
mode_func <- function(x){
  ux <- na.omit(unique(x))
  ux[which.max(tabulate(match(x, ux)))]
}

for(col in names(data)){
  if(col != "y"){
    if(is.character(data[[col]])){
      data[[col]][is.na(data[[col]])] <- mode_func(data[[col]])
    } else {
      data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm=TRUE)
    }
  }
}

cat("=== CEK SETELAH IMPUTASI ===\n")
## === CEK SETELAH IMPUTASI ===
kable(data.frame(Variabel=names(data),
                 Sisa_NA=colSums(is.na(data))))
Variabel Sisa_NA
id id 0
age age 0
job job 0
marital marital 0
education education 0
default default 0
housing housing 0
contact contact 0
month month 0
day_of_week day_of_week 0
campaign campaign 0
pdays pdays 0
previous previous 0
poutcome poutcome 0
emp.var.rate emp.var.rate 0
cons.price.idx cons.price.idx 0
cons.conf.idx cons.conf.idx 0
euribor3m euribor3m 0
nr.employed nr.employed 0
y y 0

tidak ada missing value

data <- unique(data)

cat("=== JUMLAH DATA ===\n")
## === JUMLAH DATA ===
kable(data.frame(Jumlah=nrow(data)))
Jumlah
12257
data[] <- lapply(data, function(x){
  if(is.character(x)) as.factor(x) else x
})

data$y <- as.factor(data$y)

cat("=== STRUKTUR DATA ===\n")
## === STRUKTUR DATA ===
kable(data.frame(
  Variabel=names(data),
  Tipe=sapply(data,class)
))
Variabel Tipe
id id integer
age age integer
job job factor
marital marital factor
education education factor
default default factor
housing housing factor
contact contact factor
month month factor
day_of_week day_of_week factor
campaign campaign integer
pdays pdays integer
previous previous integer
poutcome poutcome factor
emp.var.rate emp.var.rate numeric
cons.price.idx cons.price.idx numeric
cons.conf.idx cons.conf.idx numeric
euribor3m euribor3m numeric
nr.employed nr.employed numeric
y y factor
cat("=== DISTRIBUSI TARGET FINAL ===\n")
## === DISTRIBUSI TARGET FINAL ===
kable(data.frame(Kategori=names(table(data$y)),
                 Jumlah=as.vector(table(data$y))))
Kategori Jumlah
no 10337
yes 1920
if(length(unique(data$y)) < 2){
  stop("ERROR: Target tidak valid")
}
set.seed(123)

trainIndex <- createDataPartition(data$y, p=0.8, list=FALSE)

train <- data[trainIndex, ]
test <- data[-trainIndex, ]

cat("=== SPLIT DATA ===\n")
## === SPLIT DATA ===
kable(data.frame(
  Dataset=c("Train","Test"),
  Jumlah=c(nrow(train), nrow(test))
))
Dataset Jumlah
Train 9806
Test 2451
cat("=== SEBELUM BALANCING ===\n")
## === SEBELUM BALANCING ===
kable(data.frame(Kategori=names(table(train$y)),
                 Jumlah=as.vector(table(train$y))))
Kategori Jumlah
no 8270
yes 1536
train_balanced <- ovun.sample(y ~ ., data=train, method="over")$data

cat("=== SETELAH BALANCING ===\n")
## === SETELAH BALANCING ===
kable(data.frame(Kategori=names(table(train_balanced$y)),
                 Jumlah=as.vector(table(train_balanced$y))))
Kategori Jumlah
no 8270
yes 8201
model <- glm(y ~ ., data=train_balanced, family=binomial)

summary(model)
## 
## Call:
## glm(formula = y ~ ., family = binomial, data = train_balanced)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.403e+01  4.160e+01  -0.337 0.735902    
## id                           -1.129e-07  1.320e-06  -0.085 0.931880    
## age                           1.672e-03  1.920e-03   0.871 0.383749    
## jobblue-collar               -6.025e-02  5.651e-02  -1.066 0.286362    
## jobentrepreneur              -2.389e-01  8.934e-02  -2.674 0.007499 ** 
## jobhousemaid                 -1.973e-01  1.058e-01  -1.864 0.062322 .  
## jobmanagement                -9.471e-02  6.985e-02  -1.356 0.175162    
## jobretired                   -2.720e-01  9.776e-02  -2.783 0.005391 ** 
## jobself-employed             -2.164e-01  9.475e-02  -2.284 0.022386 *  
## jobservices                  -1.087e-01  6.253e-02  -1.738 0.082152 .  
## jobstudent                   -2.717e-02  1.179e-01  -0.230 0.817721    
## jobtechnician                -2.290e-01  5.606e-02  -4.084 4.43e-05 ***
## jobunemployed                -1.977e-01  1.084e-01  -1.824 0.068126 .  
## maritalmarried                2.100e-02  5.126e-02   0.410 0.682064    
## maritalsingle                -1.644e-03  5.895e-02  -0.028 0.977756    
## educationbasic.6y            -1.906e-01  8.744e-02  -2.180 0.029238 *  
## educationbasic.9y            -1.223e-02  6.649e-02  -0.184 0.854088    
## educationhigh.school         -5.373e-04  6.831e-02  -0.008 0.993724    
## educationilliterate          -1.140e+01  1.115e+02  -0.102 0.918606    
## educationprofessional.course  1.906e-01  7.560e-02   2.521 0.011694 *  
## educationuniversity.degree   -1.647e-02  6.692e-02  -0.246 0.805539    
## defaultyes                   -1.130e+01  1.970e+02  -0.057 0.954233    
## housingyes                    2.386e-01  3.182e-02   7.498 6.47e-14 ***
## contacttelephone             -8.899e-02  6.249e-02  -1.424 0.154448    
## monthaug                     -3.499e-02  1.507e-01  -0.232 0.816361    
## monthdec                      1.189e-01  2.546e-01   0.467 0.640470    
## monthjul                      2.246e-02  8.708e-02   0.258 0.796433    
## monthjun                     -1.049e-01  1.503e-01  -0.698 0.485225    
## monthmar                      8.956e-02  1.792e-01   0.500 0.617301    
## monthmay                     -5.153e-02  8.371e-02  -0.616 0.538169    
## monthnov                     -1.972e-01  1.063e-01  -1.856 0.063462 .  
## monthoct                     -7.174e-01  1.852e-01  -3.874 0.000107 ***
## monthsep                      9.552e-02  2.057e-01   0.464 0.642369    
## day_of_weekmon               -1.117e-01  4.995e-02  -2.237 0.025277 *  
## day_of_weekthu               -2.288e-02  4.941e-02  -0.463 0.643307    
## day_of_weektue               -8.149e-02  5.070e-02  -1.607 0.107977    
## day_of_weekwed               -7.406e-02  5.067e-02  -1.462 0.143795    
## campaign                      4.953e-03  5.589e-03   0.886 0.375518    
## pdays                        -4.861e-04  2.996e-04  -1.623 0.104694    
## previous                     -4.162e-01  8.734e-02  -4.765 1.89e-06 ***
## poutcomenonexistent          -5.031e-01  1.111e-01  -4.527 5.99e-06 ***
## poutcomesuccess              -3.810e-01  2.941e-01  -1.296 0.195103    
## emp.var.rate                 -2.159e-01  1.692e-01  -1.276 0.201949    
## cons.price.idx                2.293e-01  2.778e-01   0.825 0.409173    
## cons.conf.idx                -1.996e-03  9.031e-03  -0.221 0.825043    
## euribor3m                     2.353e-01  1.254e-01   1.876 0.060694 .  
## nr.employed                  -1.421e-03  3.272e-03  -0.434 0.664006    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22833  on 16470  degrees of freedom
## Residual deviance: 22652  on 16424  degrees of freedom
## AIC: 22746
## 
## Number of Fisher Scoring iterations: 10
pred <- predict(model, test, type="response")

kable(head(data.frame(Probabilitas=pred)))
Probabilitas
19 0.4993936
26 0.5189362
31 0.5587478
37 0.4701323
38 0.4522836
41 0.5051517
pred_class <- ifelse(pred > 0.5, "yes", "no")
pred_class <- as.factor(pred_class)

kable(data.frame(Kategori=names(table(pred_class)),
                 Jumlah=as.vector(table(pred_class))))
Kategori Jumlah
no 1298
yes 1153
cat("=== MODEL TANPA BALANCING ===\n")
## === MODEL TANPA BALANCING ===
model_awal <- glm(y ~ ., data=train, family=binomial)

pred_awal_prob <- predict(model_awal, test, type="response")
pred_awal <- ifelse(pred_awal_prob > 0.5, "yes", "no")
pred_awal <- as.factor(pred_awal)

library(caret)
cm_awal <- confusionMatrix(pred_awal, test$y)
## Warning in confusionMatrix.default(pred_awal, test$y): Levels are not in the
## same order for reference and data. Refactoring data to match.
print(cm_awal)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2067  384
##        yes    0    0
##                                           
##                Accuracy : 0.8433          
##                  95% CI : (0.8283, 0.8575)
##     No Information Rate : 0.8433          
##     P-Value [Acc > NIR] : 0.5136          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8433          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8433          
##          Detection Rate : 0.8433          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : no              
## 
cat("=== MODEL DENGAN BALANCING ===\n")
## === MODEL DENGAN BALANCING ===
model_akhir <- glm(y ~ ., data=train_balanced, family=binomial)

pred_akhir_prob <- predict(model_akhir, test, type="response")
pred_akhir <- ifelse(pred_akhir_prob > 0.5, "yes", "no")
pred_akhir <- as.factor(pred_akhir)

cm_akhir <- confusionMatrix(pred_akhir, test$y)

print(cm_akhir)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1101  197
##        yes  966  187
##                                           
##                Accuracy : 0.5255          
##                  95% CI : (0.5055, 0.5454)
##     No Information Rate : 0.8433          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0108          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5327          
##             Specificity : 0.4870          
##          Pos Pred Value : 0.8482          
##          Neg Pred Value : 0.1622          
##              Prevalence : 0.8433          
##          Detection Rate : 0.4492          
##    Detection Prevalence : 0.5296          
##       Balanced Accuracy : 0.5098          
##                                           
##        'Positive' Class : no              
## 
cat("=== METRIK MODEL ===\n")
## === METRIK MODEL ===
# ambil metrik dari confusion matrix
get_metrics <- function(cm){
  accuracy <- cm$overall["Accuracy"]
  precision <- cm$byClass["Precision"]
  recall <- cm$byClass["Recall"]
  f1 <- cm$byClass["F1"]
  
  return(c(Accuracy=accuracy,
           Precision=precision,
           Recall=recall,
           F1_Score=f1))
}

metrik_awal <- get_metrics(cm_awal)
metrik_akhir <- get_metrics(cm_akhir)

hasil_metrik <- data.frame(
  Metrik = names(metrik_awal),
  Tanpa_Preprocessing = as.numeric(metrik_awal),
  Dengan_Preprocessing = as.numeric(metrik_akhir)
)

library(knitr)
kable(hasil_metrik)
Metrik Tanpa_Preprocessing Dengan_Preprocessing
Accuracy.Accuracy 0.8433293 0.5254998
Precision.Precision 0.8433293 0.8482280
Recall.Recall 1.0000000 0.5326560
F1_Score.F1 0.9150066 0.6543834