codingan ads

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'tidyr' was built under R version 4.4.3

## Warning: package 'readr' was built under R version 4.4.3

## Warning: package 'purrr' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

## Warning: package 'stringr' was built under R version 4.4.3

## Warning: package 'forcats' was built under R version 4.4.3

## Warning: package 'lubridate' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.4.3

## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(ROSE)

## Warning: package 'ROSE' was built under R version 4.4.3

## Loaded ROSE 0.0-4

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

## 
## Attaching package: 'e1071'
## 
## The following object is masked from 'package:ggplot2':
## 
##     element

# LOAD DATA (PENTING: pakai ; )
library(knitr)

## Warning: package 'knitr' was built under R version 4.4.3

library(caret)
library(ROSE)

# load data (WAJIB pakai ;)
data <- read.csv("C:/Users/USER/Downloads/data bank.csv", sep=";", stringsAsFactors=FALSE)

cat("=== DIMENSI DATA ===\n")

## === DIMENSI DATA ===

kable(data.frame(Keterangan=c("Baris","Kolom"), Nilai=dim(data)))

Keterangan	Nilai
Baris	12543
Kolom	20

cat("=== NAMA KOLOM ===\n")

## === NAMA KOLOM ===

kable(data.frame(Kolom=names(data)))

Kolom
id
age
job
marital
education
default
housing
loan
contact
month
day_of_week
campaign
pdays
previous
poutcome
emp.var.rate
cons.price.idx
cons.conf.idx
euribor3m
nr.employed

("=== CEK TARGET ===\n")

## [1] "=== CEK TARGET ===\n"

# karena tidak ada y → pakai loan
data$y <- data$loan
data$loan <- NULL   # 🔥 INI YANG PENTING

cat("=== DISTRIBUSI TARGET AWAL ===\n")

## === DISTRIBUSI TARGET AWAL ===

kable(data.frame(Kategori=names(table(data$y)),
                 Jumlah=as.vector(table(data$y))))

Kategori	Jumlah
no	10337
unknown	286
yes	1920

# ambil hanya yes/no
data <- data[data$y %in% c("yes","no"), ]

data$y <- as.factor(data$y)

cat("=== TARGET SETELAH CLEANING ===\n")

## === TARGET SETELAH CLEANING ===

kable(data.frame(Kategori=names(table(data$y)),
                 Jumlah=as.vector(table(data$y))))

Kategori	Jumlah
no	10337
yes	1920

# ubah unknown jadi NA (kecuali target)
for(col in names(data)){
  if(col != "y" && is.character(data[[col]])){
    data[[col]][data[[col]] == "unknown"] <- NA
  }
}

na_table <- data.frame(
  Variabel=names(data),
  Jumlah_NA=colSums(is.na(data)),
  Persen=round(colSums(is.na(data))/nrow(data)*100,2)
)

cat("=== DETEKSI MISSING VALUE ===\n")

## === DETEKSI MISSING VALUE ===

kable(na_table)

	Variabel	Jumlah_NA	Persen
id	id	0	0.00
age	age	0	0.00
job	job	86	0.70
marital	marital	24	0.20
education	education	511	4.17
default	default	2574	21.00
housing	housing	0	0.00
contact	contact	0	0.00
month	month	0	0.00
day_of_week	day_of_week	0	0.00
campaign	campaign	0	0.00
pdays	pdays	0	0.00
previous	previous	0	0.00
poutcome	poutcome	0	0.00
emp.var.rate	emp.var.rate	0	0.00
cons.price.idx	cons.price.idx	0	0.00
cons.conf.idx	cons.conf.idx	0	0.00
euribor3m	euribor3m	0	0.00
nr.employed	nr.employed	0	0.00
y	y	0	0.00

mode_func <- function(x){
  ux <- na.omit(unique(x))
  ux[which.max(tabulate(match(x, ux)))]
}

for(col in names(data)){
  if(col != "y"){
    if(is.character(data[[col]])){
      data[[col]][is.na(data[[col]])] <- mode_func(data[[col]])
    } else {
      data[[col]][is.na(data[[col]])] <- median(data[[col]], na.rm=TRUE)
    }
  }
}

cat("=== CEK SETELAH IMPUTASI ===\n")

## === CEK SETELAH IMPUTASI ===

kable(data.frame(Variabel=names(data),
                 Sisa_NA=colSums(is.na(data))))

	Variabel	Sisa_NA
id	id	0
age	age	0
job	job	0
marital	marital	0
education	education	0
default	default	0
housing	housing	0
contact	contact	0
month	month	0
day_of_week	day_of_week	0
campaign	campaign	0
pdays	pdays	0
previous	previous	0
poutcome	poutcome	0
emp.var.rate	emp.var.rate	0
cons.price.idx	cons.price.idx	0
cons.conf.idx	cons.conf.idx	0
euribor3m	euribor3m	0
nr.employed	nr.employed	0
y	y	0

tidak ada missing value

data <- unique(data)

cat("=== JUMLAH DATA ===\n")

## === JUMLAH DATA ===

kable(data.frame(Jumlah=nrow(data)))

Jumlah
12257

data[] <- lapply(data, function(x){
  if(is.character(x)) as.factor(x) else x
})

data$y <- as.factor(data$y)

cat("=== STRUKTUR DATA ===\n")

## === STRUKTUR DATA ===

kable(data.frame(
  Variabel=names(data),
  Tipe=sapply(data,class)
))

	Variabel	Tipe
id	id	integer
age	age	integer
job	job	factor
marital	marital	factor
education	education	factor
default	default	factor
housing	housing	factor
contact	contact	factor
month	month	factor
day_of_week	day_of_week	factor
campaign	campaign	integer
pdays	pdays	integer
previous	previous	integer
poutcome	poutcome	factor
emp.var.rate	emp.var.rate	numeric
cons.price.idx	cons.price.idx	numeric
cons.conf.idx	cons.conf.idx	numeric
euribor3m	euribor3m	numeric
nr.employed	nr.employed	numeric
y	y	factor

cat("=== DISTRIBUSI TARGET FINAL ===\n")

## === DISTRIBUSI TARGET FINAL ===

kable(data.frame(Kategori=names(table(data$y)),
                 Jumlah=as.vector(table(data$y))))

Kategori	Jumlah
no	10337
yes	1920

if(length(unique(data$y)) < 2){
  stop("ERROR: Target tidak valid")
}

set.seed(123)

trainIndex <- createDataPartition(data$y, p=0.8, list=FALSE)

train <- data[trainIndex, ]
test <- data[-trainIndex, ]

cat("=== SPLIT DATA ===\n")

## === SPLIT DATA ===

kable(data.frame(
  Dataset=c("Train","Test"),
  Jumlah=c(nrow(train), nrow(test))
))

Dataset	Jumlah
Train	9806
Test	2451

cat("=== SEBELUM BALANCING ===\n")

## === SEBELUM BALANCING ===

kable(data.frame(Kategori=names(table(train$y)),
                 Jumlah=as.vector(table(train$y))))

Kategori	Jumlah
no	8270
yes	1536

train_balanced <- ovun.sample(y ~ ., data=train, method="over")$data

cat("=== SETELAH BALANCING ===\n")

## === SETELAH BALANCING ===

kable(data.frame(Kategori=names(table(train_balanced$y)),
                 Jumlah=as.vector(table(train_balanced$y))))

Kategori	Jumlah
no	8270
yes	8201

model <- glm(y ~ ., data=train_balanced, family=binomial)

summary(model)

## 
## Call:
## glm(formula = y ~ ., family = binomial, data = train_balanced)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.403e+01  4.160e+01  -0.337 0.735902    
## id                           -1.129e-07  1.320e-06  -0.085 0.931880    
## age                           1.672e-03  1.920e-03   0.871 0.383749    
## jobblue-collar               -6.025e-02  5.651e-02  -1.066 0.286362    
## jobentrepreneur              -2.389e-01  8.934e-02  -2.674 0.007499 ** 
## jobhousemaid                 -1.973e-01  1.058e-01  -1.864 0.062322 .  
## jobmanagement                -9.471e-02  6.985e-02  -1.356 0.175162    
## jobretired                   -2.720e-01  9.776e-02  -2.783 0.005391 ** 
## jobself-employed             -2.164e-01  9.475e-02  -2.284 0.022386 *  
## jobservices                  -1.087e-01  6.253e-02  -1.738 0.082152 .  
## jobstudent                   -2.717e-02  1.179e-01  -0.230 0.817721    
## jobtechnician                -2.290e-01  5.606e-02  -4.084 4.43e-05 ***
## jobunemployed                -1.977e-01  1.084e-01  -1.824 0.068126 .  
## maritalmarried                2.100e-02  5.126e-02   0.410 0.682064    
## maritalsingle                -1.644e-03  5.895e-02  -0.028 0.977756    
## educationbasic.6y            -1.906e-01  8.744e-02  -2.180 0.029238 *  
## educationbasic.9y            -1.223e-02  6.649e-02  -0.184 0.854088    
## educationhigh.school         -5.373e-04  6.831e-02  -0.008 0.993724    
## educationilliterate          -1.140e+01  1.115e+02  -0.102 0.918606    
## educationprofessional.course  1.906e-01  7.560e-02   2.521 0.011694 *  
## educationuniversity.degree   -1.647e-02  6.692e-02  -0.246 0.805539    
## defaultyes                   -1.130e+01  1.970e+02  -0.057 0.954233    
## housingyes                    2.386e-01  3.182e-02   7.498 6.47e-14 ***
## contacttelephone             -8.899e-02  6.249e-02  -1.424 0.154448    
## monthaug                     -3.499e-02  1.507e-01  -0.232 0.816361    
## monthdec                      1.189e-01  2.546e-01   0.467 0.640470    
## monthjul                      2.246e-02  8.708e-02   0.258 0.796433    
## monthjun                     -1.049e-01  1.503e-01  -0.698 0.485225    
## monthmar                      8.956e-02  1.792e-01   0.500 0.617301    
## monthmay                     -5.153e-02  8.371e-02  -0.616 0.538169    
## monthnov                     -1.972e-01  1.063e-01  -1.856 0.063462 .  
## monthoct                     -7.174e-01  1.852e-01  -3.874 0.000107 ***
## monthsep                      9.552e-02  2.057e-01   0.464 0.642369    
## day_of_weekmon               -1.117e-01  4.995e-02  -2.237 0.025277 *  
## day_of_weekthu               -2.288e-02  4.941e-02  -0.463 0.643307    
## day_of_weektue               -8.149e-02  5.070e-02  -1.607 0.107977    
## day_of_weekwed               -7.406e-02  5.067e-02  -1.462 0.143795    
## campaign                      4.953e-03  5.589e-03   0.886 0.375518    
## pdays                        -4.861e-04  2.996e-04  -1.623 0.104694    
## previous                     -4.162e-01  8.734e-02  -4.765 1.89e-06 ***
## poutcomenonexistent          -5.031e-01  1.111e-01  -4.527 5.99e-06 ***
## poutcomesuccess              -3.810e-01  2.941e-01  -1.296 0.195103    
## emp.var.rate                 -2.159e-01  1.692e-01  -1.276 0.201949    
## cons.price.idx                2.293e-01  2.778e-01   0.825 0.409173    
## cons.conf.idx                -1.996e-03  9.031e-03  -0.221 0.825043    
## euribor3m                     2.353e-01  1.254e-01   1.876 0.060694 .  
## nr.employed                  -1.421e-03  3.272e-03  -0.434 0.664006    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 22833  on 16470  degrees of freedom
## Residual deviance: 22652  on 16424  degrees of freedom
## AIC: 22746
## 
## Number of Fisher Scoring iterations: 10

pred <- predict(model, test, type="response")

kable(head(data.frame(Probabilitas=pred)))

	Probabilitas
19	0.4993936
26	0.5189362
31	0.5587478
37	0.4701323
38	0.4522836
41	0.5051517

pred_class <- ifelse(pred > 0.5, "yes", "no")
pred_class <- as.factor(pred_class)

kable(data.frame(Kategori=names(table(pred_class)),
                 Jumlah=as.vector(table(pred_class))))

Kategori	Jumlah
no	1298
yes	1153

cat("=== MODEL TANPA BALANCING ===\n")

## === MODEL TANPA BALANCING ===

model_awal <- glm(y ~ ., data=train, family=binomial)

pred_awal_prob <- predict(model_awal, test, type="response")
pred_awal <- ifelse(pred_awal_prob > 0.5, "yes", "no")
pred_awal <- as.factor(pred_awal)

library(caret)
cm_awal <- confusionMatrix(pred_awal, test$y)

## Warning in confusionMatrix.default(pred_awal, test$y): Levels are not in the
## same order for reference and data. Refactoring data to match.

print(cm_awal)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2067  384
##        yes    0    0
##                                           
##                Accuracy : 0.8433          
##                  95% CI : (0.8283, 0.8575)
##     No Information Rate : 0.8433          
##     P-Value [Acc > NIR] : 0.5136          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.8433          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.8433          
##          Detection Rate : 0.8433          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : no              
##

cat("=== MODEL DENGAN BALANCING ===\n")

## === MODEL DENGAN BALANCING ===

model_akhir <- glm(y ~ ., data=train_balanced, family=binomial)

pred_akhir_prob <- predict(model_akhir, test, type="response")
pred_akhir <- ifelse(pred_akhir_prob > 0.5, "yes", "no")
pred_akhir <- as.factor(pred_akhir)

cm_akhir <- confusionMatrix(pred_akhir, test$y)

print(cm_akhir)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1101  197
##        yes  966  187
##                                           
##                Accuracy : 0.5255          
##                  95% CI : (0.5055, 0.5454)
##     No Information Rate : 0.8433          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0108          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5327          
##             Specificity : 0.4870          
##          Pos Pred Value : 0.8482          
##          Neg Pred Value : 0.1622          
##              Prevalence : 0.8433          
##          Detection Rate : 0.4492          
##    Detection Prevalence : 0.5296          
##       Balanced Accuracy : 0.5098          
##                                           
##        'Positive' Class : no              
##

cat("=== METRIK MODEL ===\n")

## === METRIK MODEL ===

# ambil metrik dari confusion matrix
get_metrics <- function(cm){
  accuracy <- cm$overall["Accuracy"]
  precision <- cm$byClass["Precision"]
  recall <- cm$byClass["Recall"]
  f1 <- cm$byClass["F1"]
  
  return(c(Accuracy=accuracy,
           Precision=precision,
           Recall=recall,
           F1_Score=f1))
}

metrik_awal <- get_metrics(cm_awal)
metrik_akhir <- get_metrics(cm_akhir)

hasil_metrik <- data.frame(
  Metrik = names(metrik_awal),
  Tanpa_Preprocessing = as.numeric(metrik_awal),
  Dengan_Preprocessing = as.numeric(metrik_akhir)
)

library(knitr)
kable(hasil_metrik)

Metrik	Tanpa_Preprocessing	Dengan_Preprocessing
Accuracy.Accuracy	0.8433293	0.5254998
Precision.Precision	0.8433293	0.8482280
Recall.Recall	1.0000000	0.5326560
F1_Score.F1	0.9150066	0.6543834

codingan ads

ANITA 16231008

2026-03-28