GOALS

Memprediksi apakah seorang klien akan gagal bayar (default payment) di bulan berikutnya.

Variabel Target

default.payment.next.month - 1 (gagal bayar) - 0 (tidak gagal bayar)

Variabel Prediktor

Step 1 — Import Library & Load Dataset

# Import Library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.2
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.5.2
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(themis)
## Warning: package 'themis' was built under R version 4.5.2
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.5.2
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
## 
##     step
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.5.2
library(Rborist)
## Warning: package 'Rborist' was built under R version 4.5.2
## Rborist 0.3-11
## Type RboristNews() to see new features/changes/bug fixes.
data_credit <- read_excel("default of credit card clients.xls", sheet = 1)
## New names:
## • `` -> `...1`
head(data_credit)
## # A tibble: 6 × 25
##   ...1  X1     X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12  
##   <chr> <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID    LIMIT… SEX   EDUC… MARR… AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
## 2 1     20000  2     2     1     24    2     2     -1    -1    -2    -2    3913 
## 3 2     120000 2     2     2     26    -1    2     0     0     0     2     2682 
## 4 3     90000  2     2     2     34    0     0     0     0     0     0     29239
## 5 4     50000  2     2     1     37    0     0     0     0     0     0     46990
## 6 5     50000  1     2     1     57    -1    0     -1    0     0     0     8617 
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## #   X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>

Step 2 — Data Cleaning

# Cek Info data
str(data_credit)
## tibble [30,001 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ...1: chr [1:30001] "ID" "1" "2" "3" ...
##  $ X1  : chr [1:30001] "LIMIT_BAL" "20000" "120000" "90000" ...
##  $ X2  : chr [1:30001] "SEX" "2" "2" "2" ...
##  $ X3  : chr [1:30001] "EDUCATION" "2" "2" "2" ...
##  $ X4  : chr [1:30001] "MARRIAGE" "1" "2" "2" ...
##  $ X5  : chr [1:30001] "AGE" "24" "26" "34" ...
##  $ X6  : chr [1:30001] "PAY_0" "2" "-1" "0" ...
##  $ X7  : chr [1:30001] "PAY_2" "2" "2" "0" ...
##  $ X8  : chr [1:30001] "PAY_3" "-1" "0" "0" ...
##  $ X9  : chr [1:30001] "PAY_4" "-1" "0" "0" ...
##  $ X10 : chr [1:30001] "PAY_5" "-2" "0" "0" ...
##  $ X11 : chr [1:30001] "PAY_6" "-2" "2" "0" ...
##  $ X12 : chr [1:30001] "BILL_AMT1" "3913" "2682" "29239" ...
##  $ X13 : chr [1:30001] "BILL_AMT2" "3102" "1725" "14027" ...
##  $ X14 : chr [1:30001] "BILL_AMT3" "689" "2682" "13559" ...
##  $ X15 : chr [1:30001] "BILL_AMT4" "0" "3272" "14331" ...
##  $ X16 : chr [1:30001] "BILL_AMT5" "0" "3455" "14948" ...
##  $ X17 : chr [1:30001] "BILL_AMT6" "0" "3261" "15549" ...
##  $ X18 : chr [1:30001] "PAY_AMT1" "0" "0" "1518" ...
##  $ X19 : chr [1:30001] "PAY_AMT2" "689" "1000" "1500" ...
##  $ X20 : chr [1:30001] "PAY_AMT3" "0" "1000" "1000" ...
##  $ X21 : chr [1:30001] "PAY_AMT4" "0" "1000" "1000" ...
##  $ X22 : chr [1:30001] "PAY_AMT5" "0" "0" "1000" ...
##  $ X23 : chr [1:30001] "PAY_AMT6" "0" "2000" "5000" ...
##  $ Y   : chr [1:30001] "default payment next month" "1" "1" "0" ...
summary(data_credit)
##      ...1                X1                 X2                 X3           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       X4                 X5                 X6                 X7           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       X8                 X9                X10                X11           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      X12                X13                X14                X15           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      X16                X17                X18                X19           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      X20                X21                X22                X23           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       Y            
##  Length:30001      
##  Class :character  
##  Mode  :character
# Cek Missing Value/Data Null
colSums(is.na(data_credit))
## ...1   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  X16  X17  X18  X19  X20  X21  X22  X23    Y 
##    0    0    0    0    0    0    0    0    0
# Cek Duplikasi
data_credit %>% 
  summarise(duplikat = n() - n_distinct(across(.cols = everything())))
## # A tibble: 1 × 1
##   duplikat
##      <int>
## 1        0
# Rename kolom
data_credit <- rename(data_credit,
    ID = `...1`,
    LIMIT_BAL = X1, SEX = X2, EDUCATION = X3, MARRIAGE = X4, AGE = X5,
    PAY_0 = X6, PAY_2 = X7, PAY_3 = X8, PAY_4 = X9, PAY_5 = X10, PAY_6 = X11,
    BILL_AMT1 = X12, BILL_AMT2 = X13, BILL_AMT3 = X14, BILL_AMT4 = X15,
    BILL_AMT5 = X16, BILL_AMT6 = X17, PAY_AMT1 = X18, PAY_AMT2 = X19,
    PAY_AMT3 = X20, PAY_AMT4 = X21, PAY_AMT5 = X22, PAY_AMT6 = X23,
    default_payment_next_month = Y)
head(data_credit)
## # A tibble: 6 × 25
##   ID    LIMIT_BAL SEX   EDUCATION MARRIAGE AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <chr> <chr>     <chr> <chr>     <chr>    <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID    LIMIT_BAL SEX   EDUCATION MARRIAGE AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## 2 1     20000     2     2         1        24    2     2     -1    -1    -2   
## 3 2     120000    2     2         2        26    -1    2     0     0     0    
## 4 3     90000     2     2         2        34    0     0     0     0     0    
## 5 4     50000     2     2         1        37    0     0     0     0     0    
## 6 5     50000     1     2         1        57    -1    0     -1    0     0    
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## #   BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## #   PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## #   PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Menghapus baris pertama
data_credit <- data_credit[-1, ]
head(data_credit)
## # A tibble: 6 × 25
##   ID    LIMIT_BAL SEX   EDUCATION MARRIAGE AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <chr> <chr>     <chr> <chr>     <chr>    <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1     20000     2     2         1        24    2     2     -1    -1    -2   
## 2 2     120000    2     2         2        26    -1    2     0     0     0    
## 3 3     90000     2     2         2        34    0     0     0     0     0    
## 4 4     50000     2     2         1        37    0     0     0     0     0    
## 5 5     50000     1     2         1        57    -1    0     -1    0     0    
## 6 6     50000     1     1         2        37    0     0     0     0     0    
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## #   BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## #   PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## #   PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Menghapus kolom ID
data_credit <- data_credit[, -1]
head(data_credit)
## # A tibble: 6 × 24
##   LIMIT_BAL SEX   EDUCATION MARRIAGE AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
##   <chr>     <chr> <chr>     <chr>    <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 20000     2     2         1        24    2     2     -1    -1    -2    -2   
## 2 120000    2     2         2        26    -1    2     0     0     0     2    
## 3 90000     2     2         2        34    0     0     0     0     0     0    
## 4 50000     2     2         1        37    0     0     0     0     0     0    
## 5 50000     1     2         1        57    -1    0     -1    0     0     0    
## 6 50000     1     1         2        37    0     0     0     0     0     0    
## # ℹ 13 more variables: BILL_AMT1 <chr>, BILL_AMT2 <chr>, BILL_AMT3 <chr>,
## #   BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>, PAY_AMT1 <chr>,
## #   PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>, PAY_AMT5 <chr>,
## #   PAY_AMT6 <chr>, default_payment_next_month <chr>
# Mengubah kolom AGE, LIMIT_BAL, BILL_AMT1 - BILL_AMT6, PAY_AMT1 - PAY_AMT6 menjadi numerik
data_credit <- data_credit %>%
  mutate(across(c(AGE, LIMIT_BAL, BILL_AMT1:BILL_AMT6, PAY_0:PAY_6, PAY_AMT1:PAY_AMT6),
                ~ as.numeric(as.character(.))))
str(data_credit)
## tibble [30,000 × 24] (S3: tbl_df/tbl/data.frame)
##  $ LIMIT_BAL                 : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : chr [1:30000] "2" "2" "2" "2" ...
##  $ EDUCATION                 : chr [1:30000] "2" "2" "2" "2" ...
##  $ MARRIAGE                  : chr [1:30000] "1" "2" "2" "1" ...
##  $ AGE                       : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : num [1:30000] 3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2                 : num [1:30000] 3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3                 : num [1:30000] 689 2682 13559 49291 35835 ...
##  $ BILL_AMT4                 : num [1:30000] 0 3272 14331 28314 20940 ...
##  $ BILL_AMT5                 : num [1:30000] 0 3455 14948 28959 19146 ...
##  $ BILL_AMT6                 : num [1:30000] 0 3261 15549 29547 19131 ...
##  $ PAY_AMT1                  : num [1:30000] 0 0 1518 2000 2000 ...
##  $ PAY_AMT2                  : num [1:30000] 689 1000 1500 2019 36681 ...
##  $ PAY_AMT3                  : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : num [1:30000] 0 1000 1000 1100 9000 ...
##  $ PAY_AMT5                  : num [1:30000] 0 0 1000 1069 689 ...
##  $ PAY_AMT6                  : num [1:30000] 0 2000 5000 1000 679 ...
##  $ default_payment_next_month: chr [1:30000] "1" "1" "0" "0" ...

Step 3 - Exploratory Data Analysis (EDA)

# Distribusi target
ggplot(data_credit, aes(x = factor(default_payment_next_month), fill = factor(default_payment_next_month))) + 
  geom_bar() + 
  scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Distribusi Default Payment Next Month",
x = "Default (0 = Tidak, 1 = Ya)",
y = "Jumlah") +
theme_minimal()

# Persentase Distribusi
persen_dist <- data_credit %>%
  group_by(default_payment_next_month) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)
print(persen_dist)
## # A tibble: 2 × 3
##   default_payment_next_month count percentage
##   <chr>                      <int>      <dbl>
## 1 0                          23364       77.9
## 2 1                           6636       22.1
# Perbandingan Pembayaran bulan berikutnya berdasarkan jenis kelamin
data_credit %>% 
  group_by(SEX, default_payment_next_month) %>% 
  summarise(count = n()) %>%
  mutate(SEX = ifelse(SEX == 1, "Male", "Female")) %>% 
  ggplot(aes(x = SEX, y = count, fill = factor(default_payment_next_month))) + 
  geom_bar(stat = "identity", position = "fill") + 
  scale_fill_manual(values = c("0" = "blue", "1" = "red")) + 
  labs(title = "Proporsi berdasarkan Jenis Kelamin", y = "Proporsi")
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Tingkat Pendidikan
data_credit %>%
  mutate(EDUCATION = case_when(
    EDUCATION %in% c(1) ~ "graduate school",
    EDUCATION %in% c(2) ~ "university",
    EDUCATION %in% c(3) ~ "high school",
    TRUE ~ "Others"
  )) %>%
  mutate(EDUCATION = factor(EDUCATION, levels = c("graduate school", "university", "high school", "Others"))) %>%
  group_by(EDUCATION, default_payment_next_month) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = EDUCATION, y = count, fill = factor(default_payment_next_month))) +
  geom_bar(stat = "identity", position = "fill") +
  scale_fill_manual(values = c("0" = "blue", "1" = "red"))
## `summarise()` has grouped output by 'EDUCATION'. You can override using the
## `.groups` argument.

  labs(title = "Proporsi berdasarkan Pendidikan", x = "Tingkat Pendidikan")
## <ggplot2::labels> List of 2
##  $ x    : chr "Tingkat Pendidikan"
##  $ title: chr "Proporsi berdasarkan Pendidikan"

Insight : Semakin rendah tingkat pendidikan klien, semakin besar risiko gagal bayar. Oleh karena itu, tingkat pendidikan adalah prediktor risiko yang kuat

# Perbandingan Pembayaran bulan berikutnya berdasarkan Status Pernikahan
data_credit %>% 
  filter(MARRIAGE %in% 1:3) %>%
  mutate(MARRIAGE = factor(MARRIAGE,
                           levels = 1:3,
                           labels = c("married", "single", "Others"))) %>%
  group_by(MARRIAGE, default_payment_next_month) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(x = MARRIAGE, y = count, fill = factor(default_payment_next_month))) + 
  geom_bar(stat = "identity", position = "fill") + 
  scale_fill_manual(values = c("0" = "blue", "1" = "red"))
## `summarise()` has grouped output by 'MARRIAGE'. You can override using the
## `.groups` argument.

  labs(title = "Proporsi berdasarkan Status Pernikahan", x = "Status")
## <ggplot2::labels> List of 2
##  $ x    : chr "Status"
##  $ title: chr "Proporsi berdasarkan Status Pernikahan"

Insight : Klien yang bercerai berkontribusi pada tingginya risiko di kelompok “Others”,menunjukkan bahwa perubahan status keluarga (seperti perceraian) dapat menyebabkan ketidakstabilan finansial.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Umur
data_credit %>%
  mutate(Umur = cut(AGE,
                         breaks = c(20, 30, 40, 50, 60, 70, 80),
                         right = FALSE,
                         labels = c("20-29", "30-39", "40-49", "50-59", "60-69", "70-79"))) %>%
  group_by(Umur, default_payment_next_month) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = Umur, y = count, fill = factor(default_payment_next_month))) +
  geom_bar(stat = "identity", position = "fill") + 
  scale_fill_manual(values = c("0" = "blue", "1" = "red"))
## `summarise()` has grouped output by 'Umur'. You can override using the
## `.groups` argument.

  labs(title = "Proporsi berdasarkan Kelompok Umur", x = "Kelompok Umur", y = "Proporsi")
## <ggplot2::labels> List of 3
##  $ x    : chr "Kelompok Umur"
##  $ y    : chr "Proporsi"
##  $ title: chr "Proporsi berdasarkan Kelompok Umur"
# Analisis Pengaruh Status Pembayaran Terakhir (PAY_)
data_credit %>%
  mutate(PAY_0_Faktor = as.factor(PAY_0)) %>%
  group_by(PAY_0_Faktor, default_payment_next_month) %>%
  summarise(count = n(), .groups = 'drop') %>%
  ggplot(aes(x = PAY_0_Faktor, y = count, fill = factor(default_payment_next_month))) +
  geom_bar(stat = "identity", position = "fill") +
  scale_fill_manual(values = c("0" = "blue", "1" = "red"), name = "Default") +
  labs(
    title = "Proporsi Gagal Bayar berdasarkan Status Pembayaran Terakhir (PAY_0)",
    subtitle = "Angka > 0 menunjukkan keterlambatan (1=telat 1 bln, 2=telat 2 bln, dst)",
    x = "Status Pembayaran (PAY_0)",
    y = "Proporsi"
  ) +
  theme_minimal()

- Klien yang memiliki status pembayaran telat > 2 bulan beresiko gagal membayar di bulan berikutnya. Insight : Riwayat pembayaran adalah prediktor utama dari kegagalan kredit di bulan berikutnya. Klien yang mencapai status keterlambatan 2 bulan atau lebih (PAY >= 2) memiliki probabilitas kegagalan (default) lebih tinggi, yang mengindikasikan mereka sangat mungkin gagal membayar di bulan berikutnya.

# Korelasi Antar Numerik
library(reshape2)

num_cols <- c("AGE", "LIMIT_BAL", paste0("BILL_AMT", 1:6), paste0("PAY_AMT", 1:6), paste0("PAY_", c(0, 2:6)))

corr_matrix <- cor(data_credit[num_cols], use = "complete.obs")

# Melt matriks untuk ggplot
melted_corr <- melt(corr_matrix)

# Buat heatmap
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1, 1), name = "Korelasi") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
  labs(title = "Korelasi antar Fitur Numerik",x = "", y = "")

Kesimpulan Akhir EDA - 👫 Jenis Kelamin : Proporsi klien Laki-laki yang gagal bayar lebih tinggi dibandingkan dengan perempuan (berdasarkan proporsi dalam masing-masing kelompok gender). Ini menunjukkan bahwa faktor gender perlu dipertimbangkan sebagai prediktor risiko. - 🎓 Tingkat Pendidikan : Semakin rendah tingkat pendidikan klien (khususnya university dan high school mendominasi jumlah default), semakin besar risiko gagal bayar. Hal ini menjadikan tingkat pendidikan sebagai prediktor risiko yang kuat. - 💍 Status Pernikahan : Proposi nya tidak jauh berbeda dengan kelompok yang lain (married dan single), maka ini bukan prediktor yang kuat. - 🎂 Usia : Klien berusia 60 tahun ke atas memiliki risiko gagal bayar tertinggi, yang kemungkinan besar disebabkan oleh penurunan pendapatan pasca-pensiun, menjadikan usia sebagai faktor yang signifikan. - 💳 Status Pembayaran Terakhir : Riwayat pembayaran adalah prediktor utama kegagalan pembayaran di bulan berikutnya. - 📈 Korelasi Antar Variabel Numerik : Ditemukan korelasi yang sangat kuat (tinggi dan positif) antar variabel jumlah tagihan bulanan. Korelasi yang tinggi ini mengindikasikan adanya multikolinearitas antar variabel BILL_AMT dan PAY_AMT berturut-turut.

Step 4 - Modeling & Evaluasi Model

# Bagian 1: Pemrosesan Data & Feature Engineering

# Menghapus baris dengan nilai NA 
data_credit_bersih <- na.omit(data_credit)

# FEATURE ENGINEERING (Untuk mengatasi multikolinearitas)
data_credit_bersih <- data_credit_bersih %>%
  mutate(
    Avg_Bill_Amt = (BILL_AMT1 + BILL_AMT2 + BILL_AMT3 + BILL_AMT4 + BILL_AMT5 + BILL_AMT6) / 6,
    # Rasio tagihan utang
    Debt_Limit_Ratio = Avg_Bill_Amt / LIMIT_BAL,
    Debt_Limit_Ratio = ifelse(is.infinite(Debt_Limit_Ratio) | is.na(Debt_Limit_Ratio), 0, Debt_Limit_Ratio),
    
    Avg_Payment_Status = (PAY_0 + PAY_2 + PAY_3 + PAY_4 + PAY_5 + PAY_6) / 6,
    
    Total_Payment = PAY_AMT1 + PAY_AMT2 + PAY_AMT3 + PAY_AMT4 + PAY_AMT5 + PAY_AMT6
  )

# Pemrosesan Kategori dan Encoding 
data_processing <- data_credit_bersih %>%
  # Mengelompokkan EDUCATION
  mutate(EDUCATION = case_when(EDUCATION %in% c(1, 2, 3) ~ as.character(EDUCATION), TRUE ~ "4")) %>%
  # Mengelompokkan MARRIAGE 
  mutate(MARRIAGE = case_when(MARRIAGE %in% c(1, 2) ~ as.character(MARRIAGE), TRUE ~ "3")) %>%
  mutate(SEX = as.character(SEX))

# Ekstrak Target (Y)
Y_target <- factor(
  data_processing$default_payment_next_month, 
  levels = c(0, 1), 
  labels = c("NoDefault", "Default") 
)

# One-Hot Encoding pada Prediktor (X)
X_prediktor <- data_processing %>% 
select(-default_payment_next_month)
X_encoded <- X_prediktor %>%
  dummy_cols(select_columns = c("SEX", "EDUCATION", "MARRIAGE"),
             remove_selected_columns = TRUE, 
             remove_first_dummy = TRUE)

# Memastikan semua kolom prediktor adalah NUMERIK
X_final <- X_encoded %>%
  mutate(across(everything(), as.numeric))
             
# Pembagian Data (70% Training, 30% Testing)
set.seed(42) 
index_split <- createDataPartition(Y_target, p = 0.7, list = FALSE)

# Variabel training dan testing
train_data_final <- data.frame(
    X_final[index_split, ], 
    default_payment_next_month = Y_target[index_split]
)
test_data_final <- data.frame(
    X_final[-index_split, ], 
    default_payment_next_month = Y_target[-index_split]
)

cat("Fitur Engineering dan Pembagian Data Selesai.\n")
## Fitur Engineering dan Pembagian Data Selesai.
print(table(train_data_final$default_payment_next_month))
## 
## NoDefault   Default 
##     16355      4646
# Bagian 2: Data Balancing dan Persiapan Matriks XGBoost

# Tambahkan balancing dengan SMOTE
train_balanced <- recipe(default_payment_next_month ~ ., data = train_data_final) %>%
  step_smote(default_payment_next_month, over_ratio = 1) %>% 
  prep() %>%
  bake(new_data = NULL)

# Konversi target ke numerik untuk XGBoost 
# Konversi faktor ("NoDefault", "Default") menjadi 0 dan 1.
train_balanced$default_payment_next_month <- as.numeric(train_balanced$default_payment_next_month) - 1
test_data_final$default_payment_next_month <- as.numeric(test_data_final$default_payment_next_month) - 1


# Buat matrix untuk XGBoost
train_matrix <- xgb.DMatrix(data = as.matrix(train_balanced %>% select(-default_payment_next_month)), 
                            label = train_balanced$default_payment_next_month)
test_matrix <- xgb.DMatrix(data = as.matrix(test_data_final %>% select(-default_payment_next_month)), 
                           label = test_data_final$default_payment_next_month)
# Bagian 3 : Pelatihan Model XGBoost
# Tentukan Parameter XGBoost
params <- list(
  objective = "binary:logistic", 
  eval_metric = "logloss",       
  eta = 0.05,                    
  max_depth = 6,                 
  subsample = 0.8,               
  colsample_bytree = 0.8         
)

# Latih Model XGBoost
set.seed(42)
xgb_model <- xgb.train(
  params = params,
  data = train_matrix,
  nrounds = 300,                 
  evals = list(train = train_matrix, test = test_matrix),
  verbose = 1,                   
  early_stopping_rounds = 10   
)
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
## 
## [1]  train-logloss:0.667966  test-logloss:0.674058 
## [2]  train-logloss:0.645963  test-logloss:0.656850 
## [3]  train-logloss:0.625945  test-logloss:0.641466 
## [4]  train-logloss:0.607543  test-logloss:0.627513 
## [5]  train-logloss:0.590345  test-logloss:0.614411 
## [6]  train-logloss:0.574546  test-logloss:0.602337 
## [7]  train-logloss:0.560503  test-logloss:0.592395 
## [8]  train-logloss:0.546715  test-logloss:0.581862 
## [9]  train-logloss:0.533779  test-logloss:0.571984 
## [10] train-logloss:0.521208  test-logloss:0.562896 
## [11] train-logloss:0.509410  test-logloss:0.554322 
## [12] train-logloss:0.498893  test-logloss:0.546362 
## [13] train-logloss:0.489323  test-logloss:0.539245 
## [14] train-logloss:0.479969  test-logloss:0.532327 
## [15] train-logloss:0.470879  test-logloss:0.525670 
## [16] train-logloss:0.462903  test-logloss:0.520169 
## [17] train-logloss:0.455405  test-logloss:0.514863 
## [18] train-logloss:0.448670  test-logloss:0.510097 
## [19] train-logloss:0.441714  test-logloss:0.505133 
## [20] train-logloss:0.435938  test-logloss:0.501230 
## [21] train-logloss:0.430345  test-logloss:0.497451 
## [22] train-logloss:0.424108  test-logloss:0.493172 
## [23] train-logloss:0.418584  test-logloss:0.489485 
## [24] train-logloss:0.413430  test-logloss:0.485837 
## [25] train-logloss:0.408097  test-logloss:0.482372 
## [26] train-logloss:0.403282  test-logloss:0.479164 
## [27] train-logloss:0.399065  test-logloss:0.476283 
## [28] train-logloss:0.394779  test-logloss:0.473466 
## [29] train-logloss:0.390561  test-logloss:0.470709 
## [30] train-logloss:0.386352  test-logloss:0.468196 
## [31] train-logloss:0.382468  test-logloss:0.465829 
## [32] train-logloss:0.378948  test-logloss:0.463655 
## [33] train-logloss:0.375510  test-logloss:0.461576 
## [34] train-logloss:0.372001  test-logloss:0.459507 
## [35] train-logloss:0.368506  test-logloss:0.457476 
## [36] train-logloss:0.365288  test-logloss:0.455689 
## [37] train-logloss:0.362576  test-logloss:0.454059 
## [38] train-logloss:0.359783  test-logloss:0.452477 
## [39] train-logloss:0.357041  test-logloss:0.450935 
## [40] train-logloss:0.354715  test-logloss:0.449587 
## [41] train-logloss:0.352508  test-logloss:0.448294 
## [42] train-logloss:0.350295  test-logloss:0.447165 
## [43] train-logloss:0.347826  test-logloss:0.445822 
## [44] train-logloss:0.345403  test-logloss:0.444491 
## [45] train-logloss:0.343433  test-logloss:0.443453 
## [46] train-logloss:0.341453  test-logloss:0.442527 
## [47] train-logloss:0.339116  test-logloss:0.441335 
## [48] train-logloss:0.337125  test-logloss:0.440376 
## [49] train-logloss:0.335444  test-logloss:0.439585 
## [50] train-logloss:0.333417  test-logloss:0.438658 
## [51] train-logloss:0.331947  test-logloss:0.437879 
## [52] train-logloss:0.330479  test-logloss:0.437095 
## [53] train-logloss:0.328454  test-logloss:0.436331 
## [54] train-logloss:0.326775  test-logloss:0.435580 
## [55] train-logloss:0.325247  test-logloss:0.434918 
## [56] train-logloss:0.323935  test-logloss:0.434313 
## [57] train-logloss:0.323111  test-logloss:0.433957 
## [58] train-logloss:0.321567  test-logloss:0.433261 
## [59] train-logloss:0.320340  test-logloss:0.432789 
## [60] train-logloss:0.318972  test-logloss:0.432239 
## [61] train-logloss:0.317615  test-logloss:0.431739 
## [62] train-logloss:0.316512  test-logloss:0.431391 
## [63] train-logloss:0.315402  test-logloss:0.430917 
## [64] train-logloss:0.314544  test-logloss:0.430608 
## [65] train-logloss:0.313303  test-logloss:0.430028 
## [66] train-logloss:0.312272  test-logloss:0.429654 
## [67] train-logloss:0.311399  test-logloss:0.429375 
## [68] train-logloss:0.310028  test-logloss:0.428930 
## [69] train-logloss:0.308951  test-logloss:0.428465 
## [70] train-logloss:0.307869  test-logloss:0.427959 
## [71] train-logloss:0.306802  test-logloss:0.427593 
## [72] train-logloss:0.305951  test-logloss:0.427223 
## [73] train-logloss:0.305145  test-logloss:0.427006 
## [74] train-logloss:0.304177  test-logloss:0.426529 
## [75] train-logloss:0.303164  test-logloss:0.426130 
## [76] train-logloss:0.302202  test-logloss:0.425858 
## [77] train-logloss:0.301315  test-logloss:0.425562 
## [78] train-logloss:0.300603  test-logloss:0.425335 
## [79] train-logloss:0.299876  test-logloss:0.425145 
## [80] train-logloss:0.299122  test-logloss:0.425027 
## [81] train-logloss:0.298411  test-logloss:0.424896 
## [82] train-logloss:0.297785  test-logloss:0.424742 
## [83] train-logloss:0.297182  test-logloss:0.424752 
## [84] train-logloss:0.296614  test-logloss:0.424654 
## [85] train-logloss:0.295938  test-logloss:0.424473 
## [86] train-logloss:0.295280  test-logloss:0.424370 
## [87] train-logloss:0.294641  test-logloss:0.424302 
## [88] train-logloss:0.294318  test-logloss:0.424214 
## [89] train-logloss:0.293750  test-logloss:0.424074 
## [90] train-logloss:0.293315  test-logloss:0.423956 
## [91] train-logloss:0.292591  test-logloss:0.423852 
## [92] train-logloss:0.292084  test-logloss:0.423745 
## [93] train-logloss:0.291467  test-logloss:0.423622 
## [94] train-logloss:0.290967  test-logloss:0.423457 
## [95] train-logloss:0.290355  test-logloss:0.423311 
## [96] train-logloss:0.289728  test-logloss:0.423122 
## [97] train-logloss:0.289142  test-logloss:0.422998 
## [98] train-logloss:0.288595  test-logloss:0.422901 
## [99] train-logloss:0.288173  test-logloss:0.422871 
## [100]    train-logloss:0.287710  test-logloss:0.422844 
## [101]    train-logloss:0.287275  test-logloss:0.422739 
## [102]    train-logloss:0.286771  test-logloss:0.422588 
## [103]    train-logloss:0.286365  test-logloss:0.422515 
## [104]    train-logloss:0.285912  test-logloss:0.422293 
## [105]    train-logloss:0.285482  test-logloss:0.422257 
## [106]    train-logloss:0.285082  test-logloss:0.422139 
## [107]    train-logloss:0.284575  test-logloss:0.422058 
## [108]    train-logloss:0.284113  test-logloss:0.421974 
## [109]    train-logloss:0.283903  test-logloss:0.421938 
## [110]    train-logloss:0.283390  test-logloss:0.421881 
## [111]    train-logloss:0.282940  test-logloss:0.421840 
## [112]    train-logloss:0.282447  test-logloss:0.421663 
## [113]    train-logloss:0.281972  test-logloss:0.421646 
## [114]    train-logloss:0.281645  test-logloss:0.421612 
## [115]    train-logloss:0.281242  test-logloss:0.421522 
## [116]    train-logloss:0.280878  test-logloss:0.421464 
## [117]    train-logloss:0.280492  test-logloss:0.421411 
## [118]    train-logloss:0.279941  test-logloss:0.421384 
## [119]    train-logloss:0.279373  test-logloss:0.421363 
## [120]    train-logloss:0.278870  test-logloss:0.421321 
## [121]    train-logloss:0.278461  test-logloss:0.421347 
## [122]    train-logloss:0.278001  test-logloss:0.421265 
## [123]    train-logloss:0.277590  test-logloss:0.421254 
## [124]    train-logloss:0.277172  test-logloss:0.421235 
## [125]    train-logloss:0.276770  test-logloss:0.421129 
## [126]    train-logloss:0.276401  test-logloss:0.421133 
## [127]    train-logloss:0.275977  test-logloss:0.421227 
## [128]    train-logloss:0.275610  test-logloss:0.421213 
## [129]    train-logloss:0.275136  test-logloss:0.421145 
## [130]    train-logloss:0.274862  test-logloss:0.421118 
## [131]    train-logloss:0.274494  test-logloss:0.421049 
## [132]    train-logloss:0.274158  test-logloss:0.421046 
## [133]    train-logloss:0.273802  test-logloss:0.421023 
## [134]    train-logloss:0.273417  test-logloss:0.421044 
## [135]    train-logloss:0.273232  test-logloss:0.421021 
## [136]    train-logloss:0.272849  test-logloss:0.421020 
## [137]    train-logloss:0.272540  test-logloss:0.420911 
## [138]    train-logloss:0.272184  test-logloss:0.420841 
## [139]    train-logloss:0.271783  test-logloss:0.420769 
## [140]    train-logloss:0.271512  test-logloss:0.420743 
## [141]    train-logloss:0.271029  test-logloss:0.420796 
## [142]    train-logloss:0.270609  test-logloss:0.420839 
## [143]    train-logloss:0.270466  test-logloss:0.420768 
## [144]    train-logloss:0.269974  test-logloss:0.420841 
## [145]    train-logloss:0.269717  test-logloss:0.420881 
## [146]    train-logloss:0.269323  test-logloss:0.420770 
## [147]    train-logloss:0.269041  test-logloss:0.420679 
## [148]    train-logloss:0.268563  test-logloss:0.420646 
## [149]    train-logloss:0.268273  test-logloss:0.420596 
## [150]    train-logloss:0.267915  test-logloss:0.420593 
## [151]    train-logloss:0.267580  test-logloss:0.420483 
## [152]    train-logloss:0.267319  test-logloss:0.420604 
## [153]    train-logloss:0.266999  test-logloss:0.420543 
## [154]    train-logloss:0.266745  test-logloss:0.420499 
## [155]    train-logloss:0.266429  test-logloss:0.420457 
## [156]    train-logloss:0.266147  test-logloss:0.420395 
## [157]    train-logloss:0.265914  test-logloss:0.420366 
## [158]    train-logloss:0.265705  test-logloss:0.420382 
## [159]    train-logloss:0.265479  test-logloss:0.420346 
## [160]    train-logloss:0.265200  test-logloss:0.420299 
## [161]    train-logloss:0.265001  test-logloss:0.420264 
## [162]    train-logloss:0.264835  test-logloss:0.420168 
## [163]    train-logloss:0.264501  test-logloss:0.420040 
## [164]    train-logloss:0.264225  test-logloss:0.419996 
## [165]    train-logloss:0.263917  test-logloss:0.419989 
## [166]    train-logloss:0.263644  test-logloss:0.419971 
## [167]    train-logloss:0.263351  test-logloss:0.419860 
## [168]    train-logloss:0.263133  test-logloss:0.419881 
## [169]    train-logloss:0.262907  test-logloss:0.419878 
## [170]    train-logloss:0.262545  test-logloss:0.419847 
## [171]    train-logloss:0.262335  test-logloss:0.419914 
## [172]    train-logloss:0.262113  test-logloss:0.419846 
## [173]    train-logloss:0.261776  test-logloss:0.419932 
## [174]    train-logloss:0.261432  test-logloss:0.419898 
## [175]    train-logloss:0.261124  test-logloss:0.419954 
## [176]    train-logloss:0.260818  test-logloss:0.419974 
## [177]    train-logloss:0.260493  test-logloss:0.419946 
## [178]    train-logloss:0.260298  test-logloss:0.419904 
## [179]    train-logloss:0.260063  test-logloss:0.419949 
## [180]    train-logloss:0.259786  test-logloss:0.419932 
## [181]    train-logloss:0.259544  test-logloss:0.419969 
## Stopping. Best iteration:
## [182]    train-logloss:0.259245  test-logloss:0.419947
## 
## [182]    train-logloss:0.259245  test-logloss:0.419947
cat(" Pelatihan Model XGBoost Selesai ")
##  Pelatihan Model XGBoost Selesai
# Bagian 4 : Evaluasi Model XGBoost

# Prediksi Probabilitas pada Data Testing
xgb_pred_prob <- predict(xgb_model, test_matrix)

# Konversi Probabilitas menjadi Prediksi Kelas (Optimal 0.35)
threshold <- 0.35
xgb_pred_class <- as.factor(ifelse(xgb_pred_prob > threshold, 1, 0))

# Pastikan level prediksi dan aktual sama
actual_labels <- as.factor(test_data_final$default_payment_next_month)
levels(xgb_pred_class) <- levels(actual_labels)


# Hitung Confusion Matrix
conf_matrix <- confusionMatrix(xgb_pred_class, actual_labels, positive = "1")
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6198  899
##          1  811 1091
##                                          
##                Accuracy : 0.81           
##                  95% CI : (0.8017, 0.818)
##     No Information Rate : 0.7789         
##     P-Value [Acc > NIR] : 2.506e-13      
##                                          
##                   Kappa : 0.4395         
##                                          
##  Mcnemar's Test P-Value : 0.03539        
##                                          
##             Sensitivity : 0.5482         
##             Specificity : 0.8843         
##          Pos Pred Value : 0.5736         
##          Neg Pred Value : 0.8733         
##              Prevalence : 0.2211         
##          Detection Rate : 0.1212         
##    Detection Prevalence : 0.2114         
##       Balanced Accuracy : 0.7163         
##                                          
##        'Positive' Class : 1              
## 
# Hitung dan Plot ROC Curve
roc_obj <- roc(test_data_final$default_payment_next_month, xgb_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC Curve 
plot(roc_obj, main = "Kurva ROC untuk Model XGBoost", col = "blue", lwd = 2)

# Hitung AUC
auc_value <- auc(roc_obj)
cat(paste("\nAUC (Area Under the Curve):", round(auc_value, 4), "\n"))
## 
## AUC (Area Under the Curve): 0.7981
cat(" Evaluasi Model Selesai ")
##  Evaluasi Model Selesai