Memprediksi apakah seorang klien akan gagal bayar (default payment) di bulan berikutnya.
default.payment.next.month - 1 (gagal bayar) - 0 (tidak gagal bayar)
# Import Library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.2
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.5.2
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(themis)
## Warning: package 'themis' was built under R version 4.5.2
## Loading required package: recipes
## Warning: package 'recipes' was built under R version 4.5.2
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stats':
##
## step
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.5.2
library(Rborist)
## Warning: package 'Rborist' was built under R version 4.5.2
## Rborist 0.3-11
## Type RboristNews() to see new features/changes/bug fixes.
data_credit <- read_excel("default of credit card clients.xls", sheet = 1)
## New names:
## • `` -> `...1`
head(data_credit)
## # A tibble: 6 × 25
## ...1 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID LIMIT… SEX EDUC… MARR… AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
## 2 1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913
## 3 2 120000 2 2 2 26 -1 2 0 0 0 2 2682
## 4 3 90000 2 2 2 34 0 0 0 0 0 0 29239
## 5 4 50000 2 2 1 37 0 0 0 0 0 0 46990
## 6 5 50000 1 2 1 57 -1 0 -1 0 0 0 8617
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## # X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>
# Cek Info data
str(data_credit)
## tibble [30,001 × 25] (S3: tbl_df/tbl/data.frame)
## $ ...1: chr [1:30001] "ID" "1" "2" "3" ...
## $ X1 : chr [1:30001] "LIMIT_BAL" "20000" "120000" "90000" ...
## $ X2 : chr [1:30001] "SEX" "2" "2" "2" ...
## $ X3 : chr [1:30001] "EDUCATION" "2" "2" "2" ...
## $ X4 : chr [1:30001] "MARRIAGE" "1" "2" "2" ...
## $ X5 : chr [1:30001] "AGE" "24" "26" "34" ...
## $ X6 : chr [1:30001] "PAY_0" "2" "-1" "0" ...
## $ X7 : chr [1:30001] "PAY_2" "2" "2" "0" ...
## $ X8 : chr [1:30001] "PAY_3" "-1" "0" "0" ...
## $ X9 : chr [1:30001] "PAY_4" "-1" "0" "0" ...
## $ X10 : chr [1:30001] "PAY_5" "-2" "0" "0" ...
## $ X11 : chr [1:30001] "PAY_6" "-2" "2" "0" ...
## $ X12 : chr [1:30001] "BILL_AMT1" "3913" "2682" "29239" ...
## $ X13 : chr [1:30001] "BILL_AMT2" "3102" "1725" "14027" ...
## $ X14 : chr [1:30001] "BILL_AMT3" "689" "2682" "13559" ...
## $ X15 : chr [1:30001] "BILL_AMT4" "0" "3272" "14331" ...
## $ X16 : chr [1:30001] "BILL_AMT5" "0" "3455" "14948" ...
## $ X17 : chr [1:30001] "BILL_AMT6" "0" "3261" "15549" ...
## $ X18 : chr [1:30001] "PAY_AMT1" "0" "0" "1518" ...
## $ X19 : chr [1:30001] "PAY_AMT2" "689" "1000" "1500" ...
## $ X20 : chr [1:30001] "PAY_AMT3" "0" "1000" "1000" ...
## $ X21 : chr [1:30001] "PAY_AMT4" "0" "1000" "1000" ...
## $ X22 : chr [1:30001] "PAY_AMT5" "0" "0" "1000" ...
## $ X23 : chr [1:30001] "PAY_AMT6" "0" "2000" "5000" ...
## $ Y : chr [1:30001] "default payment next month" "1" "1" "0" ...
summary(data_credit)
## ...1 X1 X2 X3
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X4 X5 X6 X7
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X8 X9 X10 X11
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X12 X13 X14 X15
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X16 X17 X18 X19
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X20 X21 X22 X23
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Y
## Length:30001
## Class :character
## Mode :character
# Cek Missing Value/Data Null
colSums(is.na(data_credit))
## ...1 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## X16 X17 X18 X19 X20 X21 X22 X23 Y
## 0 0 0 0 0 0 0 0 0
# Cek Duplikasi
data_credit %>%
summarise(duplikat = n() - n_distinct(across(.cols = everything())))
## # A tibble: 1 × 1
## duplikat
## <int>
## 1 0
# Rename kolom
data_credit <- rename(data_credit,
ID = `...1`,
LIMIT_BAL = X1, SEX = X2, EDUCATION = X3, MARRIAGE = X4, AGE = X5,
PAY_0 = X6, PAY_2 = X7, PAY_3 = X8, PAY_4 = X9, PAY_5 = X10, PAY_6 = X11,
BILL_AMT1 = X12, BILL_AMT2 = X13, BILL_AMT3 = X14, BILL_AMT4 = X15,
BILL_AMT5 = X16, BILL_AMT6 = X17, PAY_AMT1 = X18, PAY_AMT2 = X19,
PAY_AMT3 = X20, PAY_AMT4 = X21, PAY_AMT5 = X22, PAY_AMT6 = X23,
default_payment_next_month = Y)
head(data_credit)
## # A tibble: 6 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## 2 1 20000 2 2 1 24 2 2 -1 -1 -2
## 3 2 120000 2 2 2 26 -1 2 0 0 0
## 4 3 90000 2 2 2 34 0 0 0 0 0
## 5 4 50000 2 2 1 37 0 0 0 0 0
## 6 5 50000 1 2 1 57 -1 0 -1 0 0
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## # BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## # PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## # PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Menghapus baris pertama
data_credit <- data_credit[-1, ]
head(data_credit)
## # A tibble: 6 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## 6 6 50000 1 1 2 37 0 0 0 0 0
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## # BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## # PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## # PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Menghapus kolom ID
data_credit <- data_credit[, -1]
head(data_credit)
## # A tibble: 6 × 24
## LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 20000 2 2 1 24 2 2 -1 -1 -2 -2
## 2 120000 2 2 2 26 -1 2 0 0 0 2
## 3 90000 2 2 2 34 0 0 0 0 0 0
## 4 50000 2 2 1 37 0 0 0 0 0 0
## 5 50000 1 2 1 57 -1 0 -1 0 0 0
## 6 50000 1 1 2 37 0 0 0 0 0 0
## # ℹ 13 more variables: BILL_AMT1 <chr>, BILL_AMT2 <chr>, BILL_AMT3 <chr>,
## # BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>, PAY_AMT1 <chr>,
## # PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>, PAY_AMT5 <chr>,
## # PAY_AMT6 <chr>, default_payment_next_month <chr>
# Mengubah kolom AGE, LIMIT_BAL, BILL_AMT1 - BILL_AMT6, PAY_AMT1 - PAY_AMT6 menjadi numerik
data_credit <- data_credit %>%
mutate(across(c(AGE, LIMIT_BAL, BILL_AMT1:BILL_AMT6, PAY_0:PAY_6, PAY_AMT1:PAY_AMT6),
~ as.numeric(as.character(.))))
str(data_credit)
## tibble [30,000 × 24] (S3: tbl_df/tbl/data.frame)
## $ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : chr [1:30000] "2" "2" "2" "2" ...
## $ EDUCATION : chr [1:30000] "2" "2" "2" "2" ...
## $ MARRIAGE : chr [1:30000] "1" "2" "2" "1" ...
## $ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
## $ default_payment_next_month: chr [1:30000] "1" "1" "0" "0" ...
# Distribusi target
ggplot(data_credit, aes(x = factor(default_payment_next_month), fill = factor(default_payment_next_month))) +
geom_bar() +
scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Distribusi Default Payment Next Month",
x = "Default (0 = Tidak, 1 = Ya)",
y = "Jumlah") +
theme_minimal()
# Persentase Distribusi
persen_dist <- data_credit %>%
group_by(default_payment_next_month) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count) * 100)
print(persen_dist)
## # A tibble: 2 × 3
## default_payment_next_month count percentage
## <chr> <int> <dbl>
## 1 0 23364 77.9
## 2 1 6636 22.1
# Perbandingan Pembayaran bulan berikutnya berdasarkan jenis kelamin
data_credit %>%
group_by(SEX, default_payment_next_month) %>%
summarise(count = n()) %>%
mutate(SEX = ifelse(SEX == 1, "Male", "Female")) %>%
ggplot(aes(x = SEX, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Proporsi berdasarkan Jenis Kelamin", y = "Proporsi")
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Tingkat Pendidikan
data_credit %>%
mutate(EDUCATION = case_when(
EDUCATION %in% c(1) ~ "graduate school",
EDUCATION %in% c(2) ~ "university",
EDUCATION %in% c(3) ~ "high school",
TRUE ~ "Others"
)) %>%
mutate(EDUCATION = factor(EDUCATION, levels = c("graduate school", "university", "high school", "Others"))) %>%
group_by(EDUCATION, default_payment_next_month) %>%
summarise(count = n()) %>%
ggplot(aes(x = EDUCATION, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("0" = "blue", "1" = "red"))
## `summarise()` has grouped output by 'EDUCATION'. You can override using the
## `.groups` argument.
labs(title = "Proporsi berdasarkan Pendidikan", x = "Tingkat Pendidikan")
## <ggplot2::labels> List of 2
## $ x : chr "Tingkat Pendidikan"
## $ title: chr "Proporsi berdasarkan Pendidikan"
Insight : Semakin rendah tingkat pendidikan klien, semakin besar risiko gagal bayar. Oleh karena itu, tingkat pendidikan adalah prediktor risiko yang kuat
# Perbandingan Pembayaran bulan berikutnya berdasarkan Status Pernikahan
data_credit %>%
filter(MARRIAGE %in% 1:3) %>%
mutate(MARRIAGE = factor(MARRIAGE,
levels = 1:3,
labels = c("married", "single", "Others"))) %>%
group_by(MARRIAGE, default_payment_next_month) %>%
summarise(count = n()) %>%
ggplot(aes(x = MARRIAGE, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("0" = "blue", "1" = "red"))
## `summarise()` has grouped output by 'MARRIAGE'. You can override using the
## `.groups` argument.
labs(title = "Proporsi berdasarkan Status Pernikahan", x = "Status")
## <ggplot2::labels> List of 2
## $ x : chr "Status"
## $ title: chr "Proporsi berdasarkan Status Pernikahan"
Insight : Klien yang bercerai berkontribusi pada tingginya risiko di kelompok “Others”,menunjukkan bahwa perubahan status keluarga (seperti perceraian) dapat menyebabkan ketidakstabilan finansial.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Umur
data_credit %>%
mutate(Umur = cut(AGE,
breaks = c(20, 30, 40, 50, 60, 70, 80),
right = FALSE,
labels = c("20-29", "30-39", "40-49", "50-59", "60-69", "70-79"))) %>%
group_by(Umur, default_payment_next_month) %>%
summarise(count = n()) %>%
ggplot(aes(x = Umur, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("0" = "blue", "1" = "red"))
## `summarise()` has grouped output by 'Umur'. You can override using the
## `.groups` argument.
labs(title = "Proporsi berdasarkan Kelompok Umur", x = "Kelompok Umur", y = "Proporsi")
## <ggplot2::labels> List of 3
## $ x : chr "Kelompok Umur"
## $ y : chr "Proporsi"
## $ title: chr "Proporsi berdasarkan Kelompok Umur"
# Analisis Pengaruh Status Pembayaran Terakhir (PAY_)
data_credit %>%
mutate(PAY_0_Faktor = as.factor(PAY_0)) %>%
group_by(PAY_0_Faktor, default_payment_next_month) %>%
summarise(count = n(), .groups = 'drop') %>%
ggplot(aes(x = PAY_0_Faktor, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("0" = "blue", "1" = "red"), name = "Default") +
labs(
title = "Proporsi Gagal Bayar berdasarkan Status Pembayaran Terakhir (PAY_0)",
subtitle = "Angka > 0 menunjukkan keterlambatan (1=telat 1 bln, 2=telat 2 bln, dst)",
x = "Status Pembayaran (PAY_0)",
y = "Proporsi"
) +
theme_minimal()
- Klien yang memiliki status pembayaran telat > 2 bulan beresiko
gagal membayar di bulan berikutnya. Insight : Riwayat
pembayaran adalah prediktor utama dari kegagalan kredit di bulan
berikutnya. Klien yang mencapai status keterlambatan 2 bulan atau lebih
(PAY >= 2) memiliki probabilitas kegagalan (default) lebih tinggi,
yang mengindikasikan mereka sangat mungkin gagal membayar di bulan
berikutnya.
# Korelasi Antar Numerik
library(reshape2)
num_cols <- c("AGE", "LIMIT_BAL", paste0("BILL_AMT", 1:6), paste0("PAY_AMT", 1:6), paste0("PAY_", c(0, 2:6)))
corr_matrix <- cor(data_credit[num_cols], use = "complete.obs")
# Melt matriks untuk ggplot
melted_corr <- melt(corr_matrix)
# Buat heatmap
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), name = "Korelasi") +
geom_text(aes(label = round(value, 2)), color = "black", size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
labs(title = "Korelasi antar Fitur Numerik",x = "", y = "")
Kesimpulan Akhir EDA - 👫 Jenis Kelamin : Proporsi klien Laki-laki yang gagal bayar lebih tinggi dibandingkan dengan perempuan (berdasarkan proporsi dalam masing-masing kelompok gender). Ini menunjukkan bahwa faktor gender perlu dipertimbangkan sebagai prediktor risiko. - 🎓 Tingkat Pendidikan : Semakin rendah tingkat pendidikan klien (khususnya university dan high school mendominasi jumlah default), semakin besar risiko gagal bayar. Hal ini menjadikan tingkat pendidikan sebagai prediktor risiko yang kuat. - 💍 Status Pernikahan : Proposi nya tidak jauh berbeda dengan kelompok yang lain (married dan single), maka ini bukan prediktor yang kuat. - 🎂 Usia : Klien berusia 60 tahun ke atas memiliki risiko gagal bayar tertinggi, yang kemungkinan besar disebabkan oleh penurunan pendapatan pasca-pensiun, menjadikan usia sebagai faktor yang signifikan. - 💳 Status Pembayaran Terakhir : Riwayat pembayaran adalah prediktor utama kegagalan pembayaran di bulan berikutnya. - 📈 Korelasi Antar Variabel Numerik : Ditemukan korelasi yang sangat kuat (tinggi dan positif) antar variabel jumlah tagihan bulanan. Korelasi yang tinggi ini mengindikasikan adanya multikolinearitas antar variabel BILL_AMT dan PAY_AMT berturut-turut.
# Bagian 1: Pemrosesan Data & Feature Engineering
# Menghapus baris dengan nilai NA
data_credit_bersih <- na.omit(data_credit)
# FEATURE ENGINEERING (Untuk mengatasi multikolinearitas)
data_credit_bersih <- data_credit_bersih %>%
mutate(
Avg_Bill_Amt = (BILL_AMT1 + BILL_AMT2 + BILL_AMT3 + BILL_AMT4 + BILL_AMT5 + BILL_AMT6) / 6,
# Rasio tagihan utang
Debt_Limit_Ratio = Avg_Bill_Amt / LIMIT_BAL,
Debt_Limit_Ratio = ifelse(is.infinite(Debt_Limit_Ratio) | is.na(Debt_Limit_Ratio), 0, Debt_Limit_Ratio),
Avg_Payment_Status = (PAY_0 + PAY_2 + PAY_3 + PAY_4 + PAY_5 + PAY_6) / 6,
Total_Payment = PAY_AMT1 + PAY_AMT2 + PAY_AMT3 + PAY_AMT4 + PAY_AMT5 + PAY_AMT6
)
# Pemrosesan Kategori dan Encoding
data_processing <- data_credit_bersih %>%
# Mengelompokkan EDUCATION
mutate(EDUCATION = case_when(EDUCATION %in% c(1, 2, 3) ~ as.character(EDUCATION), TRUE ~ "4")) %>%
# Mengelompokkan MARRIAGE
mutate(MARRIAGE = case_when(MARRIAGE %in% c(1, 2) ~ as.character(MARRIAGE), TRUE ~ "3")) %>%
mutate(SEX = as.character(SEX))
# Ekstrak Target (Y)
Y_target <- factor(
data_processing$default_payment_next_month,
levels = c(0, 1),
labels = c("NoDefault", "Default")
)
# One-Hot Encoding pada Prediktor (X)
X_prediktor <- data_processing %>%
select(-default_payment_next_month)
X_encoded <- X_prediktor %>%
dummy_cols(select_columns = c("SEX", "EDUCATION", "MARRIAGE"),
remove_selected_columns = TRUE,
remove_first_dummy = TRUE)
# Memastikan semua kolom prediktor adalah NUMERIK
X_final <- X_encoded %>%
mutate(across(everything(), as.numeric))
# Pembagian Data (70% Training, 30% Testing)
set.seed(42)
index_split <- createDataPartition(Y_target, p = 0.7, list = FALSE)
# Variabel training dan testing
train_data_final <- data.frame(
X_final[index_split, ],
default_payment_next_month = Y_target[index_split]
)
test_data_final <- data.frame(
X_final[-index_split, ],
default_payment_next_month = Y_target[-index_split]
)
cat("Fitur Engineering dan Pembagian Data Selesai.\n")
## Fitur Engineering dan Pembagian Data Selesai.
print(table(train_data_final$default_payment_next_month))
##
## NoDefault Default
## 16355 4646
# Bagian 2: Data Balancing dan Persiapan Matriks XGBoost
# Tambahkan balancing dengan SMOTE
train_balanced <- recipe(default_payment_next_month ~ ., data = train_data_final) %>%
step_smote(default_payment_next_month, over_ratio = 1) %>%
prep() %>%
bake(new_data = NULL)
# Konversi target ke numerik untuk XGBoost
# Konversi faktor ("NoDefault", "Default") menjadi 0 dan 1.
train_balanced$default_payment_next_month <- as.numeric(train_balanced$default_payment_next_month) - 1
test_data_final$default_payment_next_month <- as.numeric(test_data_final$default_payment_next_month) - 1
# Buat matrix untuk XGBoost
train_matrix <- xgb.DMatrix(data = as.matrix(train_balanced %>% select(-default_payment_next_month)),
label = train_balanced$default_payment_next_month)
test_matrix <- xgb.DMatrix(data = as.matrix(test_data_final %>% select(-default_payment_next_month)),
label = test_data_final$default_payment_next_month)
# Bagian 3 : Pelatihan Model XGBoost
# Tentukan Parameter XGBoost
params <- list(
objective = "binary:logistic",
eval_metric = "logloss",
eta = 0.05,
max_depth = 6,
subsample = 0.8,
colsample_bytree = 0.8
)
# Latih Model XGBoost
set.seed(42)
xgb_model <- xgb.train(
params = params,
data = train_matrix,
nrounds = 300,
evals = list(train = train_matrix, test = test_matrix),
verbose = 1,
early_stopping_rounds = 10
)
## Multiple eval metrics are present. Will use test_logloss for early stopping.
## Will train until test_logloss hasn't improved in 10 rounds.
##
## [1] train-logloss:0.667966 test-logloss:0.674058
## [2] train-logloss:0.645963 test-logloss:0.656850
## [3] train-logloss:0.625945 test-logloss:0.641466
## [4] train-logloss:0.607543 test-logloss:0.627513
## [5] train-logloss:0.590345 test-logloss:0.614411
## [6] train-logloss:0.574546 test-logloss:0.602337
## [7] train-logloss:0.560503 test-logloss:0.592395
## [8] train-logloss:0.546715 test-logloss:0.581862
## [9] train-logloss:0.533779 test-logloss:0.571984
## [10] train-logloss:0.521208 test-logloss:0.562896
## [11] train-logloss:0.509410 test-logloss:0.554322
## [12] train-logloss:0.498893 test-logloss:0.546362
## [13] train-logloss:0.489323 test-logloss:0.539245
## [14] train-logloss:0.479969 test-logloss:0.532327
## [15] train-logloss:0.470879 test-logloss:0.525670
## [16] train-logloss:0.462903 test-logloss:0.520169
## [17] train-logloss:0.455405 test-logloss:0.514863
## [18] train-logloss:0.448670 test-logloss:0.510097
## [19] train-logloss:0.441714 test-logloss:0.505133
## [20] train-logloss:0.435938 test-logloss:0.501230
## [21] train-logloss:0.430345 test-logloss:0.497451
## [22] train-logloss:0.424108 test-logloss:0.493172
## [23] train-logloss:0.418584 test-logloss:0.489485
## [24] train-logloss:0.413430 test-logloss:0.485837
## [25] train-logloss:0.408097 test-logloss:0.482372
## [26] train-logloss:0.403282 test-logloss:0.479164
## [27] train-logloss:0.399065 test-logloss:0.476283
## [28] train-logloss:0.394779 test-logloss:0.473466
## [29] train-logloss:0.390561 test-logloss:0.470709
## [30] train-logloss:0.386352 test-logloss:0.468196
## [31] train-logloss:0.382468 test-logloss:0.465829
## [32] train-logloss:0.378948 test-logloss:0.463655
## [33] train-logloss:0.375510 test-logloss:0.461576
## [34] train-logloss:0.372001 test-logloss:0.459507
## [35] train-logloss:0.368506 test-logloss:0.457476
## [36] train-logloss:0.365288 test-logloss:0.455689
## [37] train-logloss:0.362576 test-logloss:0.454059
## [38] train-logloss:0.359783 test-logloss:0.452477
## [39] train-logloss:0.357041 test-logloss:0.450935
## [40] train-logloss:0.354715 test-logloss:0.449587
## [41] train-logloss:0.352508 test-logloss:0.448294
## [42] train-logloss:0.350295 test-logloss:0.447165
## [43] train-logloss:0.347826 test-logloss:0.445822
## [44] train-logloss:0.345403 test-logloss:0.444491
## [45] train-logloss:0.343433 test-logloss:0.443453
## [46] train-logloss:0.341453 test-logloss:0.442527
## [47] train-logloss:0.339116 test-logloss:0.441335
## [48] train-logloss:0.337125 test-logloss:0.440376
## [49] train-logloss:0.335444 test-logloss:0.439585
## [50] train-logloss:0.333417 test-logloss:0.438658
## [51] train-logloss:0.331947 test-logloss:0.437879
## [52] train-logloss:0.330479 test-logloss:0.437095
## [53] train-logloss:0.328454 test-logloss:0.436331
## [54] train-logloss:0.326775 test-logloss:0.435580
## [55] train-logloss:0.325247 test-logloss:0.434918
## [56] train-logloss:0.323935 test-logloss:0.434313
## [57] train-logloss:0.323111 test-logloss:0.433957
## [58] train-logloss:0.321567 test-logloss:0.433261
## [59] train-logloss:0.320340 test-logloss:0.432789
## [60] train-logloss:0.318972 test-logloss:0.432239
## [61] train-logloss:0.317615 test-logloss:0.431739
## [62] train-logloss:0.316512 test-logloss:0.431391
## [63] train-logloss:0.315402 test-logloss:0.430917
## [64] train-logloss:0.314544 test-logloss:0.430608
## [65] train-logloss:0.313303 test-logloss:0.430028
## [66] train-logloss:0.312272 test-logloss:0.429654
## [67] train-logloss:0.311399 test-logloss:0.429375
## [68] train-logloss:0.310028 test-logloss:0.428930
## [69] train-logloss:0.308951 test-logloss:0.428465
## [70] train-logloss:0.307869 test-logloss:0.427959
## [71] train-logloss:0.306802 test-logloss:0.427593
## [72] train-logloss:0.305951 test-logloss:0.427223
## [73] train-logloss:0.305145 test-logloss:0.427006
## [74] train-logloss:0.304177 test-logloss:0.426529
## [75] train-logloss:0.303164 test-logloss:0.426130
## [76] train-logloss:0.302202 test-logloss:0.425858
## [77] train-logloss:0.301315 test-logloss:0.425562
## [78] train-logloss:0.300603 test-logloss:0.425335
## [79] train-logloss:0.299876 test-logloss:0.425145
## [80] train-logloss:0.299122 test-logloss:0.425027
## [81] train-logloss:0.298411 test-logloss:0.424896
## [82] train-logloss:0.297785 test-logloss:0.424742
## [83] train-logloss:0.297182 test-logloss:0.424752
## [84] train-logloss:0.296614 test-logloss:0.424654
## [85] train-logloss:0.295938 test-logloss:0.424473
## [86] train-logloss:0.295280 test-logloss:0.424370
## [87] train-logloss:0.294641 test-logloss:0.424302
## [88] train-logloss:0.294318 test-logloss:0.424214
## [89] train-logloss:0.293750 test-logloss:0.424074
## [90] train-logloss:0.293315 test-logloss:0.423956
## [91] train-logloss:0.292591 test-logloss:0.423852
## [92] train-logloss:0.292084 test-logloss:0.423745
## [93] train-logloss:0.291467 test-logloss:0.423622
## [94] train-logloss:0.290967 test-logloss:0.423457
## [95] train-logloss:0.290355 test-logloss:0.423311
## [96] train-logloss:0.289728 test-logloss:0.423122
## [97] train-logloss:0.289142 test-logloss:0.422998
## [98] train-logloss:0.288595 test-logloss:0.422901
## [99] train-logloss:0.288173 test-logloss:0.422871
## [100] train-logloss:0.287710 test-logloss:0.422844
## [101] train-logloss:0.287275 test-logloss:0.422739
## [102] train-logloss:0.286771 test-logloss:0.422588
## [103] train-logloss:0.286365 test-logloss:0.422515
## [104] train-logloss:0.285912 test-logloss:0.422293
## [105] train-logloss:0.285482 test-logloss:0.422257
## [106] train-logloss:0.285082 test-logloss:0.422139
## [107] train-logloss:0.284575 test-logloss:0.422058
## [108] train-logloss:0.284113 test-logloss:0.421974
## [109] train-logloss:0.283903 test-logloss:0.421938
## [110] train-logloss:0.283390 test-logloss:0.421881
## [111] train-logloss:0.282940 test-logloss:0.421840
## [112] train-logloss:0.282447 test-logloss:0.421663
## [113] train-logloss:0.281972 test-logloss:0.421646
## [114] train-logloss:0.281645 test-logloss:0.421612
## [115] train-logloss:0.281242 test-logloss:0.421522
## [116] train-logloss:0.280878 test-logloss:0.421464
## [117] train-logloss:0.280492 test-logloss:0.421411
## [118] train-logloss:0.279941 test-logloss:0.421384
## [119] train-logloss:0.279373 test-logloss:0.421363
## [120] train-logloss:0.278870 test-logloss:0.421321
## [121] train-logloss:0.278461 test-logloss:0.421347
## [122] train-logloss:0.278001 test-logloss:0.421265
## [123] train-logloss:0.277590 test-logloss:0.421254
## [124] train-logloss:0.277172 test-logloss:0.421235
## [125] train-logloss:0.276770 test-logloss:0.421129
## [126] train-logloss:0.276401 test-logloss:0.421133
## [127] train-logloss:0.275977 test-logloss:0.421227
## [128] train-logloss:0.275610 test-logloss:0.421213
## [129] train-logloss:0.275136 test-logloss:0.421145
## [130] train-logloss:0.274862 test-logloss:0.421118
## [131] train-logloss:0.274494 test-logloss:0.421049
## [132] train-logloss:0.274158 test-logloss:0.421046
## [133] train-logloss:0.273802 test-logloss:0.421023
## [134] train-logloss:0.273417 test-logloss:0.421044
## [135] train-logloss:0.273232 test-logloss:0.421021
## [136] train-logloss:0.272849 test-logloss:0.421020
## [137] train-logloss:0.272540 test-logloss:0.420911
## [138] train-logloss:0.272184 test-logloss:0.420841
## [139] train-logloss:0.271783 test-logloss:0.420769
## [140] train-logloss:0.271512 test-logloss:0.420743
## [141] train-logloss:0.271029 test-logloss:0.420796
## [142] train-logloss:0.270609 test-logloss:0.420839
## [143] train-logloss:0.270466 test-logloss:0.420768
## [144] train-logloss:0.269974 test-logloss:0.420841
## [145] train-logloss:0.269717 test-logloss:0.420881
## [146] train-logloss:0.269323 test-logloss:0.420770
## [147] train-logloss:0.269041 test-logloss:0.420679
## [148] train-logloss:0.268563 test-logloss:0.420646
## [149] train-logloss:0.268273 test-logloss:0.420596
## [150] train-logloss:0.267915 test-logloss:0.420593
## [151] train-logloss:0.267580 test-logloss:0.420483
## [152] train-logloss:0.267319 test-logloss:0.420604
## [153] train-logloss:0.266999 test-logloss:0.420543
## [154] train-logloss:0.266745 test-logloss:0.420499
## [155] train-logloss:0.266429 test-logloss:0.420457
## [156] train-logloss:0.266147 test-logloss:0.420395
## [157] train-logloss:0.265914 test-logloss:0.420366
## [158] train-logloss:0.265705 test-logloss:0.420382
## [159] train-logloss:0.265479 test-logloss:0.420346
## [160] train-logloss:0.265200 test-logloss:0.420299
## [161] train-logloss:0.265001 test-logloss:0.420264
## [162] train-logloss:0.264835 test-logloss:0.420168
## [163] train-logloss:0.264501 test-logloss:0.420040
## [164] train-logloss:0.264225 test-logloss:0.419996
## [165] train-logloss:0.263917 test-logloss:0.419989
## [166] train-logloss:0.263644 test-logloss:0.419971
## [167] train-logloss:0.263351 test-logloss:0.419860
## [168] train-logloss:0.263133 test-logloss:0.419881
## [169] train-logloss:0.262907 test-logloss:0.419878
## [170] train-logloss:0.262545 test-logloss:0.419847
## [171] train-logloss:0.262335 test-logloss:0.419914
## [172] train-logloss:0.262113 test-logloss:0.419846
## [173] train-logloss:0.261776 test-logloss:0.419932
## [174] train-logloss:0.261432 test-logloss:0.419898
## [175] train-logloss:0.261124 test-logloss:0.419954
## [176] train-logloss:0.260818 test-logloss:0.419974
## [177] train-logloss:0.260493 test-logloss:0.419946
## [178] train-logloss:0.260298 test-logloss:0.419904
## [179] train-logloss:0.260063 test-logloss:0.419949
## [180] train-logloss:0.259786 test-logloss:0.419932
## [181] train-logloss:0.259544 test-logloss:0.419969
## Stopping. Best iteration:
## [182] train-logloss:0.259245 test-logloss:0.419947
##
## [182] train-logloss:0.259245 test-logloss:0.419947
cat(" Pelatihan Model XGBoost Selesai ")
## Pelatihan Model XGBoost Selesai
# Bagian 4 : Evaluasi Model XGBoost
# Prediksi Probabilitas pada Data Testing
xgb_pred_prob <- predict(xgb_model, test_matrix)
# Konversi Probabilitas menjadi Prediksi Kelas (Optimal 0.35)
threshold <- 0.35
xgb_pred_class <- as.factor(ifelse(xgb_pred_prob > threshold, 1, 0))
# Pastikan level prediksi dan aktual sama
actual_labels <- as.factor(test_data_final$default_payment_next_month)
levels(xgb_pred_class) <- levels(actual_labels)
# Hitung Confusion Matrix
conf_matrix <- confusionMatrix(xgb_pred_class, actual_labels, positive = "1")
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6198 899
## 1 811 1091
##
## Accuracy : 0.81
## 95% CI : (0.8017, 0.818)
## No Information Rate : 0.7789
## P-Value [Acc > NIR] : 2.506e-13
##
## Kappa : 0.4395
##
## Mcnemar's Test P-Value : 0.03539
##
## Sensitivity : 0.5482
## Specificity : 0.8843
## Pos Pred Value : 0.5736
## Neg Pred Value : 0.8733
## Prevalence : 0.2211
## Detection Rate : 0.1212
## Detection Prevalence : 0.2114
## Balanced Accuracy : 0.7163
##
## 'Positive' Class : 1
##
# Hitung dan Plot ROC Curve
roc_obj <- roc(test_data_final$default_payment_next_month, xgb_pred_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# Plot ROC Curve
plot(roc_obj, main = "Kurva ROC untuk Model XGBoost", col = "blue", lwd = 2)
# Hitung AUC
auc_value <- auc(roc_obj)
cat(paste("\nAUC (Area Under the Curve):", round(auc_value, 4), "\n"))
##
## AUC (Area Under the Curve): 0.7981
cat(" Evaluasi Model Selesai ")
## Evaluasi Model Selesai