Memprediksi apakah seorang klien akan gagal bayar (default payment) di bulan berikutnya.
default.payment.next.month - 1 (gagal bayar) - 0 (tidak gagal bayar)
# Import Library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.2
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.5.2
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data_credit <- read_excel("default of credit card clients.xls", sheet = 1)
## New names:
## • `` -> `...1`
head(data_credit)
## # A tibble: 6 × 25
## ...1 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID LIMIT… SEX EDUC… MARR… AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
## 2 1 20000 2 2 1 24 2 2 -1 -1 -2 -2 3913
## 3 2 120000 2 2 2 26 -1 2 0 0 0 2 2682
## 4 3 90000 2 2 2 34 0 0 0 0 0 0 29239
## 5 4 50000 2 2 1 37 0 0 0 0 0 0 46990
## 6 5 50000 1 2 1 57 -1 0 -1 0 0 0 8617
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## # X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>
# Cek Info data
str(data_credit)
## tibble [30,001 × 25] (S3: tbl_df/tbl/data.frame)
## $ ...1: chr [1:30001] "ID" "1" "2" "3" ...
## $ X1 : chr [1:30001] "LIMIT_BAL" "20000" "120000" "90000" ...
## $ X2 : chr [1:30001] "SEX" "2" "2" "2" ...
## $ X3 : chr [1:30001] "EDUCATION" "2" "2" "2" ...
## $ X4 : chr [1:30001] "MARRIAGE" "1" "2" "2" ...
## $ X5 : chr [1:30001] "AGE" "24" "26" "34" ...
## $ X6 : chr [1:30001] "PAY_0" "2" "-1" "0" ...
## $ X7 : chr [1:30001] "PAY_2" "2" "2" "0" ...
## $ X8 : chr [1:30001] "PAY_3" "-1" "0" "0" ...
## $ X9 : chr [1:30001] "PAY_4" "-1" "0" "0" ...
## $ X10 : chr [1:30001] "PAY_5" "-2" "0" "0" ...
## $ X11 : chr [1:30001] "PAY_6" "-2" "2" "0" ...
## $ X12 : chr [1:30001] "BILL_AMT1" "3913" "2682" "29239" ...
## $ X13 : chr [1:30001] "BILL_AMT2" "3102" "1725" "14027" ...
## $ X14 : chr [1:30001] "BILL_AMT3" "689" "2682" "13559" ...
## $ X15 : chr [1:30001] "BILL_AMT4" "0" "3272" "14331" ...
## $ X16 : chr [1:30001] "BILL_AMT5" "0" "3455" "14948" ...
## $ X17 : chr [1:30001] "BILL_AMT6" "0" "3261" "15549" ...
## $ X18 : chr [1:30001] "PAY_AMT1" "0" "0" "1518" ...
## $ X19 : chr [1:30001] "PAY_AMT2" "689" "1000" "1500" ...
## $ X20 : chr [1:30001] "PAY_AMT3" "0" "1000" "1000" ...
## $ X21 : chr [1:30001] "PAY_AMT4" "0" "1000" "1000" ...
## $ X22 : chr [1:30001] "PAY_AMT5" "0" "0" "1000" ...
## $ X23 : chr [1:30001] "PAY_AMT6" "0" "2000" "5000" ...
## $ Y : chr [1:30001] "default payment next month" "1" "1" "0" ...
summary(data_credit)
## ...1 X1 X2 X3
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X4 X5 X6 X7
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X8 X9 X10 X11
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X12 X13 X14 X15
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X16 X17 X18 X19
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## X20 X21 X22 X23
## Length:30001 Length:30001 Length:30001 Length:30001
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## Y
## Length:30001
## Class :character
## Mode :character
# Cek Missing Value/Data Null
colSums(is.na(data_credit))
## ...1 X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## X16 X17 X18 X19 X20 X21 X22 X23 Y
## 0 0 0 0 0 0 0 0 0
# Cek Duplikasi
data_credit %>%
summarise(duplikat = n() - n_distinct(across(.cols = everything())))
## # A tibble: 1 × 1
## duplikat
## <int>
## 1 0
# Mengganti nama kolom agar mudah dibaca
library(dplyr)
data_credit <- rename(data_credit,
ID = `...1`,
LIMIT_BAL = X1, SEX = X2, EDUCATION = X3, MARRIAGE = X4, AGE = X5,
PAY_0 = X6, PAY_2 = X7, PAY_3 = X8, PAY_4 = X9, PAY_5 = X10, PAY_6 = X11,
BILL_AMT1 = X12, BILL_AMT2 = X13, BILL_AMT3 = X14, BILL_AMT4 = X15,
BILL_AMT5 = X16, BILL_AMT6 = X17, PAY_AMT1 = X18, PAY_AMT2 = X19,
PAY_AMT3 = X20, PAY_AMT4 = X21, PAY_AMT5 = X22, PAY_AMT6 = X23,
default_payment_next_month = Y)
head(data_credit)
## # A tibble: 6 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## 2 1 20000 2 2 1 24 2 2 -1 -1 -2
## 3 2 120000 2 2 2 26 -1 2 0 0 0
## 4 3 90000 2 2 2 34 0 0 0 0 0
## 5 4 50000 2 2 1 37 0 0 0 0 0
## 6 5 50000 1 2 1 57 -1 0 -1 0 0
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## # BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## # PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## # PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Menghapus kolom pertama
data_credit <- data_credit[-1, ]
head(data_credit)
## # A tibble: 6 × 25
## ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 20000 2 2 1 24 2 2 -1 -1 -2
## 2 2 120000 2 2 2 26 -1 2 0 0 0
## 3 3 90000 2 2 2 34 0 0 0 0 0
## 4 4 50000 2 2 1 37 0 0 0 0 0
## 5 5 50000 1 2 1 57 -1 0 -1 0 0
## 6 6 50000 1 1 2 37 0 0 0 0 0
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## # BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## # PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## # PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Mengubah kolom AGE, LIMIT_BAL, BILL_AMT1 - BILL_AMT6, PAY_AMT1 - PAY_AMT6 menjadi numerik
data_credit <- data_credit %>%
mutate(across(c(AGE, LIMIT_BAL, BILL_AMT1:BILL_AMT6, PAY_0:PAY_6, PAY_AMT1:PAY_AMT6),
~ as.numeric(as.character(.))))
str(data_credit)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
## $ ID : chr [1:30000] "1" "2" "3" "4" ...
## $ LIMIT_BAL : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : chr [1:30000] "2" "2" "2" "2" ...
## $ EDUCATION : chr [1:30000] "2" "2" "2" "2" ...
## $ MARRIAGE : chr [1:30000] "1" "2" "2" "1" ...
## $ AGE : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : num [1:30000] 3913 2682 29239 46990 8617 ...
## $ BILL_AMT2 : num [1:30000] 3102 1725 14027 48233 5670 ...
## $ BILL_AMT3 : num [1:30000] 689 2682 13559 49291 35835 ...
## $ BILL_AMT4 : num [1:30000] 0 3272 14331 28314 20940 ...
## $ BILL_AMT5 : num [1:30000] 0 3455 14948 28959 19146 ...
## $ BILL_AMT6 : num [1:30000] 0 3261 15549 29547 19131 ...
## $ PAY_AMT1 : num [1:30000] 0 0 1518 2000 2000 ...
## $ PAY_AMT2 : num [1:30000] 689 1000 1500 2019 36681 ...
## $ PAY_AMT3 : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : num [1:30000] 0 1000 1000 1100 9000 ...
## $ PAY_AMT5 : num [1:30000] 0 0 1000 1069 689 ...
## $ PAY_AMT6 : num [1:30000] 0 2000 5000 1000 679 ...
## $ default_payment_next_month: chr [1:30000] "1" "1" "0" "0" ...
# Distribusi target
ggplot(data_credit, aes(x = factor(default_payment_next_month), fill = factor(default_payment_next_month))) +
geom_bar() +
scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Distribusi Default Payment Next Month",
x = "Default (0 = Tidak, 1 = Ya)",
y = "Jumlah") +
theme_minimal()
# Persentase Distribusi
persen_dist <- data_credit %>%
group_by(default_payment_next_month) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count) * 100)
print(persen_dist)
## # A tibble: 2 × 3
## default_payment_next_month count percentage
## <chr> <int> <dbl>
## 1 0 23364 77.9
## 2 1 6636 22.1
# Perbandingan Pembayaran bulan berikutnya berdasarkan jenis kelamin
data_credit %>%
group_by(SEX, default_payment_next_month) %>%
summarise(count = n()) %>%
mutate(SEX = ifelse(SEX == 1, "Male", "Female")) %>%
ggplot(aes(x = SEX, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Proporsi berdasarkan Jenis Kelamin", y = "Proporsi")
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.
# Perbandingan Pembayaran bulan berikutnya berdasarkan jenis kelamin
data_credit %>%
filter(default_payment_next_month == 0) %>%
group_by(SEX) %>%
summarise(count = n()) %>%
mutate(prop = count / sum(count),
SEX = ifelse(SEX == 1, "Male", "Female")) %>%
ggplot(aes(x = SEX, y = prop)) +
geom_bar(stat = "identity", fill = "blue") +
labs(title = "Proporsi yang Berhasil Membayar berdasarkan Jenis Kelamin", y = "Proporsi")
- Proporsi klien perempuan yang berhasil membayar bulan berikutnya lebih
banyak dibandingkan laki-laki. Insight : Perempuan
cenderung lebih disiplin dalam pembayaran kredit dibandingkan
laki-laki.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Tingkat Pendidikan
data_credit %>%
mutate(EDUCATION = case_when(
EDUCATION %in% c(1) ~ "graduate school",
EDUCATION %in% c(2) ~ "university",
EDUCATION %in% c(3) ~ "high school",
TRUE ~ "Others"
)) %>%
mutate(EDUCATION = factor(EDUCATION, levels = c("graduate school", "university", "high school", "Others"))) %>%
group_by(EDUCATION, default_payment_next_month) %>%
summarise(count = n()) %>%
ggplot(aes(x = EDUCATION, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
labs(title = "Proporsi berdasarkan Pendidikan", x = "Tingkat Pendidikan")
## `summarise()` has grouped output by 'EDUCATION'. You can override using the
## `.groups` argument.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Tingkat Pendidikan
data_credit %>%
mutate(EDUCATION = case_when(
EDUCATION %in% c(1) ~ "graduate school",
EDUCATION %in% c(2) ~ "university",
EDUCATION %in% c(3) ~ "high school",
TRUE ~ "Others"
)) %>%
mutate(EDUCATION = factor(EDUCATION, levels = c("graduate school", "university", "high school", "Others"))) %>%
filter(default_payment_next_month == 0) %>%
count(EDUCATION) %>%
ggplot(aes(x = EDUCATION, y = n, fill = EDUCATION)) +
geom_bar(stat = "identity") +
labs(title = "Distribusi Klien yang Tidak Gagal Bayar berdasarkan Pendidikan",
x = "Tingkat Pendidikan", y = "Jumlah") +
theme(legend.position = "none")
- Klien dengan pendidikan university mendominasi kelompok yang berhasil
membayar, menandakan hubungan positif antara tingkat pendidikan dan
kedisiplinan pembayaran kredit. - Pendidikan tinggi biasanya dikaitkan
dengan literasi keuangan yang lebih baik, pendapatan lebih stabil, dan
risiko gagal bayar lebih kecil.
Insight : Tingkat pendidikan adalah indikator penting dalam menilai risiko kredit, di mana semakin tinggi pendidikan, semakin rendah risiko gagal bayar.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Status Pernikahan
data_credit %>%
filter(MARRIAGE %in% 1:3) %>%
mutate(MARRIAGE = factor(MARRIAGE,
levels = 1:3,
labels = c("married", "single", "Others"))) %>%
group_by(MARRIAGE, default_payment_next_month) %>%
summarise(count = n()) %>%
ggplot(aes(x = MARRIAGE, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
labs(title = "Proporsi berdasarkan Status Pernikahan", x = "Status")
## `summarise()` has grouped output by 'MARRIAGE'. You can override using the
## `.groups` argument.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Status Pernikahan
data_credit %>%
filter(MARRIAGE %in% 1:3) %>%
mutate(MARRIAGE = factor(MARRIAGE,
levels = 1:3,
labels = c("married", "single", "Others"))) %>%
filter(default_payment_next_month == 0) %>%
count(MARRIAGE) %>%
ggplot(aes(x = MARRIAGE, y = n, fill = MARRIAGE)) +
geom_bar(stat = "identity") +
labs(title = "Distribusi Klien yang Tidak Gagal Bayar berdasarkan Status Pernikahan",
x = "Status Pernikahan", y = "Jumlah") +
theme(legend.position = "none")
Insight : Status pernikahan memengaruhi keberhasilan pembayaran kartu kredit bulan berikutnya.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Umur
data_credit %>%
mutate(Age_group = cut(AGE,
breaks = c(20, 30, 40, 50, 60, 70, 80),
right = FALSE,
labels = c("20-29", "30-39", "40-49", "50-59", "60-69", "70-79"))) %>%
group_by(Age_group, default_payment_next_month) %>%
summarise(count = n()) %>%
ggplot(aes(x = Age_group, y = count, fill = factor(default_payment_next_month))) +
geom_bar(stat = "identity", position = "fill") +
labs(title = "Proporsi berdasarkan Kelompok Umur", x = "Kelompok Umur", y = "Proporsi")
## `summarise()` has grouped output by 'Age_group'. You can override using the
## `.groups` argument.
# Perbandingan Pembayaran bulan berikutnya berdasarkan Umur
data_credit %>%
mutate(Age_group = cut(AGE,
breaks = c(20, 30, 40, 50, 60, 70, 80),
right = FALSE,
labels = c("20-29", "30-39", "40-49", "50-59", "60-69", "70-79"))) %>%
filter(default_payment_next_month == 0) %>%
count(Age_group) %>%
ggplot(aes(x = Age_group, y = n, fill = Age_group)) +
geom_bar(stat = "identity") +
labs(title = "Distribusi Klien yang Tidak Gagal Bayar berdasarkan Kelompok Umur",
x = "Kelompok Umur", y = "Jumlah") +
theme(legend.position = "none")
- Klien dengan tingkat keberhasilan pembayaran berada pada usia
produktif (30-39 tahun) karena stabilitas pendapatan dan pengalaman
finansial yang lebih baik. Insight : Faktor umur secara
signifikan memengaruhi tingkat keberhasilan pembayaran seorang
klien.
# Korelasi Numerik
# Load library yang diperlukan
library(ggplot2)
library(reshape2) # Untuk fungsi melt()
# Asumsikan data_credit sudah dimuat (misalnya: data_credit <- read.csv("path/to/your/data.csv"))
# Definisi kolom numerik
num_cols <- c("AGE", "LIMIT_BAL", paste0("BILL_AMT", 1:6), paste0("PAY_AMT", 1:6), paste0("PAY_", c(0, 2:6)))
# Hitung matriks korelasi (gunakan complete.obs untuk menangani NA)
corr_matrix <- cor(data_credit[num_cols], use = "complete.obs")
# Melt matriks untuk ggplot
melted_corr <- melt(corr_matrix)
# Buat heatmap
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), name = "Korelasi") +
geom_text(aes(label = round(value, 2)), color = "black", size = 3) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
labs(title = "Korelasi antar Fitur Numerik", x = "", y = "")
Kesimpulan Akhir EDA - Jenis Kelamin : Perempuan
cenderung lebih disiplin dalam pembayaran kredit dibanding laki-laki.
Proporsi perempuan yang berhasil membayar lebih tinggi, menunjukkan
jenis kelamin sebagai prediktor potensial risiko default. - Pendidikan :
Klien dengan pendidikan tinggi (terutama university dan graduate school)
lebih dominan dalam kelompok yang berhasil membayar. Tingkat pendidikan
berkorelasi positif dengan kedisiplinan pembayaran, kemungkinan karena
literasi keuangan dan stabilitas pendapatan yang lebih baik. - Status
Pernikahan : Klien single lebih disiplin membayar tepat waktu dibanding
yang menikah, karena tanggungan keuangan yang lebih sedikit. Status
pernikahan memengaruhi risiko default. - Usia : Klien usia produktif
(30-39 tahun) memiliki tingkat keberhasilan pembayaran tertinggi,
didukung oleh stabilitas pendapatan dan pengalaman finansial. Usia muda
(<30) atau tua (>60) cenderung lebih berisiko.
X <- data_credit %>% select(-default_payment_next_month) # Semua kecuali target
y <- data_credit$default_payment_next_month # Target saja
X_dummy <- dummy_cols(X,
select_columns = c("SEX", "EDUCATION", "MARRIAGE"),
remove_first_dummy = TRUE)