GOALS

Memprediksi apakah seorang klien akan gagal bayar (default payment) di bulan berikutnya.

Variabel Target

default.payment.next.month - 1 (gagal bayar) - 0 (tidak gagal bayar)

Variabel Prediktor

Step 1 — Import Library & Load Dataset

# Import Library
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readxl)
## Warning: package 'readxl' was built under R version 4.5.2
library(ggplot2)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.5.2
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.5.2
library(caret)
## Warning: package 'caret' was built under R version 4.5.2
## Loading required package: lattice
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.5.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(pROC)
## Warning: package 'pROC' was built under R version 4.5.2
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data_credit <- read_excel("default of credit card clients.xls", sheet = 1)
## New names:
## • `` -> `...1`
head(data_credit)
## # A tibble: 6 × 25
##   ...1  X1     X2    X3    X4    X5    X6    X7    X8    X9    X10   X11   X12  
##   <chr> <chr>  <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID    LIMIT… SEX   EDUC… MARR… AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL…
## 2 1     20000  2     2     1     24    2     2     -1    -1    -2    -2    3913 
## 3 2     120000 2     2     2     26    -1    2     0     0     0     2     2682 
## 4 3     90000  2     2     2     34    0     0     0     0     0     0     29239
## 5 4     50000  2     2     1     37    0     0     0     0     0     0     46990
## 6 5     50000  1     2     1     57    -1    0     -1    0     0     0     8617 
## # ℹ 12 more variables: X13 <chr>, X14 <chr>, X15 <chr>, X16 <chr>, X17 <chr>,
## #   X18 <chr>, X19 <chr>, X20 <chr>, X21 <chr>, X22 <chr>, X23 <chr>, Y <chr>

Step 2 — Data Cleaning

# Cek Info data
str(data_credit)
## tibble [30,001 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ...1: chr [1:30001] "ID" "1" "2" "3" ...
##  $ X1  : chr [1:30001] "LIMIT_BAL" "20000" "120000" "90000" ...
##  $ X2  : chr [1:30001] "SEX" "2" "2" "2" ...
##  $ X3  : chr [1:30001] "EDUCATION" "2" "2" "2" ...
##  $ X4  : chr [1:30001] "MARRIAGE" "1" "2" "2" ...
##  $ X5  : chr [1:30001] "AGE" "24" "26" "34" ...
##  $ X6  : chr [1:30001] "PAY_0" "2" "-1" "0" ...
##  $ X7  : chr [1:30001] "PAY_2" "2" "2" "0" ...
##  $ X8  : chr [1:30001] "PAY_3" "-1" "0" "0" ...
##  $ X9  : chr [1:30001] "PAY_4" "-1" "0" "0" ...
##  $ X10 : chr [1:30001] "PAY_5" "-2" "0" "0" ...
##  $ X11 : chr [1:30001] "PAY_6" "-2" "2" "0" ...
##  $ X12 : chr [1:30001] "BILL_AMT1" "3913" "2682" "29239" ...
##  $ X13 : chr [1:30001] "BILL_AMT2" "3102" "1725" "14027" ...
##  $ X14 : chr [1:30001] "BILL_AMT3" "689" "2682" "13559" ...
##  $ X15 : chr [1:30001] "BILL_AMT4" "0" "3272" "14331" ...
##  $ X16 : chr [1:30001] "BILL_AMT5" "0" "3455" "14948" ...
##  $ X17 : chr [1:30001] "BILL_AMT6" "0" "3261" "15549" ...
##  $ X18 : chr [1:30001] "PAY_AMT1" "0" "0" "1518" ...
##  $ X19 : chr [1:30001] "PAY_AMT2" "689" "1000" "1500" ...
##  $ X20 : chr [1:30001] "PAY_AMT3" "0" "1000" "1000" ...
##  $ X21 : chr [1:30001] "PAY_AMT4" "0" "1000" "1000" ...
##  $ X22 : chr [1:30001] "PAY_AMT5" "0" "0" "1000" ...
##  $ X23 : chr [1:30001] "PAY_AMT6" "0" "2000" "5000" ...
##  $ Y   : chr [1:30001] "default payment next month" "1" "1" "0" ...
summary(data_credit)
##      ...1                X1                 X2                 X3           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       X4                 X5                 X6                 X7           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       X8                 X9                X10                X11           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      X12                X13                X14                X15           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      X16                X17                X18                X19           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##      X20                X21                X22                X23           
##  Length:30001       Length:30001       Length:30001       Length:30001      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##       Y            
##  Length:30001      
##  Class :character  
##  Mode  :character
# Cek Missing Value/Data Null
colSums(is.na(data_credit))
## ...1   X1   X2   X3   X4   X5   X6   X7   X8   X9  X10  X11  X12  X13  X14  X15 
##    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0 
##  X16  X17  X18  X19  X20  X21  X22  X23    Y 
##    0    0    0    0    0    0    0    0    0
# Cek Duplikasi
data_credit %>% 
  summarise(duplikat = n() - n_distinct(across(.cols = everything())))
## # A tibble: 1 × 1
##   duplikat
##      <int>
## 1        0
# Mengganti nama kolom agar mudah dibaca
library(dplyr)
data_credit <- rename(data_credit,
    ID = `...1`,
    LIMIT_BAL = X1, SEX = X2, EDUCATION = X3, MARRIAGE = X4, AGE = X5,
    PAY_0 = X6, PAY_2 = X7, PAY_3 = X8, PAY_4 = X9, PAY_5 = X10, PAY_6 = X11,
    BILL_AMT1 = X12, BILL_AMT2 = X13, BILL_AMT3 = X14, BILL_AMT4 = X15,
    BILL_AMT5 = X16, BILL_AMT6 = X17, PAY_AMT1 = X18, PAY_AMT2 = X19,
    PAY_AMT3 = X20, PAY_AMT4 = X21, PAY_AMT5 = X22, PAY_AMT6 = X23,
    default_payment_next_month = Y)
head(data_credit)
## # A tibble: 6 × 25
##   ID    LIMIT_BAL SEX   EDUCATION MARRIAGE AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <chr> <chr>     <chr> <chr>     <chr>    <chr> <chr> <chr> <chr> <chr> <chr>
## 1 ID    LIMIT_BAL SEX   EDUCATION MARRIAGE AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
## 2 1     20000     2     2         1        24    2     2     -1    -1    -2   
## 3 2     120000    2     2         2        26    -1    2     0     0     0    
## 4 3     90000     2     2         2        34    0     0     0     0     0    
## 5 4     50000     2     2         1        37    0     0     0     0     0    
## 6 5     50000     1     2         1        57    -1    0     -1    0     0    
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## #   BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## #   PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## #   PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Menghapus kolom pertama
data_credit <- data_credit[-1, ]
head(data_credit)
## # A tibble: 6 × 25
##   ID    LIMIT_BAL SEX   EDUCATION MARRIAGE AGE   PAY_0 PAY_2 PAY_3 PAY_4 PAY_5
##   <chr> <chr>     <chr> <chr>     <chr>    <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1     20000     2     2         1        24    2     2     -1    -1    -2   
## 2 2     120000    2     2         2        26    -1    2     0     0     0    
## 3 3     90000     2     2         2        34    0     0     0     0     0    
## 4 4     50000     2     2         1        37    0     0     0     0     0    
## 5 5     50000     1     2         1        57    -1    0     -1    0     0    
## 6 6     50000     1     1         2        37    0     0     0     0     0    
## # ℹ 14 more variables: PAY_6 <chr>, BILL_AMT1 <chr>, BILL_AMT2 <chr>,
## #   BILL_AMT3 <chr>, BILL_AMT4 <chr>, BILL_AMT5 <chr>, BILL_AMT6 <chr>,
## #   PAY_AMT1 <chr>, PAY_AMT2 <chr>, PAY_AMT3 <chr>, PAY_AMT4 <chr>,
## #   PAY_AMT5 <chr>, PAY_AMT6 <chr>, default_payment_next_month <chr>
# Mengubah kolom AGE, LIMIT_BAL, BILL_AMT1 - BILL_AMT6, PAY_AMT1 - PAY_AMT6 menjadi numerik
data_credit <- data_credit %>%
  mutate(across(c(AGE, LIMIT_BAL, BILL_AMT1:BILL_AMT6, PAY_0:PAY_6, PAY_AMT1:PAY_AMT6),
                ~ as.numeric(as.character(.))))
str(data_credit)
## tibble [30,000 × 25] (S3: tbl_df/tbl/data.frame)
##  $ ID                        : chr [1:30000] "1" "2" "3" "4" ...
##  $ LIMIT_BAL                 : num [1:30000] 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : chr [1:30000] "2" "2" "2" "2" ...
##  $ EDUCATION                 : chr [1:30000] "2" "2" "2" "2" ...
##  $ MARRIAGE                  : chr [1:30000] "1" "2" "2" "1" ...
##  $ AGE                       : num [1:30000] 24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : num [1:30000] 2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : num [1:30000] 2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : num [1:30000] -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : num [1:30000] -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : num [1:30000] -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : num [1:30000] -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : num [1:30000] 3913 2682 29239 46990 8617 ...
##  $ BILL_AMT2                 : num [1:30000] 3102 1725 14027 48233 5670 ...
##  $ BILL_AMT3                 : num [1:30000] 689 2682 13559 49291 35835 ...
##  $ BILL_AMT4                 : num [1:30000] 0 3272 14331 28314 20940 ...
##  $ BILL_AMT5                 : num [1:30000] 0 3455 14948 28959 19146 ...
##  $ BILL_AMT6                 : num [1:30000] 0 3261 15549 29547 19131 ...
##  $ PAY_AMT1                  : num [1:30000] 0 0 1518 2000 2000 ...
##  $ PAY_AMT2                  : num [1:30000] 689 1000 1500 2019 36681 ...
##  $ PAY_AMT3                  : num [1:30000] 0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : num [1:30000] 0 1000 1000 1100 9000 ...
##  $ PAY_AMT5                  : num [1:30000] 0 0 1000 1069 689 ...
##  $ PAY_AMT6                  : num [1:30000] 0 2000 5000 1000 679 ...
##  $ default_payment_next_month: chr [1:30000] "1" "1" "0" "0" ...

Step 3 - Exploratory Data Analysis (EDA)

# Distribusi target
ggplot(data_credit, aes(x = factor(default_payment_next_month), fill = factor(default_payment_next_month))) + 
  geom_bar() + 
  scale_fill_manual(values = c("0" = "blue", "1" = "red")) +
labs(title = "Distribusi Default Payment Next Month",
x = "Default (0 = Tidak, 1 = Ya)",
y = "Jumlah") +
theme_minimal()

# Persentase Distribusi
persen_dist <- data_credit %>%
  group_by(default_payment_next_month) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100)
print(persen_dist)
## # A tibble: 2 × 3
##   default_payment_next_month count percentage
##   <chr>                      <int>      <dbl>
## 1 0                          23364       77.9
## 2 1                           6636       22.1
# Perbandingan Pembayaran bulan berikutnya berdasarkan jenis kelamin
data_credit %>% 
  group_by(SEX, default_payment_next_month) %>% 
  summarise(count = n()) %>% 
  mutate(SEX = ifelse(SEX == 1, "Male", "Female")) %>% 
  ggplot(aes(x = SEX, y = count, fill = factor(default_payment_next_month))) + 
  geom_bar(stat = "identity", position = "fill") + 
  scale_fill_manual(values = c("0" = "blue", "1" = "red")) + 
  labs(title = "Proporsi berdasarkan Jenis Kelamin", y = "Proporsi")
## `summarise()` has grouped output by 'SEX'. You can override using the `.groups`
## argument.

# Perbandingan Pembayaran bulan berikutnya berdasarkan jenis kelamin
data_credit %>% 
  filter(default_payment_next_month == 0) %>% 
  group_by(SEX) %>% 
  summarise(count = n()) %>% 
  mutate(prop = count / sum(count), 
         SEX = ifelse(SEX == 1, "Male", "Female")) %>% 
  ggplot(aes(x = SEX, y = prop)) + 
  geom_bar(stat = "identity", fill = "blue") + 
  labs(title = "Proporsi yang Berhasil Membayar berdasarkan Jenis Kelamin", y = "Proporsi")

- Proporsi klien perempuan yang berhasil membayar bulan berikutnya lebih banyak dibandingkan laki-laki. Insight : Perempuan cenderung lebih disiplin dalam pembayaran kredit dibandingkan laki-laki.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Tingkat Pendidikan
data_credit %>%
  mutate(EDUCATION = case_when(
    EDUCATION %in% c(1) ~ "graduate school",
    EDUCATION %in% c(2) ~ "university",
    EDUCATION %in% c(3) ~ "high school",
    TRUE ~ "Others"
  )) %>%
  mutate(EDUCATION = factor(EDUCATION, levels = c("graduate school", "university", "high school", "Others"))) %>%
  group_by(EDUCATION, default_payment_next_month) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = EDUCATION, y = count, fill = factor(default_payment_next_month))) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Proporsi berdasarkan Pendidikan", x = "Tingkat Pendidikan")
## `summarise()` has grouped output by 'EDUCATION'. You can override using the
## `.groups` argument.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Tingkat Pendidikan
data_credit %>%
  mutate(EDUCATION = case_when(
    EDUCATION %in% c(1) ~ "graduate school",
    EDUCATION %in% c(2) ~ "university",
    EDUCATION %in% c(3) ~ "high school",
    TRUE ~ "Others"
  )) %>%
  mutate(EDUCATION = factor(EDUCATION, levels = c("graduate school", "university", "high school", "Others"))) %>%
  filter(default_payment_next_month == 0) %>% 
  count(EDUCATION) %>%
  ggplot(aes(x = EDUCATION, y = n, fill = EDUCATION)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribusi Klien yang Tidak Gagal Bayar berdasarkan Pendidikan", 
       x = "Tingkat Pendidikan", y = "Jumlah") +
  theme(legend.position = "none")

- Klien dengan pendidikan university mendominasi kelompok yang berhasil membayar, menandakan hubungan positif antara tingkat pendidikan dan kedisiplinan pembayaran kredit. - Pendidikan tinggi biasanya dikaitkan dengan literasi keuangan yang lebih baik, pendapatan lebih stabil, dan risiko gagal bayar lebih kecil.

Insight : Tingkat pendidikan adalah indikator penting dalam menilai risiko kredit, di mana semakin tinggi pendidikan, semakin rendah risiko gagal bayar.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Status Pernikahan
data_credit %>% 
  filter(MARRIAGE %in% 1:3) %>%
  mutate(MARRIAGE = factor(MARRIAGE,
                           levels = 1:3,
                           labels = c("married", "single", "Others"))) %>%
  group_by(MARRIAGE, default_payment_next_month) %>% 
  summarise(count = n()) %>% 
  ggplot(aes(x = MARRIAGE, y = count, fill = factor(default_payment_next_month))) + 
  geom_bar(stat = "identity", position = "fill") + 
  labs(title = "Proporsi berdasarkan Status Pernikahan", x = "Status")
## `summarise()` has grouped output by 'MARRIAGE'. You can override using the
## `.groups` argument.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Status Pernikahan
data_credit %>% 
  filter(MARRIAGE %in% 1:3) %>%
  mutate(MARRIAGE = factor(MARRIAGE,
                           levels = 1:3,
                           labels = c("married", "single", "Others"))) %>%
  filter(default_payment_next_month == 0) %>% 
  count(MARRIAGE) %>%
  ggplot(aes(x = MARRIAGE, y = n, fill = MARRIAGE)) + 
  geom_bar(stat = "identity") + 
  labs(title = "Distribusi Klien yang Tidak Gagal Bayar berdasarkan Status Pernikahan", 
       x = "Status Pernikahan", y = "Jumlah") +
  theme(legend.position = "none")

Insight : Status pernikahan memengaruhi keberhasilan pembayaran kartu kredit bulan berikutnya.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Umur
data_credit %>%
  mutate(Age_group = cut(AGE,
                         breaks = c(20, 30, 40, 50, 60, 70, 80),
                         right = FALSE,
                         labels = c("20-29", "30-39", "40-49", "50-59", "60-69", "70-79"))) %>%
  group_by(Age_group, default_payment_next_month) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = Age_group, y = count, fill = factor(default_payment_next_month))) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Proporsi berdasarkan Kelompok Umur", x = "Kelompok Umur", y = "Proporsi")
## `summarise()` has grouped output by 'Age_group'. You can override using the
## `.groups` argument.

# Perbandingan Pembayaran bulan berikutnya berdasarkan Umur
data_credit %>%
  mutate(Age_group = cut(AGE,
                         breaks = c(20, 30, 40, 50, 60, 70, 80),
                         right = FALSE,
                         labels = c("20-29", "30-39", "40-49", "50-59", "60-69", "70-79"))) %>%
  filter(default_payment_next_month == 0) %>% 
  count(Age_group) %>%
  ggplot(aes(x = Age_group, y = n, fill = Age_group)) +
  geom_bar(stat = "identity") +
  labs(title = "Distribusi Klien yang Tidak Gagal Bayar berdasarkan Kelompok Umur", 
       x = "Kelompok Umur", y = "Jumlah") +
  theme(legend.position = "none")

- Klien dengan tingkat keberhasilan pembayaran berada pada usia produktif (30-39 tahun) karena stabilitas pendapatan dan pengalaman finansial yang lebih baik. Insight : Faktor umur secara signifikan memengaruhi tingkat keberhasilan pembayaran seorang klien.

# Korelasi Numerik
# Load library yang diperlukan
library(ggplot2)
library(reshape2)  # Untuk fungsi melt()

# Asumsikan data_credit sudah dimuat (misalnya: data_credit <- read.csv("path/to/your/data.csv"))

# Definisi kolom numerik
num_cols <- c("AGE", "LIMIT_BAL", paste0("BILL_AMT", 1:6), paste0("PAY_AMT", 1:6), paste0("PAY_", c(0, 2:6)))

# Hitung matriks korelasi (gunakan complete.obs untuk menangani NA)
corr_matrix <- cor(data_credit[num_cols], use = "complete.obs")

# Melt matriks untuk ggplot
melted_corr <- melt(corr_matrix)

# Buat heatmap
ggplot(data = melted_corr, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1, 1), name = "Korelasi") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 3) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 10, hjust = 1)) +
  labs(title = "Korelasi antar Fitur Numerik", x = "", y = "")

Kesimpulan Akhir EDA - Jenis Kelamin : Perempuan cenderung lebih disiplin dalam pembayaran kredit dibanding laki-laki. Proporsi perempuan yang berhasil membayar lebih tinggi, menunjukkan jenis kelamin sebagai prediktor potensial risiko default. - Pendidikan : Klien dengan pendidikan tinggi (terutama university dan graduate school) lebih dominan dalam kelompok yang berhasil membayar. Tingkat pendidikan berkorelasi positif dengan kedisiplinan pembayaran, kemungkinan karena literasi keuangan dan stabilitas pendapatan yang lebih baik. - Status Pernikahan : Klien single lebih disiplin membayar tepat waktu dibanding yang menikah, karena tanggungan keuangan yang lebih sedikit. Status pernikahan memengaruhi risiko default. - Usia : Klien usia produktif (30-39 tahun) memiliki tingkat keberhasilan pembayaran tertinggi, didukung oleh stabilitas pendapatan dan pengalaman finansial. Usia muda (<30) atau tua (>60) cenderung lebih berisiko.

Step 4 - Modeling & Evaluasi Model

X <- data_credit %>% select(-default_payment_next_month) # Semua kecuali target
y <- data_credit$default_payment_next_month # Target saja

X_dummy <- dummy_cols(X,
select_columns = c("SEX", "EDUCATION", "MARRIAGE"), 
remove_first_dummy = TRUE)