Credit Risk–Taiwan

Credit Risk

Libraries

# Data preprocessing
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(moments)
library(DescTools)
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE

library(tidyr)

# Binning
library(smbinning)

## Loading required package: sqldf

## Loading required package: gsubfn

## Loading required package: proto

## Loading required package: RSQLite

## Loading required package: partykit

## Loading required package: grid

## Loading required package: libcoin

## Loading required package: mvtnorm

## Loading required package: Formula

library(scorecard)

## 
## Attaching package: 'scorecard'

## The following object is masked from 'package:tidyr':
## 
##     replace_na

# Modeling
library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-8

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

# model evaluation
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following object is masked from 'package:dplyr':
## 
##     combine

library(DT) 
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

Inspect data

credit <- read.csv("~/Desktop/dsba/spring semester/credit risk/default of credit card clients.csv")
str(credit)

## 'data.frame':    30000 obs. of  25 variables:
##  $ ID                        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ LIMIT_BAL                 : int  20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
##  $ SEX                       : int  2 2 2 2 1 1 1 2 2 1 ...
##  $ EDUCATION                 : int  2 2 2 2 2 1 1 2 3 3 ...
##  $ MARRIAGE                  : int  1 2 2 1 1 2 2 2 1 2 ...
##  $ AGE                       : int  24 26 34 37 57 37 29 23 28 35 ...
##  $ PAY_0                     : int  2 -1 0 0 -1 0 0 0 0 -2 ...
##  $ PAY_2                     : int  2 2 0 0 0 0 0 -1 0 -2 ...
##  $ PAY_3                     : int  -1 0 0 0 -1 0 0 -1 2 -2 ...
##  $ PAY_4                     : int  -1 0 0 0 0 0 0 0 0 -2 ...
##  $ PAY_5                     : int  -2 0 0 0 0 0 0 0 0 -1 ...
##  $ PAY_6                     : int  -2 2 0 0 0 0 0 -1 0 -1 ...
##  $ BILL_AMT1                 : int  3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
##  $ BILL_AMT2                 : int  3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
##  $ BILL_AMT3                 : int  689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
##  $ BILL_AMT4                 : int  0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
##  $ BILL_AMT5                 : int  0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
##  $ BILL_AMT6                 : int  0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
##  $ PAY_AMT1                  : int  0 0 1518 2000 2000 2500 55000 380 3329 0 ...
##  $ PAY_AMT2                  : int  689 1000 1500 2019 36681 1815 40000 601 0 0 ...
##  $ PAY_AMT3                  : int  0 1000 1000 1200 10000 657 38000 0 432 0 ...
##  $ PAY_AMT4                  : int  0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
##  $ PAY_AMT5                  : int  0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
##  $ PAY_AMT6                  : int  0 2000 5000 1000 679 800 13770 1542 1000 0 ...
##  $ default.payment.next.month: int  1 1 0 0 0 0 0 0 0 0 ...

Data

The dataset used in this project is titled “Default of Credit Card Clients” and was obtained from the UCI Machine Learning Repository. It contains data on 30,000 credit card holders in Taiwan and aims to capture customer behavior related to default payments. The target variable indicates whether a client defaulted on the next month’s payment. The data set includes demographic features such as age, gender, education, and marital status, as well as financial attributes including credit limit, past bill amounts, and repayment records over a six-month period.

##  [1] "ID"                         "LIMIT_BAL"                 
##  [3] "SEX"                        "EDUCATION"                 
##  [5] "MARRIAGE"                   "AGE"                       
##  [7] "PAY_0"                      "PAY_2"                     
##  [9] "PAY_3"                      "PAY_4"                     
## [11] "PAY_5"                      "PAY_6"                     
## [13] "BILL_AMT1"                  "BILL_AMT2"                 
## [15] "BILL_AMT3"                  "BILL_AMT4"                 
## [17] "BILL_AMT5"                  "BILL_AMT6"                 
## [19] "PAY_AMT1"                   "PAY_AMT2"                  
## [21] "PAY_AMT3"                   "PAY_AMT4"                  
## [23] "PAY_AMT5"                   "PAY_AMT6"                  
## [25] "default.payment.next.month"

# A quick visualization on each variable
#check if the margins are large enough
library(DescTools)
Desc(credit)

## ────────────────────────────────────────────────────────────────────────────── 
## Describe credit (data.frame):
## 
## data frame:  30000 obs. of  25 variables
##      30000 complete cases (100.0%)
## 
##   Nr  Class  ColName                     NAs  Levels
##   1   int    ID                          .          
##   2   int    LIMIT_BAL                   .          
##   3   int    SEX                         .          
##   4   int    EDUCATION                   .          
##   5   int    MARRIAGE                    .          
##   6   int    AGE                         .          
##   7   int    PAY_0                       .          
##   8   int    PAY_2                       .          
##   9   int    PAY_3                       .          
##   10  int    PAY_4                       .          
##   11  int    PAY_5                       .          
##   12  int    PAY_6                       .          
##   13  int    BILL_AMT1                   .          
##   14  int    BILL_AMT2                   .          
##   15  int    BILL_AMT3                   .          
##   16  int    BILL_AMT4                   .          
##   17  int    BILL_AMT5                   .          
##   18  int    BILL_AMT6                   .          
##   19  int    PAY_AMT1                    .          
##   20  int    PAY_AMT2                    .          
##   21  int    PAY_AMT3                    .          
##   22  int    PAY_AMT4                    .          
##   23  int    PAY_AMT5                    .          
##   24  int    PAY_AMT6                    .          
##   25  int    default.payment.next.month  .          
## 
## 
## ────────────────────────────────────────────────────────────────────────────── 
## 1 - ID (integer)
## 
##      length         n       NAs     unique         0s       mean     meanCI'
##      30'000    30'000         0        = n          0  15'000.50  14'902.50
##                100.0%      0.0%                  0.0%             15'098.50
##                                                                            
##         .05       .10       .25     median        .75        .90        .95
##    1'500.95  3'000.90  7'500.75  15'000.50  22'500.25  27'000.10  28'500.05
##                                                                            
##       range        sd     vcoef        mad        IQR       skew       kurt
##   29'999.00  8'660.40      0.58  11'119.50  14'999.50      -0.00      -1.20
##                                                                            
## lowest : 1, 2, 3, 4, 5
## highest: 29'996, 29'997, 29'998, 29'999, 30'000
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 2 - LIMIT_BAL (integer)
## 
##       length           n        NAs      unique          0s        mean'
##       30'000      30'000          0          81           0  167'484.32
##                   100.0%       0.0%                    0.0%            
##                                                                        
##          .05         .10        .25      median         .75         .90
##    20'000.00   30'000.00  50'000.00  140'000.00  240'000.00  360'000.00
##                                                                        
##        range          sd      vcoef         mad         IQR        skew
##   990'000.00  129'747.66       0.77  133'434.00  190'000.00        0.99
##                                                                        
##       meanCI
##   166'016.06
##   168'952.59
##             
##          .95
##   430'000.00
##             
##         kurt
##         0.54
##             
## lowest : 10'000 (493), 16'000 (2), 20'000 (1'976), 30'000 (1'610), 40'000 (230)
## highest: 750'000 (4), 760'000, 780'000 (2), 800'000 (2), 1'000'000
## 
## heap(?): remarkable frequency (11.2%) for the mode(s) (= 50000)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 3 - SEX (integer - dichotomous)
## 
##   length      n    NAs unique
##   30'000 30'000      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 1  11'888  39.6%   39.1%   40.2%
## 2  18'112  60.4%   59.8%   60.9%
## 
## ' 95%-CI (Wilson)

## ────────────────────────────────────────────────────────────────────────────── 
## 4 - EDUCATION (integer)
## 
##   length       n    NAs  unique    0s  mean  meanCI'
##   30'000  30'000      0       7    14  1.85    1.84
##           100.0%   0.0%          0.0%          1.86
##                                                    
##      .05     .10    .25  median   .75   .90     .95
##     1.00    1.00   1.00    2.00  2.00  3.00    3.00
##                                                    
##    range      sd  vcoef     mad   IQR  skew    kurt
##     6.00    0.79   0.43    1.48  1.00  0.97    2.08
##                                                    
## 
##    value    freq   perc  cumfreq  cumperc
## 1      0      14   0.0%       14     0.0%
## 2      1  10'585  35.3%   10'599    35.3%
## 3      2  14'030  46.8%   24'629    82.1%
## 4      3   4'917  16.4%   29'546    98.5%
## 5      4     123   0.4%   29'669    98.9%
## 6      5     280   0.9%   29'949    99.8%
## 7      6      51   0.2%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 5 - MARRIAGE (integer)
## 
##   length       n    NAs  unique    0s   mean  meanCI'
##   30'000  30'000      0       4    54   1.55    1.55
##           100.0%   0.0%          0.2%           1.56
##                                                     
##      .05     .10    .25  median   .75    .90     .95
##     1.00    1.00   1.00    2.00  2.00   2.00    2.00
##                                                     
##    range      sd  vcoef     mad   IQR   skew    kurt
##     3.00    0.52   0.34    0.00  1.00  -0.02   -1.36
##                                                     
## 
##    value    freq   perc  cumfreq  cumperc
## 1      0      54   0.2%       54     0.2%
## 2      1  13'659  45.5%   13'713    45.7%
## 3      2  15'964  53.2%   29'677    98.9%
## 4      3     323   1.1%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 6 - AGE (integer)
## 
##   length       n    NAs  unique     0s   mean  meanCI'
##   30'000  30'000      0      56      0  35.49   35.38
##           100.0%   0.0%           0.0%          35.59
##                                                      
##      .05     .10    .25  median    .75    .90     .95
##    23.00   25.00  28.00   34.00  41.00  49.00   53.00
##                                                      
##    range      sd  vcoef     mad    IQR   skew    kurt
##    58.00    9.22   0.26    8.90  13.00   0.73    0.04
##                                                      
## lowest : 21 (67), 22 (560), 23 (931), 24 (1'127), 25 (1'186)
## highest: 72 (3), 73 (4), 74, 75 (3), 79
## 
## heap(?): remarkable frequency (5.3%) for the mode(s) (= 29)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 7 - PAY_0 (integer)
## 
##   length       n     NAs  unique      0s   mean  meanCI'
##   30'000  30'000       0      11  14'737  -0.02   -0.03
##           100.0%    0.0%           49.1%          -0.00
##                                                        
##      .05     .10     .25  median     .75    .90     .95
##    -2.00   -1.00   -1.00    0.00    0.00   2.00    2.00
##                                                        
##    range      sd   vcoef     mad     IQR   skew    kurt
##    10.00    1.12  -67.29    1.48    1.00   0.73    2.72
##                                                        
## 
##     value    freq   perc  cumfreq  cumperc
## 1      -2   2'759   9.2%    2'759     9.2%
## 2      -1   5'686  19.0%    8'445    28.1%
## 3       0  14'737  49.1%   23'182    77.3%
## 4       1   3'688  12.3%   26'870    89.6%
## 5       2   2'667   8.9%   29'537    98.5%
## 6       3     322   1.1%   29'859    99.5%
## 7       4      76   0.3%   29'935    99.8%
## 8       5      26   0.1%   29'961    99.9%
## 9       6      11   0.0%   29'972    99.9%
## 10      7       9   0.0%   29'981    99.9%
## 11      8      19   0.1%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 8 - PAY_2 (integer)
## 
##   length       n    NAs  unique      0s   mean  meanCI'
##   30'000  30'000      0      11  15'730  -0.13   -0.15
##           100.0%   0.0%           52.4%          -0.12
##                                                       
##      .05     .10    .25  median     .75    .90     .95
##    -2.00   -2.00  -1.00    0.00    0.00   2.00    2.00
##                                                       
##    range      sd  vcoef     mad     IQR   skew    kurt
##    10.00    1.20  -8.95    0.00    1.00   0.79    1.57
##                                                       
## 
##     value    freq   perc  cumfreq  cumperc
## 1      -2   3'782  12.6%    3'782    12.6%
## 2      -1   6'050  20.2%    9'832    32.8%
## 3       0  15'730  52.4%   25'562    85.2%
## 4       1      28   0.1%   25'590    85.3%
## 5       2   3'927  13.1%   29'517    98.4%
## 6       3     326   1.1%   29'843    99.5%
## 7       4      99   0.3%   29'942    99.8%
## 8       5      25   0.1%   29'967    99.9%
## 9       6      12   0.0%   29'979    99.9%
## 10      7      20   0.1%   29'999   100.0%
## 11      8       1   0.0%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 9 - PAY_3 (integer)
## 
##   length       n    NAs  unique      0s   mean  meanCI'
##   30'000  30'000      0      11  15'764  -0.17   -0.18
##           100.0%   0.0%           52.5%          -0.15
##                                                       
##      .05     .10    .25  median     .75    .90     .95
##    -2.00   -2.00  -1.00    0.00    0.00   2.00    2.00
##                                                       
##    range      sd  vcoef     mad     IQR   skew    kurt
##    10.00    1.20  -7.20    0.00    1.00   0.84    2.08
##                                                       
## 
##     value    freq   perc  cumfreq  cumperc
## 1      -2   4'085  13.6%    4'085    13.6%
## 2      -1   5'938  19.8%   10'023    33.4%
## 3       0  15'764  52.5%   25'787    86.0%
## 4       1       4   0.0%   25'791    86.0%
## 5       2   3'819  12.7%   29'610    98.7%
## 6       3     240   0.8%   29'850    99.5%
## 7       4      76   0.3%   29'926    99.8%
## 8       5      21   0.1%   29'947    99.8%
## 9       6      23   0.1%   29'970    99.9%
## 10      7      27   0.1%   29'997   100.0%
## 11      8       3   0.0%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 10 - PAY_4 (integer)
## 
##   length       n    NAs  unique      0s   mean  meanCI'
##   30'000  30'000      0      11  16'455  -0.22   -0.23
##           100.0%   0.0%           54.9%          -0.21
##                                                       
##      .05     .10    .25  median     .75    .90     .95
##    -2.00   -2.00  -1.00    0.00    0.00   2.00    2.00
##                                                       
##    range      sd  vcoef     mad     IQR   skew    kurt
##    10.00    1.17  -5.30    0.00    1.00   1.00    3.50
##                                                       
## 
##     value    freq   perc  cumfreq  cumperc
## 1      -2   4'348  14.5%    4'348    14.5%
## 2      -1   5'687  19.0%   10'035    33.5%
## 3       0  16'455  54.9%   26'490    88.3%
## 4       1       2   0.0%   26'492    88.3%
## 5       2   3'159  10.5%   29'651    98.8%
## 6       3     180   0.6%   29'831    99.4%
## 7       4      69   0.2%   29'900    99.7%
## 8       5      35   0.1%   29'935    99.8%
## 9       6       5   0.0%   29'940    99.8%
## 10      7      58   0.2%   29'998   100.0%
## 11      8       2   0.0%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 11 - PAY_5 (integer)
## 
##   length       n    NAs  unique      0s   mean  meanCI'
##   30'000  30'000      0      10  16'947  -0.27   -0.28
##           100.0%   0.0%           56.5%          -0.25
##                                                       
##      .05     .10    .25  median     .75    .90     .95
##    -2.00   -2.00  -1.00    0.00    0.00   0.00    2.00
##                                                       
##    range      sd  vcoef     mad     IQR   skew    kurt
##    10.00    1.13  -4.26    0.00    1.00   1.01    3.99
##                                                       
## 
##     value    freq   perc  cumfreq  cumperc
## 1      -2   4'546  15.2%    4'546    15.2%
## 2      -1   5'539  18.5%   10'085    33.6%
## 3       0  16'947  56.5%   27'032    90.1%
## 4       2   2'626   8.8%   29'658    98.9%
## 5       3     178   0.6%   29'836    99.5%
## 6       4      84   0.3%   29'920    99.7%
## 7       5      17   0.1%   29'937    99.8%
## 8       6       4   0.0%   29'941    99.8%
## 9       7      58   0.2%   29'999   100.0%
## 10      8       1   0.0%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 12 - PAY_6 (integer)
## 
##   length       n    NAs  unique      0s   mean  meanCI'
##   30'000  30'000      0      10  16'286  -0.29   -0.30
##           100.0%   0.0%           54.3%          -0.28
##                                                       
##      .05     .10    .25  median     .75    .90     .95
##    -2.00   -2.00  -1.00    0.00    0.00   2.00    2.00
##                                                       
##    range      sd  vcoef     mad     IQR   skew    kurt
##    10.00    1.15  -3.95    0.00    1.00   0.95    3.43
##                                                       
## 
##     value    freq   perc  cumfreq  cumperc
## 1      -2   4'895  16.3%    4'895    16.3%
## 2      -1   5'740  19.1%   10'635    35.4%
## 3       0  16'286  54.3%   26'921    89.7%
## 4       2   2'766   9.2%   29'687    99.0%
## 5       3     184   0.6%   29'871    99.6%
## 6       4      49   0.2%   29'920    99.7%
## 7       5      13   0.0%   29'933    99.8%
## 8       6      19   0.1%   29'952    99.8%
## 9       7      46   0.2%   29'998   100.0%
## 10      8       2   0.0%   30'000   100.0%
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 13 - BILL_AMT1 (integer)
## 
##         length          n       NAs     unique         0s        mean'
##         30'000     30'000         0     22'723      2'008   51'223.33
##                    100.0%      0.0%                  6.7%            
##                                                                      
##            .05        .10       .25     median        .75         .90
##           0.00     278.90  3'558.75  22'381.50  67'091.00  142'133.70
##                                                                      
##          range         sd     vcoef        mad        IQR        skew
##   1'130'091.00  73'635.86      1.44  32'321.42  63'532.25        2.66
##                                                                      
##       meanCI
##    50'390.04
##    52'056.62
##             
##          .95
##   201'203.05
##             
##         kurt
##         9.80
##             
## lowest : -165'580, -154'973, -15'308, -14'386, -11'545
## highest: 626'648, 630'458, 653'062, 746'814, 964'511
## 
## heap(?): remarkable frequency (6.7%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 14 - BILL_AMT2 (integer)
## 
##         length          n       NAs     unique         0s        mean'
##         30'000     30'000         0     22'346      2'506   49'179.08
##                    100.0%      0.0%                  8.4%            
##                                                                      
##            .05        .10       .25     median        .75         .90
##           0.00       0.00  2'984.75  21'200.00  64'006.25  136'905.50
##                                                                      
##          range         sd     vcoef        mad        IQR        skew
##   1'053'708.00  71'173.77      1.45  30'852.91  61'021.50        2.70
##                                                                      
##       meanCI
##    48'373.65
##    49'984.50
##             
##          .95
##   194'792.20
##             
##         kurt
##        10.30
##             
## lowest : -69'777, -67'526, -33'350, -30'000, -26'214
## highest: 624'475, 646'770, 671'563, 743'970, 983'931
## 
## heap(?): remarkable frequency (8.4%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 15 - BILL_AMT3 (integer)
## 
##         length          n       NAs     unique         0s        mean'
##         30'000     30'000         0     22'026      2'870   47'013.15
##                    100.0%      0.0%                  9.6%            
##                                                                      
##            .05        .10       .25     median        .75         .90
##           0.00       0.00  2'666.25  20'088.50  60'164.75  132'051.30
##                                                                      
##          range         sd     vcoef        mad        IQR        skew
##   1'821'353.00  69'349.39      1.48  29'219.82  57'498.50        3.09
##                                                                      
##       meanCI
##    46'228.38
##    47'797.93
##             
##          .95
##   187'821.05
##             
##         kurt
##        19.78
##             
## lowest : -157'264, -61'506, -46'127, -34'041, -25'443
## highest: 689'627, 689'643, 693'131, 855'086, 1'664'089
## 
## heap(?): remarkable frequency (9.6%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 16 - BILL_AMT4 (integer)
## 
##         length          n       NAs     unique         0s        mean'
##         30'000     30'000         0     21'548      3'195   43'262.95
##                    100.0%      0.0%                 10.7%            
##                                                                      
##            .05        .10       .25     median        .75         .90
##           0.00       0.00  2'326.75  19'052.00  54'506.00  122'418.70
##                                                                      
##          range         sd     vcoef        mad        IQR        skew
##   1'061'586.00  64'332.86      1.49  27'659.39  52'179.25        2.82
##                                                                      
##       meanCI
##    42'534.94
##    43'990.96
##             
##          .95
##   174'333.35
##             
##         kurt
##        11.31
##             
## lowest : -170'000, -81'334, -65'167, -50'616, -46'627
## highest: 572'805, 616'836, 628'699, 706'864, 891'586
## 
## heap(?): remarkable frequency (10.7%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 17 - BILL_AMT5 (integer)
## 
##         length          n       NAs     unique         0s        mean'
##         30'000     30'000         0     21'010      3'506   40'311.40
##                    100.0%      0.0%                 11.7%            
##                                                                      
##            .05        .10       .25     median        .75         .90
##           0.00       0.00  1'763.00  18'104.50  50'190.50  115'883.00
##                                                                      
##          range         sd     vcoef        mad        IQR        skew
##   1'008'505.00  60'797.16      1.51  26'224.97  48'427.50        2.88
##                                                                      
##       meanCI
##    39'623.40
##    40'999.40
##             
##          .95
##   165'794.30
##             
##         kurt
##        12.30
##             
## lowest : -81'334, -61'372, -53'007, -46'627, -37'594
## highest: 547'880, 551'702, 587'067, 823'540, 927'171
## 
## heap(?): remarkable frequency (11.7%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 18 - BILL_AMT6 (integer)
## 
##         length          n       NAs     unique         0s        mean'
##         30'000     30'000         0     20'604      4'020   38'871.76
##                    100.0%      0.0%                 13.4%            
##                                                                      
##            .05        .10       .25     median        .75         .90
##           0.00       0.00  1'256.00  17'071.00  49'198.25  112'110.40
##                                                                      
##          range         sd     vcoef        mad        IQR        skew
##   1'301'267.00  59'554.11      1.53  24'840.96  47'942.25        2.85
##                                                                      
##       meanCI
##    38'197.83
##    39'545.69
##             
##          .95
##   161'912.00
##             
##         kurt
##        12.27
##             
## lowest : -339'603, -209'051, -150'953, -94'625, -73'895
## highest: 527'566, 527'711, 568'638, 699'944, 961'664
## 
## heap(?): remarkable frequency (13.4%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 19 - PAY_AMT1 (integer)
## 
##       length          n       NAs    unique        0s       mean     meanCI'
##       30'000     30'000         0     7'943     5'249   5'663.58   5'476.15
##                  100.0%      0.0%               17.5%              5'851.02
##                                                                            
##          .05        .10       .25    median       .75        .90        .95
##         0.00       0.00  1'000.00  2'100.00  5'006.00  10'300.00  18'428.20
##                                                                            
##        range         sd     vcoef       mad       IQR       skew       kurt
##   873'552.00  16'563.28      2.92  2'864.38  4'006.00      14.67     415.16
##                                                                            
## lowest : 0 (5'249), 1 (9), 2 (14), 3 (15), 4 (18)
## highest: 405'016, 423'903, 493'358, 505'000, 873'552
## 
## heap(?): remarkable frequency (17.5%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 20 - PAY_AMT2 (integer)
## 
##         length          n     NAs    unique        0s       mean     meanCI'
##         30'000     30'000       0     7'899     5'396   5'921.16   5'660.43
##                    100.0%    0.0%               18.0%              6'181.90
##                                                                            
##            .05        .10     .25    median       .75        .90        .95
##           0.00       0.00  833.00  2'009.00  5'000.00  10'401.10  19'004.35
##                                                                            
##          range         sd   vcoef       mad       IQR       skew       kurt
##   1'684'259.00  23'040.87    3.89  2'951.86  4'167.00      30.45   1'641.25
##                                                                            
## lowest : 0 (5'396), 1 (15), 2 (20), 3 (18), 4 (11)
## highest: 580'464, 1'024'516, 1'215'471, 1'227'082, 1'684'259
## 
## heap(?): remarkable frequency (18.0%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 21 - PAY_AMT3 (integer)
## 
##       length          n     NAs    unique        0s       mean     meanCI'
##       30'000     30'000       0     7'518     5'968   5'225.68   5'026.44
##                  100.0%    0.0%               19.9%              5'424.93
##                                                                          
##          .05        .10     .25    median       .75        .90        .95
##         0.00       0.00  390.00  1'800.00  4'505.00  10'000.00  17'589.40
##                                                                          
##        range         sd   vcoef       mad       IQR       skew       kurt
##   896'040.00  17'606.96    3.37  2'661.27  4'115.00      17.21     564.18
##                                                                          
## lowest : 0 (5'968), 1 (13), 2 (19), 3 (14), 4 (15)
## highest: 400'972, 417'588, 508'229, 889'043, 896'040
## 
## heap(?): remarkable frequency (19.9%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 22 - PAY_AMT4 (integer)
## 
##       length          n     NAs    unique        0s      mean     meanCI'
##       30'000     30'000       0     6'937     6'408  4'826.08   4'648.79
##                  100.0%    0.0%               21.4%             5'003.36
##                                                                         
##          .05        .10     .25    median       .75       .90        .95
##         0.00       0.00  296.00  1'500.00  4'013.25  9'570.60  16'014.95
##                                                                         
##        range         sd   vcoef       mad       IQR      skew       kurt
##   621'000.00  15'666.16    3.25  2'223.90  3'717.25     12.90     277.27
##                                                                         
## lowest : 0 (6'408), 1 (22), 2 (22), 3 (13), 4 (20)
## highest: 400'046, 432'130, 497'000, 528'897, 621'000
## 
## heap(?): remarkable frequency (21.4%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 23 - PAY_AMT5 (integer)
## 
##       length          n     NAs    unique        0s      mean     meanCI'
##       30'000     30'000       0     6'897     6'703  4'799.39   4'626.49
##                  100.0%    0.0%               22.3%             4'972.28
##                                                                         
##          .05        .10     .25    median       .75       .90        .95
##         0.00       0.00  252.50  1'500.00  4'031.50  9'500.00  16'000.00
##                                                                         
##        range         sd   vcoef       mad       IQR      skew       kurt
##   426'529.00  15'278.31    3.18  2'223.90  3'779.00     11.13     180.02
##                                                                         
## lowest : 0 (6'703), 1 (21), 2 (13), 3 (13), 4 (12)
## highest: 332'000, 379'267, 388'071, 417'990, 426'529
## 
## heap(?): remarkable frequency (22.3%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 24 - PAY_AMT6 (integer)
## 
##       length          n     NAs    unique        0s      mean     meanCI'
##       30'000     30'000       0     6'939     7'173  5'215.50   5'014.33
##                  100.0%    0.0%               23.9%             5'416.68
##                                                                         
##          .05        .10     .25    median       .75       .90        .95
##         0.00       0.00  117.75  1'500.00  4'000.00  9'600.00  17'343.80
##                                                                         
##        range         sd   vcoef       mad       IQR      skew       kurt
##   528'666.00  17'777.47    3.41  2'223.90  3'882.25     10.64     167.12
##                                                                         
## lowest : 0 (7'173), 1 (20), 2 (9), 3 (14), 4 (12)
## highest: 403'500, 422'000, 443'001, 527'143, 528'666
## 
## heap(?): remarkable frequency (23.9%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ────────────────────────────────────────────────────────────────────────────── 
## 25 - default.payment.next.month (integer - dichotomous)
## 
##   length      n    NAs unique
##   30'000 30'000      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  23'364  77.9%   77.4%   78.3%
## 1   6'636  22.1%   21.7%   22.6%
## 
## ' 95%-CI (Wilson)

Data Preprocessing

Factor

## Change integer to factor
credit$EDUCATION <- ifelse(credit$EDUCATION %in% c(0, 4:6), 0, credit$EDUCATION)

credit <- credit |>
  mutate(
    SEXF       = factor(SEX,       levels = c(1, 2),           labels = c("male", "female")),
    EDUCATIONF = factor(EDUCATION, levels = c(0, 1, 2, 3),     labels = c("others", "graduate school", "high school", "university")),
    MARRIAGEF  = factor(MARRIAGE,  levels = c(0, 1, 2, 3),     labels = c("others", "married", "single", "divorce"))
  )

pay_vars <- grep("^PAY_[0-6]$", names(credit), value = TRUE)   
for (var in pay_vars) {
  credit[[paste0(var,"F")]] <- factor(
    credit[[var]],
    levels = c(-2,-1,0,1:9),
    labels = c("no consumption","fully paid","revolving",
               paste0(1:9,"d"))
  )
}

Visualization

ggplot(credit, aes(x = EDUCATIONF, fill = factor(default.payment.next.month))) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(
    values = c("0" = "#D5DBB3", "1" = "#BCAAA4"),
    labels = c("No Default", "Default")
  ) +
  labs(
    title = "Default Rate by Education Level",
    x = "Education",
    y = "Proportion",
    fill = "Default Status"
  ) +
  theme_minimal()

##### The university group exhibits the highest default rate among all education levels, followed by high school and graduate school.

ggplot(credit, aes(x = SEXF, fill = factor(default.payment.next.month))) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(
    values = c("0" = "#D5DBB3", "1" = "#BCAAA4"),
    labels = c("No Default", "Default")
  ) +
  labs(
    title = "Default Rate by Gender",
    x = "Gender",
    y = "Proportion",
    fill = "Default Status"
  ) +
  theme_minimal()

##### A similar tendency toward gender.

ggplot(credit, aes(x = MARRIAGEF, fill = factor(default.payment.next.month))) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  scale_fill_manual(
    values = c("0" = "#D5DBB3", "1" = "#BCAAA4"),
    labels = c("No Default", "Default")
  ) +
  labs(
    title = "Default Rate by Marital Status",
    x = "Marital Status",
    y = "Proportion",
    fill = "Default Status"
  ) +
  theme_minimal()

##### The chart illustrates the default distribution across marital status groups. Individuals categorized as divorced exhibit the highest default rate, followed by married and single clients. The ‘others’ group shows the lowest proportion of default, though this may be influenced by its undefined classification.

pay_status <- credit %>%
  select(ID = 1, matches("PAY_[0-6]F")) %>%
  reshape2::melt(id.vars = "ID", variable.name = "Period", value.name = "Status")

ggplot(pay_status, aes(x = Status)) +
  geom_bar(fill = "#BCAAA4") +
  facet_wrap(~ Period, scales = "free") +
  labs(title = "Payment Status Distribution over 7 Periods", x = "Status", y = "Count") +
  theme_minimal()

##### The charts display the distribution of payment statuses over the past seven periods (PAY_0F to PAY_6F). The majority of users consistently fall into either the ‘fully paid’ or ‘revolving’ categories, indicating stable repayment behavior. Delinquency rates beyond one or two months are relatively rare, while the number of users with ‘no consumption’ remains minimal across all periods.

Separate numeric and factor variable

columns <- c(
  "SEXF","EDUCATIONF","MARRIAGEF","LIMIT_BAL",
  paste0(pay_vars,"F"),                             
  paste0("BILL_AMT",1:6), paste0("PAY_AMT",1:6),
  "default.payment.next.month"
)
columns  <- intersect(columns, names(credit))          
base     <- credit[, columns]

base_numeric <- base[, sapply(base, is.numeric)]
base_factor  <- base[, sapply(base, is.factor)]

target variable

# Set target variable
base_numeric$def      <- base$`default.payment.next.month`
base_numeric$def_woe  <- 1 - base_numeric$def

p1 <- ggplot(base_numeric, aes(x = factor(def))) +
  geom_bar(fill = c("#D5DBB3", "#BCAAA4")) +
  labs(
    title = "Default Variable Distribution",
    x = "def (0 = No Default, 1 = Default)",
    y = "Count"
  ) +
  theme_minimal()

p2 <- ggplot(base_numeric, aes(x = factor(def_woe))) +
  geom_bar(fill = c("#BCAAA4", "#D5DBB3")) +
  labs(
    title = "def_woe (for smbinning)",
    x = "def_woe (0 = Default, 1 = No Default)",
    y = "Count"
  ) +
  theme_minimal()

gridExtra::grid.arrange(p1, p2, ncol = 2)

##### The figure illustrates the distribution of the target variable used for credit default prediction. The original variable (def) reveals a class imbalance, with approximately 22% of clients defaulting on their next payment. To comply with the requirements of the WoE-based binning procedure, a transformed variable (def_woe) is created, where 0 indicates ‘bad’ (default) and 1 indicates ‘good’ (no default). This transformation is essential for scorecard modeling and IV analysis.

Log transformation

vars_to_log <- c("LIMIT_BAL", paste0("BILL_AMT",1:6), paste0("PAY_AMT",1:6))
vars_to_log <- intersect(vars_to_log, names(base_numeric))
base_numeric[vars_to_log] <- lapply(base_numeric[vars_to_log], log1p)

## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced

Variable binning & Categorical conversion

percentile  <- apply(base_numeric, 2, function(x) quantile(x, seq(0.1,1,0.1), na.rm = TRUE) |> unique() |> round(2))
unique_vals <- apply(base_numeric, 2, function(x) length(unique(x)))

vars_to_bin    <- names(unique_vals[unique_vals >= 10]) 
vars_as_factor <- names(unique_vals[unique_vals < 10 & unique_vals > 1])

unique_vals   <- sapply(base_numeric, function(x) length(unique(x)))
vars_to_bin    <- names(unique_vals[unique_vals >= 10])
vars_as_factor <- names(unique_vals[unique_vals < 10 & unique_vals > 1])

## continuous variables
for (m in vars_to_bin) {
  cuts <- unique(quantile(base_numeric[[m]],
                          probs = seq(0.1, 1, 0.1),
                          na.rm = TRUE))
  if (length(cuts) < 2) next
  
  base_numeric[[paste0(m, "_fine")]] <- cut(
    base_numeric[[m]],
    breaks = c(-Inf, cuts),
    labels = paste0("≤", cuts),
    include.lowest = TRUE
  )
}

for (m in vars_as_factor) {
  base_numeric[[paste0(m, "_fine")]] <- factor(base_numeric[[m]])
}

bill_vars <- paste0("BILL_AMT", 1:6)

base_numeric[bill_vars] <- lapply(base_numeric[bill_vars], function(x){
  x <- ifelse(x < 0, 0, x)     
  log1p(x)                        
})

for (m in bill_vars){
  cuts <- unique(quantile(base_numeric[[m]], probs = seq(0.1,1,0.1), na.rm = TRUE))
  if (length(cuts) < 2) next
  base_numeric[[paste0(m,"_fine")]] <- cut(
    base_numeric[[m]],
    breaks = c(-Inf, cuts),
    labels = paste0("≤", cuts),
    include.lowest = TRUE
  )
}

Binning EDA

smbinning.eda(base_numeric, rounding = 3, pbar = 1)

##  
##   |                                                          |                                                  |   0%  |                                                          |--                                                |   3%  |                                                          |---                                               |   6%  |                                                          |-----                                             |   9%  |                                                          |------                                            |  12%  |                                                          |--------                                          |  16%  |                                                          |---------                                         |  19%  |                                                          |-----------                                       |  22%  |                                                          |------------                                      |  25%  |                                                          |--------------                                    |  28%  |                                                          |----------------                                  |  31%  |                                                          |-----------------                                 |  34%  |                                                          |-------------------                               |  38%  |                                                          |--------------------                              |  41%  |                                                          |----------------------                            |  44%  |                                                          |-----------------------                           |  47%  |                                                          |-------------------------                         |  50%  |                                                          |---------------------------                       |  53%  |                                                          |----------------------------                      |  56%  |                                                          |------------------------------                    |  59%  |                                                          |-------------------------------                   |  62%  |                                                          |---------------------------------                 |  66%  |                                                          |----------------------------------                |  69%  |                                                          |------------------------------------              |  72%  |                                                          |--------------------------------------            |  75%  |                                                          |---------------------------------------           |  78%  |                                                          |-----------------------------------------         |  81%  |                                                          |------------------------------------------        |  84%  |                                                          |--------------------------------------------      |  88%  |                                                          |---------------------------------------------     |  91%  |                                                          |-----------------------------------------------   |  94%  |                                                          |------------------------------------------------  |  97%  |                                                          |--------------------------------------------------| 100%

## $eda
##                              Field    Type  Recs Miss Unique  Min    Q25    Q50
## 1                        LIMIT_BAL Num/Int 30000    0     81 9.21 10.820 11.849
## 2                        BILL_AMT1 Num/Int 30000  574  22417 0.00  2.234  2.405
## 3                        BILL_AMT2 Num/Int 30000  651  21999 0.00  2.221  2.401
## 4                        BILL_AMT3 Num/Int 30000  631  21662 0.00  2.211  2.394
## 5                        BILL_AMT4 Num/Int 30000  650  21171 0.00  2.194  2.387
## 6                        BILL_AMT5 Num/Int 30000  631  20626 0.00  2.166  2.383
## 7                        BILL_AMT6 Num/Int 30000  663  20190 0.00  2.128  2.379
## 8                         PAY_AMT1 Num/Int 30000    0   7943 0.00  6.909  7.650
## 9                         PAY_AMT2 Num/Int 30000    0   7899 0.00  6.726  7.606
## 10                        PAY_AMT3 Num/Int 30000    0   7518 0.00  5.969  7.496
## 11                        PAY_AMT4 Num/Int 30000    0   6937 0.00  5.694  7.314
## 12                        PAY_AMT5 Num/Int 30000    0   6897 0.00  5.535  7.314
## 13                        PAY_AMT6 Num/Int 30000    0   6939 0.00  4.777  7.314
## 14      default.payment.next.month Num/Int 30000    0      2 0.00  0.000  0.000
## 15                             def Num/Int 30000    0      2 0.00  0.000  0.000
## 16                         def_woe Num/Int 30000    0      2 0.00  1.000  1.000
## 17                  LIMIT_BAL_fine  Factor 30000    0     10   NA     NA     NA
## 18                  BILL_AMT1_fine  Factor 30000  574     10   NA     NA     NA
## 19                  BILL_AMT2_fine  Factor 30000  651     10   NA     NA     NA
## 20                  BILL_AMT3_fine  Factor 30000  631     10   NA     NA     NA
## 21                  BILL_AMT4_fine  Factor 30000  650     10   NA     NA     NA
## 22                  BILL_AMT5_fine  Factor 30000  631     10   NA     NA     NA
## 23                  BILL_AMT6_fine  Factor 30000  663     10   NA     NA     NA
## 24                   PAY_AMT1_fine  Factor 30000    0     10   NA     NA     NA
## 25                   PAY_AMT2_fine  Factor 30000    0     10   NA     NA     NA
## 26                   PAY_AMT3_fine  Factor 30000    0     10   NA     NA     NA
## 27                   PAY_AMT4_fine  Factor 30000    0      9   NA     NA     NA
## 28                   PAY_AMT5_fine  Factor 30000    0      9   NA     NA     NA
## 29                   PAY_AMT6_fine  Factor 30000    0      9   NA     NA     NA
## 30 default.payment.next.month_fine  Factor 30000    0      2   NA     NA     NA
## 31                        def_fine  Factor 30000    0      2   NA     NA     NA
## 32                    def_woe_fine  Factor 30000    0      2   NA     NA     NA
##       Avg    Q75    Max  StDv Neg  Zero   Pos OutLo OutHi
## 1  11.663 12.388 13.816 0.941   0     0 30000     0     0
## 2   2.210  2.496  2.693 0.625   0  2024 27402  2283     0
## 3   2.169  2.492  2.695 0.687   0  2524 26825  2724     0
## 4   2.138  2.488  2.729 0.727   0  2894 26475  3035     0
## 5   2.108  2.480  2.688 0.758   0  3220 26130  3308     0
## 6   2.076  2.472  2.691 0.786   0  3530 25839  3590     0
## 7   2.032  2.470  2.693 0.830   0  4045 25292  4072     0
## 8   6.630  8.519 13.680 3.250   0  5249 24751  5498   386
## 9   6.563  8.517 14.337 3.279   0  5396 24604  5638   314
## 10  6.283  8.413 13.706 3.350   0  5968 24032  6089    49
## 11  6.078  8.298 13.339 3.397   0  6408 23592  6485    33
## 12  6.032  8.302 12.963 3.444   0  6703 23297  6737    17
## 13  5.933  8.294 13.178 3.528   0  7173 22827     0     0
## 14  0.221  0.000  1.000 0.415   0 23364  6636     0  6636
## 15  0.221  0.000  1.000 0.415   0 23364  6636     0  6636
## 16  0.779  1.000  1.000 0.415   0  6636 23364  6636     0
## 17     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 18     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 19     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 20     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 21     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 22     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 23     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 24     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 25     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 26     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 27     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 28     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 29     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 30     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 31     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 32     NA     NA     NA    NA  NA    NA    NA    NA    NA
## 
## $edapct
##                              Field    Type  Recs  Miss Neg  Zero   Pos OutLo
## 1                        LIMIT_BAL Num/Int 30000 0.000   0 0.000 1.000 0.000
## 2                        BILL_AMT1 Num/Int 30000 0.019   0 0.067 0.913 0.076
## 3                        BILL_AMT2 Num/Int 30000 0.022   0 0.084 0.894 0.091
## 4                        BILL_AMT3 Num/Int 30000 0.021   0 0.096 0.882 0.101
## 5                        BILL_AMT4 Num/Int 30000 0.022   0 0.107 0.871 0.110
## 6                        BILL_AMT5 Num/Int 30000 0.021   0 0.118 0.861 0.120
## 7                        BILL_AMT6 Num/Int 30000 0.022   0 0.135 0.843 0.136
## 8                         PAY_AMT1 Num/Int 30000 0.000   0 0.175 0.825 0.183
## 9                         PAY_AMT2 Num/Int 30000 0.000   0 0.180 0.820 0.188
## 10                        PAY_AMT3 Num/Int 30000 0.000   0 0.199 0.801 0.203
## 11                        PAY_AMT4 Num/Int 30000 0.000   0 0.214 0.786 0.216
## 12                        PAY_AMT5 Num/Int 30000 0.000   0 0.223 0.777 0.225
## 13                        PAY_AMT6 Num/Int 30000 0.000   0 0.239 0.761 0.000
## 14      default.payment.next.month Num/Int 30000 0.000   0 0.779 0.221 0.000
## 15                             def Num/Int 30000 0.000   0 0.779 0.221 0.000
## 16                         def_woe Num/Int 30000 0.000   0 0.221 0.779 0.221
## 17                  LIMIT_BAL_fine  Factor 30000 0.000  NA    NA    NA    NA
## 18                  BILL_AMT1_fine  Factor 30000 0.019  NA    NA    NA    NA
## 19                  BILL_AMT2_fine  Factor 30000 0.022  NA    NA    NA    NA
## 20                  BILL_AMT3_fine  Factor 30000 0.021  NA    NA    NA    NA
## 21                  BILL_AMT4_fine  Factor 30000 0.022  NA    NA    NA    NA
## 22                  BILL_AMT5_fine  Factor 30000 0.021  NA    NA    NA    NA
## 23                  BILL_AMT6_fine  Factor 30000 0.022  NA    NA    NA    NA
## 24                   PAY_AMT1_fine  Factor 30000 0.000  NA    NA    NA    NA
## 25                   PAY_AMT2_fine  Factor 30000 0.000  NA    NA    NA    NA
## 26                   PAY_AMT3_fine  Factor 30000 0.000  NA    NA    NA    NA
## 27                   PAY_AMT4_fine  Factor 30000 0.000  NA    NA    NA    NA
## 28                   PAY_AMT5_fine  Factor 30000 0.000  NA    NA    NA    NA
## 29                   PAY_AMT6_fine  Factor 30000 0.000  NA    NA    NA    NA
## 30 default.payment.next.month_fine  Factor 30000 0.000  NA    NA    NA    NA
## 31                        def_fine  Factor 30000 0.000  NA    NA    NA    NA
## 32                    def_woe_fine  Factor 30000 0.000  NA    NA    NA    NA
##    OutHi
## 1  0.000
## 2  0.000
## 3  0.000
## 4  0.000
## 5  0.000
## 6  0.000
## 7  0.000
## 8  0.013
## 9  0.010
## 10 0.002
## 11 0.001
## 12 0.001
## 13 0.000
## 14 0.221
## 15 0.221
## 16 0.000
## 17    NA
## 18    NA
## 19    NA
## 20    NA
## 21    NA
## 22    NA
## 23    NA
## 24    NA
## 25    NA
## 26    NA
## 27    NA
## 28    NA
## 29    NA
## 30    NA
## 31    NA
## 32    NA

numeric_vars <- base_numeric %>%
  summarise(across(everything(), ~n_distinct(.))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "distinct_count") %>%
  filter(distinct_count > 10) %>%
  pull(variable)

for (v in numeric_vars) {
  print(
    ggplot(base_numeric, aes(x = as.numeric(.data[[v]]))) +
      geom_histogram(fill = "#D5DBB3", bins = 50) +
      labs(title = paste("Distribution of", v), x = v) +
      theme_minimal()
  )
}

## Warning: Removed 574 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 651 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 650 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 663 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 574 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 651 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 650 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 663 rows containing non-finite outside the scale range
## (`stat_bin()`).

Weight of Evidence & IV Information Value

pdf("WoE_numeric.pdf", paper = "a4")                   

IV           <- data.frame(VAR = character(), IV = numeric())
WOE          <- list()
error_vars   <- c()
success_vars <- c()

vars_fine <- grep("_fine$", names(base_numeric), value = TRUE)
vars_loop <- setdiff(names(base_numeric), c(vars_fine, "def", "def_woe", "default.payment.next.month"))

pb <- txtProgressBar(min = 0, max = length(vars_loop), style = 3)

##   |                                                                              |                                                                      |   0%

for (v in vars_loop) {
  
  par(mfrow = c(2,2))
  cuts <- unique(quantile(base_numeric[[v]],
                        probs = seq(0.1, 1, 0.1),
                        na.rm = TRUE))

  if (length(cuts) < 2) { error_vars <- c(error_vars, v); next }
  
  res <- try(
    smbinning.custom(df = base_numeric, y = "def_woe", x = v, cuts = cuts),
    silent = TRUE
  )
  
  if (!inherits(res, "try-error") && is.list(res)) {
    success_vars <- c(success_vars, v)
    
    boxplot(base_numeric[[v]] ~ base_numeric$def, horizontal = TRUE, col = "lightgray",
            frame = FALSE, main = "Distribution"); mtext(v,3)
    
    smbinning.plot(res, option = "dist",    sub = v)
    smbinning.plot(res, option = "badrate", sub = v)
    smbinning.plot(res, option = "WoE",     sub = v)
    

    iv_val <- res$ivtable[res$ivtable$Cutpoint == "Total", "IV"]
    IV     <- rbind(IV, data.frame(VAR = v, IV = as.numeric(iv_val)))
    

    w <- res$ivtable |> 
         dplyr::filter(Cutpoint != "Total") |> 
         dplyr::select(Cutpoint, WoE, PctRec) |>
         dplyr::arrange(WoE)
    w$numer <- 11:(nrow(w) + 10)
    WOE[[v]] <- w
  } else {
    error_vars <- c(error_vars, v)
  }
  
  setTxtProgressBar(pb, which(vars_loop == v))
}

## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2

##   |                                                                              |=====                                                                 |   8%

##   |                                                                              |===========                                                           |  15%

##   |                                                                              |================                                                      |  23%

##   |                                                                              |======================                                                |  31%

##   |                                                                              |===========================                                           |  38%

##   |                                                                              |================================                                      |  46%

##   |                                                                              |======================================                                |  54%

## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2

##   |                                                                              |===========================================                           |  62%

## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2

##   |                                                                              |================================================                      |  69%

## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2

##   |                                                                              |======================================================                |  77%

## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2

##   |                                                                              |===========================================================           |  85%

## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2

##   |                                                                              |=================================================================     |  92%

## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2

##   |                                                                              |======================================================================| 100%

close(pb)

dev.off()

## quartz_off_screen 
##                 2

cat("Success：", length(success_vars), "\n")

## Success： 13

cat("Fail：", paste(error_vars, collapse = ", "), "\n")

## Fail：

IV <- IV[order(-IV$IV), ]
print(IV)

##          VAR     IV
## 1  LIMIT_BAL 0.1782
## 8   PAY_AMT1 0.1717
## 9   PAY_AMT2 0.1495
## 10  PAY_AMT3 0.1228
## 11  PAY_AMT4 0.1067
## 13  PAY_AMT6 0.0947
## 12  PAY_AMT5 0.0879
## 6  BILL_AMT5 0.0165
## 7  BILL_AMT6 0.0164
## 5  BILL_AMT4 0.0155
## 2  BILL_AMT1 0.0144
## 3  BILL_AMT2 0.0118
## 4  BILL_AMT3 0.0106

Feature Selection

IV <- IV[order(-IV$IV), ]         
selected <- IV$VAR[IV$IV > 0.02]          
print(IV)

##          VAR     IV
## 1  LIMIT_BAL 0.1782
## 8   PAY_AMT1 0.1717
## 9   PAY_AMT2 0.1495
## 10  PAY_AMT3 0.1228
## 11  PAY_AMT4 0.1067
## 13  PAY_AMT6 0.0947
## 12  PAY_AMT5 0.0879
## 6  BILL_AMT5 0.0165
## 7  BILL_AMT6 0.0164
## 5  BILL_AMT4 0.0155
## 2  BILL_AMT1 0.0144
## 3  BILL_AMT2 0.0118
## 4  BILL_AMT3 0.0106

length(selected)

## [1] 7

Preprocessing for modeling purpose

# WOE encoded
bins <- woebin(
  dt        = credit,                          
  y         = "default.payment.next.month",
  x         = selected,                      
  positive  = "1"                              
)

## ℹ Creating woe binning ...

## ✔ Binning on 30000 rows and 8 columns in 00:00:05

dt_woe <- woebin_ply(credit, bins)

## ℹ Converting into woe values ...

## ✔ Woe transformating on 30000 rows and 7 columns in 00:00:11

head(dt_woe[, c(paste0(selected, "_woe"), "default.payment.next.month")])

## [1] "LIMIT_BAL_woe" "PAY_AMT1_woe"  "PAY_AMT2_woe"  "PAY_AMT3_woe" 
## [5] "PAY_AMT4_woe"  "PAY_AMT6_woe"

Train and test split

set.seed(2025)
idx   <- sample(seq_len(nrow(dt_woe)), 0.7*nrow(dt_woe))
train <- dt_woe[idx, ]
test  <- dt_woe[-idx, ]

form  <- as.formula(
  paste("default.payment.next.month ~", 
        paste(paste0(selected, "_woe"), collapse = " + "))
)

Modeling

#
features <- paste0(selected, "_woe")

evaluate_model <- function(actual, predicted_prob, cutoff = 0.5, model_name = "") {
  predicted <- ifelse(predicted_prob >= cutoff, 1, 0)
  cm <- confusionMatrix(factor(predicted), factor(actual), positive = "1")
  auc_val <- auc(actual, predicted_prob)
  f1_val <- 2 * cm$byClass["Sensitivity"] * cm$byClass["Precision"] /
            (cm$byClass["Sensitivity"] + cm$byClass["Precision"])
  data.frame(
    Model    = model_name,
    AUC      = round(auc_val, 4),
    Accuracy = round(cm$overall["Accuracy"], 4),
    Recall   = round(cm$byClass["Sensitivity"], 4),
    F1       = round(f1_val, 4)
  )
}

Logistic regression

form <- as.formula(paste("default.payment.next.month ~", paste(features, collapse = " + ")))
model_glm <- glm(form, data = train, family = binomial)
prob_glm  <- predict(model_glm, newdata = test, type = "response")
res_glm   <- evaluate_model(test$default.payment.next.month, prob_glm, model_name = "GLM")

## Warning in confusionMatrix.default(factor(predicted), factor(actual), positive
## = "1"): Levels are not in the same order for reference and data. Refactoring
## data to match.

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

Random Forest

train$default.payment.next.month <- as.factor(train$default.payment.next.month)
test$default.payment.next.month  <- as.factor(test$default.payment.next.month)

model_rf <- randomForest(formula = form, data = train, ntree = 100)
prob_rf  <- predict(model_rf, newdata = test, type = "prob")[, "1"]
res_rf   <- evaluate_model(test$default.payment.next.month, prob_rf, model_name = "Random Forest")

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

XGBoost

y_train <- as.numeric(as.character(train$default.payment.next.month))
y_test  <- as.numeric(as.character(test$default.payment.next.month))

train_matrix <- xgb.DMatrix(data = as.matrix(train[, ..features]), label = y_train)
test_matrix  <- xgb.DMatrix(data = as.matrix(test[, ..features]),  label = y_test)

model_xgb <- xgboost(
  data      = train_matrix,
  nrounds   = 100,
  objective = "binary:logistic",
  verbose   = 0
)
prob_xgb <- predict(model_xgb, newdata = test_matrix)
res_xgb  <- evaluate_model(y_test, prob_xgb, model_name = "XGBoost")

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

Modeling result

results <- rbind(res_glm, res_rf, res_xgb)
datatable(results, caption = "Model Performance")

card <- scorecard(bins, model_glm)
score <- scorecard_ply(credit, card)
head(score)

##    score
##    <num>
## 1:   408
## 2:   441
## 3:   462
## 4:   469
## 5:   527
## 6:   465

label_glm <- as.numeric(as.character(test$default.payment.next.month))
# KS plot
library(scorecard)
perf_eva(
  pred = prob_glm,
  label = label_glm,
  title = "GLM",
  show_plot = TRUE
)

## $binomial_metric
## $binomial_metric$GLM
##          MSE      RMSE   LogLoss         R2        KS       AUC      Gini
##        <num>     <num>     <num>      <num>     <num>     <num>     <num>
## 1: 0.1642983 0.4053373 0.5046303 0.05279292 0.2409348 0.6567201 0.3134402

gini_glm <- 2 * auc(label_glm, prob_glm) - 1

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

print(gini_glm)

## [1] 0.3134402

label_glm <- as.numeric(as.character(test$default.payment.next.month))

roc_obj <- roc(label_glm, prob_glm)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

opt_cut <- coords(roc_obj, "best", ret = "threshold", best.method = "youden")
print(opt_cut)

##   threshold
## 1 0.2328704

label_glm <- as.numeric(as.character(test$default.payment.next.month))

thresholds <- seq(0, 1, by = 0.01)

metrics <- data.frame(
  Threshold = thresholds,
  Precision = NA,
  Recall    = NA,
  F1        = NA
)

for (i in seq_along(thresholds)) {
  t <- thresholds[i]
  pred_class <- ifelse(prob_glm >= t, 1, 0)
  cm <- confusionMatrix(factor(pred_class), factor(label_glm), positive = "1")
  prec <- cm$byClass["Precision"]
  rec  <- cm$byClass["Sensitivity"]
  f1   <- ifelse(prec + rec == 0, 0, 2 * prec * rec / (prec + rec))
  metrics[i, 2:4] <- c(prec, rec, f1)
}

## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

metrics_long <- melt(metrics, id.vars = "Threshold")

ggplot(metrics_long, aes(x = Threshold, y = value, color = variable)) +
  geom_line(size = 1) +
  labs(
    title = "Precision / Recall / F1 vs. Cut-off Threshold",
    x = "Threshold",
    y = "Metric Value",
    color = "Metric"
  ) +
  scale_color_manual(values = c(
    "Precision" = "#D5DBB3",
    "Recall"    = "#BCAAA4",
    "F1"        = "#5D7290"
  )) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 102 rows containing missing values or values outside the scale range
## (`geom_line()`).

References

https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients