# Data preprocessing
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(moments)
library(DescTools)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
library(tidyr)
# Binning
library(smbinning)
## Loading required package: sqldf
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
## Loading required package: partykit
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
## Loading required package: Formula
library(scorecard)
##
## Attaching package: 'scorecard'
## The following object is masked from 'package:tidyr':
##
## replace_na
# Modeling
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# model evaluation
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(DT)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
credit <- read.csv("~/Desktop/dsba/spring semester/credit risk/default of credit card clients.csv")
str(credit)
## 'data.frame': 30000 obs. of 25 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ LIMIT_BAL : int 20000 120000 90000 50000 50000 50000 500000 100000 140000 20000 ...
## $ SEX : int 2 2 2 2 1 1 1 2 2 1 ...
## $ EDUCATION : int 2 2 2 2 2 1 1 2 3 3 ...
## $ MARRIAGE : int 1 2 2 1 1 2 2 2 1 2 ...
## $ AGE : int 24 26 34 37 57 37 29 23 28 35 ...
## $ PAY_0 : int 2 -1 0 0 -1 0 0 0 0 -2 ...
## $ PAY_2 : int 2 2 0 0 0 0 0 -1 0 -2 ...
## $ PAY_3 : int -1 0 0 0 -1 0 0 -1 2 -2 ...
## $ PAY_4 : int -1 0 0 0 0 0 0 0 0 -2 ...
## $ PAY_5 : int -2 0 0 0 0 0 0 0 0 -1 ...
## $ PAY_6 : int -2 2 0 0 0 0 0 -1 0 -1 ...
## $ BILL_AMT1 : int 3913 2682 29239 46990 8617 64400 367965 11876 11285 0 ...
## $ BILL_AMT2 : int 3102 1725 14027 48233 5670 57069 412023 380 14096 0 ...
## $ BILL_AMT3 : int 689 2682 13559 49291 35835 57608 445007 601 12108 0 ...
## $ BILL_AMT4 : int 0 3272 14331 28314 20940 19394 542653 221 12211 0 ...
## $ BILL_AMT5 : int 0 3455 14948 28959 19146 19619 483003 -159 11793 13007 ...
## $ BILL_AMT6 : int 0 3261 15549 29547 19131 20024 473944 567 3719 13912 ...
## $ PAY_AMT1 : int 0 0 1518 2000 2000 2500 55000 380 3329 0 ...
## $ PAY_AMT2 : int 689 1000 1500 2019 36681 1815 40000 601 0 0 ...
## $ PAY_AMT3 : int 0 1000 1000 1200 10000 657 38000 0 432 0 ...
## $ PAY_AMT4 : int 0 1000 1000 1100 9000 1000 20239 581 1000 13007 ...
## $ PAY_AMT5 : int 0 0 1000 1069 689 1000 13750 1687 1000 1122 ...
## $ PAY_AMT6 : int 0 2000 5000 1000 679 800 13770 1542 1000 0 ...
## $ default.payment.next.month: int 1 1 0 0 0 0 0 0 0 0 ...
The dataset used in this project is titled “Default of Credit Card Clients” and was obtained from the UCI Machine Learning Repository. It contains data on 30,000 credit card holders in Taiwan and aims to capture customer behavior related to default payments. The target variable indicates whether a client defaulted on the next month’s payment. The data set includes demographic features such as age, gender, education, and marital status, as well as financial attributes including credit limit, past bill amounts, and repayment records over a six-month period.
## [1] "ID" "LIMIT_BAL"
## [3] "SEX" "EDUCATION"
## [5] "MARRIAGE" "AGE"
## [7] "PAY_0" "PAY_2"
## [9] "PAY_3" "PAY_4"
## [11] "PAY_5" "PAY_6"
## [13] "BILL_AMT1" "BILL_AMT2"
## [15] "BILL_AMT3" "BILL_AMT4"
## [17] "BILL_AMT5" "BILL_AMT6"
## [19] "PAY_AMT1" "PAY_AMT2"
## [21] "PAY_AMT3" "PAY_AMT4"
## [23] "PAY_AMT5" "PAY_AMT6"
## [25] "default.payment.next.month"
# A quick visualization on each variable
#check if the margins are large enough
library(DescTools)
Desc(credit)
## ──────────────────────────────────────────────────────────────────────────────
## Describe credit (data.frame):
##
## data frame: 30000 obs. of 25 variables
## 30000 complete cases (100.0%)
##
## Nr Class ColName NAs Levels
## 1 int ID .
## 2 int LIMIT_BAL .
## 3 int SEX .
## 4 int EDUCATION .
## 5 int MARRIAGE .
## 6 int AGE .
## 7 int PAY_0 .
## 8 int PAY_2 .
## 9 int PAY_3 .
## 10 int PAY_4 .
## 11 int PAY_5 .
## 12 int PAY_6 .
## 13 int BILL_AMT1 .
## 14 int BILL_AMT2 .
## 15 int BILL_AMT3 .
## 16 int BILL_AMT4 .
## 17 int BILL_AMT5 .
## 18 int BILL_AMT6 .
## 19 int PAY_AMT1 .
## 20 int PAY_AMT2 .
## 21 int PAY_AMT3 .
## 22 int PAY_AMT4 .
## 23 int PAY_AMT5 .
## 24 int PAY_AMT6 .
## 25 int default.payment.next.month .
##
##
## ──────────────────────────────────────────────────────────────────────────────
## 1 - ID (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 = n 0 15'000.50 14'902.50
## 100.0% 0.0% 0.0% 15'098.50
##
## .05 .10 .25 median .75 .90 .95
## 1'500.95 3'000.90 7'500.75 15'000.50 22'500.25 27'000.10 28'500.05
##
## range sd vcoef mad IQR skew kurt
## 29'999.00 8'660.40 0.58 11'119.50 14'999.50 -0.00 -1.20
##
## lowest : 1, 2, 3, 4, 5
## highest: 29'996, 29'997, 29'998, 29'999, 30'000
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 2 - LIMIT_BAL (integer)
##
## length n NAs unique 0s mean'
## 30'000 30'000 0 81 0 167'484.32
## 100.0% 0.0% 0.0%
##
## .05 .10 .25 median .75 .90
## 20'000.00 30'000.00 50'000.00 140'000.00 240'000.00 360'000.00
##
## range sd vcoef mad IQR skew
## 990'000.00 129'747.66 0.77 133'434.00 190'000.00 0.99
##
## meanCI
## 166'016.06
## 168'952.59
##
## .95
## 430'000.00
##
## kurt
## 0.54
##
## lowest : 10'000 (493), 16'000 (2), 20'000 (1'976), 30'000 (1'610), 40'000 (230)
## highest: 750'000 (4), 760'000, 780'000 (2), 800'000 (2), 1'000'000
##
## heap(?): remarkable frequency (11.2%) for the mode(s) (= 50000)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 3 - SEX (integer - dichotomous)
##
## length n NAs unique
## 30'000 30'000 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 1 11'888 39.6% 39.1% 40.2%
## 2 18'112 60.4% 59.8% 60.9%
##
## ' 95%-CI (Wilson)
## ──────────────────────────────────────────────────────────────────────────────
## 4 - EDUCATION (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 7 14 1.85 1.84
## 100.0% 0.0% 0.0% 1.86
##
## .05 .10 .25 median .75 .90 .95
## 1.00 1.00 1.00 2.00 2.00 3.00 3.00
##
## range sd vcoef mad IQR skew kurt
## 6.00 0.79 0.43 1.48 1.00 0.97 2.08
##
##
## value freq perc cumfreq cumperc
## 1 0 14 0.0% 14 0.0%
## 2 1 10'585 35.3% 10'599 35.3%
## 3 2 14'030 46.8% 24'629 82.1%
## 4 3 4'917 16.4% 29'546 98.5%
## 5 4 123 0.4% 29'669 98.9%
## 6 5 280 0.9% 29'949 99.8%
## 7 6 51 0.2% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 5 - MARRIAGE (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 4 54 1.55 1.55
## 100.0% 0.0% 0.2% 1.56
##
## .05 .10 .25 median .75 .90 .95
## 1.00 1.00 1.00 2.00 2.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 3.00 0.52 0.34 0.00 1.00 -0.02 -1.36
##
##
## value freq perc cumfreq cumperc
## 1 0 54 0.2% 54 0.2%
## 2 1 13'659 45.5% 13'713 45.7%
## 3 2 15'964 53.2% 29'677 98.9%
## 4 3 323 1.1% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 6 - AGE (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 56 0 35.49 35.38
## 100.0% 0.0% 0.0% 35.59
##
## .05 .10 .25 median .75 .90 .95
## 23.00 25.00 28.00 34.00 41.00 49.00 53.00
##
## range sd vcoef mad IQR skew kurt
## 58.00 9.22 0.26 8.90 13.00 0.73 0.04
##
## lowest : 21 (67), 22 (560), 23 (931), 24 (1'127), 25 (1'186)
## highest: 72 (3), 73 (4), 74, 75 (3), 79
##
## heap(?): remarkable frequency (5.3%) for the mode(s) (= 29)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 7 - PAY_0 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 11 14'737 -0.02 -0.03
## 100.0% 0.0% 49.1% -0.00
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -1.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 10.00 1.12 -67.29 1.48 1.00 0.73 2.72
##
##
## value freq perc cumfreq cumperc
## 1 -2 2'759 9.2% 2'759 9.2%
## 2 -1 5'686 19.0% 8'445 28.1%
## 3 0 14'737 49.1% 23'182 77.3%
## 4 1 3'688 12.3% 26'870 89.6%
## 5 2 2'667 8.9% 29'537 98.5%
## 6 3 322 1.1% 29'859 99.5%
## 7 4 76 0.3% 29'935 99.8%
## 8 5 26 0.1% 29'961 99.9%
## 9 6 11 0.0% 29'972 99.9%
## 10 7 9 0.0% 29'981 99.9%
## 11 8 19 0.1% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 8 - PAY_2 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 11 15'730 -0.13 -0.15
## 100.0% 0.0% 52.4% -0.12
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 10.00 1.20 -8.95 0.00 1.00 0.79 1.57
##
##
## value freq perc cumfreq cumperc
## 1 -2 3'782 12.6% 3'782 12.6%
## 2 -1 6'050 20.2% 9'832 32.8%
## 3 0 15'730 52.4% 25'562 85.2%
## 4 1 28 0.1% 25'590 85.3%
## 5 2 3'927 13.1% 29'517 98.4%
## 6 3 326 1.1% 29'843 99.5%
## 7 4 99 0.3% 29'942 99.8%
## 8 5 25 0.1% 29'967 99.9%
## 9 6 12 0.0% 29'979 99.9%
## 10 7 20 0.1% 29'999 100.0%
## 11 8 1 0.0% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 9 - PAY_3 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 11 15'764 -0.17 -0.18
## 100.0% 0.0% 52.5% -0.15
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 10.00 1.20 -7.20 0.00 1.00 0.84 2.08
##
##
## value freq perc cumfreq cumperc
## 1 -2 4'085 13.6% 4'085 13.6%
## 2 -1 5'938 19.8% 10'023 33.4%
## 3 0 15'764 52.5% 25'787 86.0%
## 4 1 4 0.0% 25'791 86.0%
## 5 2 3'819 12.7% 29'610 98.7%
## 6 3 240 0.8% 29'850 99.5%
## 7 4 76 0.3% 29'926 99.8%
## 8 5 21 0.1% 29'947 99.8%
## 9 6 23 0.1% 29'970 99.9%
## 10 7 27 0.1% 29'997 100.0%
## 11 8 3 0.0% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 10 - PAY_4 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 11 16'455 -0.22 -0.23
## 100.0% 0.0% 54.9% -0.21
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 10.00 1.17 -5.30 0.00 1.00 1.00 3.50
##
##
## value freq perc cumfreq cumperc
## 1 -2 4'348 14.5% 4'348 14.5%
## 2 -1 5'687 19.0% 10'035 33.5%
## 3 0 16'455 54.9% 26'490 88.3%
## 4 1 2 0.0% 26'492 88.3%
## 5 2 3'159 10.5% 29'651 98.8%
## 6 3 180 0.6% 29'831 99.4%
## 7 4 69 0.2% 29'900 99.7%
## 8 5 35 0.1% 29'935 99.8%
## 9 6 5 0.0% 29'940 99.8%
## 10 7 58 0.2% 29'998 100.0%
## 11 8 2 0.0% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 11 - PAY_5 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 10 16'947 -0.27 -0.28
## 100.0% 0.0% 56.5% -0.25
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 0.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 10.00 1.13 -4.26 0.00 1.00 1.01 3.99
##
##
## value freq perc cumfreq cumperc
## 1 -2 4'546 15.2% 4'546 15.2%
## 2 -1 5'539 18.5% 10'085 33.6%
## 3 0 16'947 56.5% 27'032 90.1%
## 4 2 2'626 8.8% 29'658 98.9%
## 5 3 178 0.6% 29'836 99.5%
## 6 4 84 0.3% 29'920 99.7%
## 7 5 17 0.1% 29'937 99.8%
## 8 6 4 0.0% 29'941 99.8%
## 9 7 58 0.2% 29'999 100.0%
## 10 8 1 0.0% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 12 - PAY_6 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 10 16'286 -0.29 -0.30
## 100.0% 0.0% 54.3% -0.28
##
## .05 .10 .25 median .75 .90 .95
## -2.00 -2.00 -1.00 0.00 0.00 2.00 2.00
##
## range sd vcoef mad IQR skew kurt
## 10.00 1.15 -3.95 0.00 1.00 0.95 3.43
##
##
## value freq perc cumfreq cumperc
## 1 -2 4'895 16.3% 4'895 16.3%
## 2 -1 5'740 19.1% 10'635 35.4%
## 3 0 16'286 54.3% 26'921 89.7%
## 4 2 2'766 9.2% 29'687 99.0%
## 5 3 184 0.6% 29'871 99.6%
## 6 4 49 0.2% 29'920 99.7%
## 7 5 13 0.0% 29'933 99.8%
## 8 6 19 0.1% 29'952 99.8%
## 9 7 46 0.2% 29'998 100.0%
## 10 8 2 0.0% 30'000 100.0%
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 13 - BILL_AMT1 (integer)
##
## length n NAs unique 0s mean'
## 30'000 30'000 0 22'723 2'008 51'223.33
## 100.0% 0.0% 6.7%
##
## .05 .10 .25 median .75 .90
## 0.00 278.90 3'558.75 22'381.50 67'091.00 142'133.70
##
## range sd vcoef mad IQR skew
## 1'130'091.00 73'635.86 1.44 32'321.42 63'532.25 2.66
##
## meanCI
## 50'390.04
## 52'056.62
##
## .95
## 201'203.05
##
## kurt
## 9.80
##
## lowest : -165'580, -154'973, -15'308, -14'386, -11'545
## highest: 626'648, 630'458, 653'062, 746'814, 964'511
##
## heap(?): remarkable frequency (6.7%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 14 - BILL_AMT2 (integer)
##
## length n NAs unique 0s mean'
## 30'000 30'000 0 22'346 2'506 49'179.08
## 100.0% 0.0% 8.4%
##
## .05 .10 .25 median .75 .90
## 0.00 0.00 2'984.75 21'200.00 64'006.25 136'905.50
##
## range sd vcoef mad IQR skew
## 1'053'708.00 71'173.77 1.45 30'852.91 61'021.50 2.70
##
## meanCI
## 48'373.65
## 49'984.50
##
## .95
## 194'792.20
##
## kurt
## 10.30
##
## lowest : -69'777, -67'526, -33'350, -30'000, -26'214
## highest: 624'475, 646'770, 671'563, 743'970, 983'931
##
## heap(?): remarkable frequency (8.4%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 15 - BILL_AMT3 (integer)
##
## length n NAs unique 0s mean'
## 30'000 30'000 0 22'026 2'870 47'013.15
## 100.0% 0.0% 9.6%
##
## .05 .10 .25 median .75 .90
## 0.00 0.00 2'666.25 20'088.50 60'164.75 132'051.30
##
## range sd vcoef mad IQR skew
## 1'821'353.00 69'349.39 1.48 29'219.82 57'498.50 3.09
##
## meanCI
## 46'228.38
## 47'797.93
##
## .95
## 187'821.05
##
## kurt
## 19.78
##
## lowest : -157'264, -61'506, -46'127, -34'041, -25'443
## highest: 689'627, 689'643, 693'131, 855'086, 1'664'089
##
## heap(?): remarkable frequency (9.6%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 16 - BILL_AMT4 (integer)
##
## length n NAs unique 0s mean'
## 30'000 30'000 0 21'548 3'195 43'262.95
## 100.0% 0.0% 10.7%
##
## .05 .10 .25 median .75 .90
## 0.00 0.00 2'326.75 19'052.00 54'506.00 122'418.70
##
## range sd vcoef mad IQR skew
## 1'061'586.00 64'332.86 1.49 27'659.39 52'179.25 2.82
##
## meanCI
## 42'534.94
## 43'990.96
##
## .95
## 174'333.35
##
## kurt
## 11.31
##
## lowest : -170'000, -81'334, -65'167, -50'616, -46'627
## highest: 572'805, 616'836, 628'699, 706'864, 891'586
##
## heap(?): remarkable frequency (10.7%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 17 - BILL_AMT5 (integer)
##
## length n NAs unique 0s mean'
## 30'000 30'000 0 21'010 3'506 40'311.40
## 100.0% 0.0% 11.7%
##
## .05 .10 .25 median .75 .90
## 0.00 0.00 1'763.00 18'104.50 50'190.50 115'883.00
##
## range sd vcoef mad IQR skew
## 1'008'505.00 60'797.16 1.51 26'224.97 48'427.50 2.88
##
## meanCI
## 39'623.40
## 40'999.40
##
## .95
## 165'794.30
##
## kurt
## 12.30
##
## lowest : -81'334, -61'372, -53'007, -46'627, -37'594
## highest: 547'880, 551'702, 587'067, 823'540, 927'171
##
## heap(?): remarkable frequency (11.7%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 18 - BILL_AMT6 (integer)
##
## length n NAs unique 0s mean'
## 30'000 30'000 0 20'604 4'020 38'871.76
## 100.0% 0.0% 13.4%
##
## .05 .10 .25 median .75 .90
## 0.00 0.00 1'256.00 17'071.00 49'198.25 112'110.40
##
## range sd vcoef mad IQR skew
## 1'301'267.00 59'554.11 1.53 24'840.96 47'942.25 2.85
##
## meanCI
## 38'197.83
## 39'545.69
##
## .95
## 161'912.00
##
## kurt
## 12.27
##
## lowest : -339'603, -209'051, -150'953, -94'625, -73'895
## highest: 527'566, 527'711, 568'638, 699'944, 961'664
##
## heap(?): remarkable frequency (13.4%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 19 - PAY_AMT1 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 7'943 5'249 5'663.58 5'476.15
## 100.0% 0.0% 17.5% 5'851.02
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 1'000.00 2'100.00 5'006.00 10'300.00 18'428.20
##
## range sd vcoef mad IQR skew kurt
## 873'552.00 16'563.28 2.92 2'864.38 4'006.00 14.67 415.16
##
## lowest : 0 (5'249), 1 (9), 2 (14), 3 (15), 4 (18)
## highest: 405'016, 423'903, 493'358, 505'000, 873'552
##
## heap(?): remarkable frequency (17.5%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 20 - PAY_AMT2 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 7'899 5'396 5'921.16 5'660.43
## 100.0% 0.0% 18.0% 6'181.90
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 833.00 2'009.00 5'000.00 10'401.10 19'004.35
##
## range sd vcoef mad IQR skew kurt
## 1'684'259.00 23'040.87 3.89 2'951.86 4'167.00 30.45 1'641.25
##
## lowest : 0 (5'396), 1 (15), 2 (20), 3 (18), 4 (11)
## highest: 580'464, 1'024'516, 1'215'471, 1'227'082, 1'684'259
##
## heap(?): remarkable frequency (18.0%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 21 - PAY_AMT3 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 7'518 5'968 5'225.68 5'026.44
## 100.0% 0.0% 19.9% 5'424.93
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 390.00 1'800.00 4'505.00 10'000.00 17'589.40
##
## range sd vcoef mad IQR skew kurt
## 896'040.00 17'606.96 3.37 2'661.27 4'115.00 17.21 564.18
##
## lowest : 0 (5'968), 1 (13), 2 (19), 3 (14), 4 (15)
## highest: 400'972, 417'588, 508'229, 889'043, 896'040
##
## heap(?): remarkable frequency (19.9%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 22 - PAY_AMT4 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 6'937 6'408 4'826.08 4'648.79
## 100.0% 0.0% 21.4% 5'003.36
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 296.00 1'500.00 4'013.25 9'570.60 16'014.95
##
## range sd vcoef mad IQR skew kurt
## 621'000.00 15'666.16 3.25 2'223.90 3'717.25 12.90 277.27
##
## lowest : 0 (6'408), 1 (22), 2 (22), 3 (13), 4 (20)
## highest: 400'046, 432'130, 497'000, 528'897, 621'000
##
## heap(?): remarkable frequency (21.4%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 23 - PAY_AMT5 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 6'897 6'703 4'799.39 4'626.49
## 100.0% 0.0% 22.3% 4'972.28
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 252.50 1'500.00 4'031.50 9'500.00 16'000.00
##
## range sd vcoef mad IQR skew kurt
## 426'529.00 15'278.31 3.18 2'223.90 3'779.00 11.13 180.02
##
## lowest : 0 (6'703), 1 (21), 2 (13), 3 (13), 4 (12)
## highest: 332'000, 379'267, 388'071, 417'990, 426'529
##
## heap(?): remarkable frequency (22.3%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 24 - PAY_AMT6 (integer)
##
## length n NAs unique 0s mean meanCI'
## 30'000 30'000 0 6'939 7'173 5'215.50 5'014.33
## 100.0% 0.0% 23.9% 5'416.68
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 117.75 1'500.00 4'000.00 9'600.00 17'343.80
##
## range sd vcoef mad IQR skew kurt
## 528'666.00 17'777.47 3.41 2'223.90 3'882.25 10.64 167.12
##
## lowest : 0 (7'173), 1 (20), 2 (9), 3 (14), 4 (12)
## highest: 403'500, 422'000, 443'001, 527'143, 528'666
##
## heap(?): remarkable frequency (23.9%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ──────────────────────────────────────────────────────────────────────────────
## 25 - default.payment.next.month (integer - dichotomous)
##
## length n NAs unique
## 30'000 30'000 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 23'364 77.9% 77.4% 78.3%
## 1 6'636 22.1% 21.7% 22.6%
##
## ' 95%-CI (Wilson)
## Change integer to factor
credit$EDUCATION <- ifelse(credit$EDUCATION %in% c(0, 4:6), 0, credit$EDUCATION)
credit <- credit |>
mutate(
SEXF = factor(SEX, levels = c(1, 2), labels = c("male", "female")),
EDUCATIONF = factor(EDUCATION, levels = c(0, 1, 2, 3), labels = c("others", "graduate school", "high school", "university")),
MARRIAGEF = factor(MARRIAGE, levels = c(0, 1, 2, 3), labels = c("others", "married", "single", "divorce"))
)
pay_vars <- grep("^PAY_[0-6]$", names(credit), value = TRUE)
for (var in pay_vars) {
credit[[paste0(var,"F")]] <- factor(
credit[[var]],
levels = c(-2,-1,0,1:9),
labels = c("no consumption","fully paid","revolving",
paste0(1:9,"d"))
)
}
ggplot(credit, aes(x = EDUCATIONF, fill = factor(default.payment.next.month))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(
values = c("0" = "#D5DBB3", "1" = "#BCAAA4"),
labels = c("No Default", "Default")
) +
labs(
title = "Default Rate by Education Level",
x = "Education",
y = "Proportion",
fill = "Default Status"
) +
theme_minimal()
##### The university group exhibits the highest default rate among all
education levels, followed by high school and graduate school.
ggplot(credit, aes(x = SEXF, fill = factor(default.payment.next.month))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(
values = c("0" = "#D5DBB3", "1" = "#BCAAA4"),
labels = c("No Default", "Default")
) +
labs(
title = "Default Rate by Gender",
x = "Gender",
y = "Proportion",
fill = "Default Status"
) +
theme_minimal()
##### A similar tendency toward gender.
ggplot(credit, aes(x = MARRIAGEF, fill = factor(default.payment.next.month))) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
scale_fill_manual(
values = c("0" = "#D5DBB3", "1" = "#BCAAA4"),
labels = c("No Default", "Default")
) +
labs(
title = "Default Rate by Marital Status",
x = "Marital Status",
y = "Proportion",
fill = "Default Status"
) +
theme_minimal()
##### The chart illustrates the default distribution across marital
status groups. Individuals categorized as divorced exhibit the highest
default rate, followed by married and single clients. The ‘others’ group
shows the lowest proportion of default, though this may be influenced by
its undefined classification.
pay_status <- credit %>%
select(ID = 1, matches("PAY_[0-6]F")) %>%
reshape2::melt(id.vars = "ID", variable.name = "Period", value.name = "Status")
ggplot(pay_status, aes(x = Status)) +
geom_bar(fill = "#BCAAA4") +
facet_wrap(~ Period, scales = "free") +
labs(title = "Payment Status Distribution over 7 Periods", x = "Status", y = "Count") +
theme_minimal()
##### The charts display the distribution of payment statuses over the
past seven periods (PAY_0F to PAY_6F). The majority of users
consistently fall into either the ‘fully paid’ or ‘revolving’
categories, indicating stable repayment behavior. Delinquency rates
beyond one or two months are relatively rare, while the number of users
with ‘no consumption’ remains minimal across all periods.
columns <- c(
"SEXF","EDUCATIONF","MARRIAGEF","LIMIT_BAL",
paste0(pay_vars,"F"),
paste0("BILL_AMT",1:6), paste0("PAY_AMT",1:6),
"default.payment.next.month"
)
columns <- intersect(columns, names(credit))
base <- credit[, columns]
base_numeric <- base[, sapply(base, is.numeric)]
base_factor <- base[, sapply(base, is.factor)]
# Set target variable
base_numeric$def <- base$`default.payment.next.month`
base_numeric$def_woe <- 1 - base_numeric$def
p1 <- ggplot(base_numeric, aes(x = factor(def))) +
geom_bar(fill = c("#D5DBB3", "#BCAAA4")) +
labs(
title = "Default Variable Distribution",
x = "def (0 = No Default, 1 = Default)",
y = "Count"
) +
theme_minimal()
p2 <- ggplot(base_numeric, aes(x = factor(def_woe))) +
geom_bar(fill = c("#BCAAA4", "#D5DBB3")) +
labs(
title = "def_woe (for smbinning)",
x = "def_woe (0 = Default, 1 = No Default)",
y = "Count"
) +
theme_minimal()
gridExtra::grid.arrange(p1, p2, ncol = 2)
##### The figure illustrates the distribution of the target variable
used for credit default prediction. The original variable (def) reveals
a class imbalance, with approximately 22% of clients defaulting on their
next payment. To comply with the requirements of the WoE-based binning
procedure, a transformed variable (def_woe) is created, where 0
indicates ‘bad’ (default) and 1 indicates ‘good’ (no default). This
transformation is essential for scorecard modeling and IV analysis.
vars_to_log <- c("LIMIT_BAL", paste0("BILL_AMT",1:6), paste0("PAY_AMT",1:6))
vars_to_log <- intersect(vars_to_log, names(base_numeric))
base_numeric[vars_to_log] <- lapply(base_numeric[vars_to_log], log1p)
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
percentile <- apply(base_numeric, 2, function(x) quantile(x, seq(0.1,1,0.1), na.rm = TRUE) |> unique() |> round(2))
unique_vals <- apply(base_numeric, 2, function(x) length(unique(x)))
vars_to_bin <- names(unique_vals[unique_vals >= 10])
vars_as_factor <- names(unique_vals[unique_vals < 10 & unique_vals > 1])
unique_vals <- sapply(base_numeric, function(x) length(unique(x)))
vars_to_bin <- names(unique_vals[unique_vals >= 10])
vars_as_factor <- names(unique_vals[unique_vals < 10 & unique_vals > 1])
## continuous variables
for (m in vars_to_bin) {
cuts <- unique(quantile(base_numeric[[m]],
probs = seq(0.1, 1, 0.1),
na.rm = TRUE))
if (length(cuts) < 2) next
base_numeric[[paste0(m, "_fine")]] <- cut(
base_numeric[[m]],
breaks = c(-Inf, cuts),
labels = paste0("≤", cuts),
include.lowest = TRUE
)
}
for (m in vars_as_factor) {
base_numeric[[paste0(m, "_fine")]] <- factor(base_numeric[[m]])
}
bill_vars <- paste0("BILL_AMT", 1:6)
base_numeric[bill_vars] <- lapply(base_numeric[bill_vars], function(x){
x <- ifelse(x < 0, 0, x)
log1p(x)
})
for (m in bill_vars){
cuts <- unique(quantile(base_numeric[[m]], probs = seq(0.1,1,0.1), na.rm = TRUE))
if (length(cuts) < 2) next
base_numeric[[paste0(m,"_fine")]] <- cut(
base_numeric[[m]],
breaks = c(-Inf, cuts),
labels = paste0("≤", cuts),
include.lowest = TRUE
)
}
smbinning.eda(base_numeric, rounding = 3, pbar = 1)
##
## | | | 0% | |-- | 3% | |--- | 6% | |----- | 9% | |------ | 12% | |-------- | 16% | |--------- | 19% | |----------- | 22% | |------------ | 25% | |-------------- | 28% | |---------------- | 31% | |----------------- | 34% | |------------------- | 38% | |-------------------- | 41% | |---------------------- | 44% | |----------------------- | 47% | |------------------------- | 50% | |--------------------------- | 53% | |---------------------------- | 56% | |------------------------------ | 59% | |------------------------------- | 62% | |--------------------------------- | 66% | |---------------------------------- | 69% | |------------------------------------ | 72% | |-------------------------------------- | 75% | |--------------------------------------- | 78% | |----------------------------------------- | 81% | |------------------------------------------ | 84% | |-------------------------------------------- | 88% | |--------------------------------------------- | 91% | |----------------------------------------------- | 94% | |------------------------------------------------ | 97% | |--------------------------------------------------| 100%
## $eda
## Field Type Recs Miss Unique Min Q25 Q50
## 1 LIMIT_BAL Num/Int 30000 0 81 9.21 10.820 11.849
## 2 BILL_AMT1 Num/Int 30000 574 22417 0.00 2.234 2.405
## 3 BILL_AMT2 Num/Int 30000 651 21999 0.00 2.221 2.401
## 4 BILL_AMT3 Num/Int 30000 631 21662 0.00 2.211 2.394
## 5 BILL_AMT4 Num/Int 30000 650 21171 0.00 2.194 2.387
## 6 BILL_AMT5 Num/Int 30000 631 20626 0.00 2.166 2.383
## 7 BILL_AMT6 Num/Int 30000 663 20190 0.00 2.128 2.379
## 8 PAY_AMT1 Num/Int 30000 0 7943 0.00 6.909 7.650
## 9 PAY_AMT2 Num/Int 30000 0 7899 0.00 6.726 7.606
## 10 PAY_AMT3 Num/Int 30000 0 7518 0.00 5.969 7.496
## 11 PAY_AMT4 Num/Int 30000 0 6937 0.00 5.694 7.314
## 12 PAY_AMT5 Num/Int 30000 0 6897 0.00 5.535 7.314
## 13 PAY_AMT6 Num/Int 30000 0 6939 0.00 4.777 7.314
## 14 default.payment.next.month Num/Int 30000 0 2 0.00 0.000 0.000
## 15 def Num/Int 30000 0 2 0.00 0.000 0.000
## 16 def_woe Num/Int 30000 0 2 0.00 1.000 1.000
## 17 LIMIT_BAL_fine Factor 30000 0 10 NA NA NA
## 18 BILL_AMT1_fine Factor 30000 574 10 NA NA NA
## 19 BILL_AMT2_fine Factor 30000 651 10 NA NA NA
## 20 BILL_AMT3_fine Factor 30000 631 10 NA NA NA
## 21 BILL_AMT4_fine Factor 30000 650 10 NA NA NA
## 22 BILL_AMT5_fine Factor 30000 631 10 NA NA NA
## 23 BILL_AMT6_fine Factor 30000 663 10 NA NA NA
## 24 PAY_AMT1_fine Factor 30000 0 10 NA NA NA
## 25 PAY_AMT2_fine Factor 30000 0 10 NA NA NA
## 26 PAY_AMT3_fine Factor 30000 0 10 NA NA NA
## 27 PAY_AMT4_fine Factor 30000 0 9 NA NA NA
## 28 PAY_AMT5_fine Factor 30000 0 9 NA NA NA
## 29 PAY_AMT6_fine Factor 30000 0 9 NA NA NA
## 30 default.payment.next.month_fine Factor 30000 0 2 NA NA NA
## 31 def_fine Factor 30000 0 2 NA NA NA
## 32 def_woe_fine Factor 30000 0 2 NA NA NA
## Avg Q75 Max StDv Neg Zero Pos OutLo OutHi
## 1 11.663 12.388 13.816 0.941 0 0 30000 0 0
## 2 2.210 2.496 2.693 0.625 0 2024 27402 2283 0
## 3 2.169 2.492 2.695 0.687 0 2524 26825 2724 0
## 4 2.138 2.488 2.729 0.727 0 2894 26475 3035 0
## 5 2.108 2.480 2.688 0.758 0 3220 26130 3308 0
## 6 2.076 2.472 2.691 0.786 0 3530 25839 3590 0
## 7 2.032 2.470 2.693 0.830 0 4045 25292 4072 0
## 8 6.630 8.519 13.680 3.250 0 5249 24751 5498 386
## 9 6.563 8.517 14.337 3.279 0 5396 24604 5638 314
## 10 6.283 8.413 13.706 3.350 0 5968 24032 6089 49
## 11 6.078 8.298 13.339 3.397 0 6408 23592 6485 33
## 12 6.032 8.302 12.963 3.444 0 6703 23297 6737 17
## 13 5.933 8.294 13.178 3.528 0 7173 22827 0 0
## 14 0.221 0.000 1.000 0.415 0 23364 6636 0 6636
## 15 0.221 0.000 1.000 0.415 0 23364 6636 0 6636
## 16 0.779 1.000 1.000 0.415 0 6636 23364 6636 0
## 17 NA NA NA NA NA NA NA NA NA
## 18 NA NA NA NA NA NA NA NA NA
## 19 NA NA NA NA NA NA NA NA NA
## 20 NA NA NA NA NA NA NA NA NA
## 21 NA NA NA NA NA NA NA NA NA
## 22 NA NA NA NA NA NA NA NA NA
## 23 NA NA NA NA NA NA NA NA NA
## 24 NA NA NA NA NA NA NA NA NA
## 25 NA NA NA NA NA NA NA NA NA
## 26 NA NA NA NA NA NA NA NA NA
## 27 NA NA NA NA NA NA NA NA NA
## 28 NA NA NA NA NA NA NA NA NA
## 29 NA NA NA NA NA NA NA NA NA
## 30 NA NA NA NA NA NA NA NA NA
## 31 NA NA NA NA NA NA NA NA NA
## 32 NA NA NA NA NA NA NA NA NA
##
## $edapct
## Field Type Recs Miss Neg Zero Pos OutLo
## 1 LIMIT_BAL Num/Int 30000 0.000 0 0.000 1.000 0.000
## 2 BILL_AMT1 Num/Int 30000 0.019 0 0.067 0.913 0.076
## 3 BILL_AMT2 Num/Int 30000 0.022 0 0.084 0.894 0.091
## 4 BILL_AMT3 Num/Int 30000 0.021 0 0.096 0.882 0.101
## 5 BILL_AMT4 Num/Int 30000 0.022 0 0.107 0.871 0.110
## 6 BILL_AMT5 Num/Int 30000 0.021 0 0.118 0.861 0.120
## 7 BILL_AMT6 Num/Int 30000 0.022 0 0.135 0.843 0.136
## 8 PAY_AMT1 Num/Int 30000 0.000 0 0.175 0.825 0.183
## 9 PAY_AMT2 Num/Int 30000 0.000 0 0.180 0.820 0.188
## 10 PAY_AMT3 Num/Int 30000 0.000 0 0.199 0.801 0.203
## 11 PAY_AMT4 Num/Int 30000 0.000 0 0.214 0.786 0.216
## 12 PAY_AMT5 Num/Int 30000 0.000 0 0.223 0.777 0.225
## 13 PAY_AMT6 Num/Int 30000 0.000 0 0.239 0.761 0.000
## 14 default.payment.next.month Num/Int 30000 0.000 0 0.779 0.221 0.000
## 15 def Num/Int 30000 0.000 0 0.779 0.221 0.000
## 16 def_woe Num/Int 30000 0.000 0 0.221 0.779 0.221
## 17 LIMIT_BAL_fine Factor 30000 0.000 NA NA NA NA
## 18 BILL_AMT1_fine Factor 30000 0.019 NA NA NA NA
## 19 BILL_AMT2_fine Factor 30000 0.022 NA NA NA NA
## 20 BILL_AMT3_fine Factor 30000 0.021 NA NA NA NA
## 21 BILL_AMT4_fine Factor 30000 0.022 NA NA NA NA
## 22 BILL_AMT5_fine Factor 30000 0.021 NA NA NA NA
## 23 BILL_AMT6_fine Factor 30000 0.022 NA NA NA NA
## 24 PAY_AMT1_fine Factor 30000 0.000 NA NA NA NA
## 25 PAY_AMT2_fine Factor 30000 0.000 NA NA NA NA
## 26 PAY_AMT3_fine Factor 30000 0.000 NA NA NA NA
## 27 PAY_AMT4_fine Factor 30000 0.000 NA NA NA NA
## 28 PAY_AMT5_fine Factor 30000 0.000 NA NA NA NA
## 29 PAY_AMT6_fine Factor 30000 0.000 NA NA NA NA
## 30 default.payment.next.month_fine Factor 30000 0.000 NA NA NA NA
## 31 def_fine Factor 30000 0.000 NA NA NA NA
## 32 def_woe_fine Factor 30000 0.000 NA NA NA NA
## OutHi
## 1 0.000
## 2 0.000
## 3 0.000
## 4 0.000
## 5 0.000
## 6 0.000
## 7 0.000
## 8 0.013
## 9 0.010
## 10 0.002
## 11 0.001
## 12 0.001
## 13 0.000
## 14 0.221
## 15 0.221
## 16 0.000
## 17 NA
## 18 NA
## 19 NA
## 20 NA
## 21 NA
## 22 NA
## 23 NA
## 24 NA
## 25 NA
## 26 NA
## 27 NA
## 28 NA
## 29 NA
## 30 NA
## 31 NA
## 32 NA
numeric_vars <- base_numeric %>%
summarise(across(everything(), ~n_distinct(.))) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "distinct_count") %>%
filter(distinct_count > 10) %>%
pull(variable)
for (v in numeric_vars) {
print(
ggplot(base_numeric, aes(x = as.numeric(.data[[v]]))) +
geom_histogram(fill = "#D5DBB3", bins = 50) +
labs(title = paste("Distribution of", v), x = v) +
theme_minimal()
)
}
## Warning: Removed 574 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 651 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 650 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 663 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 574 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 651 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 650 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 631 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 663 rows containing non-finite outside the scale range
## (`stat_bin()`).
pdf("WoE_numeric.pdf", paper = "a4")
IV <- data.frame(VAR = character(), IV = numeric())
WOE <- list()
error_vars <- c()
success_vars <- c()
vars_fine <- grep("_fine$", names(base_numeric), value = TRUE)
vars_loop <- setdiff(names(base_numeric), c(vars_fine, "def", "def_woe", "default.payment.next.month"))
pb <- txtProgressBar(min = 0, max = length(vars_loop), style = 3)
## | | | 0%
for (v in vars_loop) {
par(mfrow = c(2,2))
cuts <- unique(quantile(base_numeric[[v]],
probs = seq(0.1, 1, 0.1),
na.rm = TRUE))
if (length(cuts) < 2) { error_vars <- c(error_vars, v); next }
res <- try(
smbinning.custom(df = base_numeric, y = "def_woe", x = v, cuts = cuts),
silent = TRUE
)
if (!inherits(res, "try-error") && is.list(res)) {
success_vars <- c(success_vars, v)
boxplot(base_numeric[[v]] ~ base_numeric$def, horizontal = TRUE, col = "lightgray",
frame = FALSE, main = "Distribution"); mtext(v,3)
smbinning.plot(res, option = "dist", sub = v)
smbinning.plot(res, option = "badrate", sub = v)
smbinning.plot(res, option = "WoE", sub = v)
iv_val <- res$ivtable[res$ivtable$Cutpoint == "Total", "IV"]
IV <- rbind(IV, data.frame(VAR = v, IV = as.numeric(iv_val)))
w <- res$ivtable |>
dplyr::filter(Cutpoint != "Total") |>
dplyr::select(Cutpoint, WoE, PctRec) |>
dplyr::arrange(WoE)
w$numer <- 11:(nrow(w) + 10)
WOE[[v]] <- w
} else {
error_vars <- c(error_vars, v)
}
setTxtProgressBar(pb, which(vars_loop == v))
}
## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2
## | |===== | 8%
## | |=========== | 15%
## | |================ | 23%
## | |====================== | 31%
## | |=========================== | 38%
## | |================================ | 46%
## | |====================================== | 54%
## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2
## | |=========================================== | 62%
## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2
## | |================================================ | 69%
## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2
## | |====================================================== | 77%
## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2
## | |=========================================================== | 85%
## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2
## | |================================================================= | 92%
## Warning in rbind(deparse.level, ...): number of columns of result, 14, is not a
## multiple of vector length 13 of arg 2
## | |======================================================================| 100%
close(pb)
dev.off()
## quartz_off_screen
## 2
cat("Success:", length(success_vars), "\n")
## Success: 13
cat("Fail:", paste(error_vars, collapse = ", "), "\n")
## Fail:
IV <- IV[order(-IV$IV), ]
print(IV)
## VAR IV
## 1 LIMIT_BAL 0.1782
## 8 PAY_AMT1 0.1717
## 9 PAY_AMT2 0.1495
## 10 PAY_AMT3 0.1228
## 11 PAY_AMT4 0.1067
## 13 PAY_AMT6 0.0947
## 12 PAY_AMT5 0.0879
## 6 BILL_AMT5 0.0165
## 7 BILL_AMT6 0.0164
## 5 BILL_AMT4 0.0155
## 2 BILL_AMT1 0.0144
## 3 BILL_AMT2 0.0118
## 4 BILL_AMT3 0.0106
IV <- IV[order(-IV$IV), ]
selected <- IV$VAR[IV$IV > 0.02]
print(IV)
## VAR IV
## 1 LIMIT_BAL 0.1782
## 8 PAY_AMT1 0.1717
## 9 PAY_AMT2 0.1495
## 10 PAY_AMT3 0.1228
## 11 PAY_AMT4 0.1067
## 13 PAY_AMT6 0.0947
## 12 PAY_AMT5 0.0879
## 6 BILL_AMT5 0.0165
## 7 BILL_AMT6 0.0164
## 5 BILL_AMT4 0.0155
## 2 BILL_AMT1 0.0144
## 3 BILL_AMT2 0.0118
## 4 BILL_AMT3 0.0106
length(selected)
## [1] 7
# WOE encoded
bins <- woebin(
dt = credit,
y = "default.payment.next.month",
x = selected,
positive = "1"
)
## ℹ Creating woe binning ...
## ✔ Binning on 30000 rows and 8 columns in 00:00:05
dt_woe <- woebin_ply(credit, bins)
## ℹ Converting into woe values ...
## ✔ Woe transformating on 30000 rows and 7 columns in 00:00:11
head(dt_woe[, c(paste0(selected, "_woe"), "default.payment.next.month")])
## [1] "LIMIT_BAL_woe" "PAY_AMT1_woe" "PAY_AMT2_woe" "PAY_AMT3_woe"
## [5] "PAY_AMT4_woe" "PAY_AMT6_woe"
set.seed(2025)
idx <- sample(seq_len(nrow(dt_woe)), 0.7*nrow(dt_woe))
train <- dt_woe[idx, ]
test <- dt_woe[-idx, ]
form <- as.formula(
paste("default.payment.next.month ~",
paste(paste0(selected, "_woe"), collapse = " + "))
)
#
features <- paste0(selected, "_woe")
evaluate_model <- function(actual, predicted_prob, cutoff = 0.5, model_name = "") {
predicted <- ifelse(predicted_prob >= cutoff, 1, 0)
cm <- confusionMatrix(factor(predicted), factor(actual), positive = "1")
auc_val <- auc(actual, predicted_prob)
f1_val <- 2 * cm$byClass["Sensitivity"] * cm$byClass["Precision"] /
(cm$byClass["Sensitivity"] + cm$byClass["Precision"])
data.frame(
Model = model_name,
AUC = round(auc_val, 4),
Accuracy = round(cm$overall["Accuracy"], 4),
Recall = round(cm$byClass["Sensitivity"], 4),
F1 = round(f1_val, 4)
)
}
form <- as.formula(paste("default.payment.next.month ~", paste(features, collapse = " + ")))
model_glm <- glm(form, data = train, family = binomial)
prob_glm <- predict(model_glm, newdata = test, type = "response")
res_glm <- evaluate_model(test$default.payment.next.month, prob_glm, model_name = "GLM")
## Warning in confusionMatrix.default(factor(predicted), factor(actual), positive
## = "1"): Levels are not in the same order for reference and data. Refactoring
## data to match.
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
train$default.payment.next.month <- as.factor(train$default.payment.next.month)
test$default.payment.next.month <- as.factor(test$default.payment.next.month)
model_rf <- randomForest(formula = form, data = train, ntree = 100)
prob_rf <- predict(model_rf, newdata = test, type = "prob")[, "1"]
res_rf <- evaluate_model(test$default.payment.next.month, prob_rf, model_name = "Random Forest")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
y_train <- as.numeric(as.character(train$default.payment.next.month))
y_test <- as.numeric(as.character(test$default.payment.next.month))
train_matrix <- xgb.DMatrix(data = as.matrix(train[, ..features]), label = y_train)
test_matrix <- xgb.DMatrix(data = as.matrix(test[, ..features]), label = y_test)
model_xgb <- xgboost(
data = train_matrix,
nrounds = 100,
objective = "binary:logistic",
verbose = 0
)
prob_xgb <- predict(model_xgb, newdata = test_matrix)
res_xgb <- evaluate_model(y_test, prob_xgb, model_name = "XGBoost")
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
results <- rbind(res_glm, res_rf, res_xgb)
datatable(results, caption = "Model Performance")
card <- scorecard(bins, model_glm)
score <- scorecard_ply(credit, card)
head(score)
## score
## <num>
## 1: 408
## 2: 441
## 3: 462
## 4: 469
## 5: 527
## 6: 465
label_glm <- as.numeric(as.character(test$default.payment.next.month))
# KS plot
library(scorecard)
perf_eva(
pred = prob_glm,
label = label_glm,
title = "GLM",
show_plot = TRUE
)
## $binomial_metric
## $binomial_metric$GLM
## MSE RMSE LogLoss R2 KS AUC Gini
## <num> <num> <num> <num> <num> <num> <num>
## 1: 0.1642983 0.4053373 0.5046303 0.05279292 0.2409348 0.6567201 0.3134402
gini_glm <- 2 * auc(label_glm, prob_glm) - 1
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
print(gini_glm)
## [1] 0.3134402
label_glm <- as.numeric(as.character(test$default.payment.next.month))
roc_obj <- roc(label_glm, prob_glm)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
opt_cut <- coords(roc_obj, "best", ret = "threshold", best.method = "youden")
print(opt_cut)
## threshold
## 1 0.2328704
label_glm <- as.numeric(as.character(test$default.payment.next.month))
thresholds <- seq(0, 1, by = 0.01)
metrics <- data.frame(
Threshold = thresholds,
Precision = NA,
Recall = NA,
F1 = NA
)
for (i in seq_along(thresholds)) {
t <- thresholds[i]
pred_class <- ifelse(prob_glm >= t, 1, 0)
cm <- confusionMatrix(factor(pred_class), factor(label_glm), positive = "1")
prec <- cm$byClass["Precision"]
rec <- cm$byClass["Sensitivity"]
f1 <- ifelse(prec + rec == 0, 0, 2 * prec * rec / (prec + rec))
metrics[i, 2:4] <- c(prec, rec, f1)
}
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Warning in confusionMatrix.default(factor(pred_class), factor(label_glm), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
metrics_long <- melt(metrics, id.vars = "Threshold")
ggplot(metrics_long, aes(x = Threshold, y = value, color = variable)) +
geom_line(size = 1) +
labs(
title = "Precision / Recall / F1 vs. Cut-off Threshold",
x = "Threshold",
y = "Metric Value",
color = "Metric"
) +
scale_color_manual(values = c(
"Precision" = "#D5DBB3",
"Recall" = "#BCAAA4",
"F1" = "#5D7290"
)) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 102 rows containing missing values or values outside the scale range
## (`geom_line()`).