CREDIT SCORING
# Load some packages for data manipulation:
library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(readxl)I. Statistical summary
Get data then summarize data by statistical method:
# Import data:
### Import data from excel file
setwd("C:/Users/PC/Downloads")
data <-read.csv("credit_risk_dataset.csv")
### Descriptive statistic
summary(data)## person_age person_income person_home_ownership person_emp_length
## Min. : 20.00 Min. : 4000 Length:32581 Min. : 0.00
## 1st Qu.: 23.00 1st Qu.: 38500 Class :character 1st Qu.: 2.00
## Median : 26.00 Median : 55000 Mode :character Median : 4.00
## Mean : 27.73 Mean : 66075 Mean : 4.79
## 3rd Qu.: 30.00 3rd Qu.: 79200 3rd Qu.: 7.00
## Max. :144.00 Max. :6000000 Max. :123.00
## NA's :895
## loan_intent loan_grade loan_amnt loan_int_rate
## Length:32581 Length:32581 Min. : 500 Min. : 5.42
## Class :character Class :character 1st Qu.: 5000 1st Qu.: 7.90
## Mode :character Mode :character Median : 8000 Median :10.99
## Mean : 9589 Mean :11.01
## 3rd Qu.:12200 3rd Qu.:13.47
## Max. :35000 Max. :23.22
## NA's :3116
## loan_status loan_percent_income cb_person_default_on_file
## Min. :0.0000 Min. :0.0000 Length:32581
## 1st Qu.:0.0000 1st Qu.:0.0900 Class :character
## Median :0.0000 Median :0.1500 Mode :character
## Mean :0.2182 Mean :0.1702
## 3rd Qu.:0.0000 3rd Qu.:0.2300
## Max. :1.0000 Max. :0.8300
##
## cb_person_cred_hist_length
## Min. : 2.000
## 1st Qu.: 3.000
## Median : 4.000
## Mean : 5.804
## 3rd Qu.: 8.000
## Max. :30.000
##
Rename all variables in data:
names(data)=c("age","income","ownership","employ_length","loan_intent","loan_grade",
"loan_amount","interest_rate",
"BAD","per_income","hist_default","credit_length")
table(data$BAD)##
## 0 1
## 25473 7108
Transfer some qualitative variables in data as factor:
data <- data %>%
mutate(across(
c("ownership",
"loan_intent",
"loan_grade",
"hist_default",
"credit_length"), as.factor))Check missing value:
# Check NA
# sapply(data, function(x) sum(is.na(x)))
colSums(is.na(data))## age income ownership employ_length loan_intent
## 0 0 0 895 0
## loan_grade loan_amount interest_rate BAD per_income
## 0 0 3116 0 0
## hist_default credit_length
## 0 0
C1: We can omit all NA values from data range.
### can drop them with the na.omit().
data.rm <- na.omit(data) # delete missing value
summary(data.rm)## age income ownership employ_length
## Min. : 20.00 Min. : 4000 MORTGAGE:11801 Min. : 0.000
## 1st Qu.: 23.00 1st Qu.: 39480 OTHER : 94 1st Qu.: 2.000
## Median : 26.00 Median : 55956 OWN : 2192 Median : 4.000
## Mean : 27.73 Mean : 66649 RENT :14551 Mean : 4.789
## 3rd Qu.: 30.00 3rd Qu.: 80000 3rd Qu.: 7.000
## Max. :144.00 Max. :6000000 Max. :123.000
##
## loan_intent loan_grade loan_amount interest_rate
## DEBTCONSOLIDATION:4565 A:9402 Min. : 500 Min. : 5.42
## EDUCATION :5704 B:9151 1st Qu.: 5000 1st Qu.: 7.90
## HOMEIMPROVEMENT :3198 C:5699 Median : 8000 Median :10.99
## MEDICAL :5293 D:3248 Mean : 9656 Mean :11.04
## PERSONAL :4877 E: 870 3rd Qu.:12500 3rd Qu.:13.48
## VENTURE :5001 F: 209 Max. :35000 Max. :23.22
## G: 59
## BAD per_income hist_default credit_length
## Min. :0.0000 Min. :0.0000 N:23535 3 :5233
## 1st Qu.:0.0000 1st Qu.:0.0900 Y: 5103 4 :5228
## Median :0.0000 Median :0.1500 2 :5224
## Mean :0.2166 Mean :0.1695 7 :1673
## 3rd Qu.:0.0000 3rd Qu.:0.2300 8 :1665
## Max. :1.0000 Max. :0.8300 9 :1664
## (Other):7951
sum(is.na(data.rm))## [1] 0
C2: We replace NA values by replacing them by mean, median or mode.
In here, we will make a function to use for multiple variables
# Function replaces NA by mean:
replace_by_mean <- function(x) {
x[is.na(x)] <- mean(x, na.rm = TRUE)
return(x)
}
#A function imputes NA observations for categorical variables:
replace_na_categorical <- function(x) {
x %>%
table()%>%
as.data.frame() %>%
arrange(-Freq) ->> my_df
n_obs <- sum(my_df$Freq)
pop <- my_df$. %>% as.character()
set.seed(29)
x[is.na(x)] <- sample(pop, sum(is.na(x)), replace = TRUE, prob = my_df$Freq)
return(x)
}Applying function which we created above to replace missing values.
# replace missing value
data.mean <- data %>%
mutate_if(is.numeric, replace_by_mean) %>%
mutate(across(where(is.factor) & !BAD, replace_na_categorical))Review data after processing missing:
#glimpse(data)
sum(is.na(data.mean))## [1] 0
# install.packages("ggpubr")
library(ggpubr)
library(patchwork)
plot_Bad_Intent <- ggplot(data, aes(x = loan_intent, y = BAD)) +
stat_summary(fun.y = "sum", geom="bar", aes(width=0.5)) +
theme_pubclean()## Warning: `fun.y` is deprecated. Use `fun` instead.
## Warning: Ignoring unknown aesthetics: width
plot_Bad_Ownership <- ggplot(data, aes(x = ownership, y = BAD)) +
stat_summary(fun.y = "sum", geom="bar", aes(width=0.5)) +
theme_bw()## Warning: `fun.y` is deprecated. Use `fun` instead.
## Ignoring unknown aesthetics: width
plot_Bad_Intent + plot_Bad_Ownershipa <- ggplot(data, aes(x = loan_amount))
a + geom_histogram(bins = 30, color = "black", fill = "gray") +
geom_vline(aes(xintercept = mean(loan_amount)),
linetype = "dashed", size = 0.6)# Histogram with density plot
a + geom_histogram(aes(y = ..density..),
colour="black", fill="white") +
geom_density(alpha = 0.2, fill = "#FF6666") ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Color by groups
a + geom_histogram(aes(y = ..density.., color = BAD),
fill = "white",
position = "identity")+
geom_density(aes(color = BAD), size = 1) +
scale_color_manual(values = c("#868686FF", "#EFC000FF"))## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Handle the outliers by drawing box plot:
boxplot(data$loan_amount)summary(data$loan_amount)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 500 5000 8000 9589 12200 35000
ggplot(data, aes(x = BAD, y = loan_amount)) +
geom_boxplot() + theme_bw()## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
min_loan_amount = quantile(data$loan_amount, probs = 0.25) - 1.5*IQR(data$loan_amount)
max_loan_amount = quantile(data$loan_amount, probs = 0.75) + 1.5*IQR(data$loan_amount)
data$loan_amount[data$loan_amount > max_loan_amount | data$loan_amount < min_loan_amount] = mean(data$loan_amount)outlier_by_mean <- function(x){
Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
Iqr = IQR(x)
above = Q[2] + 1.5*Iqr
below = Q[1] - 1.5*Iqr
x[x > above | x < below] <- mean(x, na.rm = TRUE)
return(x)
}
data <- data.rm %>% mutate_if(is.numeric, outlier_by_mean)
glimpse(data)## Rows: 28,638
## Columns: 12
## $ age <dbl> 22, 21, 25, 23, 24, 21, 26, 24, 24, 21, 22, 21, 23, 26, …
## $ income <dbl> 59000.00, 9600.00, 9600.00, 65500.00, 54400.00, 9900.00,…
## $ ownership <fct> RENT, OWN, MORTGAGE, RENT, RENT, OWN, RENT, RENT, RENT, …
## $ employ_length <dbl> 4.788672, 5.000000, 1.000000, 4.000000, 8.000000, 2.0000…
## $ loan_intent <fct> PERSONAL, EDUCATION, MEDICAL, MEDICAL, MEDICAL, VENTURE,…
## $ loan_grade <fct> D, B, C, C, C, A, B, B, A, D, B, A, A, E, A, B, A, F, D,…
## $ loan_amount <dbl> 9656.493, 1000.000, 5500.000, 9656.493, 9656.493, 2500.0…
## $ interest_rate <dbl> 16.02, 11.14, 12.87, 15.23, 14.27, 7.14, 12.42, 11.11, 8…
## $ BAD <dbl> 0.2166003, 0.0000000, 0.2166003, 0.2166003, 0.2166003, 0…
## $ per_income <dbl> 0.1694881, 0.1000000, 0.1694881, 0.1694881, 0.1694881, 0…
## $ hist_default <fct> Y, N, N, N, Y, N, N, N, N, N, N, N, N, N, N, N, N, N, N,…
## $ credit_length <fct> 3, 2, 3, 2, 4, 2, 3, 4, 2, 3, 4, 2, 2, 4, 4, 3, 4, 4, 4,…
plot1 <- ggplot(data, aes(x = BAD, y = loan_amount)) +
geom_boxplot() + theme_bw()
plot2 <- ggplot(data, aes(x = loan_amount, fill = BAD)) +
geom_density(alpha = 0.7) + theme_bw() +
theme(legend.position = c(0.8, 0.8))
plot1 + plot2## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
# normalize
a <- data$loan_amount
scaled.a <- scale(a)
head(a); head(scaled.a)## [1] 9656.493 1000.000 5500.000 9656.493 9656.493 2500.000
## [,1]
## [1,] 0.1782634
## [2,] -1.5894659
## [3,] -0.6705278
## [4,] 0.1782634
## [5,] 0.1782634
## [6,] -1.2831532
# check that we get mean of 0 and sd of 1
summary(scaled.a); sd(scaled.a)## V1
## Min. :-1.6916
## 1st Qu.:-0.7726
## Median :-0.1600
## Mean : 0.0000
## 3rd Qu.: 0.6568
## Max. : 3.0563
## [1] 1
Convert value of variable to [0;1]
# convert to 0 - 1
ZeroOne <- function(x) {
m <- min(x)
M <- max(x)
x <- (x - m)/(M - m)
return(x)
}
dt <- data %>% mutate(across("loan_amount", ZeroOne))library(ggcorrplot)
##Correlation Matrix
numericVarName <- names(which(sapply(data, is.numeric)))
corr <- cor(data[,numericVarName], use = 'pairwise.complete.obs',)
corr## age income employ_length loan_amount interest_rate
## age 1.00000000 0.09726199 0.07380566 0.04519055 0.01285543
## income 0.09726199 1.00000000 0.18805872 0.31437718 -0.03785079
## employ_length 0.07380566 0.18805872 1.00000000 0.09518757 -0.05810191
## loan_amount 0.04519055 0.31437718 0.09518757 1.00000000 0.09057596
## interest_rate 0.01285543 -0.03785079 -0.05810191 0.09057596 1.00000000
## BAD -0.02957202 -0.24704496 -0.09241622 0.08294037 0.33867997
## per_income -0.04143451 -0.30445250 -0.05374364 0.56472135 0.12632277
## BAD per_income
## age -0.02957202 -0.04143451
## income -0.24704496 -0.30445250
## employ_length -0.09241622 -0.05374364
## loan_amount 0.08294037 0.56472135
## interest_rate 0.33867997 0.12632277
## BAD 1.00000000 0.33487138
## per_income 0.33487138 1.00000000
#--- pp Oversampling----
library('InformationValue')
library('ROSE')## Loaded ROSE 0.0-4
data$BAD <- as.numeric(as.character(data$BAD))
data.over <- ovun.sample(BAD ~., data = data, p = 0.5, seed = 1, method="over")$data
table(data.over$BAD)##
## 0 0.216600321251484
## 22435 22409
library(MASS)##
## Attaching package: 'MASS'
## The following object is masked from 'package:patchwork':
##
## area
## The following object is masked from 'package:dplyr':
##
## select
chi.square <- vector()
p.value <- vector()
cateVar <- data %>%
dplyr::select(-BAD) %>%
keep(is.factor)
for (i in 1:length(cateVar)) {
tbl <- table(data$BAD, unname(unlist(cateVar[i])))
csq.test <- chisq.test(tbl, correct = FALSE)
p.value[i] <- csq.test[3]$p.value
chi.square[i] <- unname(csq.test[1]$statistic)
}## Warning in chisq.test(tbl, correct = FALSE): Chi-squared approximation may be
## incorrect
chi_square_test <- tibble(variable = names(cateVar)) %>%
add_column(chi.square = chi.square) %>%
add_column(p.value = p.value)
# knitr::kable(chi_square_test)
chi_square_test## # A tibble: 5 × 3
## variable chi.square p.value
## <chr> <dbl> <dbl>
## 1 ownership 1653. 0
## 2 loan_intent 465. 2.71e- 98
## 3 loan_grade 5124. 0
## 4 hist_default 949. 2.42e-208
## 5 credit_length 41.0 5.32e- 2
II. Using logit model in scoring credit
2.1. Handle imbalanced data
#--- pp Oversampling----
library('InformationValue')
library('ROSE')##------Information Value
IV <- Information::create_infotables(data = data.rm, y = "BAD", parallel = FALSE)
print(IV$Summary)## Variable IV
## 6 loan_grade 0.920441446
## 9 per_income 0.895575946
## 8 interest_rate 0.760001887
## 2 income 0.446352176
## 3 ownership 0.378352930
## 10 hist_default 0.169988195
## 5 loan_intent 0.097885868
## 7 loan_amount 0.086659339
## 4 employ_length 0.062232498
## 1 age 0.013022162
## 11 credit_length 0.008451255
Remove variables which have IV is smaller than 0.02
# select vars of IV < 0.02
vars_removed <- IV$Summary %>% as.data.frame %>%
subset(IV < 0.02) %>% pull(1)
vars_removed## [1] "age" "credit_length"
2.2. Remove variable with IV < 0.02
data.rm<- data.rm %>% dplyr::select(-all_of(vars_removed))2.3. Divide data set into train and test
# train 70% - test 30%
set.seed(1230000)
ind <- sample(2, nrow(data.rm), replace = TRUE, prob = c(0.7, 0.3))
train.data <- data.rm [ind == 1, ]
test.data<- data.rm [ind == 2, ]2.4. Bin variables by WOE
library("scorecard")##
## Attaching package: 'scorecard'
## The following object is masked from 'package:tidyr':
##
## replace_na
bins <- woebin(train.data, y = "BAD")## [INFO] creating woe binning ...
## [INFO] Binning on 20022 rows and 10 columns in 00:00:13
woebin_plot(bins)## $income
##
## $ownership
##
## $employ_length
##
## $loan_intent
##
## $loan_grade
##
## $loan_amount
##
## $interest_rate
##
## $per_income
##
## $hist_default
2.5. Run logit model
2.5.1. Transfer data to WOE
f.train.data_woe <- woebin_ply(train.data, bins)## [INFO] converting into woe values ...
head(f.train.data_woe)## BAD income_woe ownership_woe employ_length_woe loan_intent_woe
## 1: 1 0.03946256 0.4924999 -0.3197698 -0.09466018
## 2: 0 1.08616470 -1.4094052 -0.1074637 -0.31703620
## 3: 1 -0.41519899 0.4924999 -0.1074637 0.29205121
## 4: 1 0.03946256 0.4924999 -0.3197698 0.29205121
## 5: 1 1.08616470 -1.4094052 0.1753984 -0.48846535
## 6: 1 -0.41519899 0.4924999 -0.3197698 -0.31703620
## loan_grade_woe loan_amount_woe interest_rate_woe per_income_woe
## 1: 1.7651725 0.7393396 1.7209517 2.1836615
## 2: -0.2668789 -0.1517727 -0.2344020 -0.6821879
## 3: -0.2668789 0.7393396 1.0190431 2.1836615
## 4: -0.2668789 0.7393396 1.0190431 2.1836615
## 5: -0.9257900 -0.1517727 -0.9266569 -0.1094767
## 6: -0.2668789 0.7393396 -0.2344020 2.1836615
## hist_default_woe
## 1: 0.7803670
## 2: -0.2138119
## 3: -0.2138119
## 4: 0.7803670
## 5: -0.2138119
## 6: -0.2138119
- Run on train set:
logit.model <- glm(BAD ~., family = binomial(link = 'logit'), data = train.data)
summary(logit.model)##
## Call:
## glm(formula = BAD ~ ., family = binomial(link = "logit"), data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2320 -0.5222 -0.3022 -0.1225 3.4076
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.272e+00 1.867e-01 -22.887 < 2e-16 ***
## income 6.611e-07 3.209e-07 2.060 0.039376 *
## ownershipOTHER 5.435e-01 3.561e-01 1.526 0.127013
## ownershipOWN -1.800e+00 1.357e-01 -13.264 < 2e-16 ***
## ownershipRENT 8.106e-01 5.093e-02 15.917 < 2e-16 ***
## employ_length -1.742e-02 5.863e-03 -2.971 0.002968 **
## loan_intentEDUCATION -9.037e-01 7.277e-02 -12.419 < 2e-16 ***
## loan_intentHOMEIMPROVEMENT 3.755e-03 8.100e-02 0.046 0.963029
## loan_intentMEDICAL -1.833e-01 6.835e-02 -2.681 0.007331 **
## loan_intentPERSONAL -6.391e-01 7.421e-02 -8.612 < 2e-16 ***
## loan_intentVENTURE -1.212e+00 8.006e-02 -15.142 < 2e-16 ***
## loan_gradeB 9.927e-02 9.841e-02 1.009 0.313103
## loan_gradeC 2.336e-01 1.484e-01 1.574 0.115493
## loan_gradeD 2.379e+00 1.866e-01 12.746 < 2e-16 ***
## loan_gradeE 2.527e+00 2.351e-01 10.747 < 2e-16 ***
## loan_gradeF 2.814e+00 3.213e-01 8.759 < 2e-16 ***
## loan_gradeG 5.766e+00 1.098e+00 5.254 1.49e-07 ***
## loan_amount -9.817e-05 4.988e-06 -19.681 < 2e-16 ***
## interest_rate 8.334e-02 2.162e-02 3.855 0.000116 ***
## per_income 1.305e+01 2.949e-01 44.257 < 2e-16 ***
## hist_defaultY 1.943e-02 6.378e-02 0.305 0.760650
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20950 on 20021 degrees of freedom
## Residual deviance: 13488 on 20001 degrees of freedom
## AIC: 13530
##
## Number of Fisher Scoring iterations: 6
logit.model_woe <- glm(BAD ~., family = binomial(link = 'logit'), data = f.train.data_woe)
summary(logit.model_woe)##
## Call:
## glm(formula = BAD ~ ., family = binomial(link = "logit"), data = f.train.data_woe)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5624 -0.4957 -0.2648 -0.1185 3.2524
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.36743 0.02389 -57.248 < 2e-16 ***
## income_woe 0.77187 0.04004 19.278 < 2e-16 ***
## ownership_woe 0.93531 0.03912 23.906 < 2e-16 ***
## employ_length_woe 0.50018 0.09575 5.224 1.75e-07 ***
## loan_intent_woe 1.47610 0.07365 20.042 < 2e-16 ***
## loan_grade_woe 1.07504 0.04897 21.954 < 2e-16 ***
## loan_amount_woe 0.19396 0.10499 1.847 0.0647 .
## interest_rate_woe 0.17912 0.05491 3.262 0.0011 **
## per_income_woe 1.06183 0.02818 37.676 < 2e-16 ***
## hist_default_woe 0.12107 0.05841 2.073 0.0382 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20950 on 20021 degrees of freedom
## Residual deviance: 12742 on 20012 degrees of freedom
## AIC: 12762
##
## Number of Fisher Scoring iterations: 6
2.5.2. Fill variables by stepwise
train.step <- step(logit.model, direction = "backward", trace = 0)## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(train.step)##
## Call:
## glm(formula = BAD ~ income + ownership + employ_length + loan_intent +
## loan_grade + loan_amount + interest_rate + per_income, family = binomial(link = "logit"),
## data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2317 -0.5218 -0.3023 -0.1226 3.4077
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.272e+00 1.867e-01 -22.886 < 2e-16 ***
## income 6.599e-07 3.208e-07 2.057 0.039687 *
## ownershipOTHER 5.450e-01 3.561e-01 1.530 0.125962
## ownershipOWN -1.799e+00 1.357e-01 -13.262 < 2e-16 ***
## ownershipRENT 8.107e-01 5.093e-02 15.919 < 2e-16 ***
## employ_length -1.743e-02 5.861e-03 -2.973 0.002949 **
## loan_intentEDUCATION -9.039e-01 7.277e-02 -12.422 < 2e-16 ***
## loan_intentHOMEIMPROVEMENT 3.907e-03 8.100e-02 0.048 0.961528
## loan_intentMEDICAL -1.832e-01 6.834e-02 -2.681 0.007348 **
## loan_intentPERSONAL -6.390e-01 7.421e-02 -8.610 < 2e-16 ***
## loan_intentVENTURE -1.212e+00 8.006e-02 -15.140 < 2e-16 ***
## loan_gradeB 9.923e-02 9.840e-02 1.008 0.313265
## loan_gradeC 2.430e-01 1.451e-01 1.675 0.093909 .
## loan_gradeD 2.389e+00 1.839e-01 12.988 < 2e-16 ***
## loan_gradeE 2.536e+00 2.331e-01 10.879 < 2e-16 ***
## loan_gradeF 2.822e+00 3.203e-01 8.809 < 2e-16 ***
## loan_gradeG 5.779e+00 1.097e+00 5.268 1.38e-07 ***
## loan_amount -9.815e-05 4.988e-06 -19.679 < 2e-16 ***
## interest_rate 8.335e-02 2.162e-02 3.855 0.000116 ***
## per_income 1.305e+01 2.948e-01 44.263 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20950 on 20021 degrees of freedom
## Residual deviance: 13488 on 20002 degrees of freedom
## AIC: 13528
##
## Number of Fisher Scoring iterations: 6
train.step_woe <- step(logit.model_woe, direction = "backward", trace = 0)
summary(train.step)##
## Call:
## glm(formula = BAD ~ income + ownership + employ_length + loan_intent +
## loan_grade + loan_amount + interest_rate + per_income, family = binomial(link = "logit"),
## data = train.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2317 -0.5218 -0.3023 -0.1226 3.4077
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.272e+00 1.867e-01 -22.886 < 2e-16 ***
## income 6.599e-07 3.208e-07 2.057 0.039687 *
## ownershipOTHER 5.450e-01 3.561e-01 1.530 0.125962
## ownershipOWN -1.799e+00 1.357e-01 -13.262 < 2e-16 ***
## ownershipRENT 8.107e-01 5.093e-02 15.919 < 2e-16 ***
## employ_length -1.743e-02 5.861e-03 -2.973 0.002949 **
## loan_intentEDUCATION -9.039e-01 7.277e-02 -12.422 < 2e-16 ***
## loan_intentHOMEIMPROVEMENT 3.907e-03 8.100e-02 0.048 0.961528
## loan_intentMEDICAL -1.832e-01 6.834e-02 -2.681 0.007348 **
## loan_intentPERSONAL -6.390e-01 7.421e-02 -8.610 < 2e-16 ***
## loan_intentVENTURE -1.212e+00 8.006e-02 -15.140 < 2e-16 ***
## loan_gradeB 9.923e-02 9.840e-02 1.008 0.313265
## loan_gradeC 2.430e-01 1.451e-01 1.675 0.093909 .
## loan_gradeD 2.389e+00 1.839e-01 12.988 < 2e-16 ***
## loan_gradeE 2.536e+00 2.331e-01 10.879 < 2e-16 ***
## loan_gradeF 2.822e+00 3.203e-01 8.809 < 2e-16 ***
## loan_gradeG 5.779e+00 1.097e+00 5.268 1.38e-07 ***
## loan_amount -9.815e-05 4.988e-06 -19.679 < 2e-16 ***
## interest_rate 8.335e-02 2.162e-02 3.855 0.000116 ***
## per_income 1.305e+01 2.948e-01 44.263 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20950 on 20021 degrees of freedom
## Residual deviance: 13488 on 20002 degrees of freedom
## AIC: 13528
##
## Number of Fisher Scoring iterations: 6
- Check model on train set:
train.prob <- predict(train.step, type = "response")
train.pred <- ifelse(train.prob > .5, "1", "0")
table(train.pred, train.data$BAD)##
## train.pred 0 1
## 0 14924 1910
## 1 752 2436
- Check model on test set:
logit.pred.prob<- predict(train.step, test.data, type = 'response')
logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, 1, 0))
test.data$BAD <- as.factor(test.data$BAD)
### validation
caret::confusionMatrix(logit.pred, test.data$BAD, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6421 779
## 1 338 1078
##
## Accuracy : 0.8704
## 95% CI : (0.8631, 0.8774)
## No Information Rate : 0.7845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5805
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.5805
## Specificity : 0.9500
## Pos Pred Value : 0.7613
## Neg Pred Value : 0.8918
## Prevalence : 0.2155
## Detection Rate : 0.1251
## Detection Prevalence : 0.1643
## Balanced Accuracy : 0.7652
##
## 'Positive' Class : 1
##
- Thực hiện mô hình trên tập test:
test.data_woe <- woebin_ply(test.data, bins)## [INFO] converting into woe values ...
- Thực hiện mô hình trên tập test:
logit.pred.prob_woe <- predict(train.step_woe, test.data_woe, type = 'response')
logit.pred_woe <- as.factor(ifelse(logit.pred.prob_woe > 0.5, 1, 0))
test.data_woe$BAD <- as.factor(test.data_woe$BAD)
### validation
caret::confusionMatrix(logit.pred_woe, test.data_woe$BAD, positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6434 691
## 1 325 1166
##
## Accuracy : 0.8821
## 95% CI : (0.8751, 0.8888)
## No Information Rate : 0.7845
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6244
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6279
## Specificity : 0.9519
## Pos Pred Value : 0.7820
## Neg Pred Value : 0.9030
## Prevalence : 0.2155
## Detection Rate : 0.1353
## Detection Prevalence : 0.1731
## Balanced Accuracy : 0.7899
##
## 'Positive' Class : 1
##
III. Conduct Scoring
# Calculate scorecard scores for variables based on the results from woebin and glm:
my_card <- scorecard(bins, train.step, points0 = 600, odds0 = 1/19, pdo = 50)
head(my_card)## $basepoints
## variable bin woe points
## 1: basepoints NA NA 696
##
## $income
## variable bin count count_distr neg pos posprob woe
## 1: income [-Inf,35000) 3621 0.1808511 1988 1633 0.45098039 1.08616470
## 2: income [35000,60000) 7063 0.3527620 5482 1581 0.22384256 0.03946256
## 3: income [60000,80000) 4240 0.2117671 3584 656 0.15471698 -0.41519899
## 4: income [80000, Inf) 5098 0.2546199 4622 476 0.09336995 -0.99028994
## bin_iv total_iv breaks is_special_values points
## 1: 0.2703787064 0.4867096 35000 FALSE 0
## 2: 0.0005554841 0.4867096 60000 FALSE 0
## 3: 0.0322552964 0.4867096 80000 FALSE 0
## 4: 0.1835201580 0.4867096 Inf FALSE 0
##
## $ownershipOTHER
## Empty data.table (0 rows and 13 cols): variable,bin,count,count_distr,neg,pos...
##
## $ownershipOWN
## Empty data.table (0 rows and 13 cols): variable,bin,count,count_distr,neg,pos...
##
## $ownershipRENT
## Empty data.table (0 rows and 13 cols): variable,bin,count,count_distr,neg,pos...
##
## $employ_length
## variable bin count count_distr neg pos posprob woe
## 1: employ_length [-Inf,2) 4406 0.2200579 3157 1249 0.2834771 0.3555960
## 2: employ_length [2,3) 2412 0.1204675 1813 599 0.2483416 0.1753984
## 3: employ_length [3,8) 8974 0.4482070 7185 1789 0.1993537 -0.1074637
## 4: employ_length [8, Inf) 4230 0.2112676 3521 709 0.1676123 -0.3197698
## bin_iv total_iv breaks is_special_values points
## 1: 0.030581273 0.05914612 2 FALSE 0
## 2: 0.003889176 0.05914612 3 FALSE 0
## 3: 0.005018671 0.05914612 8 FALSE 0
## 4: 0.019657001 0.05914612 Inf FALSE 0
IV. Show Results
# Calculate scorecard scores
z<-log(logit.pred.prob_woe/(1-logit.pred.prob_woe))
head(z,10)## 1 2 3 4 5 6 7
## 1.4309873 -0.7134508 2.4492096 -3.2113908 -3.1697155 3.3459632 0.3567064
## 8 9 10
## 2.7720986 -3.7500900 1.1310879
#write.csv(credit_score,"diemtindung.csv")
#write.csv(z,"diemz.csv")