CREDIT SCORING

# Load some packages for data manipulation: 
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(dplyr)
library(readxl)

I. Statistical summary

Get data then summarize data by statistical method:

# Import data: 
### Import data from excel file
setwd("C:/Users/PC/Downloads")
data <-read.csv("credit_risk_dataset.csv")
### Descriptive statistic
  summary(data)

##    person_age     person_income     person_home_ownership person_emp_length
##  Min.   : 20.00   Min.   :   4000   Length:32581          Min.   :  0.00   
##  1st Qu.: 23.00   1st Qu.:  38500   Class :character      1st Qu.:  2.00   
##  Median : 26.00   Median :  55000   Mode  :character      Median :  4.00   
##  Mean   : 27.73   Mean   :  66075                         Mean   :  4.79   
##  3rd Qu.: 30.00   3rd Qu.:  79200                         3rd Qu.:  7.00   
##  Max.   :144.00   Max.   :6000000                         Max.   :123.00   
##                                                           NA's   :895      
##  loan_intent         loan_grade          loan_amnt     loan_int_rate  
##  Length:32581       Length:32581       Min.   :  500   Min.   : 5.42  
##  Class :character   Class :character   1st Qu.: 5000   1st Qu.: 7.90  
##  Mode  :character   Mode  :character   Median : 8000   Median :10.99  
##                                        Mean   : 9589   Mean   :11.01  
##                                        3rd Qu.:12200   3rd Qu.:13.47  
##                                        Max.   :35000   Max.   :23.22  
##                                                        NA's   :3116   
##   loan_status     loan_percent_income cb_person_default_on_file
##  Min.   :0.0000   Min.   :0.0000      Length:32581             
##  1st Qu.:0.0000   1st Qu.:0.0900      Class :character         
##  Median :0.0000   Median :0.1500      Mode  :character         
##  Mean   :0.2182   Mean   :0.1702                               
##  3rd Qu.:0.0000   3rd Qu.:0.2300                               
##  Max.   :1.0000   Max.   :0.8300                               
##                                                                
##  cb_person_cred_hist_length
##  Min.   : 2.000            
##  1st Qu.: 3.000            
##  Median : 4.000            
##  Mean   : 5.804            
##  3rd Qu.: 8.000            
##  Max.   :30.000            
##

Rename all variables in data:

names(data)=c("age","income","ownership","employ_length","loan_intent","loan_grade",
              "loan_amount","interest_rate",
              "BAD","per_income","hist_default","credit_length")
table(data$BAD)

## 
##     0     1 
## 25473  7108

Transfer some qualitative variables in data as factor:

data <- data %>% 
  mutate(across(
       c("ownership",
         "loan_intent",
         "loan_grade",
         "hist_default",
         "credit_length"), as.factor))

Check missing value:

# Check NA
# sapply(data, function(x) sum(is.na(x)))
  colSums(is.na(data))

##           age        income     ownership employ_length   loan_intent 
##             0             0             0           895             0 
##    loan_grade   loan_amount interest_rate           BAD    per_income 
##             0             0          3116             0             0 
##  hist_default credit_length 
##             0             0

C1: We can omit all NA values from data range.

### can drop them with the na.omit().
  data.rm <- na.omit(data) # delete missing value
  summary(data.rm)

##       age             income           ownership     employ_length    
##  Min.   : 20.00   Min.   :   4000   MORTGAGE:11801   Min.   :  0.000  
##  1st Qu.: 23.00   1st Qu.:  39480   OTHER   :   94   1st Qu.:  2.000  
##  Median : 26.00   Median :  55956   OWN     : 2192   Median :  4.000  
##  Mean   : 27.73   Mean   :  66649   RENT    :14551   Mean   :  4.789  
##  3rd Qu.: 30.00   3rd Qu.:  80000                    3rd Qu.:  7.000  
##  Max.   :144.00   Max.   :6000000                    Max.   :123.000  
##                                                                       
##             loan_intent   loan_grade  loan_amount    interest_rate  
##  DEBTCONSOLIDATION:4565   A:9402     Min.   :  500   Min.   : 5.42  
##  EDUCATION        :5704   B:9151     1st Qu.: 5000   1st Qu.: 7.90  
##  HOMEIMPROVEMENT  :3198   C:5699     Median : 8000   Median :10.99  
##  MEDICAL          :5293   D:3248     Mean   : 9656   Mean   :11.04  
##  PERSONAL         :4877   E: 870     3rd Qu.:12500   3rd Qu.:13.48  
##  VENTURE          :5001   F: 209     Max.   :35000   Max.   :23.22  
##                           G:  59                                    
##       BAD           per_income     hist_default credit_length 
##  Min.   :0.0000   Min.   :0.0000   N:23535      3      :5233  
##  1st Qu.:0.0000   1st Qu.:0.0900   Y: 5103      4      :5228  
##  Median :0.0000   Median :0.1500                2      :5224  
##  Mean   :0.2166   Mean   :0.1695                7      :1673  
##  3rd Qu.:0.0000   3rd Qu.:0.2300                8      :1665  
##  Max.   :1.0000   Max.   :0.8300                9      :1664  
##                                                 (Other):7951

  sum(is.na(data.rm))

## [1] 0

C2: We replace NA values by replacing them by mean, median or mode.

In here, we will make a function to use for multiple variables

# Function replaces NA by mean: 
replace_by_mean <- function(x) {
  x[is.na(x)] <- mean(x, na.rm = TRUE)
  return(x)
}
 #A function imputes NA observations for categorical variables: 
replace_na_categorical <- function(x) {
  x %>%
    table()%>%
    as.data.frame() %>%
    arrange(-Freq) ->> my_df
  
 n_obs <- sum(my_df$Freq)
 pop <- my_df$. %>% as.character()
  set.seed(29)
x[is.na(x)] <- sample(pop, sum(is.na(x)), replace = TRUE, prob = my_df$Freq)
return(x)
}

Applying function which we created above to replace missing values.

# replace missing value         
data.mean <- data %>%
        mutate_if(is.numeric, replace_by_mean) %>%
        mutate(across(where(is.factor) & !BAD, replace_na_categorical))

Review data after processing missing:

#glimpse(data)
sum(is.na(data.mean))

## [1] 0

# install.packages("ggpubr")
library(ggpubr)
library(patchwork)
plot_Bad_Intent <- ggplot(data, aes(x = loan_intent, y = BAD)) +
    stat_summary(fun.y = "sum", geom="bar", aes(width=0.5)) +
    theme_pubclean()

## Warning: `fun.y` is deprecated. Use `fun` instead.

## Warning: Ignoring unknown aesthetics: width

plot_Bad_Ownership <- ggplot(data, aes(x = ownership, y = BAD)) +
    stat_summary(fun.y = "sum", geom="bar", aes(width=0.5)) +
    theme_bw()

## Warning: `fun.y` is deprecated. Use `fun` instead.
## Ignoring unknown aesthetics: width

plot_Bad_Intent  + plot_Bad_Ownership

a <- ggplot(data, aes(x = loan_amount))
a + geom_histogram(bins = 30, color = "black", fill = "gray") +
  geom_vline(aes(xintercept = mean(loan_amount)), 
             linetype = "dashed", size = 0.6)

# Histogram with density plot
a + geom_histogram(aes(y = ..density..), 
                   colour="black", fill="white") +
  geom_density(alpha = 0.2, fill = "#FF6666")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Color by groups
a + geom_histogram(aes(y = ..density.., color = BAD),
                   fill = "white",
                   position = "identity")+
  geom_density(aes(color = BAD), size = 1) +
  scale_color_manual(values = c("#868686FF", "#EFC000FF"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Handle the outliers by drawing box plot:

boxplot(data$loan_amount)

summary(data$loan_amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     500    5000    8000    9589   12200   35000

ggplot(data, aes(x = BAD, y = loan_amount)) +
  geom_boxplot() + theme_bw()

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

min_loan_amount = quantile(data$loan_amount, probs = 0.25) - 1.5*IQR(data$loan_amount)
max_loan_amount = quantile(data$loan_amount, probs = 0.75) + 1.5*IQR(data$loan_amount)
data$loan_amount[data$loan_amount > max_loan_amount | data$loan_amount < min_loan_amount] = mean(data$loan_amount)

outlier_by_mean <- function(x){
  Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
  Iqr = IQR(x)

  above = Q[2] + 1.5*Iqr
  below = Q[1] - 1.5*Iqr
  x[x > above | x < below] <- mean(x, na.rm = TRUE)
  return(x)
}
data <- data.rm %>% mutate_if(is.numeric, outlier_by_mean)
glimpse(data)

## Rows: 28,638
## Columns: 12
## $ age           <dbl> 22, 21, 25, 23, 24, 21, 26, 24, 24, 21, 22, 21, 23, 26, …
## $ income        <dbl> 59000.00, 9600.00, 9600.00, 65500.00, 54400.00, 9900.00,…
## $ ownership     <fct> RENT, OWN, MORTGAGE, RENT, RENT, OWN, RENT, RENT, RENT, …
## $ employ_length <dbl> 4.788672, 5.000000, 1.000000, 4.000000, 8.000000, 2.0000…
## $ loan_intent   <fct> PERSONAL, EDUCATION, MEDICAL, MEDICAL, MEDICAL, VENTURE,…
## $ loan_grade    <fct> D, B, C, C, C, A, B, B, A, D, B, A, A, E, A, B, A, F, D,…
## $ loan_amount   <dbl> 9656.493, 1000.000, 5500.000, 9656.493, 9656.493, 2500.0…
## $ interest_rate <dbl> 16.02, 11.14, 12.87, 15.23, 14.27, 7.14, 12.42, 11.11, 8…
## $ BAD           <dbl> 0.2166003, 0.0000000, 0.2166003, 0.2166003, 0.2166003, 0…
## $ per_income    <dbl> 0.1694881, 0.1000000, 0.1694881, 0.1694881, 0.1694881, 0…
## $ hist_default  <fct> Y, N, N, N, Y, N, N, N, N, N, N, N, N, N, N, N, N, N, N,…
## $ credit_length <fct> 3, 2, 3, 2, 4, 2, 3, 4, 2, 3, 4, 2, 2, 4, 4, 3, 4, 4, 4,…

plot1 <- ggplot(data, aes(x = BAD, y = loan_amount)) +
  geom_boxplot() + theme_bw()

plot2 <- ggplot(data, aes(x = loan_amount, fill = BAD)) +
  geom_density(alpha = 0.7) + theme_bw() +
  theme(legend.position = c(0.8, 0.8))

plot1 + plot2

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

# normalize 
a <- data$loan_amount
scaled.a <- scale(a)

head(a); head(scaled.a)

## [1] 9656.493 1000.000 5500.000 9656.493 9656.493 2500.000

##            [,1]
## [1,]  0.1782634
## [2,] -1.5894659
## [3,] -0.6705278
## [4,]  0.1782634
## [5,]  0.1782634
## [6,] -1.2831532

# check that we get mean of 0 and sd of 1
summary(scaled.a); sd(scaled.a)

##        V1         
##  Min.   :-1.6916  
##  1st Qu.:-0.7726  
##  Median :-0.1600  
##  Mean   : 0.0000  
##  3rd Qu.: 0.6568  
##  Max.   : 3.0563

## [1] 1

Convert value of variable to [0;1]

# convert to 0 - 1
ZeroOne <- function(x) {
  m <- min(x)
  M <- max(x)
  x <- (x - m)/(M - m)
  return(x)
}

dt <- data %>% mutate(across("loan_amount", ZeroOne))

library(ggcorrplot)
##Correlation Matrix
numericVarName <- names(which(sapply(data, is.numeric)))
corr <- cor(data[,numericVarName], use = 'pairwise.complete.obs',)
corr

##                       age      income employ_length loan_amount interest_rate
## age            1.00000000  0.09726199    0.07380566  0.04519055    0.01285543
## income         0.09726199  1.00000000    0.18805872  0.31437718   -0.03785079
## employ_length  0.07380566  0.18805872    1.00000000  0.09518757   -0.05810191
## loan_amount    0.04519055  0.31437718    0.09518757  1.00000000    0.09057596
## interest_rate  0.01285543 -0.03785079   -0.05810191  0.09057596    1.00000000
## BAD           -0.02957202 -0.24704496   -0.09241622  0.08294037    0.33867997
## per_income    -0.04143451 -0.30445250   -0.05374364  0.56472135    0.12632277
##                       BAD  per_income
## age           -0.02957202 -0.04143451
## income        -0.24704496 -0.30445250
## employ_length -0.09241622 -0.05374364
## loan_amount    0.08294037  0.56472135
## interest_rate  0.33867997  0.12632277
## BAD            1.00000000  0.33487138
## per_income     0.33487138  1.00000000

#--- pp Oversampling----
library('InformationValue')
library('ROSE')

## Loaded ROSE 0.0-4

data$BAD <- as.numeric(as.character(data$BAD))
data.over <- ovun.sample(BAD ~., data = data, p = 0.5, seed = 1, method="over")$data
table(data.over$BAD)

## 
##                 0 0.216600321251484 
##             22435             22409

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:patchwork':
## 
##     area

## The following object is masked from 'package:dplyr':
## 
##     select

chi.square <- vector()
p.value <- vector()
cateVar <- data %>% 
  dplyr::select(-BAD) %>% 
  keep(is.factor)

  for (i in 1:length(cateVar)) {
    tbl <- table(data$BAD, unname(unlist(cateVar[i])))
    csq.test <- chisq.test(tbl, correct = FALSE)
    p.value[i] <- csq.test[3]$p.value
    chi.square[i] <- unname(csq.test[1]$statistic)
  }

## Warning in chisq.test(tbl, correct = FALSE): Chi-squared approximation may be
## incorrect

chi_square_test <- tibble(variable = names(cateVar)) %>% 
  add_column(chi.square = chi.square) %>% 
  add_column(p.value = p.value)
# knitr::kable(chi_square_test)
chi_square_test

## # A tibble: 5 × 3
##   variable      chi.square   p.value
##   <chr>              <dbl>     <dbl>
## 1 ownership         1653.  0        
## 2 loan_intent        465.  2.71e- 98
## 3 loan_grade        5124.  0        
## 4 hist_default       949.  2.42e-208
## 5 credit_length       41.0 5.32e-  2

II. Using logit model in scoring credit

2.1. Handle imbalanced data

#--- pp Oversampling----
library('InformationValue')
library('ROSE')

##------Information Value
IV <- Information::create_infotables(data = data.rm, y = "BAD", parallel = FALSE)
print(IV$Summary)

##         Variable          IV
## 6     loan_grade 0.920441446
## 9     per_income 0.895575946
## 8  interest_rate 0.760001887
## 2         income 0.446352176
## 3      ownership 0.378352930
## 10  hist_default 0.169988195
## 5    loan_intent 0.097885868
## 7    loan_amount 0.086659339
## 4  employ_length 0.062232498
## 1            age 0.013022162
## 11 credit_length 0.008451255

Remove variables which have IV is smaller than 0.02

# select vars of IV < 0.02
vars_removed <- IV$Summary %>% as.data.frame %>% 
                                    subset(IV < 0.02) %>% pull(1)
vars_removed

## [1] "age"           "credit_length"

2.2. Remove variable with IV < 0.02

data.rm<- data.rm %>% dplyr::select(-all_of(vars_removed))

2.3. Divide data set into train and test

# train 70% - test 30%
set.seed(1230000)
ind <- sample(2, nrow(data.rm), replace = TRUE, prob = c(0.7, 0.3))
train.data <- data.rm [ind == 1, ]
test.data<- data.rm [ind == 2, ]

2.4. Bin variables by WOE

library("scorecard")

## 
## Attaching package: 'scorecard'

## The following object is masked from 'package:tidyr':
## 
##     replace_na

bins <- woebin(train.data, y = "BAD")

## [INFO] creating woe binning ... 
## [INFO] Binning on 20022 rows and 10 columns in 00:00:13

woebin_plot(bins)

## $income

## 
## $ownership

## 
## $employ_length

## 
## $loan_intent

## 
## $loan_grade

## 
## $loan_amount

## 
## $interest_rate

## 
## $per_income

## 
## $hist_default

2.5. Run logit model

2.5.1. Transfer data to WOE

f.train.data_woe <- woebin_ply(train.data, bins)

## [INFO] converting into woe values ...

head(f.train.data_woe)

##    BAD  income_woe ownership_woe employ_length_woe loan_intent_woe
## 1:   1  0.03946256     0.4924999        -0.3197698     -0.09466018
## 2:   0  1.08616470    -1.4094052        -0.1074637     -0.31703620
## 3:   1 -0.41519899     0.4924999        -0.1074637      0.29205121
## 4:   1  0.03946256     0.4924999        -0.3197698      0.29205121
## 5:   1  1.08616470    -1.4094052         0.1753984     -0.48846535
## 6:   1 -0.41519899     0.4924999        -0.3197698     -0.31703620
##    loan_grade_woe loan_amount_woe interest_rate_woe per_income_woe
## 1:      1.7651725       0.7393396         1.7209517      2.1836615
## 2:     -0.2668789      -0.1517727        -0.2344020     -0.6821879
## 3:     -0.2668789       0.7393396         1.0190431      2.1836615
## 4:     -0.2668789       0.7393396         1.0190431      2.1836615
## 5:     -0.9257900      -0.1517727        -0.9266569     -0.1094767
## 6:     -0.2668789       0.7393396        -0.2344020      2.1836615
##    hist_default_woe
## 1:        0.7803670
## 2:       -0.2138119
## 3:       -0.2138119
## 4:        0.7803670
## 5:       -0.2138119
## 6:       -0.2138119

Run on train set:

logit.model <- glm(BAD ~., family = binomial(link = 'logit'), data = train.data)
summary(logit.model)

## 
## Call:
## glm(formula = BAD ~ ., family = binomial(link = "logit"), data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2320  -0.5222  -0.3022  -0.1225   3.4076  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -4.272e+00  1.867e-01 -22.887  < 2e-16 ***
## income                      6.611e-07  3.209e-07   2.060 0.039376 *  
## ownershipOTHER              5.435e-01  3.561e-01   1.526 0.127013    
## ownershipOWN               -1.800e+00  1.357e-01 -13.264  < 2e-16 ***
## ownershipRENT               8.106e-01  5.093e-02  15.917  < 2e-16 ***
## employ_length              -1.742e-02  5.863e-03  -2.971 0.002968 ** 
## loan_intentEDUCATION       -9.037e-01  7.277e-02 -12.419  < 2e-16 ***
## loan_intentHOMEIMPROVEMENT  3.755e-03  8.100e-02   0.046 0.963029    
## loan_intentMEDICAL         -1.833e-01  6.835e-02  -2.681 0.007331 ** 
## loan_intentPERSONAL        -6.391e-01  7.421e-02  -8.612  < 2e-16 ***
## loan_intentVENTURE         -1.212e+00  8.006e-02 -15.142  < 2e-16 ***
## loan_gradeB                 9.927e-02  9.841e-02   1.009 0.313103    
## loan_gradeC                 2.336e-01  1.484e-01   1.574 0.115493    
## loan_gradeD                 2.379e+00  1.866e-01  12.746  < 2e-16 ***
## loan_gradeE                 2.527e+00  2.351e-01  10.747  < 2e-16 ***
## loan_gradeF                 2.814e+00  3.213e-01   8.759  < 2e-16 ***
## loan_gradeG                 5.766e+00  1.098e+00   5.254 1.49e-07 ***
## loan_amount                -9.817e-05  4.988e-06 -19.681  < 2e-16 ***
## interest_rate               8.334e-02  2.162e-02   3.855 0.000116 ***
## per_income                  1.305e+01  2.949e-01  44.257  < 2e-16 ***
## hist_defaultY               1.943e-02  6.378e-02   0.305 0.760650    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20950  on 20021  degrees of freedom
## Residual deviance: 13488  on 20001  degrees of freedom
## AIC: 13530
## 
## Number of Fisher Scoring iterations: 6

logit.model_woe <- glm(BAD ~., family = binomial(link = 'logit'), data = f.train.data_woe)
summary(logit.model_woe)

## 
## Call:
## glm(formula = BAD ~ ., family = binomial(link = "logit"), data = f.train.data_woe)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5624  -0.4957  -0.2648  -0.1185   3.2524  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.36743    0.02389 -57.248  < 2e-16 ***
## income_woe         0.77187    0.04004  19.278  < 2e-16 ***
## ownership_woe      0.93531    0.03912  23.906  < 2e-16 ***
## employ_length_woe  0.50018    0.09575   5.224 1.75e-07 ***
## loan_intent_woe    1.47610    0.07365  20.042  < 2e-16 ***
## loan_grade_woe     1.07504    0.04897  21.954  < 2e-16 ***
## loan_amount_woe    0.19396    0.10499   1.847   0.0647 .  
## interest_rate_woe  0.17912    0.05491   3.262   0.0011 ** 
## per_income_woe     1.06183    0.02818  37.676  < 2e-16 ***
## hist_default_woe   0.12107    0.05841   2.073   0.0382 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20950  on 20021  degrees of freedom
## Residual deviance: 12742  on 20012  degrees of freedom
## AIC: 12762
## 
## Number of Fisher Scoring iterations: 6

2.5.2. Fill variables by stepwise

train.step <- step(logit.model, direction = "backward", trace = 0)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(train.step)

## 
## Call:
## glm(formula = BAD ~ income + ownership + employ_length + loan_intent + 
##     loan_grade + loan_amount + interest_rate + per_income, family = binomial(link = "logit"), 
##     data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2317  -0.5218  -0.3023  -0.1226   3.4077  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -4.272e+00  1.867e-01 -22.886  < 2e-16 ***
## income                      6.599e-07  3.208e-07   2.057 0.039687 *  
## ownershipOTHER              5.450e-01  3.561e-01   1.530 0.125962    
## ownershipOWN               -1.799e+00  1.357e-01 -13.262  < 2e-16 ***
## ownershipRENT               8.107e-01  5.093e-02  15.919  < 2e-16 ***
## employ_length              -1.743e-02  5.861e-03  -2.973 0.002949 ** 
## loan_intentEDUCATION       -9.039e-01  7.277e-02 -12.422  < 2e-16 ***
## loan_intentHOMEIMPROVEMENT  3.907e-03  8.100e-02   0.048 0.961528    
## loan_intentMEDICAL         -1.832e-01  6.834e-02  -2.681 0.007348 ** 
## loan_intentPERSONAL        -6.390e-01  7.421e-02  -8.610  < 2e-16 ***
## loan_intentVENTURE         -1.212e+00  8.006e-02 -15.140  < 2e-16 ***
## loan_gradeB                 9.923e-02  9.840e-02   1.008 0.313265    
## loan_gradeC                 2.430e-01  1.451e-01   1.675 0.093909 .  
## loan_gradeD                 2.389e+00  1.839e-01  12.988  < 2e-16 ***
## loan_gradeE                 2.536e+00  2.331e-01  10.879  < 2e-16 ***
## loan_gradeF                 2.822e+00  3.203e-01   8.809  < 2e-16 ***
## loan_gradeG                 5.779e+00  1.097e+00   5.268 1.38e-07 ***
## loan_amount                -9.815e-05  4.988e-06 -19.679  < 2e-16 ***
## interest_rate               8.335e-02  2.162e-02   3.855 0.000116 ***
## per_income                  1.305e+01  2.948e-01  44.263  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20950  on 20021  degrees of freedom
## Residual deviance: 13488  on 20002  degrees of freedom
## AIC: 13528
## 
## Number of Fisher Scoring iterations: 6

train.step_woe <- step(logit.model_woe, direction = "backward", trace = 0)
summary(train.step)

## 
## Call:
## glm(formula = BAD ~ income + ownership + employ_length + loan_intent + 
##     loan_grade + loan_amount + interest_rate + per_income, family = binomial(link = "logit"), 
##     data = train.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2317  -0.5218  -0.3023  -0.1226   3.4077  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -4.272e+00  1.867e-01 -22.886  < 2e-16 ***
## income                      6.599e-07  3.208e-07   2.057 0.039687 *  
## ownershipOTHER              5.450e-01  3.561e-01   1.530 0.125962    
## ownershipOWN               -1.799e+00  1.357e-01 -13.262  < 2e-16 ***
## ownershipRENT               8.107e-01  5.093e-02  15.919  < 2e-16 ***
## employ_length              -1.743e-02  5.861e-03  -2.973 0.002949 ** 
## loan_intentEDUCATION       -9.039e-01  7.277e-02 -12.422  < 2e-16 ***
## loan_intentHOMEIMPROVEMENT  3.907e-03  8.100e-02   0.048 0.961528    
## loan_intentMEDICAL         -1.832e-01  6.834e-02  -2.681 0.007348 ** 
## loan_intentPERSONAL        -6.390e-01  7.421e-02  -8.610  < 2e-16 ***
## loan_intentVENTURE         -1.212e+00  8.006e-02 -15.140  < 2e-16 ***
## loan_gradeB                 9.923e-02  9.840e-02   1.008 0.313265    
## loan_gradeC                 2.430e-01  1.451e-01   1.675 0.093909 .  
## loan_gradeD                 2.389e+00  1.839e-01  12.988  < 2e-16 ***
## loan_gradeE                 2.536e+00  2.331e-01  10.879  < 2e-16 ***
## loan_gradeF                 2.822e+00  3.203e-01   8.809  < 2e-16 ***
## loan_gradeG                 5.779e+00  1.097e+00   5.268 1.38e-07 ***
## loan_amount                -9.815e-05  4.988e-06 -19.679  < 2e-16 ***
## interest_rate               8.335e-02  2.162e-02   3.855 0.000116 ***
## per_income                  1.305e+01  2.948e-01  44.263  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20950  on 20021  degrees of freedom
## Residual deviance: 13488  on 20002  degrees of freedom
## AIC: 13528
## 
## Number of Fisher Scoring iterations: 6

Check model on train set:

train.prob <- predict(train.step, type = "response")
train.pred <- ifelse(train.prob > .5, "1", "0")
table(train.pred, train.data$BAD)

##           
## train.pred     0     1
##          0 14924  1910
##          1   752  2436

Check model on test set:

logit.pred.prob<- predict(train.step, test.data, type = 'response')
logit.pred <- as.factor(ifelse(logit.pred.prob > 0.5, 1, 0))
test.data$BAD <- as.factor(test.data$BAD)
### validation
caret::confusionMatrix(logit.pred, test.data$BAD, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6421  779
##          1  338 1078
##                                           
##                Accuracy : 0.8704          
##                  95% CI : (0.8631, 0.8774)
##     No Information Rate : 0.7845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5805          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5805          
##             Specificity : 0.9500          
##          Pos Pred Value : 0.7613          
##          Neg Pred Value : 0.8918          
##              Prevalence : 0.2155          
##          Detection Rate : 0.1251          
##    Detection Prevalence : 0.1643          
##       Balanced Accuracy : 0.7652          
##                                           
##        'Positive' Class : 1               
##

Thực hiện mô hình trên tập test:

test.data_woe <- woebin_ply(test.data, bins)

## [INFO] converting into woe values ...

Thực hiện mô hình trên tập test:

logit.pred.prob_woe <- predict(train.step_woe, test.data_woe, type = 'response')
logit.pred_woe <- as.factor(ifelse(logit.pred.prob_woe > 0.5, 1, 0))
test.data_woe$BAD <- as.factor(test.data_woe$BAD)
### validation
caret::confusionMatrix(logit.pred_woe, test.data_woe$BAD, positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6434  691
##          1  325 1166
##                                           
##                Accuracy : 0.8821          
##                  95% CI : (0.8751, 0.8888)
##     No Information Rate : 0.7845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6244          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6279          
##             Specificity : 0.9519          
##          Pos Pred Value : 0.7820          
##          Neg Pred Value : 0.9030          
##              Prevalence : 0.2155          
##          Detection Rate : 0.1353          
##    Detection Prevalence : 0.1731          
##       Balanced Accuracy : 0.7899          
##                                           
##        'Positive' Class : 1               
##

III. Conduct Scoring

# Calculate scorecard scores for variables based on the results from woebin and glm: 
my_card <- scorecard(bins, train.step, points0 = 600, odds0 = 1/19, pdo = 50)
head(my_card)

## $basepoints
##      variable bin woe points
## 1: basepoints  NA  NA    696
## 
## $income
##    variable           bin count count_distr  neg  pos    posprob         woe
## 1:   income  [-Inf,35000)  3621   0.1808511 1988 1633 0.45098039  1.08616470
## 2:   income [35000,60000)  7063   0.3527620 5482 1581 0.22384256  0.03946256
## 3:   income [60000,80000)  4240   0.2117671 3584  656 0.15471698 -0.41519899
## 4:   income  [80000, Inf)  5098   0.2546199 4622  476 0.09336995 -0.99028994
##          bin_iv  total_iv breaks is_special_values points
## 1: 0.2703787064 0.4867096  35000             FALSE      0
## 2: 0.0005554841 0.4867096  60000             FALSE      0
## 3: 0.0322552964 0.4867096  80000             FALSE      0
## 4: 0.1835201580 0.4867096    Inf             FALSE      0
## 
## $ownershipOTHER
## Empty data.table (0 rows and 13 cols): variable,bin,count,count_distr,neg,pos...
## 
## $ownershipOWN
## Empty data.table (0 rows and 13 cols): variable,bin,count,count_distr,neg,pos...
## 
## $ownershipRENT
## Empty data.table (0 rows and 13 cols): variable,bin,count,count_distr,neg,pos...
## 
## $employ_length
##         variable      bin count count_distr  neg  pos   posprob        woe
## 1: employ_length [-Inf,2)  4406   0.2200579 3157 1249 0.2834771  0.3555960
## 2: employ_length    [2,3)  2412   0.1204675 1813  599 0.2483416  0.1753984
## 3: employ_length    [3,8)  8974   0.4482070 7185 1789 0.1993537 -0.1074637
## 4: employ_length [8, Inf)  4230   0.2112676 3521  709 0.1676123 -0.3197698
##         bin_iv   total_iv breaks is_special_values points
## 1: 0.030581273 0.05914612      2             FALSE      0
## 2: 0.003889176 0.05914612      3             FALSE      0
## 3: 0.005018671 0.05914612      8             FALSE      0
## 4: 0.019657001 0.05914612    Inf             FALSE      0

IV. Show Results

# Calculate scorecard scores
z<-log(logit.pred.prob_woe/(1-logit.pred.prob_woe))
head(z,10)

##          1          2          3          4          5          6          7 
##  1.4309873 -0.7134508  2.4492096 -3.2113908 -3.1697155  3.3459632  0.3567064 
##          8          9         10 
##  2.7720986 -3.7500900  1.1310879

#write.csv(credit_score,"diemtindung.csv")
#write.csv(z,"diemz.csv")