Library import

library(scorecard)
## Warning: package 'scorecard' was built under R version 3.5.3
library(oetteR)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0       v purrr   0.3.0  
## v tibble  2.0.1       v dplyr   0.8.0.1
## v tidyr   0.8.2       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:scorecard':
## 
##     vif
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(ROSE)
## Warning: package 'ROSE' was built under R version 3.5.3
## Loaded ROSE 0.0-3
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(mice)
## Warning: package 'mice' was built under R version 3.5.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
## 
##     complete
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.5.3
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(CatEncoders)
## Warning: package 'CatEncoders' was built under R version 3.5.3
## 
## Attaching package: 'CatEncoders'
## The following object is masked from 'package:base':
## 
##     transform
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Data import and data manipulation

df <- read.csv("C:/Users/BRAINWORK/Downloads/Sample data for modelling.csv", header = T)
df2 <- df
df2 <- na.omit(df2)
# Converting categorical variables to numerical variables for further analysis

df <- df %>%
  mutate(Good_Bad = ifelse(`Good_Bad`== 'Bad',1,0),
         Good_Bad = as.numeric(Good_Bad))

df <- var_filter(df ,y='Good_Bad')
## [INFO] filtering variables ...

Select variables using IV

We can use scorecard::iv() to calculate the information values

iv = iv(df, y = 'Good_Bad') %>%
  as_tibble() %>%
  mutate( info_value = round(info_value, 3) ) %>%
  arrange( desc(info_value) )

iv %>%
  knitr::kable()
variable info_value
Total_expenses 2.470
Other_Than_Home_Loan_max_DPD_of_all_period 1.724
Age 1.659
Other_Than_Home_Loan_max_DPD_of_all_accounts_for_lat_period 1.578
Risk_Category 1.457
Credit_Score 1.001
alternate_channel 0.727
Field_of_work 0.605
Location 0.487
Employement_Type 0.297
ROI_band 0.270
Income_Band 0.195
NO_OF_INQUIRIES 0.155
Number_of_family_members 0.121
Sub_Product_Category 0.113
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS 0.103
state 0.091
NEW_ACCTS_IN_LAST_SIX_MONTHS 0.070
Income_type 0.066
Age_Band 0.060
FOIR_Band 0.033
IAR_Band 0.030
Loan_Amount_Band 0.020

Removing Categorical values with more than 10 levels

df$Location <- NULL
df$state <- NULL
df$alternate_channel <- NULL
df$Field_of_work <- NULL

Weight of evidence binning

bins = woebin(df, y = 'Good_Bad')
## [INFO] creating woe binning ... 
## [INFO] Binning on 14032 rows and 20 columns in 00:00:16
bins$DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS %>%
  knitr::kable()
variable bin count count_distr good bad badprob woe bin_iv total_iv breaks is_special_values
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS missing 168 0.0119726 163 5 0.0297619 0.4659876 0.0032750 0.052483 missing TRUE
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS [-Inf,1) 12237 0.8720781 12028 209 0.0170793 -0.1023584 0.0087009 0.052483 1 FALSE
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS [1, Inf) 1627 0.1159493 1576 51 0.0313460 0.5194802 0.0405071 0.052483 Inf FALSE
woebin_plot(bins$Employement_Type)
## $Employement_Type

woebin_plot(bins$Age)
## $Age

woebin_plot(bins$Income_Band)
## $Income_Band

df <- na.omit(df)
sum(is.na(df))
## [1] 0

Apply bins

data_woe = woebin_ply( df, bins ) %>%
  as_tibble()
## [INFO] converting into woe values ...

Feature Selection

glm with lasso and crossvalidataion

set.seed(1)

vars = names(data_woe)
vars = vars[ vars != 'Good_Bad']

formula = as.formula( paste( 'Good_Bad ~', paste( vars , collapse = '+') ) )


lasso = oetteR::f_train_lasso( data = data_woe
                               , p = NULL
                               , formula = formula
                               , k = 50
                               , family = 'binomial'
)
## [1] "Progress: 1 / 1 ; 100 % ; ETA: 0 min"

Lasso Visualization

ggplotly(lasso$plot_mse)
p = lasso$plot_coef +
  theme(legend.position = 'none')

ggplotly(p,tooltip =c('x','y','color'))
lasso$tib_all %>%
  filter(lambda == lambda_1se) %>%
  select( lambda_1se, auc, n_coeff_before_lasso, n_coeff_after_lasso) %>%
  knitr::kable()
lambda_1se auc n_coeff_before_lasso n_coeff_after_lasso
0.0013918 0.7180441 19 5
# Building an d interpreti ng the Model
formula = as.formula(lasso$formula_str_lambda_1se)

model = glm(formula,data_woe,family = 'binomial')

broom::tidy( model ) %>%
  mutate( star = oetteR::f_stat_stars(p.value) ) %>%
  oetteR::f_datatable_universal( round_other_nums = 2, page_length = nrow(.) )

Predictions

predictions <- predict(model,data_woe,type = 'response')
model_performance <- perf_eva(predictions,df$Good_Bad,title = 'train')
## [INFO] The threshold of confusion matrix is 0.0352.

pred = predict(model)
resp = predict(model, type = 'response')

res = tibble( logit = pred
              , odds = exp(pred)
              , prob = odds / (odds + 1)
              , prob_ctrl = resp )


res %>%
  f_datatable_universal( page_length =  10, round_other_nums = 5 )

Convert Odds to Score

points0 = 600  # Target points
odds0 = 20     # Target odds , default 1/19 .odds = p/(1-p)
pdo = 50       # Points to Double the Odds, default 50

card = scorecard( bins , model
                  , points0 = points0 
                  , odds0 = 1/odds0 # scorecard wants the inverse
                  , pdo = pdo 
)

sc = scorecard_ply( df, card )
res$score = sc[[1]]
card[[2]]

Calculate the score manually

factor = pdo / log(2)
offset = points0 - factor * log( odds0 )

res$score_ctrl = offset - factor * res$logit


res %>%
  arrange( desc(score) ) %>%
  f_datatable_universal( page_length =  10, round_other_nums = 5 )

Logit vs. Odds, Probabilities and Score

res = res %>%
  select( - ends_with('_ctrl') )

res %>%
  gather( key = 'key', value = 'value', - logit ) %>%
  ggplot( aes( logit, value, color = key) ) +
  geom_point() +
  geom_line() +
  facet_wrap(~key, scales = 'free_y')

res %>%
  mutate( score = score * - 1 ) %>%
  gather( key = 'key', value = 'value', - odds ) %>%
  ggplot( aes( odds, value, color = key) ) +
  geom_point() +
  geom_line() +
  facet_wrap(~key, scales = 'free_y')

res %>%
  mutate( score = score * - 1 ) %>%
  mutate_at( vars(logit, prob, score), scale ) %>%
  gather( key = 'key', value = 'value', - odds ) %>%
  ggplot( aes( odds, value, color = key) ) +
  geom_point( alpha = 0.5 ) +
  geom_line()
## Warning: attributes are not identical across measure variables;
## they will be dropped

res %>%
  gather( key = 'key', value = 'value' ) %>%
  ggplot( aes(value) ) +
  geom_histogram( bins = 50
                  , fill = 'aquamarine3'
                  , color = 'black' ) +
  geom_rug()+
  facet_wrap(~key, scales = 'free')

res %>%
  select( logit, score ) %>%
  mutate_all( scale, center = T ) %>%
  mutate_all( as.vector ) %>%
  gather( key = 'key', value = 'value' ) %>%
  ggplot( )+
  geom_histogram( aes( x = value, fill = key )
                  , bins = 50
                  , position="identity"
                  , alpha = 0.5 )

Assigning variable contributions

imp = tibble( variable = names( coef(model) )
              , coef = coef(model) ) %>%
  mutate( variable = map_chr( variable, function(x) unlist( str_split(x, '_woe') )[[1]]  ) ) %>%
  left_join( iv ) %>%
  mutate( imp = abs(coef) * info_value ) %>%
  arrange( desc(imp) ) 
## Joining, by = "variable"
knitr::kable( imp, align = 'lccc', digits = 2 )
variable coef info_value imp
Risk_Category 0.76 1.46 1.11
Credit_Score 0.53 1.00 0.53
Employement_Type 0.62 0.30 0.18
ROI_band 0.67 0.27 0.18
Income_Band 0.54 0.20 0.11
(Intercept) -3.97 NA NA

Interpreting individual predictions

data_relevant = data_woe[, names( coef(model) )[-1] ]

data_mult_logit = as_tibble( data_relevant * coef(model)[-1] ) 

Dataframe with individual score values

data_mult_score = data_mult_logit %>%
  mutate_all( function(x) - factor * x ) %>%
  mutate( intercept = coef(model)[1]
          , intercept = offset - factor * intercept )

score = apply( data_mult_score, 1, sum ) 

data_mult_score$score = score

data_mult_score$score_ctrl = res$score

data_mult_score %>%
  select( score, score_ctrl, intercept, everything() ) %>%
  head(10) %>%
  knitr::kable()
score score_ctrl intercept Employement_Type_woe Income_Band_woe ROI_band_woe Credit_Score_woe Risk_Category_woe
639.2450 654 670.2 -18.23941 -33.355612 1.930999 0.2814268 18.427578
695.1538 697 670.2 -15.97447 -6.120831 45.523380 -0.4204271 1.946120
615.1043 629 670.2 -19.60352 -38.119926 1.376865 -0.3335193 1.584394
796.1484 805 670.2 28.69328 29.003243 48.928026 0.3946901 18.929119
583.1584 582 670.2 -22.40357 10.788730 -19.470342 -29.9772957 -25.979129
661.6285 658 670.2 -18.23941 -7.715785 1.930999 26.4630879 -11.010380
782.5932 791 670.2 29.47422 10.502875 45.523380 0.3453608 26.547348
590.1615 586 670.2 -19.60352 -8.817861 -13.374748 -14.0709865 -24.171377
729.1560 714 670.2 28.69328 29.003243 -24.543885 37.1134440 -11.310048
698.0892 706 670.2 -22.40357 25.401664 1.340384 0.3213290 23.229400
data_mult_score$UniqueId <- df2$Unique_ID
data_mult_score$Good_Bad <- df2$Good_Bad

new_names_score = names(data_mult_score) %>%
  str_replace_all( '_woe', '')

new_names_data_relevant = names(data_relevant) %>%
  str_replace_all( '_woe', '')


names(data_mult_score) <- new_names_score

names(data_relevant) <- new_names_data_relevant


data_mult_score <- data_mult_score[,-c(5,7)]
write.csv(data_mult_score,'Individual_score.csv')

Observation score

obs1_woe = data_relevant[1,] %>%
  mutate( rwn = row_number() ) %>%
  select( rwn, everything() ) %>%
  f_manip_transpose_tibble()

obs1_values = df[1,] %>%
  mutate( rwn = row_number() ) %>%
  select( rwn, everything() ) %>%
  f_manip_transpose_tibble()
## Warning: attributes are not identical across measure variables;
## they will be dropped
obs1_score = data_mult_score[1,] %>%
  mutate( rwn = row_number() ) %>%
  select( rwn, everything() ) %>%
  f_manip_transpose_tibble() 
## Warning: attributes are not identical across measure variables;
## they will be dropped
 obs1_score %>%
  left_join( obs1_woe , by = 'row_names' ) %>%
  left_join( obs1_values, by = 'row_names') %>%
  left_join( iv, by = c('row_names' = 'variable') ) %>%
  arrange( desc(info_value) ) %>%
  rename( variable = row_names
          , `score` = `1.x`
          , `woe` = `1.y`
          , `value` = `1`) %>%
  knitr::kable( digits = 2, align = 'lcccc' )
variable score woe value info_value
Credit_Score 0.281426823813058 -0.01 15 1.00
Employement_Type -18.2394118862113 0.41 Self Employed 0.30
ROI_band 1.93099940068185 -0.04 20 - 22 0.27
Income_Band -33.3556117675996 0.69 >50k 0.20
Good_Bad Good NA 0 NA
intercept 670.199999126996 NA NA NA
score_ctrl 654 NA NA NA
UniqueId 5005351 NA NA NA