Library import
library(scorecard)
## Warning: package 'scorecard' was built under R version 3.5.3
library(oetteR)
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.3.0
## v tibble 2.0.1 v dplyr 0.8.0.1
## v tidyr 0.8.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'dplyr' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:scorecard':
##
## vif
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(ROSE)
## Warning: package 'ROSE' was built under R version 3.5.3
## Loaded ROSE 0.0-3
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.5.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(mice)
## Warning: package 'mice' was built under R version 3.5.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(Amelia)
## Warning: package 'Amelia' was built under R version 3.5.3
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(CatEncoders)
## Warning: package 'CatEncoders' was built under R version 3.5.3
##
## Attaching package: 'CatEncoders'
## The following object is masked from 'package:base':
##
## transform
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Data import and data manipulation
df <- read.csv("C:/Users/BRAINWORK/Downloads/Sample data for modelling.csv", header = T)
df2 <- df
df2 <- na.omit(df2)
# Converting categorical variables to numerical variables for further analysis
df <- df %>%
mutate(Good_Bad = ifelse(`Good_Bad`== 'Bad',1,0),
Good_Bad = as.numeric(Good_Bad))
df <- var_filter(df ,y='Good_Bad')
## [INFO] filtering variables ...
Select variables using IV
Removing Categorical values with more than 10 levels
df$Location <- NULL
df$state <- NULL
df$alternate_channel <- NULL
df$Field_of_work <- NULL
Weight of evidence binning
bins = woebin(df, y = 'Good_Bad')
## [INFO] creating woe binning ...
## [INFO] Binning on 14032 rows and 20 columns in 00:00:16
bins$DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS %>%
knitr::kable()
| DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS |
missing |
168 |
0.0119726 |
163 |
5 |
0.0297619 |
0.4659876 |
0.0032750 |
0.052483 |
missing |
TRUE |
| DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS |
[-Inf,1) |
12237 |
0.8720781 |
12028 |
209 |
0.0170793 |
-0.1023584 |
0.0087009 |
0.052483 |
1 |
FALSE |
| DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS |
[1, Inf) |
1627 |
0.1159493 |
1576 |
51 |
0.0313460 |
0.5194802 |
0.0405071 |
0.052483 |
Inf |
FALSE |
woebin_plot(bins$Employement_Type)
## $Employement_Type

woebin_plot(bins$Age)
## $Age

woebin_plot(bins$Income_Band)
## $Income_Band

df <- na.omit(df)
sum(is.na(df))
## [1] 0
Apply bins
data_woe = woebin_ply( df, bins ) %>%
as_tibble()
## [INFO] converting into woe values ...
Feature Selection
glm with lasso and crossvalidataion
set.seed(1)
vars = names(data_woe)
vars = vars[ vars != 'Good_Bad']
formula = as.formula( paste( 'Good_Bad ~', paste( vars , collapse = '+') ) )
lasso = oetteR::f_train_lasso( data = data_woe
, p = NULL
, formula = formula
, k = 50
, family = 'binomial'
)
## [1] "Progress: 1 / 1 ; 100 % ; ETA: 0 min"
Lasso Visualization
ggplotly(lasso$plot_mse)
p = lasso$plot_coef +
theme(legend.position = 'none')
ggplotly(p,tooltip =c('x','y','color'))
lasso$tib_all %>%
filter(lambda == lambda_1se) %>%
select( lambda_1se, auc, n_coeff_before_lasso, n_coeff_after_lasso) %>%
knitr::kable()
| 0.0013918 |
0.7180441 |
19 |
5 |
| # Building an |
d interpreti |
ng the Model |
|
formula = as.formula(lasso$formula_str_lambda_1se)
model = glm(formula,data_woe,family = 'binomial')
broom::tidy( model ) %>%
mutate( star = oetteR::f_stat_stars(p.value) ) %>%
oetteR::f_datatable_universal( round_other_nums = 2, page_length = nrow(.) )
Predictions
predictions <- predict(model,data_woe,type = 'response')
model_performance <- perf_eva(predictions,df$Good_Bad,title = 'train')
## [INFO] The threshold of confusion matrix is 0.0352.

pred = predict(model)
resp = predict(model, type = 'response')
res = tibble( logit = pred
, odds = exp(pred)
, prob = odds / (odds + 1)
, prob_ctrl = resp )
res %>%
f_datatable_universal( page_length = 10, round_other_nums = 5 )
Convert Odds to Score
points0 = 600 # Target points
odds0 = 20 # Target odds , default 1/19 .odds = p/(1-p)
pdo = 50 # Points to Double the Odds, default 50
card = scorecard( bins , model
, points0 = points0
, odds0 = 1/odds0 # scorecard wants the inverse
, pdo = pdo
)
sc = scorecard_ply( df, card )
res$score = sc[[1]]
card[[2]]
Calculate the score manually
factor = pdo / log(2)
offset = points0 - factor * log( odds0 )
res$score_ctrl = offset - factor * res$logit
res %>%
arrange( desc(score) ) %>%
f_datatable_universal( page_length = 10, round_other_nums = 5 )
Logit vs. Odds, Probabilities and Score
res = res %>%
select( - ends_with('_ctrl') )
res %>%
gather( key = 'key', value = 'value', - logit ) %>%
ggplot( aes( logit, value, color = key) ) +
geom_point() +
geom_line() +
facet_wrap(~key, scales = 'free_y')

res %>%
mutate( score = score * - 1 ) %>%
gather( key = 'key', value = 'value', - odds ) %>%
ggplot( aes( odds, value, color = key) ) +
geom_point() +
geom_line() +
facet_wrap(~key, scales = 'free_y')

res %>%
mutate( score = score * - 1 ) %>%
mutate_at( vars(logit, prob, score), scale ) %>%
gather( key = 'key', value = 'value', - odds ) %>%
ggplot( aes( odds, value, color = key) ) +
geom_point( alpha = 0.5 ) +
geom_line()
## Warning: attributes are not identical across measure variables;
## they will be dropped

res %>%
gather( key = 'key', value = 'value' ) %>%
ggplot( aes(value) ) +
geom_histogram( bins = 50
, fill = 'aquamarine3'
, color = 'black' ) +
geom_rug()+
facet_wrap(~key, scales = 'free')

res %>%
select( logit, score ) %>%
mutate_all( scale, center = T ) %>%
mutate_all( as.vector ) %>%
gather( key = 'key', value = 'value' ) %>%
ggplot( )+
geom_histogram( aes( x = value, fill = key )
, bins = 50
, position="identity"
, alpha = 0.5 )

Assigning variable contributions
imp = tibble( variable = names( coef(model) )
, coef = coef(model) ) %>%
mutate( variable = map_chr( variable, function(x) unlist( str_split(x, '_woe') )[[1]] ) ) %>%
left_join( iv ) %>%
mutate( imp = abs(coef) * info_value ) %>%
arrange( desc(imp) )
## Joining, by = "variable"
knitr::kable( imp, align = 'lccc', digits = 2 )
| Risk_Category |
0.76 |
1.46 |
1.11 |
| Credit_Score |
0.53 |
1.00 |
0.53 |
| Employement_Type |
0.62 |
0.30 |
0.18 |
| ROI_band |
0.67 |
0.27 |
0.18 |
| Income_Band |
0.54 |
0.20 |
0.11 |
| (Intercept) |
-3.97 |
NA |
NA |
Interpreting individual predictions
data_relevant = data_woe[, names( coef(model) )[-1] ]
data_mult_logit = as_tibble( data_relevant * coef(model)[-1] )
Dataframe with individual score values
data_mult_score = data_mult_logit %>%
mutate_all( function(x) - factor * x ) %>%
mutate( intercept = coef(model)[1]
, intercept = offset - factor * intercept )
score = apply( data_mult_score, 1, sum )
data_mult_score$score = score
data_mult_score$score_ctrl = res$score
data_mult_score %>%
select( score, score_ctrl, intercept, everything() ) %>%
head(10) %>%
knitr::kable()
| 639.2450 |
654 |
670.2 |
-18.23941 |
-33.355612 |
1.930999 |
0.2814268 |
18.427578 |
| 695.1538 |
697 |
670.2 |
-15.97447 |
-6.120831 |
45.523380 |
-0.4204271 |
1.946120 |
| 615.1043 |
629 |
670.2 |
-19.60352 |
-38.119926 |
1.376865 |
-0.3335193 |
1.584394 |
| 796.1484 |
805 |
670.2 |
28.69328 |
29.003243 |
48.928026 |
0.3946901 |
18.929119 |
| 583.1584 |
582 |
670.2 |
-22.40357 |
10.788730 |
-19.470342 |
-29.9772957 |
-25.979129 |
| 661.6285 |
658 |
670.2 |
-18.23941 |
-7.715785 |
1.930999 |
26.4630879 |
-11.010380 |
| 782.5932 |
791 |
670.2 |
29.47422 |
10.502875 |
45.523380 |
0.3453608 |
26.547348 |
| 590.1615 |
586 |
670.2 |
-19.60352 |
-8.817861 |
-13.374748 |
-14.0709865 |
-24.171377 |
| 729.1560 |
714 |
670.2 |
28.69328 |
29.003243 |
-24.543885 |
37.1134440 |
-11.310048 |
| 698.0892 |
706 |
670.2 |
-22.40357 |
25.401664 |
1.340384 |
0.3213290 |
23.229400 |
data_mult_score$UniqueId <- df2$Unique_ID
data_mult_score$Good_Bad <- df2$Good_Bad
new_names_score = names(data_mult_score) %>%
str_replace_all( '_woe', '')
new_names_data_relevant = names(data_relevant) %>%
str_replace_all( '_woe', '')
names(data_mult_score) <- new_names_score
names(data_relevant) <- new_names_data_relevant
data_mult_score <- data_mult_score[,-c(5,7)]
write.csv(data_mult_score,'Individual_score.csv')
Observation score
obs1_woe = data_relevant[1,] %>%
mutate( rwn = row_number() ) %>%
select( rwn, everything() ) %>%
f_manip_transpose_tibble()
obs1_values = df[1,] %>%
mutate( rwn = row_number() ) %>%
select( rwn, everything() ) %>%
f_manip_transpose_tibble()
## Warning: attributes are not identical across measure variables;
## they will be dropped
obs1_score = data_mult_score[1,] %>%
mutate( rwn = row_number() ) %>%
select( rwn, everything() ) %>%
f_manip_transpose_tibble()
## Warning: attributes are not identical across measure variables;
## they will be dropped
obs1_score %>%
left_join( obs1_woe , by = 'row_names' ) %>%
left_join( obs1_values, by = 'row_names') %>%
left_join( iv, by = c('row_names' = 'variable') ) %>%
arrange( desc(info_value) ) %>%
rename( variable = row_names
, `score` = `1.x`
, `woe` = `1.y`
, `value` = `1`) %>%
knitr::kable( digits = 2, align = 'lcccc' )
| Credit_Score |
0.281426823813058 |
-0.01 |
15 |
1.00 |
| Employement_Type |
-18.2394118862113 |
0.41 |
Self Employed |
0.30 |
| ROI_band |
1.93099940068185 |
-0.04 |
20 - 22 |
0.27 |
| Income_Band |
-33.3556117675996 |
0.69 |
>50k |
0.20 |
| Good_Bad |
Good |
NA |
0 |
NA |
| intercept |
670.199999126996 |
NA |
NA |
NA |
| score_ctrl |
654 |
NA |
NA |
NA |
| UniqueId |
5005351 |
NA |
NA |
NA |