Library import

library(scorecard)

## Warning: package 'scorecard' was built under R version 3.5.3

library(oetteR)
library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.0       v purrr   0.3.0  
## v tibble  2.0.1       v dplyr   0.8.0.1
## v tidyr   0.8.2       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0

## Warning: package 'dplyr' was built under R version 3.5.3

## -- Conflicts ------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:scorecard':
## 
##     vif

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(ROSE)

## Warning: package 'ROSE' was built under R version 3.5.3

## Loaded ROSE 0.0-3

library(e1071)

## Warning: package 'e1071' was built under R version 3.5.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.5.3

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(mice)

## Warning: package 'mice' was built under R version 3.5.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:tidyr':
## 
##     complete

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(Amelia)

## Warning: package 'Amelia' was built under R version 3.5.3

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(CatEncoders)

## Warning: package 'CatEncoders' was built under R version 3.5.3

## 
## Attaching package: 'CatEncoders'

## The following object is masked from 'package:base':
## 
##     transform

library(plotly)

## Warning: package 'plotly' was built under R version 3.5.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Data import and data manipulation

df <- read.csv("C:/Users/BRAINWORK/Downloads/Sample data for modelling.csv", header = T)
df2 <- df
df2 <- na.omit(df2)
# Converting categorical variables to numerical variables for further analysis

df <- df %>%
  mutate(Good_Bad = ifelse(`Good_Bad`== 'Bad',1,0),
         Good_Bad = as.numeric(Good_Bad))

df <- var_filter(df ,y='Good_Bad')

## [INFO] filtering variables ...

Select variables using IV

We can use scorecard::iv() to calculate the information values

iv = iv(df, y = 'Good_Bad') %>%
  as_tibble() %>%
  mutate( info_value = round(info_value, 3) ) %>%
  arrange( desc(info_value) )

iv %>%
  knitr::kable()

variable	info_value
Total_expenses	2.470
Other_Than_Home_Loan_max_DPD_of_all_period	1.724
Age	1.659
Other_Than_Home_Loan_max_DPD_of_all_accounts_for_lat_period	1.578
Risk_Category	1.457
Credit_Score	1.001
alternate_channel	0.727
Field_of_work	0.605
Location	0.487
Employement_Type	0.297
ROI_band	0.270
Income_Band	0.195
NO_OF_INQUIRIES	0.155
Number_of_family_members	0.121
Sub_Product_Category	0.113
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS	0.103
state	0.091
NEW_ACCTS_IN_LAST_SIX_MONTHS	0.070
Income_type	0.066
Age_Band	0.060
FOIR_Band	0.033
IAR_Band	0.030
Loan_Amount_Band	0.020

Removing Categorical values with more than 10 levels

df$Location <- NULL
df$state <- NULL
df$alternate_channel <- NULL
df$Field_of_work <- NULL

Weight of evidence binning

bins = woebin(df, y = 'Good_Bad')

## [INFO] creating woe binning ... 
## [INFO] Binning on 14032 rows and 20 columns in 00:00:16

bins$DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS %>%
  knitr::kable()

variable	bin	count	count_distr	good	bad	badprob	woe	bin_iv	total_iv	breaks	is_special_values
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS	missing	168	0.0119726	163	5	0.0297619	0.4659876	0.0032750	0.052483	missing	TRUE
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS	[-Inf,1)	12237	0.8720781	12028	209	0.0170793	-0.1023584	0.0087009	0.052483	1	FALSE
DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS	[1, Inf)	1627	0.1159493	1576	51	0.0313460	0.5194802	0.0405071	0.052483	Inf	FALSE

woebin_plot(bins$Employement_Type)

## $Employement_Type

woebin_plot(bins$Age)

## $Age

woebin_plot(bins$Income_Band)

## $Income_Band

df <- na.omit(df)
sum(is.na(df))

## [1] 0

Apply bins

data_woe = woebin_ply( df, bins ) %>%
  as_tibble()

## [INFO] converting into woe values ...

Feature Selection

glm with lasso and crossvalidataion

set.seed(1)

vars = names(data_woe)
vars = vars[ vars != 'Good_Bad']

formula = as.formula( paste( 'Good_Bad ~', paste( vars , collapse = '+') ) )


lasso = oetteR::f_train_lasso( data = data_woe
                               , p = NULL
                               , formula = formula
                               , k = 50
                               , family = 'binomial'
)

## [1] "Progress: 1 / 1 ; 100 % ; ETA: 0 min"

Lasso Visualization

ggplotly(lasso$plot_mse)

p = lasso$plot_coef +
  theme(legend.position = 'none')

ggplotly(p,tooltip =c('x','y','color'))

lasso$tib_all %>%
  filter(lambda == lambda_1se) %>%
  select( lambda_1se, auc, n_coeff_before_lasso, n_coeff_after_lasso) %>%
  knitr::kable()

lambda_1se	auc	n_coeff_before_lasso	n_coeff_after_lasso
0.0013918	0.7180441	19	5
# Building an	d interpreti	ng the Model

formula = as.formula(lasso$formula_str_lambda_1se)

model = glm(formula,data_woe,family = 'binomial')

broom::tidy( model ) %>%
  mutate( star = oetteR::f_stat_stars(p.value) ) %>%
  oetteR::f_datatable_universal( round_other_nums = 2, page_length = nrow(.) )

Predictions

predictions <- predict(model,data_woe,type = 'response')
model_performance <- perf_eva(predictions,df$Good_Bad,title = 'train')

## [INFO] The threshold of confusion matrix is 0.0352.

pred = predict(model)
resp = predict(model, type = 'response')

res = tibble( logit = pred
              , odds = exp(pred)
              , prob = odds / (odds + 1)
              , prob_ctrl = resp )


res %>%
  f_datatable_universal( page_length =  10, round_other_nums = 5 )

Convert Odds to Score

points0 = 600  # Target points
odds0 = 20     # Target odds , default 1/19 .odds = p/(1-p)
pdo = 50       # Points to Double the Odds, default 50

card = scorecard( bins , model
                  , points0 = points0 
                  , odds0 = 1/odds0 # scorecard wants the inverse
                  , pdo = pdo 
)

sc = scorecard_ply( df, card )
res$score = sc[[1]]
card[[2]]

Calculate the score manually

factor = pdo / log(2)
offset = points0 - factor * log( odds0 )

res$score_ctrl = offset - factor * res$logit


res %>%
  arrange( desc(score) ) %>%
  f_datatable_universal( page_length =  10, round_other_nums = 5 )

Logit vs. Odds, Probabilities and Score

res = res %>%
  select( - ends_with('_ctrl') )

res %>%
  gather( key = 'key', value = 'value', - logit ) %>%
  ggplot( aes( logit, value, color = key) ) +
  geom_point() +
  geom_line() +
  facet_wrap(~key, scales = 'free_y')

res %>%
  mutate( score = score * - 1 ) %>%
  gather( key = 'key', value = 'value', - odds ) %>%
  ggplot( aes( odds, value, color = key) ) +
  geom_point() +
  geom_line() +
  facet_wrap(~key, scales = 'free_y')

res %>%
  mutate( score = score * - 1 ) %>%
  mutate_at( vars(logit, prob, score), scale ) %>%
  gather( key = 'key', value = 'value', - odds ) %>%
  ggplot( aes( odds, value, color = key) ) +
  geom_point( alpha = 0.5 ) +
  geom_line()

## Warning: attributes are not identical across measure variables;
## they will be dropped

res %>%
  gather( key = 'key', value = 'value' ) %>%
  ggplot( aes(value) ) +
  geom_histogram( bins = 50
                  , fill = 'aquamarine3'
                  , color = 'black' ) +
  geom_rug()+
  facet_wrap(~key, scales = 'free')

res %>%
  select( logit, score ) %>%
  mutate_all( scale, center = T ) %>%
  mutate_all( as.vector ) %>%
  gather( key = 'key', value = 'value' ) %>%
  ggplot( )+
  geom_histogram( aes( x = value, fill = key )
                  , bins = 50
                  , position="identity"
                  , alpha = 0.5 )

Assigning variable contributions

imp = tibble( variable = names( coef(model) )
              , coef = coef(model) ) %>%
  mutate( variable = map_chr( variable, function(x) unlist( str_split(x, '_woe') )[[1]]  ) ) %>%
  left_join( iv ) %>%
  mutate( imp = abs(coef) * info_value ) %>%
  arrange( desc(imp) )

## Joining, by = "variable"

knitr::kable( imp, align = 'lccc', digits = 2 )

variable	coef	info_value	imp
Risk_Category	0.76	1.46	1.11
Credit_Score	0.53	1.00	0.53
Employement_Type	0.62	0.30	0.18
ROI_band	0.67	0.27	0.18
Income_Band	0.54	0.20	0.11
(Intercept)	-3.97	NA	NA

Interpreting individual predictions

data_relevant = data_woe[, names( coef(model) )[-1] ]

data_mult_logit = as_tibble( data_relevant * coef(model)[-1] )

Dataframe with individual score values

data_mult_score = data_mult_logit %>%
  mutate_all( function(x) - factor * x ) %>%
  mutate( intercept = coef(model)[1]
          , intercept = offset - factor * intercept )

score = apply( data_mult_score, 1, sum ) 

data_mult_score$score = score

data_mult_score$score_ctrl = res$score

data_mult_score %>%
  select( score, score_ctrl, intercept, everything() ) %>%
  head(10) %>%
  knitr::kable()

score	score_ctrl	intercept	Employement_Type_woe	Income_Band_woe	ROI_band_woe	Credit_Score_woe	Risk_Category_woe
639.2450	654	670.2	-18.23941	-33.355612	1.930999	0.2814268	18.427578
695.1538	697	670.2	-15.97447	-6.120831	45.523380	-0.4204271	1.946120
615.1043	629	670.2	-19.60352	-38.119926	1.376865	-0.3335193	1.584394
796.1484	805	670.2	28.69328	29.003243	48.928026	0.3946901	18.929119
583.1584	582	670.2	-22.40357	10.788730	-19.470342	-29.9772957	-25.979129
661.6285	658	670.2	-18.23941	-7.715785	1.930999	26.4630879	-11.010380
782.5932	791	670.2	29.47422	10.502875	45.523380	0.3453608	26.547348
590.1615	586	670.2	-19.60352	-8.817861	-13.374748	-14.0709865	-24.171377
729.1560	714	670.2	28.69328	29.003243	-24.543885	37.1134440	-11.310048
698.0892	706	670.2	-22.40357	25.401664	1.340384	0.3213290	23.229400

data_mult_score$UniqueId <- df2$Unique_ID
data_mult_score$Good_Bad <- df2$Good_Bad

new_names_score = names(data_mult_score) %>%
  str_replace_all( '_woe', '')

new_names_data_relevant = names(data_relevant) %>%
  str_replace_all( '_woe', '')


names(data_mult_score) <- new_names_score

names(data_relevant) <- new_names_data_relevant


data_mult_score <- data_mult_score[,-c(5,7)]
write.csv(data_mult_score,'Individual_score.csv')

Observation score

obs1_woe = data_relevant[1,] %>%
  mutate( rwn = row_number() ) %>%
  select( rwn, everything() ) %>%
  f_manip_transpose_tibble()

obs1_values = df[1,] %>%
  mutate( rwn = row_number() ) %>%
  select( rwn, everything() ) %>%
  f_manip_transpose_tibble()

## Warning: attributes are not identical across measure variables;
## they will be dropped

obs1_score = data_mult_score[1,] %>%
  mutate( rwn = row_number() ) %>%
  select( rwn, everything() ) %>%
  f_manip_transpose_tibble()

## Warning: attributes are not identical across measure variables;
## they will be dropped

 obs1_score %>%
  left_join( obs1_woe , by = 'row_names' ) %>%
  left_join( obs1_values, by = 'row_names') %>%
  left_join( iv, by = c('row_names' = 'variable') ) %>%
  arrange( desc(info_value) ) %>%
  rename( variable = row_names
          , `score` = `1.x`
          , `woe` = `1.y`
          , `value` = `1`) %>%
  knitr::kable( digits = 2, align = 'lcccc' )

variable	score	woe	value	info_value
Credit_Score	0.281426823813058	-0.01	15	1.00
Employement_Type	-18.2394118862113	0.41	Self Employed	0.30
ROI_band	1.93099940068185	-0.04	20 - 22	0.27
Income_Band	-33.3556117675996	0.69	>50k	0.20
Good_Bad	Good	NA	0	NA
intercept	670.199999126996	NA	NA	NA
score_ctrl	654	NA	NA	NA
UniqueId	5005351	NA	NA	NA

R Notebook

Library import

Data import and data manipulation

Select variables using IV

We can use scorecard::iv() to calculate the information values

Removing Categorical values with more than 10 levels

Weight of evidence binning

Apply bins

Feature Selection

glm with lasso and crossvalidataion

Lasso Visualization

Predictions

Convert Odds to Score

Calculate the score manually

Logit vs. Odds, Probabilities and Score

Assigning variable contributions

Interpreting individual predictions

Dataframe with individual score values

Observation score