1 Connecting to DB:

con <- dbConnect(SQLite(), 'dataset.db') 
dbListTables(con) #Listado de tablas en la data base
## [1] "dataset"  "metadata"
dbListFields(con, "dataset") #Listado de variables de x tabla
##  [1] "case_id" "keydate" "ct090"   "ap090"   "c0001"   "c0039"   "c0044"  
##  [8] "d0031"   "b0007"   "d0009"   "c0031"   "a8404"   "c0019"   "d0027"  
## [15] "c9008"   "d2112"   "d0012"   "d1205"   "c0015"
dataset<-dbReadTable(con, "dataset") #Leer y asignar tabla
metadata<-dbReadTable(con, "metadata")

2 Attributes Management:

datatable(metadata)
preds <- metadata %>%
  filter(var_class %in% c("variable")) %>%
  select(varcode) %>%
  unlist %>% unname

3 DB Analysis :

Here we analyze the DB by doing the following: 1. Variables that have missing values. 2. The format of the variables is analyzed. My intention will be to have only numeric or factor type attr. 3. I analyze the dependent variable, by date and globally.

# Attributes missing count
quantile(dataset$ap090, probs = seq(0, 1, 0.05))
##        0%        5%       10%       15%       20%       25%       30%       35% 
##     0.000     0.000     0.000     0.000     0.000     0.000     0.000     0.000 
##       40%       45%       50%       55%       60%       65%       70%       75% 
##     0.000     0.000     0.000     0.000     0.000     0.000     0.000     0.000 
##       80%       85%       90%       95%      100% 
##     0.000    52.643   142.304   293.826 25000.000
colSums(is.na(dataset))[colSums(is.na(dataset))>0]
## c0001 d0031 b0007 a8404 c9008 d2112 d1205 
##    25 12629 16926 10401  2984 16952 21238
# Attributes type Analysis
BI<-dataset %>% mutate(DATE_NEW = (as.numeric(substr(keydate, 1, 4))*100 + (as.numeric(substr(keydate, 6, 7)))))%>% mutate(d0012= as.numeric(d0012),c0044= as.numeric(c0044),c0039=as.factor(c0039))%>% select(-keydate)

colnames(BI)
##  [1] "case_id"  "ct090"    "ap090"    "c0001"    "c0039"    "c0044"   
##  [7] "d0031"    "b0007"    "d0009"    "c0031"    "a8404"    "c0019"   
## [13] "d0027"    "c9008"    "d2112"    "d0012"    "d1205"    "c0015"   
## [19] "DATE_NEW"
# Target Analysis
freq(dataset$ct090)

## dataset$ct090 
##       Frequency Percent
## 0         24938   83.13
## 1          5062   16.87
## Total     30000  100.00
freq(dataset$c0039)  

## dataset$c0039 
##       Frequency Percent
## K6419       909   3.030
## K6420        15   0.050
## K6491        24   0.080
## K6499       410   1.367
## K6511         6   0.020
## K6512     17329  57.763
## K6619      1391   4.637
## K6622      9916  33.053
## Total     30000 100.000
table(BI$DATE_NEW,BI$ct090)
##         
##             0    1
##   201701 1708  441
##   201702 3493  566
##   201703 2710  537
##   201704 2579  448
##   201705 3299  632
##   201706 2855  630
##   201707 3058  667
##   201708 2637  564
##   201709 2599  577
BR_GR<-BI %>% group_by(DATE_NEW) %>% 
    summarise(Tot = n(),
              Good = sum(if_else(ct090 == 0 ,1,0)),
              Bad = sum(if_else(ct090 == 1 ,1,0)),
              GoodRate = Good/Tot,
              BadRate = Bad/Tot
              )
BR_GR%>%
flextable() %>%
autofit() %>%
color(color = "black", part = "header") %>%
align(align = "center", part = "all")

4 Variable Reduction

The objective of this stage is to reduce the dimension of the variables through statistical analysis, selecting the best group of variables to build the score. For the choice of variables, a univariate analysis will be done, followed by another bivariate and finally a multivariate analysis.

4.1 Training and testing samples

I use a 70/30 split for the development and validation data.

set.seed(222) # seed for reproducibility

index = sample(nrow(BI), nrow(BI)*0.7)
train<- BI[index,] 
test<- BI[-index,]

##Covert to Dmatrix

dtrain <- xgb.DMatrix(data.matrix(train[,preds]), label = train$ct090)
dtest  <- xgb.DMatrix(data.matrix(test[,preds]), label = test$ct090)

###Create Watch List 
watchlist <- list(test = dtest, train = dtrain)

4.2 Variable Reduction - Xgboost

The bivariate analysis measures the level of prediction of each of the variables with respect to the variable to be predicted and selects the most predictive variables. In this stage, a machine learning algorithm known as XGboost (Extreme Gradient Boosting), is a supervised predictive algorithm that uses the boost principle. The idea is to generate multiple “weak” prediction models sequentially. Each of these takes the results of the previous model and generates a new model that explains the error of the old one, and add this result to the old models to create a newer and strong model. To obtain the “strongest” model, an optimization algorithm is used that consists of an iterative adjustment that tries to find the minimum of an objective function (AUC, RMSE, etc). Each model is compared with the previous one, repeating the process until reaching a point where the difference between consecutive models is insignificant, which tells us that we have found the best possible model.

param <- list(
  max_depth = 1
  , gamma = 0.5                                        # Regularization term
  ,eval_metric = "rmse"                              
  , objective = "binary:logistic"                      # Binary Target
  )

lm1 <- xgb.train(
  param                                                # Parameters from above
  , dtrain                                             # Training Data
  , nrounds = 500                                      # Number of boosting rounds
  , early_stopping_rounds = 10                         # Stop early if no improvement
  , watchlist                                          # data for which to watch error
  , verbose = 0                                        # 0 xgboost wotn print info.
  )
lm1
## ##### xgb.Booster
## raw: 270.7 Kb 
## call:
##   xgb.train(params = param, data = dtrain, nrounds = 500, watchlist = watchlist, 
##     verbose = 0, early_stopping_rounds = 10)
## params (as set within xgb.train):
##   max_depth = "1", gamma = "0.5", eval_metric = "rmse", objective = "binary:logistic", validate_parameters = "TRUE"
## xgb.attributes:
##   best_iteration, best_msg, best_ntreelimit, best_score, niter
## callbacks:
##   cb.evaluation.log()
##   cb.early.stop(stopping_rounds = early_stopping_rounds, maximize = maximize, 
##     verbose = verbose)
## # of features: 15 
## niter: 500
## best_iteration : 500 
## best_ntreelimit : 500 
## best_score : 0.35366 
## best_msg : [500] test-rmse:0.353169  train-rmse:0.353658 
## nfeatures : 15 
## evaluation_log:
##     iter test_rmse train_rmse
##        1   0.43792    0.43880
##        2   0.40422    0.40560
## ---                          
##      499   0.35319    0.35366
##      500   0.35317    0.35366

4.2.1 Selected Variables

We observe the resulting variables in order of importance

importance_matrix <- xgb.importance(preds, lm1)
top <- importance_matrix[1:10] %>% left_join(metadata, by = c("Feature" = "varcode"))%>% select(-Cover,-Frequency)
datatable(top) %>%formatRound(c("Gain"
                                #, "Cover"
                                ), 4) 
xgb.plot.importance(importance_matrix)

preds_xgboost <- top %>%select(Feature) 

preds_xgboost1 <- importance_matrix %>%inner_join(preds_xgboost,by=c("Feature"))%>% mutate(cumul_gain = cumsum(Gain))

topVarsTable <- preds_xgboost1  %>% 
    select(Feature) %>%inner_join(metadata,by=c("Feature"="varcode"))%>%
  inner_join(preds_xgboost1,by=c("Feature"))

topVars <- topVarsTable %>% 
    select(Feature) %>%  
  #filter(!(Feature=="c0039"))%>%
  unlist %>% unname   

5 Univariate and Binning Process

A basic reject checking is performed before, after the missing imputation process, some attributed may be rejected, due to:

a. % of negative or missing values.  
b. zero variance. 
c. extreme clumping. 

For this exercise I am going to be very strict and exclude all variables that are >50% missing.

In the Binning process we want to transform a continuous attribute into a number of intervals (bins). This help us to get a better understanding of its distribution accordingly to the target. Mainly, it should be noted that the attr must be monotone, either ascending or descending depending on the direct or inverse relationship between the variable and the risk.

Depending on the meaning of each attribute, I carry out the missing imputation process. I usually know 2 methodologies.

  1. Replace the missing value to the closest value by business sense.
  2. Replace the missing values by negative values, (eg -999). Then at the “binning”, these negative values may be neutralized or impute the sense of the “woe” according to the business sense.

I’m going to skip both stages (“Supervised binning” and “missing imputation”), since it would be necessary a better understanding about the meaning and business sense of the attributes.

### Here we run the Binning process
library(woeBinning)
bins2 <- woe.binning(df=train,target.var="ct090",pred.var= c(topVars))

###For "c0039" we can build a tree to check the cuts of this attribute. For this case, I just implemented a factor woe binning.
 
woe.binning.table(bins2)
## $`WOE Table for b0007`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1   <= 8.91         458         2.2%      53     405     1.5%     2.3%  88.4%
## 2    <= Inf        8699        41.4%    2165    6534    60.9%    37.5%  75.1%
## 3   Missing       11843        56.4%    1339   10504    37.6%    60.2%  88.7%
## 4     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1 -44.4 0.004
## 2  48.5 0.114
## 3 -47.0 0.106
## 4    NA 0.223
## 
## $`WOE Table for d1205`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1      <= 0        1752         8.3%     437    1315    12.3%     7.5%  75.1%
## 2    <= Inf        4371        20.8%    1260    3111    35.4%    17.8%  71.2%
## 3   Missing       14877        70.8%    1860   13017    52.3%    74.6%  87.5%
## 4     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1  48.8 0.023
## 2  68.6 0.121
## 3 -35.6 0.079
## 4    NA 0.223
## 
## $`WOE Table for d2112`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1      <= 0        5039        24.0%    1130    3909    31.8%    22.4%  77.6%
## 2    <= Inf        4100        19.5%    1076    3024    30.3%    17.3%  73.8%
## 3   Missing       11861        56.5%    1351   10510    38.0%    60.3%  88.6%
## 4     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1  34.9 0.033
## 2  55.7 0.072
## 3 -46.1 0.103
## 4    NA 0.207
## 
## $`WOE Table for d0031`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1      <= 2       10135        48.3%    1379    8756    38.8%    50.2%  86.4%
## 2    <= Inf        2071         9.9%     128    1943     3.6%    11.1%  93.8%
## 3   Missing        8794        41.9%    2050    6744    57.6%    38.7%  76.7%
## 4     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##      WOE    IV
## 1  -25.8 0.030
## 2 -113.0 0.085
## 3   39.9 0.076
## 4     NA 0.190
## 
## $`WOE Table for c0039`
##                  Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr.
## 1                    K6622        6922        33.0%     658    6264    18.5%
## 2         misc. level neg.          10         0.0%       1       9     0.0%
## 3 K6512 + misc. level pos.       14068        67.0%    2898   11170    81.5%
## 4                    Total       21000       100.0%    3557   17443   100.0%
##   0.Distr. 0.Rate   WOE    IV
## 1    35.9%  90.5% -66.3 0.116
## 2     0.1%  90.0% -60.7 0.000
## 3    64.0%  79.4%  24.1 0.042
## 4   100.0%  83.1%    NA 0.158
## 
## $`WOE Table for d0009`
##    Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1 <= 142.234       12600        60.0%    2591   10009    72.8%    57.4%  79.4%
## 2     <= Inf        8400        40.0%     966    7434    27.2%    42.6%  88.5%
## 4      Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1  23.9 0.037
## 2 -45.1 0.070
## 4    NA 0.107
## 
## $`WOE Table for c0031`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1      <= 3       14290        68.0%    2711   11579    76.2%    66.4%  81.0%
## 2      <= 5        4334        20.6%     624    3710    17.5%    21.3%  85.6%
## 3    <= Inf        2376        11.3%     222    2154     6.2%    12.3%  90.7%
## 5     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1  13.8 0.014
## 2 -19.3 0.007
## 3 -68.2 0.042
## 5    NA 0.062
## 
## $`WOE Table for a8404`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1   <= 0.33        9363        44.6%    1253    8110    35.2%    46.5%  86.6%
## 2    <= Inf        4352        20.7%     815    3537    22.9%    20.3%  81.3%
## 3   Missing        7285        34.7%    1489    5796    41.9%    33.2%  79.6%
## 4     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1 -27.8 0.031
## 2  12.2 0.003
## 3  23.1 0.020
## 4    NA 0.054
## 
## $`WOE Table for d0027`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1      <= 0       16365        77.9%    2548   13817    71.6%    79.2%  84.4%
## 2    <= Inf        4635        22.1%    1009    3626    28.4%    20.8%  78.2%
## 4     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1 -10.1 0.008
## 2  31.1 0.024
## 4    NA 0.031
## 
## $`WOE Table for c9008`
##   Final.Bin Total.Count Total.Distr. 1.Count 0.Count 1.Distr. 0.Distr. 0.Rate
## 1     <= 27        2366        11.3%     391    1975    11.0%    11.3%  83.5%
## 2     <= 37        5461        26.0%     751    4710    21.1%    27.0%  86.2%
## 3    <= Inf       11083        52.8%    1972    9111    55.4%    52.2%  82.2%
## 4   Missing        2090        10.0%     443    1647    12.5%     9.4%  78.8%
## 5     Total       21000       100.0%    3557   17443   100.0%   100.0%  83.1%
##     WOE    IV
## 1  -3.0 0.000
## 2 -24.6 0.014
## 3   6.0 0.002
## 4  27.7 0.008
## 5    NA 0.025
woe.binning.plot(bins2)

### Here we deploy the binning already done on the step above, using the training data. 
train_binn <- woe.binning.deploy(train, bins2
                                 #,add.woe.or.dum.var='woe'
                                 )

#table(train_binn$b0007.binned,train_binn$woe.b0007.binned )

### Finally we create a new DB with the id attribiutes, the target and the binned attr. 
colnames(train_binn)
##  [1] "case_id"      "ct090"        "ap090"        "c0001"        "c0039"       
##  [6] "c0044"        "d0031"        "b0007"        "d0009"        "c0031"       
## [11] "a8404"        "c0019"        "d0027"        "c9008"        "d2112"       
## [16] "d0012"        "d1205"        "c0015"        "DATE_NEW"     "b0007.binned"
## [21] "d1205.binned" "d2112.binned" "d0031.binned" "c0039.binned" "d0009.binned"
## [26] "c0031.binned" "a8404.binned" "d0027.binned" "c9008.binned"
BF_0<-train_binn %>% select(c("case_id","ct090", "DATE_NEW",     "b0007.binned","d1205.binned", "d2112.binned", "d0031.binned", "c0039.binned", "d0009.binned", "c0031.binned", "a8404.binned", "d0027.binned", "c9008.binned"))


# As I Mentioned before im going to exclude attributes with >50% missing values.
BF_0<-BF_0 %>% select(-"b0007.binned",-"d1205.binned",-"d2112.binned")

6 Model- GLM

In this stage, the model is created that will allow the probability of customer default to be measured. The included variables are resulting from the selection of variables. We should have taken into account business sense and risk management. After this, we proceed with the adjustment of the model, immediately afterwards we review multicollinearity of the final variables of the model, obtaining a final statistical formula.

fmla<-paste0("ct090~",paste(c("d0031.binned", "c0039.binned", "d0009.binned", "c0031.binned", "a8404.binned", "d0027.binned", "c9008.binned"),collapse = "+"))

mod1<-glm(fmla, family = binomial(link = "logit"), BF_0)
mod1
## 
## Call:  glm(formula = fmla, family = binomial(link = "logit"), data = BF_0)
## 
## Coefficients:
##                  (Intercept)          d0031.binned(2, Inf]  
##                      -1.7831                       -0.6807  
##          d0031.binnedMissing  c0039.binnedmisc. level neg.  
##                       0.6738                       -0.5311  
##            c0039.binnedK6622    d0009.binned(142.234, Inf]  
##                      -0.7437                       -0.6915  
##            c0031.binned(3,5]          c0031.binned(5, Inf]  
##                      -0.0655                       -0.2476  
##      a8404.binned(0.33, Inf]           a8404.binnedMissing  
##                       0.2740                        0.3837  
##         d0027.binned(0, Inf]           c9008.binned(27,37]  
##                       0.5220                       -0.1182  
##        c9008.binned(37, Inf]           c9008.binnedMissing  
##                       0.1299                        0.1266  
## 
## Degrees of Freedom: 20999 Total (i.e. Null);  20986 Residual
## Null Deviance:       19100 
## Residual Deviance: 17600     AIC: 17700

7 Correlation Check

7.1 Variance Inflation Factor

VIF numerically shows the degree of correlation between a predictor and the other attributes in a model. A general guideline is to exclude a VIF larger than 5, therefore we dont excude any attr.

corResults1<-vif(mod1)

VIF <- as.data.frame(corResults1) %>% 
  rownames_to_column("bin_attr") %>% 
  flextable() %>%
  autofit() %>%
  bg(bg = "black", part = "header") %>%
  color(color = "white", part = "header") %>%
align(align = "center", part = "all")
VIF

7.2 Correlation Matrix

train2<-train %>% select(d0031,d0009,c0031,a8404,d0027,c0001)

corMatrix <- rcorr(as.matrix(train2))

cor<-corMatrix$r

data.frame(cor) %>%
  rownames_to_column("attr") %>% 
  flextable() %>%
  autofit() %>%
  bg(bg = "black", part = "header") %>%
  color(color = "white", part = "header") %>%
align(align = "center", part = "all")

8 Model Report

Finally, we score the DB using the resulting Model-Score. Then we create ranges, to develop the behavior table of the Model-Score according to the cuts. Here we compute the GINI and KS values from the training database.

I attached an excel with the behaviour train table.

We must score the test DB, to evaluate the stability between both tables. (Train and test DB). For the purposes of this exercise, I will only do the train data-related.

scoredData_0 <- BF_0 %>% 
   cbind(tibble(Score = predict(mod1, BF_0)))

RANGE<-ntile(scoredData_0$Score,10)
RANGE1 <- as.data.frame(RANGE)
colnames(RANGE1) <- "RangoScore"

scoredData <- cbind(scoredData_0, RANGE1)
table(scoredData$RangoScore,scoredData$ct090 )
##     
##         0    1
##   1  1993  107
##   2  1961  139
##   3  1923  177
##   4  1872  228
##   5  1802  298
##   6  1779  321
##   7  1707  393
##   8  1640  460
##   9  1469  631
##   10 1297  803
datatable(scoredData %>% 
  group_by(RangoScore) %>% 
  summarise(
    n = n()
    ,min=min(Score)
    ,max=max(Score)
    , nBad = sum(if_else(ct090 == 1 ,1,0))
    , nGood = sum(if_else(ct090 == 0 ,1,0))
  ) %>%
  mutate(
    pBad = nBad/n
    , pGood = nGood/n
  ) %>%
  arrange(RangoScore)) %>%
  formatRound(c("min","max","pBad", "pGood"), 3)
## Stability (PSI)

#The population stability index measures how much a sample or variable changes between two samples or over time based on its distribution. (Indice de Estabilidad Poblacional IEP)

PSI_train_table<-scoredData %>% 
  group_by(DATE_NEW,RangoScore) %>% 
  summarise(n=n()) %>% 
  mutate(pct=n/sum(n))
## `summarise()` has grouped output by 'DATE_NEW'. You can override using the `.groups` argument.
PSI_train_table_0 <- PSI_train_table %>%
  pivot_wider(id_cols=RangoScore,
              names_from = DATE_NEW,
              names_sort = TRUE,
              values_from = pct,
  ) 

PSI_train_table_1<-PSI_train_table_0[,-1]

#With the results we can check if there is any reason for the movement from January to February in the lower risk profiles (1,2)

a <- matrix(NA,10,8)
dat_f_0<-PSI_train_table_1

pct_old<-0
pct<-0

for (i in 1:8)  {
  
  pct_old<-dat_f_0[,i]
  pct<-dat_f_0[,i+1]
  
  resultado <-  ((pct_old - pct) * (log(pct_old / pct))) %>% as.matrix()
  a[,i] <- round(resultado,4)
}

cortes<-unique(PSI_train_table$DATE_NEW)


IEP<-as_tibble(a)
COLUMNAS<-paste0(cortes[1:(length(cortes)-1)],"_",cortes[2:length(cortes)])
names(IEP)<-COLUMNAS
IEP_1<-colSums(IEP)
IEP<-rbind(IEP,IEP_1*100)
rows<- t(data_frame("1","2","3","4","5","6","7","8","9","10","IEP%"))
IEP<-cbind(rows,IEP)%>%rename(RANGO_SC=rows)

IEP %>%
  flextable() %>%
  autofit() %>%
  bg(bg = "black", part = "header") %>%
  color(color = "white", part = "header") %>%
  align(align = "center", part = "all")