Predicting Credit Worthiness of a customer

This is an R Markdown document built to predict credit worthiness of a customer in some Financial Services. This model uses the famous “XgBoost” Ensemble technique to predict credit worthiness of a customer.

Removing stale objects from the environment

rm(list = ls())

Loading Required Libraries for data analysis and Modelling

library(ggplot2)
library(e1071)
library(gridExtra)
library(Boruta)
## Warning: package 'Boruta' was built under R version 3.3.2
## Loading required package: ranger
## Warning: package 'ranger' was built under R version 3.3.2
library(Matrix)
library(caTools)
library(magrittr)
library(data.table)
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.3.2
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess

Reading Data from the git hub repository.

data=read.csv("https://raw.githubusercontent.com/imtiazBDSgit/TextAnalytics/master/ModelingData.txt",sep="\t",header=TRUE)

Structure of the data

dim(data)
## [1] 1002   32
str(data)
## 'data.frame':    1002 obs. of  32 variables:
##  $ OBS.            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CHK_ACCT        : int  0 1 3 0 0 3 3 1 3 1 ...
##  $ DURATION        : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ HISTORY         : int  4 2 4 2 3 2 2 2 2 4 ...
##  $ NEW_CAR         : int  0 0 0 0 1 0 0 0 0 1 ...
##  $ USED_CAR        : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ FURNITURE       : int  0 0 0 1 0 0 1 0 0 0 ...
##  $ RADIO.TV        : int  1 1 0 0 0 0 0 0 1 0 ...
##  $ EDUCATION       : int  0 0 1 0 0 1 0 0 0 0 ...
##  $ RETRAINING      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ AMOUNT          : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ SAV_ACCT        : int  4 0 0 0 0 4 2 0 3 0 ...
##  $ EMPLOYMENT      : int  4 2 3 3 2 2 4 2 3 0 ...
##  $ INSTALL_RATE    : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ MALE_DIV        : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ MALE_SINGLE     : int  1 0 1 1 1 1 1 1 0 0 ...
##  $ MALE_MAR_or_WID : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ CO.APPLICANT    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GUARANTOR       : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ PRESENT_RESIDENT: int  4 2 3 4 4 4 4 2 4 2 ...
##  $ REAL_ESTATE     : int  1 1 1 0 0 0 0 0 1 0 ...
##  $ PROP_UNKN_NONE  : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ AGE             : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ OTHER_INSTALL   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RENT            : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ OWN_RES         : int  1 1 1 0 0 0 1 0 1 1 ...
##  $ NUM_CREDITS     : int  2 1 1 1 2 1 1 1 1 2 ...
##  $ JOB             : int  2 2 1 2 2 1 2 3 1 3 ...
##  $ NUM_DEPENDENTS  : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ TELEPHONE       : int  1 0 0 0 0 1 0 1 0 0 ...
##  $ FOREIGN         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RESPONSE        : int  1 0 1 1 0 1 1 1 1 0 ...
head(data)
##   OBS. CHK_ACCT DURATION HISTORY NEW_CAR USED_CAR FURNITURE RADIO.TV
## 1    1        0        6       4       0        0         0        1
## 2    2        1       48       2       0        0         0        1
## 3    3        3       12       4       0        0         0        0
## 4    4        0       42       2       0        0         1        0
## 5    5        0       24       3       1        0         0        0
## 6    6        3       36       2       0        0         0        0
##   EDUCATION RETRAINING AMOUNT SAV_ACCT EMPLOYMENT INSTALL_RATE MALE_DIV
## 1         0          0   1169        4          4            4        0
## 2         0          0   5951        0          2            2        0
## 3         1          0   2096        0          3            2        0
## 4         0          0   7882        0          3            2        0
## 5         0          0   4870        0          2            3        0
## 6         1          0   9055        4          2            2        0
##   MALE_SINGLE MALE_MAR_or_WID CO.APPLICANT GUARANTOR PRESENT_RESIDENT
## 1           1               0            0         0                4
## 2           0               0            0         0                2
## 3           1               0            0         0                3
## 4           1               0            0         1                4
## 5           1               0            0         0                4
## 6           1               0            0         0                4
##   REAL_ESTATE PROP_UNKN_NONE AGE OTHER_INSTALL RENT OWN_RES NUM_CREDITS
## 1           1              0  67             0    0       1           2
## 2           1              0  22             0    0       1           1
## 3           1              0  49             0    0       1           1
## 4           0              0  45             0    0       0           1
## 5           0              1  53             0    0       0           2
## 6           0              1  35             0    0       0           1
##   JOB NUM_DEPENDENTS TELEPHONE FOREIGN RESPONSE
## 1   2              1         1       0        1
## 2   2              1         0       0        0
## 3   1              2         0       0        1
## 4   2              2         0       0        1
## 5   2              2         0       0        0
## 6   1              2         1       0        1

Summarizing the data , objective here is to remove NA values.

summary(data)
##       OBS.           CHK_ACCT        DURATION       HISTORY     
##  Min.   :   1.0   Min.   :0.000   Min.   : 4.0   Min.   :0.000  
##  1st Qu.: 250.8   1st Qu.:0.000   1st Qu.:12.0   1st Qu.:2.000  
##  Median : 500.5   Median :1.000   Median :18.0   Median :2.000  
##  Mean   : 500.5   Mean   :1.577   Mean   :20.9   Mean   :2.545  
##  3rd Qu.: 750.2   3rd Qu.:3.000   3rd Qu.:24.0   3rd Qu.:4.000  
##  Max.   :1000.0   Max.   :3.000   Max.   :72.0   Max.   :4.000  
##  NA's   :2        NA's   :2       NA's   :2      NA's   :2      
##     NEW_CAR         USED_CAR       FURNITURE        RADIO.TV   
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :0.00  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.00  
##  Median :0.000   Median :0.000   Median :0.000   Median :0.00  
##  Mean   :0.234   Mean   :0.103   Mean   :0.181   Mean   :0.28  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:1.00  
##  Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :1.00  
##  NA's   :2       NA's   :2       NA's   :2       NA's   :2     
##    EDUCATION      RETRAINING        AMOUNT         SAV_ACCT    
##  Min.   :0.00   Min.   :0.000   Min.   :  250   Min.   :0.000  
##  1st Qu.:0.00   1st Qu.:0.000   1st Qu.: 1366   1st Qu.:0.000  
##  Median :0.00   Median :0.000   Median : 2320   Median :0.000  
##  Mean   :0.05   Mean   :0.097   Mean   : 3271   Mean   :1.105  
##  3rd Qu.:0.00   3rd Qu.:0.000   3rd Qu.: 3972   3rd Qu.:2.000  
##  Max.   :1.00   Max.   :1.000   Max.   :18424   Max.   :4.000  
##  NA's   :2      NA's   :2       NA's   :2       NA's   :2      
##    EMPLOYMENT     INSTALL_RATE      MALE_DIV     MALE_SINGLE   
##  Min.   :0.000   Min.   :1.000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:2.000   1st Qu.:2.000   1st Qu.:0.00   1st Qu.:0.000  
##  Median :2.000   Median :3.000   Median :0.00   Median :1.000  
##  Mean   :2.384   Mean   :2.973   Mean   :0.05   Mean   :0.548  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:0.00   3rd Qu.:1.000  
##  Max.   :4.000   Max.   :4.000   Max.   :1.00   Max.   :1.000  
##  NA's   :2       NA's   :2       NA's   :2      NA's   :2      
##  MALE_MAR_or_WID  CO.APPLICANT     GUARANTOR     PRESENT_RESIDENT
##  Min.   :0.000   Min.   :0.000   Min.   :0.000   Min.   :1.000   
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:2.000   
##  Median :0.000   Median :0.000   Median :0.000   Median :3.000   
##  Mean   :0.092   Mean   :0.041   Mean   :0.052   Mean   :2.845   
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:4.000   
##  Max.   :1.000   Max.   :1.000   Max.   :1.000   Max.   :4.000   
##  NA's   :2       NA's   :2       NA's   :2       NA's   :2       
##   REAL_ESTATE    PROP_UNKN_NONE       AGE        OTHER_INSTALL  
##  Min.   :0.000   Min.   :0.000   Min.   :19.00   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:27.00   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :33.00   Median :0.000  
##  Mean   :0.282   Mean   :0.154   Mean   :35.55   Mean   :0.186  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:42.00   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.000   Max.   :75.00   Max.   :1.000  
##  NA's   :2       NA's   :2       NA's   :2       NA's   :2      
##       RENT          OWN_RES       NUM_CREDITS         JOB       
##  Min.   :0.000   Min.   :0.000   Min.   :1.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :0.000   Median :1.000   Median :1.000   Median :2.000  
##  Mean   :0.179   Mean   :0.713   Mean   :1.407   Mean   :1.904  
##  3rd Qu.:0.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :1.000   Max.   :1.000   Max.   :4.000   Max.   :3.000  
##  NA's   :2       NA's   :2       NA's   :2       NA's   :2      
##  NUM_DEPENDENTS    TELEPHONE        FOREIGN         RESPONSE  
##  Min.   :1.000   Min.   :0.000   Min.   :0.000   Min.   :0.0  
##  1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0  
##  Median :1.000   Median :0.000   Median :0.000   Median :1.0  
##  Mean   :1.155   Mean   :0.404   Mean   :0.037   Mean   :0.7  
##  3rd Qu.:1.000   3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:1.0  
##  Max.   :2.000   Max.   :1.000   Max.   :1.000   Max.   :1.0  
##  NA's   :2       NA's   :2       NA's   :2       NA's   :2

Since NA’s are only two in number in the whole data, it has little impact so we can safely remove them.

data= na.omit(data)

Categorical Features as mentioned in the code book.

categoricalFeatures=c('CHK_ACCT','HISTORY','SAV_ACCT','EMPLOYMENT','PRESENT_RESIDENT','JOB','NEW_CAR',  'USED_CAR', 'FURNITURE',    'RADIO.TV', 'EDUCATION',    'RETRAINING',   'MALE_DIV', 'MALE_SINGLE',  'MALE_MAR_or_WID',  'CO.APPLICANT', 'GUARANTOR',    'REAL_ESTATE',  'PROP_UNKN_NONE',   'OTHER_INSTALL',    'RENT', 'OWN_RES',  'TELEPHONE',    'FOREIGN',  'RESPONSE')

Making those values as factor values,removing observation column as it is mere index value.

data[,categoricalFeatures]=lapply(data[,categoricalFeatures],as.factor)
data=data[,-1]
data_numeric=data[ , !(names(data) %in% categoricalFeatures)]
data_categorical=data[ , (names(data) %in% categoricalFeatures)]

Some Reusable functions to do exploratory visual analysis.

#Plots histograms
plotHist <- function(data_in, i) {
  data <- data.frame(x=data_in[[i]])
  p <- ggplot(data=data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() + 
    theme(axis.text.x = element_text(angle = 90, hjust =1))
  return (p)
}

doPlots <- function(data_in, fun, ii, ncol=3) {
  pp <- list()
  for (i in ii) {
    p <- fun(data_in=data_in, i=i)
    pp <- c(pp, list(p))
  }
  do.call("grid.arrange", c(pp, ncol=ncol))
}

#Plots density plots for numeric variables
plotDen <- function(data_in, i){
  data=data.frame(x=data_in[[i]])
  p <- ggplot(data= data) + geom_line(aes(x = x), stat = 'density', size = 1,alpha = 1.0) +
    xlab(paste0((colnames(data_in)[i]), '\n', 'Skewness: ',round(skewness(data_in[[i]], na.rm = TRUE), 2))) + theme_light() 
  return(p)
  
}

Barplots of categorical features.

doPlots(data_categorical, fun = plotHist, ii = 1:25, ncol = 5)

Insights

People with chk_acct status are very less compared other categories People were fairly compliant in paying back the bills , we can infer that from the credit History category 2 New cars are fairly less Used cars are fairly more. People have less furniture at their homes People having radio/tv are also pretty less Education levels is also low purpose of credit is also not retraining in most cases For most customers Account balances are low. People with 1-4 years experienced employees dominate male_divorced or male_married_widowed are less people dont have co-applicants and guarantor mostly Present residence fairly distributed own houses are more fairly a quarter of them own real estate people dont have telephones in great deal nature of the job is mostly skilled employees Most people dont rent. workers are local , foreign workers are less Credit rating is good dominated.

Numerical variable density plots

doPlots(data_numeric, fun = plotDen, ii = 1:6, ncol = 2)

#Insights Amount is right skewed distributed Age is right skewed distributed Rest all features dont have a particular pattern , mostly decreasing exception is install_rate

Feature Engineering Using Boruta package to reduce unimportant features and to retain most important

## Boruta performed 99 iterations in 2.230423 mins.
##  15 attributes confirmed important: AGE, AMOUNT, CHK_ACCT,
## DURATION, EMPLOYMENT and 10 more.
##  13 attributes confirmed unimportant: CO.APPLICANT, EDUCATION,
## FOREIGN, FURNITURE, MALE_DIV and 8 more.
##  2 tentative attributes left: JOB, NUM_CREDITS.

plotting feature importance

plot(boruta.train, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(boruta.train$ImpHistory),function(i)
  boruta.train$ImpHistory[is.finite(boruta.train$ImpHistory[,i]),i])
names(lz) <- colnames(boruta.train$ImpHistory)
Labels <- sort(sapply(lz,median))
 axis(side = 1,las=2,labels = names(Labels),
       at = 1:ncol(boruta.train$ImpHistory), cex.axis = 0.7)

Model Building using Xgboost as the algorithm is seen to perform the best because ofEnsembling techniques.

Filtering out important features which we got from boruta package.

TotalData=data[ , (names(data) %in% importantFeatures)]
 str(TotalData)
## 'data.frame':    1000 obs. of  15 variables:
##  $ CHK_ACCT      : Factor w/ 4 levels "0","1","2","3": 1 2 4 1 1 4 4 2 4 2 ...
##  $ DURATION      : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ HISTORY       : Factor w/ 5 levels "0","1","2","3",..: 5 3 5 3 4 3 3 3 3 5 ...
##  $ NEW_CAR       : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 2 ...
##  $ USED_CAR      : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  $ AMOUNT        : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ SAV_ACCT      : Factor w/ 5 levels "0","1","2","3",..: 5 1 1 1 1 5 3 1 4 1 ...
##  $ EMPLOYMENT    : Factor w/ 5 levels "0","1","2","3",..: 5 3 4 4 3 3 5 3 4 1 ...
##  $ INSTALL_RATE  : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ GUARANTOR     : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
##  $ REAL_ESTATE   : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 2 1 ...
##  $ PROP_UNKN_NONE: Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 1 1 1 ...
##  $ AGE           : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ OTHER_INSTALL : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OWN_RES       : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 2 1 2 2 ...
 TotalData$RESPONSE=data$RESPONSE

Data preparation for Xgboost Model, Since the model accepts only numeric data.

 set.seed(101) 
 sparse_matrix <- sparse.model.matrix(RESPONSE~.-1, data = TotalData)
 colnames(sparse_matrix)
##  [1] "CHK_ACCT0"       "CHK_ACCT1"       "CHK_ACCT2"      
##  [4] "CHK_ACCT3"       "DURATION"        "HISTORY1"       
##  [7] "HISTORY2"        "HISTORY3"        "HISTORY4"       
## [10] "NEW_CAR1"        "USED_CAR1"       "AMOUNT"         
## [13] "SAV_ACCT1"       "SAV_ACCT2"       "SAV_ACCT3"      
## [16] "SAV_ACCT4"       "EMPLOYMENT1"     "EMPLOYMENT2"    
## [19] "EMPLOYMENT3"     "EMPLOYMENT4"     "INSTALL_RATE"   
## [22] "GUARANTOR1"      "REAL_ESTATE1"    "PROP_UNKN_NONE1"
## [25] "AGE"             "OTHER_INSTALL1"  "OWN_RES1"
 dim(sparse_matrix)
## [1] 1000   27
 sample = sample.split(TotalData, SplitRatio = .75)
 trainLabel =  subset(TotalData, sample == TRUE)$RESPONSE
 testLabel  =  subset(TotalData, sample == FALSE)$RESPONSE
 trainMatrix =sparse_matrix[sample == TRUE,]
 testMatrix  =sparse_matrix[sample == FALSE,]
 trainLabel = trainLabel==1
 testLabel=testLabel==1

Training the Model with prepared Data

bst <- xgboost(data = trainMatrix, label=trainLabel,max_depth = 8,
                eta = 1, nthread = 2, nrounds = 20,objective = "binary:logistic")
## [0]  train-error:0.138482
## [1]  train-error:0.107856
## [2]  train-error:0.081225
## [3]  train-error:0.057257
## [4]  train-error:0.041278
## [5]  train-error:0.026631
## [6]  train-error:0.018642
## [7]  train-error:0.005326
## [8]  train-error:0.002663
## [9]  train-error:0.003995
## [10] train-error:0.002663
## [11] train-error:0.000000
## [12] train-error:0.000000
## [13] train-error:0.000000
## [14] train-error:0.000000
## [15] train-error:0.000000
## [16] train-error:0.000000
## [17] train-error:0.000000
## [18] train-error:0.000000
## [19] train-error:0.000000

Model Evaluation on test set.

y_pred <- predict(bst, testMatrix)
 
 table(testLabel,y_pred>0.5)
##          
## testLabel FALSE TRUE
##     FALSE    42   39
##     TRUE     36  132
 pred <- prediction(y_pred, testLabel)
 perf <- performance(pred, measure = "tpr", x.measure = "fpr") 
 plot(perf, col=rainbow(10))

Auc Calculations

auc = as.numeric(performance(pred, "auc")@y.values)
 auc
## [1] 0.739565

Conclusion

The credit worthiness model gives an accuracy of 74 percent with a good AUC. This model can be used to predict credit worthiness for the streaming data with the parameters mentioned.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.