This is an R Markdown document built to predict credit worthiness of a customer in some Financial Services. This model uses the famous “XgBoost” Ensemble technique to predict credit worthiness of a customer.
rm(list = ls())
library(ggplot2)
library(e1071)
library(gridExtra)
library(Boruta)
## Warning: package 'Boruta' was built under R version 3.3.2
## Loading required package: ranger
## Warning: package 'ranger' was built under R version 3.3.2
library(Matrix)
library(caTools)
library(magrittr)
library(data.table)
library(xgboost)
## Warning: package 'xgboost' was built under R version 3.3.2
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
data=read.csv("https://raw.githubusercontent.com/imtiazBDSgit/TextAnalytics/master/ModelingData.txt",sep="\t",header=TRUE)
dim(data)
## [1] 1002 32
str(data)
## 'data.frame': 1002 obs. of 32 variables:
## $ OBS. : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CHK_ACCT : int 0 1 3 0 0 3 3 1 3 1 ...
## $ DURATION : int 6 48 12 42 24 36 24 36 12 30 ...
## $ HISTORY : int 4 2 4 2 3 2 2 2 2 4 ...
## $ NEW_CAR : int 0 0 0 0 1 0 0 0 0 1 ...
## $ USED_CAR : int 0 0 0 0 0 0 0 1 0 0 ...
## $ FURNITURE : int 0 0 0 1 0 0 1 0 0 0 ...
## $ RADIO.TV : int 1 1 0 0 0 0 0 0 1 0 ...
## $ EDUCATION : int 0 0 1 0 0 1 0 0 0 0 ...
## $ RETRAINING : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AMOUNT : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ SAV_ACCT : int 4 0 0 0 0 4 2 0 3 0 ...
## $ EMPLOYMENT : int 4 2 3 3 2 2 4 2 3 0 ...
## $ INSTALL_RATE : int 4 2 2 2 3 2 3 2 2 4 ...
## $ MALE_DIV : int 0 0 0 0 0 0 0 0 1 0 ...
## $ MALE_SINGLE : int 1 0 1 1 1 1 1 1 0 0 ...
## $ MALE_MAR_or_WID : int 0 0 0 0 0 0 0 0 0 1 ...
## $ CO.APPLICANT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ GUARANTOR : int 0 0 0 1 0 0 0 0 0 0 ...
## $ PRESENT_RESIDENT: int 4 2 3 4 4 4 4 2 4 2 ...
## $ REAL_ESTATE : int 1 1 1 0 0 0 0 0 1 0 ...
## $ PROP_UNKN_NONE : int 0 0 0 0 1 1 0 0 0 0 ...
## $ AGE : int 67 22 49 45 53 35 53 35 61 28 ...
## $ OTHER_INSTALL : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RENT : int 0 0 0 0 0 0 0 1 0 0 ...
## $ OWN_RES : int 1 1 1 0 0 0 1 0 1 1 ...
## $ NUM_CREDITS : int 2 1 1 1 2 1 1 1 1 2 ...
## $ JOB : int 2 2 1 2 2 1 2 3 1 3 ...
## $ NUM_DEPENDENTS : int 1 1 2 2 2 2 1 1 1 1 ...
## $ TELEPHONE : int 1 0 0 0 0 1 0 1 0 0 ...
## $ FOREIGN : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RESPONSE : int 1 0 1 1 0 1 1 1 1 0 ...
head(data)
## OBS. CHK_ACCT DURATION HISTORY NEW_CAR USED_CAR FURNITURE RADIO.TV
## 1 1 0 6 4 0 0 0 1
## 2 2 1 48 2 0 0 0 1
## 3 3 3 12 4 0 0 0 0
## 4 4 0 42 2 0 0 1 0
## 5 5 0 24 3 1 0 0 0
## 6 6 3 36 2 0 0 0 0
## EDUCATION RETRAINING AMOUNT SAV_ACCT EMPLOYMENT INSTALL_RATE MALE_DIV
## 1 0 0 1169 4 4 4 0
## 2 0 0 5951 0 2 2 0
## 3 1 0 2096 0 3 2 0
## 4 0 0 7882 0 3 2 0
## 5 0 0 4870 0 2 3 0
## 6 1 0 9055 4 2 2 0
## MALE_SINGLE MALE_MAR_or_WID CO.APPLICANT GUARANTOR PRESENT_RESIDENT
## 1 1 0 0 0 4
## 2 0 0 0 0 2
## 3 1 0 0 0 3
## 4 1 0 0 1 4
## 5 1 0 0 0 4
## 6 1 0 0 0 4
## REAL_ESTATE PROP_UNKN_NONE AGE OTHER_INSTALL RENT OWN_RES NUM_CREDITS
## 1 1 0 67 0 0 1 2
## 2 1 0 22 0 0 1 1
## 3 1 0 49 0 0 1 1
## 4 0 0 45 0 0 0 1
## 5 0 1 53 0 0 0 2
## 6 0 1 35 0 0 0 1
## JOB NUM_DEPENDENTS TELEPHONE FOREIGN RESPONSE
## 1 2 1 1 0 1
## 2 2 1 0 0 0
## 3 1 2 0 0 1
## 4 2 2 0 0 1
## 5 2 2 0 0 0
## 6 1 2 1 0 1
summary(data)
## OBS. CHK_ACCT DURATION HISTORY
## Min. : 1.0 Min. :0.000 Min. : 4.0 Min. :0.000
## 1st Qu.: 250.8 1st Qu.:0.000 1st Qu.:12.0 1st Qu.:2.000
## Median : 500.5 Median :1.000 Median :18.0 Median :2.000
## Mean : 500.5 Mean :1.577 Mean :20.9 Mean :2.545
## 3rd Qu.: 750.2 3rd Qu.:3.000 3rd Qu.:24.0 3rd Qu.:4.000
## Max. :1000.0 Max. :3.000 Max. :72.0 Max. :4.000
## NA's :2 NA's :2 NA's :2 NA's :2
## NEW_CAR USED_CAR FURNITURE RADIO.TV
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :0.00
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00
## Median :0.000 Median :0.000 Median :0.000 Median :0.00
## Mean :0.234 Mean :0.103 Mean :0.181 Mean :0.28
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:1.00
## Max. :1.000 Max. :1.000 Max. :1.000 Max. :1.00
## NA's :2 NA's :2 NA's :2 NA's :2
## EDUCATION RETRAINING AMOUNT SAV_ACCT
## Min. :0.00 Min. :0.000 Min. : 250 Min. :0.000
## 1st Qu.:0.00 1st Qu.:0.000 1st Qu.: 1366 1st Qu.:0.000
## Median :0.00 Median :0.000 Median : 2320 Median :0.000
## Mean :0.05 Mean :0.097 Mean : 3271 Mean :1.105
## 3rd Qu.:0.00 3rd Qu.:0.000 3rd Qu.: 3972 3rd Qu.:2.000
## Max. :1.00 Max. :1.000 Max. :18424 Max. :4.000
## NA's :2 NA's :2 NA's :2 NA's :2
## EMPLOYMENT INSTALL_RATE MALE_DIV MALE_SINGLE
## Min. :0.000 Min. :1.000 Min. :0.00 Min. :0.000
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:0.00 1st Qu.:0.000
## Median :2.000 Median :3.000 Median :0.00 Median :1.000
## Mean :2.384 Mean :2.973 Mean :0.05 Mean :0.548
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:0.00 3rd Qu.:1.000
## Max. :4.000 Max. :4.000 Max. :1.00 Max. :1.000
## NA's :2 NA's :2 NA's :2 NA's :2
## MALE_MAR_or_WID CO.APPLICANT GUARANTOR PRESENT_RESIDENT
## Min. :0.000 Min. :0.000 Min. :0.000 Min. :1.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:2.000
## Median :0.000 Median :0.000 Median :0.000 Median :3.000
## Mean :0.092 Mean :0.041 Mean :0.052 Mean :2.845
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:4.000
## Max. :1.000 Max. :1.000 Max. :1.000 Max. :4.000
## NA's :2 NA's :2 NA's :2 NA's :2
## REAL_ESTATE PROP_UNKN_NONE AGE OTHER_INSTALL
## Min. :0.000 Min. :0.000 Min. :19.00 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:27.00 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :33.00 Median :0.000
## Mean :0.282 Mean :0.154 Mean :35.55 Mean :0.186
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:42.00 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :75.00 Max. :1.000
## NA's :2 NA's :2 NA's :2 NA's :2
## RENT OWN_RES NUM_CREDITS JOB
## Min. :0.000 Min. :0.000 Min. :1.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:2.000
## Median :0.000 Median :1.000 Median :1.000 Median :2.000
## Mean :0.179 Mean :0.713 Mean :1.407 Mean :1.904
## 3rd Qu.:0.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :1.000 Max. :1.000 Max. :4.000 Max. :3.000
## NA's :2 NA's :2 NA's :2 NA's :2
## NUM_DEPENDENTS TELEPHONE FOREIGN RESPONSE
## Min. :1.000 Min. :0.000 Min. :0.000 Min. :0.0
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0
## Median :1.000 Median :0.000 Median :0.000 Median :1.0
## Mean :1.155 Mean :0.404 Mean :0.037 Mean :0.7
## 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:1.0
## Max. :2.000 Max. :1.000 Max. :1.000 Max. :1.0
## NA's :2 NA's :2 NA's :2 NA's :2
data= na.omit(data)
categoricalFeatures=c('CHK_ACCT','HISTORY','SAV_ACCT','EMPLOYMENT','PRESENT_RESIDENT','JOB','NEW_CAR', 'USED_CAR', 'FURNITURE', 'RADIO.TV', 'EDUCATION', 'RETRAINING', 'MALE_DIV', 'MALE_SINGLE', 'MALE_MAR_or_WID', 'CO.APPLICANT', 'GUARANTOR', 'REAL_ESTATE', 'PROP_UNKN_NONE', 'OTHER_INSTALL', 'RENT', 'OWN_RES', 'TELEPHONE', 'FOREIGN', 'RESPONSE')
data[,categoricalFeatures]=lapply(data[,categoricalFeatures],as.factor)
data=data[,-1]
data_numeric=data[ , !(names(data) %in% categoricalFeatures)]
data_categorical=data[ , (names(data) %in% categoricalFeatures)]
#Plots histograms
plotHist <- function(data_in, i) {
data <- data.frame(x=data_in[[i]])
p <- ggplot(data=data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1))
return (p)
}
doPlots <- function(data_in, fun, ii, ncol=3) {
pp <- list()
for (i in ii) {
p <- fun(data_in=data_in, i=i)
pp <- c(pp, list(p))
}
do.call("grid.arrange", c(pp, ncol=ncol))
}
#Plots density plots for numeric variables
plotDen <- function(data_in, i){
data=data.frame(x=data_in[[i]])
p <- ggplot(data= data) + geom_line(aes(x = x), stat = 'density', size = 1,alpha = 1.0) +
xlab(paste0((colnames(data_in)[i]), '\n', 'Skewness: ',round(skewness(data_in[[i]], na.rm = TRUE), 2))) + theme_light()
return(p)
}
doPlots(data_categorical, fun = plotHist, ii = 1:25, ncol = 5)
People with chk_acct status are very less compared other categories People were fairly compliant in paying back the bills , we can infer that from the credit History category 2 New cars are fairly less Used cars are fairly more. People have less furniture at their homes People having radio/tv are also pretty less Education levels is also low purpose of credit is also not retraining in most cases For most customers Account balances are low. People with 1-4 years experienced employees dominate male_divorced or male_married_widowed are less people dont have co-applicants and guarantor mostly Present residence fairly distributed own houses are more fairly a quarter of them own real estate people dont have telephones in great deal nature of the job is mostly skilled employees Most people dont rent. workers are local , foreign workers are less Credit rating is good dominated.
doPlots(data_numeric, fun = plotDen, ii = 1:6, ncol = 2)
#Insights Amount is right skewed distributed Age is right skewed distributed Rest all features dont have a particular pattern , mostly decreasing exception is install_rate
## Boruta performed 99 iterations in 2.230423 mins.
## 15 attributes confirmed important: AGE, AMOUNT, CHK_ACCT,
## DURATION, EMPLOYMENT and 10 more.
## 13 attributes confirmed unimportant: CO.APPLICANT, EDUCATION,
## FOREIGN, FURNITURE, MALE_DIV and 8 more.
## 2 tentative attributes left: JOB, NUM_CREDITS.
plot(boruta.train, xlab = "", xaxt = "n")
lz<-lapply(1:ncol(boruta.train$ImpHistory),function(i)
boruta.train$ImpHistory[is.finite(boruta.train$ImpHistory[,i]),i])
names(lz) <- colnames(boruta.train$ImpHistory)
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(boruta.train$ImpHistory), cex.axis = 0.7)
Filtering out important features which we got from boruta package.
TotalData=data[ , (names(data) %in% importantFeatures)]
str(TotalData)
## 'data.frame': 1000 obs. of 15 variables:
## $ CHK_ACCT : Factor w/ 4 levels "0","1","2","3": 1 2 4 1 1 4 4 2 4 2 ...
## $ DURATION : int 6 48 12 42 24 36 24 36 12 30 ...
## $ HISTORY : Factor w/ 5 levels "0","1","2","3",..: 5 3 5 3 4 3 3 3 3 5 ...
## $ NEW_CAR : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 2 ...
## $ USED_CAR : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## $ AMOUNT : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ SAV_ACCT : Factor w/ 5 levels "0","1","2","3",..: 5 1 1 1 1 5 3 1 4 1 ...
## $ EMPLOYMENT : Factor w/ 5 levels "0","1","2","3",..: 5 3 4 4 3 3 5 3 4 1 ...
## $ INSTALL_RATE : int 4 2 2 2 3 2 3 2 2 4 ...
## $ GUARANTOR : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ REAL_ESTATE : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 2 1 ...
## $ PROP_UNKN_NONE: Factor w/ 2 levels "0","1": 1 1 1 1 2 2 1 1 1 1 ...
## $ AGE : int 67 22 49 45 53 35 53 35 61 28 ...
## $ OTHER_INSTALL : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ OWN_RES : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 2 1 2 2 ...
TotalData$RESPONSE=data$RESPONSE
set.seed(101)
sparse_matrix <- sparse.model.matrix(RESPONSE~.-1, data = TotalData)
colnames(sparse_matrix)
## [1] "CHK_ACCT0" "CHK_ACCT1" "CHK_ACCT2"
## [4] "CHK_ACCT3" "DURATION" "HISTORY1"
## [7] "HISTORY2" "HISTORY3" "HISTORY4"
## [10] "NEW_CAR1" "USED_CAR1" "AMOUNT"
## [13] "SAV_ACCT1" "SAV_ACCT2" "SAV_ACCT3"
## [16] "SAV_ACCT4" "EMPLOYMENT1" "EMPLOYMENT2"
## [19] "EMPLOYMENT3" "EMPLOYMENT4" "INSTALL_RATE"
## [22] "GUARANTOR1" "REAL_ESTATE1" "PROP_UNKN_NONE1"
## [25] "AGE" "OTHER_INSTALL1" "OWN_RES1"
dim(sparse_matrix)
## [1] 1000 27
sample = sample.split(TotalData, SplitRatio = .75)
trainLabel = subset(TotalData, sample == TRUE)$RESPONSE
testLabel = subset(TotalData, sample == FALSE)$RESPONSE
trainMatrix =sparse_matrix[sample == TRUE,]
testMatrix =sparse_matrix[sample == FALSE,]
trainLabel = trainLabel==1
testLabel=testLabel==1
bst <- xgboost(data = trainMatrix, label=trainLabel,max_depth = 8,
eta = 1, nthread = 2, nrounds = 20,objective = "binary:logistic")
## [0] train-error:0.138482
## [1] train-error:0.107856
## [2] train-error:0.081225
## [3] train-error:0.057257
## [4] train-error:0.041278
## [5] train-error:0.026631
## [6] train-error:0.018642
## [7] train-error:0.005326
## [8] train-error:0.002663
## [9] train-error:0.003995
## [10] train-error:0.002663
## [11] train-error:0.000000
## [12] train-error:0.000000
## [13] train-error:0.000000
## [14] train-error:0.000000
## [15] train-error:0.000000
## [16] train-error:0.000000
## [17] train-error:0.000000
## [18] train-error:0.000000
## [19] train-error:0.000000
y_pred <- predict(bst, testMatrix)
table(testLabel,y_pred>0.5)
##
## testLabel FALSE TRUE
## FALSE 42 39
## TRUE 36 132
pred <- prediction(y_pred, testLabel)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, col=rainbow(10))
auc = as.numeric(performance(pred, "auc")@y.values)
auc
## [1] 0.739565
The credit worthiness model gives an accuracy of 74 percent with a good AUC. This model can be used to predict credit worthiness for the streaming data with the parameters mentioned.
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.