Introduction

This document is an example of using multiple R packages to build model for credit screen problem. We will use the logistic regression, GBM, KNN method, and Xgboost to test the performance of different algorithms.

The dataset is from UCI Machine Learing Repository.

Load and clean the dataset

library(data.table)
dataset = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",sep=",", header=F, na.strings="?")
head(dataset)
##   V1    V2    V3 V4 V5 V6 V7   V8 V9 V10 V11 V12 V13 V14 V15 V16
## 1  b 30.83 0.000  u  g  w  v 1.25  t   t   1   f   g 202   0   +
## 2  a 58.67 4.460  u  g  q  h 3.04  t   t   6   f   g  43 560   +
## 3  a 24.50 0.500  u  g  q  h 1.50  t   f   0   f   g 280 824   +
## 4  b 27.83 1.540  u  g  w  v 3.75  t   t   5   t   g 100   3   +
## 5  b 20.17 5.625  u  g  w  v 1.71  t   f   0   f   s 120   0   +
## 6  b 32.08 4.000  u  g  m  v 2.50  t   f   0   t   g 360   0   +

As the dataset page shows, there are 15 attributes, 690 observations, and one target variable: V16.

Then we check the missing values number for each variable, type of each variable, and only keep the full observations.

sapply(dataset, function(x) sum(is.na(x)))
##  V1  V2  V3  V4  V5  V6  V7  V8  V9 V10 V11 V12 V13 V14 V15 V16 
##  12  12   0   6   6   9   9   0   0   0   0   0   0  13   0   0
sapply(dataset, class)
##        V1        V2        V3        V4        V5        V6        V7 
##  "factor" "numeric" "numeric"  "factor"  "factor"  "factor"  "factor" 
##        V8        V9       V10       V11       V12       V13       V14 
## "numeric"  "factor"  "factor" "integer"  "factor"  "factor" "integer" 
##       V15       V16 
## "integer"  "factor"
dataset=na.omit(dataset)
dim(dataset)
## [1] 653  16
library(knitr)
kable(dataset[1:5,], digits = 2, align = 'c', caption = "Cleaned Data")
Cleaned Data
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16
b 30.83 0.00 u g w v 1.25 t t 1 f g 202 0 +
a 58.67 4.46 u g q h 3.04 t t 6 f g 43 560 +
a 24.50 0.50 u g q h 1.50 t f 0 f g 280 824 +
b 27.83 1.54 u g w v 3.75 t t 5 t g 100 3 +
b 20.17 5.62 u g w v 1.71 t f 0 f s 120 0 +

Prepare the training and testing subset from the dataset.

set.seed(123)
n = dim(dataset)[1]
index = sample(n, round(0.7*n))
train = dataset[index,]
test=dataset[-index,]

Prepare dataset with dummy numeric variables

As we already know that XGBoost only works with numeric vectors, see the XGBoost Example here.We need transfer original dataset to a new dataset with dummy variables corresponding each categorical variable.

First, we setup a function which converts a categorical variable to numerical dummy variables based on the levels of factors.

dataset2 = dataset
library(plyr)

into_factor = function(x){
     if(class(x) == "factor"){
         n = length(x)
         data.fac = data.frame(x=x, y=1:n)
         output = model.matrix(y~x, data.fac)[,-1]
         ## convert factor into dummy variable matrix
     }
     else{
        output = x
     }
     output
}

For example:

levels(dataset$V4)
## [1] "l" "u" "y"
into_factor(dataset$V4)[1:5, ]
##   xu xy
## 1  1  0
## 2  1  0
## 3  1  0
## 4  1  0
## 5  1  0

We apply this function for each column in the original dataset

dataset2 = colwise(into_factor)(dataset2)
dataset2 = do.call(cbind, dataset2)
dataset2 = as.data.frame(dataset2)
head(dataset2)
##   V1    V2    V3 xu xy xgg xp xc xcc xd xe xff xi xj xk xm xq xr xw xx xdd
## 1  1 30.83 0.000  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0
## 2  0 58.67 4.460  1  0   0  0  0   0  0  0   0  0  0  0  0  1  0  0  0   0
## 3  0 24.50 0.500  1  0   0  0  0   0  0  0   0  0  0  0  0  1  0  0  0   0
## 4  1 27.83 1.540  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0
## 5  1 20.17 5.625  1  0   0  0  0   0  0  0   0  0  0  0  0  0  0  1  0   0
## 6  1 32.08 4.000  1  0   0  0  0   0  0  0   0  0  0  0  1  0  0  0  0   0
##   xff xh xj xn xo xv xz   V8 V9 V10 V11 V12 xp xs V14 V15 V16
## 1   0  0  0  0  0  1  0 1.25  1   1   1   0  0  0 202   0   1
## 2   0  1  0  0  0  0  0 3.04  1   1   6   0  0  0  43 560   1
## 3   0  1  0  0  0  0  0 1.50  1   0   0   0  0  0 280 824   1
## 4   0  0  0  0  0  1  0 3.75  1   1   5   1  0  0 100   3   1
## 5   0  0  0  0  0  1  0 1.71  1   0   0   0  0  1 120   0   1
## 6   0  0  0  0  0  1  0 2.50  1   0   0   1  0  0 360   0   1
dim(dataset2)
## [1] 653  38

we can see that after representing all categorical variables to dummy variables, dataset2 has 37 variables.

Logistic regression

logit.model = glm(V16~., data=train, family="binomial")
logit.response = predict(logit.model, test, type="response")

logit.predict = ifelse(logit.response>0.5, "+", "-")
table(logit.predict, test$V16)
##              
## logit.predict  -  +
##             - 90 24
##             + 13 69
logit.accuracy = mean(logit.predict == test$V16)
logit.accuracy
## [1] 0.8112245

For the logistic regression model, we use the glm function, including all variables. The accuracy is 81%.

GBM

library(caret)

ctrl = trainControl(method = "repeatedcv", number=5, repeats=5, summaryFunction=defaultSummary)
set.seed(300)
m_gbm = train(V16~., data=train,method="gbm", metric="Kappa", trControl=ctrl, verbose=FALSE)

summary(m_gbm)

##       var     rel.inf
## V9t   V9t 59.65369396
## V15   V15  8.47256012
## V3     V3  6.46331973
## V8     V8  6.37989139
## V14   V14  5.89377765
## V10t V10t  3.90581554
## V2     V2  2.30351402
## V4u   V4u  2.12921196
## V11   V11  1.56929456
## V6x   V6x  1.10322874
## V6i   V6i  0.39393513
## V6k   V6k  0.31773365
## V6cc V6cc  0.29549175
## V6w   V6w  0.21387594
## V7h   V7h  0.19600019
## V1b   V1b  0.14602200
## V6q   V6q  0.13569187
## V6ff V6ff  0.11990625
## V4y   V4y  0.11207611
## V6c   V6c  0.10480060
## V7ff V7ff  0.09015884
## V5gg V5gg  0.00000000
## V5p   V5p  0.00000000
## V6d   V6d  0.00000000
## V6e   V6e  0.00000000
## V6j   V6j  0.00000000
## V6m   V6m  0.00000000
## V6r   V6r  0.00000000
## V7dd V7dd  0.00000000
## V7j   V7j  0.00000000
## V7n   V7n  0.00000000
## V7o   V7o  0.00000000
## V7v   V7v  0.00000000
## V7z   V7z  0.00000000
## V12t V12t  0.00000000
## V13p V13p  0.00000000
## V13s V13s  0.00000000
plot(m_gbm)

gbm.predict = predict(m_gbm, test)
table(gbm.predict, test$V16)
##            
## gbm.predict  -  +
##           - 92 15
##           + 11 78
gbm.accuracy = mean(gbm.predict == test$V16)
gbm.accuracy
## [1] 0.8673469

For the GBM model, we used the gbm method in package caret with 5-folder cross-validation. The accuracy is 87%.

KNN

Version 1.

Directly using the knn method, no cross-validation, no scaling, and no factorize. We selected k as 5.

knn.model = knn3(V16~., data=train, k=5)
knn.response1 = predict(knn.model, test, class="response")
knn.predict1 = ifelse(knn.response1[,1]<0.5,"+","-")

table(knn.predict1, test$V16)
##             
## knn.predict1  -  +
##            - 78 48
##            + 25 45
mean(knn.predict1 == test$V16)
## [1] 0.627551

The accuracy of it is 63%.

Version 2.

The second version to use KNN is apply KNN after scaling and covnerting into dummy variables. Here, we used the dataset2.

knn.dataset = cbind(colwise(scale)(dataset2[,-38]), V16=as.factor(dataset2$V16))
set.seed(123)

index = sample(n, round(0.7*n))
train.knn = knn.dataset[index,]
test.knn = knn.dataset[-index,]

knn.model2 = knn3(V16~., data=train.knn, k=5)

knn.predict2 = predict(knn.model2, test.knn, type="class")
table(knn.predict2, test.knn$V16)
##             
## knn.predict2  0  1
##            0 89 32
##            1 14 61
mean(knn.predict2 == test.knn$V16)
## [1] 0.7653061

The accuracy of it is 77%.

Version 3

Tuning to search the best k, directly using the training and testing dataset.

knn_train_test = function(train, test, k=5){
    knn.model.f = knn3(V16~., data=train, k=k)
    knn.predict.f = predict(knn.model.f, test, type="class")
    mean(knn.predict.f == test$V16)
}

x = 1:20
result = sapply(x, function(x) knn_train_test(train.knn, test.knn, k=x))

temp = data.frame("k"=x,"accuracy"=result)
p = ggplot(temp, aes(x = k, y=accuracy)) + geom_point() + geom_smooth(method = "lm", formula = y ~ splines::bs(x, 3))
p 

#plot(x,result,type="b")

k.final = which.max(result)
knn.accurancy = knn_train_test(train.knn, test.knn, k = k.final)
knn.accurancy
## [1] 0.75

The best k is around 4~5, the accuracy of it is 75%.

XGBoost

Load the library

library(xgboost)
library(methods)

Prepare the datasets.

set.seed(123)
index = sample(n, round(0.7*n))
train.xg = dataset2[index,]
test.xg = dataset2[-index,]
label = as.matrix(train.xg[,38, drop=F])
data = as.matrix(train.xg[,-38, drop=F])
data2 = as.matrix(test.xg[,-38, drop=F])
label2 = as.matrix(test.xg[,38,drop=F])

Prepare the parameters list.

xgmat =xgb.DMatrix(data, label=label, missing = -10000)
param = list("object" = "binary:logistic", "best:eta" = 1, "bst:max_depth"=2, "eval_metric" = "logloss", "silent"=1,"nthread"=16,"min_child_weight"=1.45)
nround = 500

Running the model.

bst = xgb.train(param, xgmat, nround)
res1 = predict(bst, data2)

Making the prediction of the testing dataset

pre1 = ifelse(res1>0.5, 1, 0)
table(pre1, label2)
##     label2
## pre1  0  1
##    0 93 18
##    1 10 75
xgb.accuracy = mean(pre1 == label2)
xgb.accuracy
## [1] 0.8571429

The accuracy of it is 86%.

Last updated: 2016-02-16