Machine Learning (for Credit Scoring): Parallel computing in R

Pour TT

Nguyen Chi Dung

#-------------------------------
#  Artificial Neural Network
#-------------------------------

# Import and Pre-Processing data: 
rm(list = ls())
library(caret)
library(tidyverse)
library(foreign)
dung <- read.spss("F:/tkdb/bankloan.sav", to.data.frame = TRUE)
sub_data <- dung %>% 
  filter(!is.na(default)) %>% 
  select(-preddef1, -preddef2, -preddef3, -ed)

sub_data %>% str()

## 'data.frame':    700 obs. of  8 variables:
##  $ age     : num  41 27 40 41 24 41 39 43 24 36 ...
##  $ employ  : num  17 10 15 15 2 5 20 12 3 0 ...
##  $ address : num  12 6 14 14 0 5 9 11 4 13 ...
##  $ income  : num  176 31 55 120 28 25 67 38 19 25 ...
##  $ debtinc : num  9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
##  $ creddebt: num  11.359 1.362 0.856 2.659 1.787 ...
##  $ othdebt : num  5.009 4.001 2.169 0.821 3.057 ...
##  $ default : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 1 1 1 2 1 ...
##  - attr(*, "variable.labels")= Named chr  "Age in years" "Level of education" "Years with current employer" "Years at current address" ...
##   ..- attr(*, "names")= chr  "age" "ed" "employ" "address" ...
##  - attr(*, "codepage")= int 65001

# Spliting data: 
set.seed(123)
id <- createDataPartition(sub_data$default, p = 0.5, list = FALSE)
train <- sub_data[id, ]
test <- sub_data[-id, ]


# Grid of tuning parameters to try:
fitGrid <- expand.grid(.size = c(5, 10, 15, 20), 
                       .decay = c(0.001, 0.01, 0.1))

# Set the seeds for using parallel processing: 
set.seed(1)
seeds <- vector(mode = "list", length = 11) # number of resamples + 1 for final model
for(i in 1:10) seeds[[i]] <- sample.int(n = 1000, 12) #  12 is the number of tuning parameter combinations
seeds[[11]] <- 1 # for the last model
seeds

## [[1]]
##  [1] 266 372 572 906 201 894 940 657 625  62 204 175
## 
## [[2]]
##  [1] 688 384 769 497 715 987 378 773 928 211 646 125
## 
## [[3]]
##  [1] 268 386  14 382 867 339 480 596 490 185 820 662
## 
## [[4]]
##  [1] 795 108 723 411 818 644 779 550 526 783  24 472
## 
## [[5]]
##  [1] 733 693 477 859 437 244  71  99 314 514 656 403
## 
## [[6]]
##  [1] 913 294 459 332 649 257 476 761  84 868 336 831
## 
## [[7]]
##  [1] 347 334 476 890 861 389 773 954 432 707 396 322
## 
## [[8]]
##  [1] 758 203 710 122 245 143 239  59 638 869 772 789
## 
## [[9]]
##  [1] 456 410 810 604 653 352 269 986 629 212 129 473
## 
## [[10]]
##  [1] 925 599 975 730 356 430 148  13 710 103 442 634
## 
## [[11]]
## [1] 1

# Set cross - validation: 
fitControl <- trainControl(method = "repeatedcv", 
                           number = 5,
                           repeats = 2,
                           classProbs = TRUE, 
                           summaryFunction =  twoClassSummary,
                           seeds = seeds)

library(doParallel)
# Find out how many cores are available: 
detectCores()

## [1] 4

# Create cluster with desired number of cores: 
cl <- makeCluster(4)
# Register cluster: 
registerDoParallel(cl)
# Find out how many cores are being used
getDoParWorkers()

## [1] 4

# Fit model using ANN: 
set.seed(1)
model.ct.nn <- train(default ~ .,
                     data = train,
                     method = "nnet",
                     maxit = 1000,
                     linout = FALSE,
                     trControl = fitControl,
                     tuneGrid = fitGrid,
                     trace = FALSE,
                     #metric = "Sens", # maximize sensitivity to "Yes" values
                     allowParallel = TRUE)

stopCluster(cl)
registerDoSEQ()

model.ct.nn

## Neural Network 
## 
## 351 samples
##   7 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 281, 280, 281, 280, 282, 281, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  ROC        Sens       Spec     
##    5    0.001  0.7741418  0.8901207  0.4885965
##    5    0.010  0.7612014  0.8572021  0.5497076
##    5    0.100  0.7979662  0.8860860  0.5225146
##   10    0.001  0.7067269  0.8322021  0.4444444
##   10    0.010  0.7305784  0.8397059  0.4672515
##   10    0.100  0.7471033  0.8688160  0.4687135
##   15    0.001  0.6989334  0.8205128  0.4836257
##   15    0.010  0.7410848  0.8107466  0.5163743
##   15    0.100  0.7713312  0.8782805  0.5052632
##   20    0.001  0.7430296  0.7914027  0.5520468
##   20    0.010  0.7466214  0.8185143  0.4833333
##   20    0.100  0.7579181  0.8591629  0.4461988
## 
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.1.

plot(model.ct.nn, metric = "Sens")

library(NeuralNetTools)
plotnet(model.ct.nn)

#----------------------
#    Random Forest
#----------------------

# default number of predictors is predictors/3 or about 6
rf.Grid <- expand.grid(mtry = seq(from = 3, to = 18, by = 3))
nrow(rf.Grid)

## [1] 6

set.seed(1)
rf.seeds <- vector(mode = "list", length = 11) # length is = (nresampling) + 1
for(i in 1:10) rf.seeds[[i]]<- sample.int(n = 1000, 6) # 6 is the number of tuning parameters (mtry possibilities)
rf.seeds[[11]] <- 1 # for the last model
rf.seeds

## [[1]]
## [1] 266 372 572 906 201 894
## 
## [[2]]
## [1] 945 661 628  62 206 176
## 
## [[3]]
## [1] 688 384 769 497 715 987
## 
## [[4]]
## [1] 381 777 933 212 650 125
## 
## [[5]]
## [1] 268 386  14 382 867 339
## 
## [[6]]
## [1] 483 599 493 186 825 666
## 
## [[7]]
## [1] 795 108 723 411 818 644
## 
## [[8]]
## [1] 783 553 529 787  24 475
## 
## [[9]]
## [1] 733 693 477 859 437 244
## 
## [[10]]
## [1]  71 100 316 518 660 405
## 
## [[11]]
## [1] 1

rf.Control <- trainControl(method = "repeatedcv", 
                           number = 5,
                           repeats = 2,
                           classProbs = TRUE, 
                           summaryFunction = twoClassSummary,
                           seeds = rf.seeds)

library(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl)

model.rf <- train(default ~.,
                  data = train, 
                  method = "rf",
                  ntree = 100,
                  importance = TRUE,
                  na.action = na.omit,
                  tuneGrid = rf.Grid,
                  trControl = rf.Control,
                  #metric = "Sens",
                  allowParallel=TRUE)
stopCluster(cl)
registerDoSEQ()
plot(model.rf, metric = "Sens")

model.rf

## Random Forest 
## 
## 351 samples
##   7 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times) 
## Summary of sample sizes: 282, 281, 281, 280, 280, 281, ... 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens       Spec     
##    3    0.8083433  0.9091252  0.4505848
##    6    0.8030074  0.8916667  0.4836257
##    9    0.8053768  0.9032428  0.4885965
##   12    0.8076065  0.8877074  0.4885965
##   15    0.7992546  0.9032805  0.4932749
##   18    0.7945646  0.8993967  0.4932749
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.

#--------------------------------
#    Compare the two models
#--------------------------------

results <- resamples(list(RandomForest = model.rf, 
                          NeuralNetwork = model.ct.nn))
results$values

##      Resample RandomForest~ROC RandomForest~Sens RandomForest~Spec
## 1  Fold1.Rep1        0.7761438         0.8823529         0.4444444
## 2  Fold1.Rep2        0.8135684         0.9230769         0.3888889
## 3  Fold2.Rep1        0.7852564         0.9423077         0.4444444
## 4  Fold2.Rep2        0.8410931         0.8461538         0.6315789
## 5  Fold3.Rep1        0.8392094         0.9807692         0.2777778
## 6  Fold3.Rep2        0.7407407         0.8627451         0.5555556
## 7  Fold4.Rep1        0.8122470         0.8846154         0.4736842
## 8  Fold4.Rep2        0.7591093         0.9230769         0.1578947
## 9  Fold5.Rep1        0.8608300         0.8653846         0.6315789
## 10 Fold5.Rep2        0.8552350         0.9807692         0.5000000
##    NeuralNetwork~ROC NeuralNetwork~Sens NeuralNetwork~Spec
## 1          0.7585470          0.8846154          0.5555556
## 2          0.8183761          0.8461538          0.5000000
## 3          0.8238866          0.9230769          0.5263158
## 4          0.7352941          0.8823529          0.5555556
## 5          0.8258547          0.9038462          0.3888889
## 6          0.8147773          0.9807692          0.4210526
## 7          0.8016194          0.8461538          0.5263158
## 8          0.8472222          0.9038462          0.6666667
## 9          0.7919390          0.8823529          0.6111111
## 10         0.7621457          0.8076923          0.4736842

# summarize the distributions
summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: RandomForest, NeuralNetwork 
## Number of resamples: 10 
## 
## ROC 
##                    Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## RandomForest  0.7407407 0.7784219 0.8129077 0.8083433 0.8406222 0.8608300
## NeuralNetwork 0.7352941 0.7695941 0.8081984 0.7979662 0.8225090 0.8472222
##               NA's
## RandomForest     0
## NeuralNetwork    0
## 
## Sens 
##                    Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## RandomForest  0.8461538 0.8696267 0.9038462 0.9091252 0.9375000 0.9807692
## NeuralNetwork 0.8076923 0.8552036 0.8834842 0.8860860 0.9038462 0.9807692
##               NA's
## RandomForest     0
## NeuralNetwork    0
## 
## Spec 
##                    Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## RandomForest  0.1578947 0.4027778 0.4590643 0.4505848 0.5416667 0.6315789
## NeuralNetwork 0.3888889 0.4802632 0.5263158 0.5225146 0.5555556 0.6666667
##               NA's
## RandomForest     0
## NeuralNetwork    0

# boxplot of results
bwplot(results, metric = "Sens")