Machine Learning (for Credit Scoring): Parallel computing in R
Pour TT
Nguyen Chi Dung
#-------------------------------
# Artificial Neural Network
#-------------------------------
# Import and Pre-Processing data:
rm(list = ls())
library(caret)
library(tidyverse)
library(foreign)
dung <- read.spss("F:/tkdb/bankloan.sav", to.data.frame = TRUE)
sub_data <- dung %>%
filter(!is.na(default)) %>%
select(-preddef1, -preddef2, -preddef3, -ed)
sub_data %>% str()
## 'data.frame': 700 obs. of 8 variables:
## $ age : num 41 27 40 41 24 41 39 43 24 36 ...
## $ employ : num 17 10 15 15 2 5 20 12 3 0 ...
## $ address : num 12 6 14 14 0 5 9 11 4 13 ...
## $ income : num 176 31 55 120 28 25 67 38 19 25 ...
## $ debtinc : num 9.3 17.3 5.5 2.9 17.3 10.2 30.6 3.6 24.4 19.7 ...
## $ creddebt: num 11.359 1.362 0.856 2.659 1.787 ...
## $ othdebt : num 5.009 4.001 2.169 0.821 3.057 ...
## $ default : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 1 1 1 2 1 ...
## - attr(*, "variable.labels")= Named chr "Age in years" "Level of education" "Years with current employer" "Years at current address" ...
## ..- attr(*, "names")= chr "age" "ed" "employ" "address" ...
## - attr(*, "codepage")= int 65001
# Spliting data:
set.seed(123)
id <- createDataPartition(sub_data$default, p = 0.5, list = FALSE)
train <- sub_data[id, ]
test <- sub_data[-id, ]
# Grid of tuning parameters to try:
fitGrid <- expand.grid(.size = c(5, 10, 15, 20),
.decay = c(0.001, 0.01, 0.1))
# Set the seeds for using parallel processing:
set.seed(1)
seeds <- vector(mode = "list", length = 11) # number of resamples + 1 for final model
for(i in 1:10) seeds[[i]] <- sample.int(n = 1000, 12) # 12 is the number of tuning parameter combinations
seeds[[11]] <- 1 # for the last model
seeds
## [[1]]
## [1] 266 372 572 906 201 894 940 657 625 62 204 175
##
## [[2]]
## [1] 688 384 769 497 715 987 378 773 928 211 646 125
##
## [[3]]
## [1] 268 386 14 382 867 339 480 596 490 185 820 662
##
## [[4]]
## [1] 795 108 723 411 818 644 779 550 526 783 24 472
##
## [[5]]
## [1] 733 693 477 859 437 244 71 99 314 514 656 403
##
## [[6]]
## [1] 913 294 459 332 649 257 476 761 84 868 336 831
##
## [[7]]
## [1] 347 334 476 890 861 389 773 954 432 707 396 322
##
## [[8]]
## [1] 758 203 710 122 245 143 239 59 638 869 772 789
##
## [[9]]
## [1] 456 410 810 604 653 352 269 986 629 212 129 473
##
## [[10]]
## [1] 925 599 975 730 356 430 148 13 710 103 442 634
##
## [[11]]
## [1] 1
# Set cross - validation:
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE,
summaryFunction = twoClassSummary,
seeds = seeds)
library(doParallel)
# Find out how many cores are available:
detectCores()
## [1] 4
# Create cluster with desired number of cores:
cl <- makeCluster(4)
# Register cluster:
registerDoParallel(cl)
# Find out how many cores are being used
getDoParWorkers()
## [1] 4
# Fit model using ANN:
set.seed(1)
model.ct.nn <- train(default ~ .,
data = train,
method = "nnet",
maxit = 1000,
linout = FALSE,
trControl = fitControl,
tuneGrid = fitGrid,
trace = FALSE,
#metric = "Sens", # maximize sensitivity to "Yes" values
allowParallel = TRUE)
stopCluster(cl)
registerDoSEQ()
model.ct.nn
## Neural Network
##
## 351 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 281, 280, 281, 280, 282, 281, ...
## Resampling results across tuning parameters:
##
## size decay ROC Sens Spec
## 5 0.001 0.7741418 0.8901207 0.4885965
## 5 0.010 0.7612014 0.8572021 0.5497076
## 5 0.100 0.7979662 0.8860860 0.5225146
## 10 0.001 0.7067269 0.8322021 0.4444444
## 10 0.010 0.7305784 0.8397059 0.4672515
## 10 0.100 0.7471033 0.8688160 0.4687135
## 15 0.001 0.6989334 0.8205128 0.4836257
## 15 0.010 0.7410848 0.8107466 0.5163743
## 15 0.100 0.7713312 0.8782805 0.5052632
## 20 0.001 0.7430296 0.7914027 0.5520468
## 20 0.010 0.7466214 0.8185143 0.4833333
## 20 0.100 0.7579181 0.8591629 0.4461988
##
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.1.
plot(model.ct.nn, metric = "Sens")

library(NeuralNetTools)
plotnet(model.ct.nn)

#----------------------
# Random Forest
#----------------------
# default number of predictors is predictors/3 or about 6
rf.Grid <- expand.grid(mtry = seq(from = 3, to = 18, by = 3))
nrow(rf.Grid)
## [1] 6
set.seed(1)
rf.seeds <- vector(mode = "list", length = 11) # length is = (nresampling) + 1
for(i in 1:10) rf.seeds[[i]]<- sample.int(n = 1000, 6) # 6 is the number of tuning parameters (mtry possibilities)
rf.seeds[[11]] <- 1 # for the last model
rf.seeds
## [[1]]
## [1] 266 372 572 906 201 894
##
## [[2]]
## [1] 945 661 628 62 206 176
##
## [[3]]
## [1] 688 384 769 497 715 987
##
## [[4]]
## [1] 381 777 933 212 650 125
##
## [[5]]
## [1] 268 386 14 382 867 339
##
## [[6]]
## [1] 483 599 493 186 825 666
##
## [[7]]
## [1] 795 108 723 411 818 644
##
## [[8]]
## [1] 783 553 529 787 24 475
##
## [[9]]
## [1] 733 693 477 859 437 244
##
## [[10]]
## [1] 71 100 316 518 660 405
##
## [[11]]
## [1] 1
rf.Control <- trainControl(method = "repeatedcv",
number = 5,
repeats = 2,
classProbs = TRUE,
summaryFunction = twoClassSummary,
seeds = rf.seeds)
library(doParallel)
cl <- makeCluster(4)
registerDoParallel(cl)
model.rf <- train(default ~.,
data = train,
method = "rf",
ntree = 100,
importance = TRUE,
na.action = na.omit,
tuneGrid = rf.Grid,
trControl = rf.Control,
#metric = "Sens",
allowParallel=TRUE)
stopCluster(cl)
registerDoSEQ()
plot(model.rf, metric = "Sens")

model.rf
## Random Forest
##
## 351 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 2 times)
## Summary of sample sizes: 282, 281, 281, 280, 280, 281, ...
## Resampling results across tuning parameters:
##
## mtry ROC Sens Spec
## 3 0.8083433 0.9091252 0.4505848
## 6 0.8030074 0.8916667 0.4836257
## 9 0.8053768 0.9032428 0.4885965
## 12 0.8076065 0.8877074 0.4885965
## 15 0.7992546 0.9032805 0.4932749
## 18 0.7945646 0.8993967 0.4932749
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
#--------------------------------
# Compare the two models
#--------------------------------
results <- resamples(list(RandomForest = model.rf,
NeuralNetwork = model.ct.nn))
results$values
## Resample RandomForest~ROC RandomForest~Sens RandomForest~Spec
## 1 Fold1.Rep1 0.7761438 0.8823529 0.4444444
## 2 Fold1.Rep2 0.8135684 0.9230769 0.3888889
## 3 Fold2.Rep1 0.7852564 0.9423077 0.4444444
## 4 Fold2.Rep2 0.8410931 0.8461538 0.6315789
## 5 Fold3.Rep1 0.8392094 0.9807692 0.2777778
## 6 Fold3.Rep2 0.7407407 0.8627451 0.5555556
## 7 Fold4.Rep1 0.8122470 0.8846154 0.4736842
## 8 Fold4.Rep2 0.7591093 0.9230769 0.1578947
## 9 Fold5.Rep1 0.8608300 0.8653846 0.6315789
## 10 Fold5.Rep2 0.8552350 0.9807692 0.5000000
## NeuralNetwork~ROC NeuralNetwork~Sens NeuralNetwork~Spec
## 1 0.7585470 0.8846154 0.5555556
## 2 0.8183761 0.8461538 0.5000000
## 3 0.8238866 0.9230769 0.5263158
## 4 0.7352941 0.8823529 0.5555556
## 5 0.8258547 0.9038462 0.3888889
## 6 0.8147773 0.9807692 0.4210526
## 7 0.8016194 0.8461538 0.5263158
## 8 0.8472222 0.9038462 0.6666667
## 9 0.7919390 0.8823529 0.6111111
## 10 0.7621457 0.8076923 0.4736842
# summarize the distributions
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: RandomForest, NeuralNetwork
## Number of resamples: 10
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## RandomForest 0.7407407 0.7784219 0.8129077 0.8083433 0.8406222 0.8608300
## NeuralNetwork 0.7352941 0.7695941 0.8081984 0.7979662 0.8225090 0.8472222
## NA's
## RandomForest 0
## NeuralNetwork 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## RandomForest 0.8461538 0.8696267 0.9038462 0.9091252 0.9375000 0.9807692
## NeuralNetwork 0.8076923 0.8552036 0.8834842 0.8860860 0.9038462 0.9807692
## NA's
## RandomForest 0
## NeuralNetwork 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## RandomForest 0.1578947 0.4027778 0.4590643 0.4505848 0.5416667 0.6315789
## NeuralNetwork 0.3888889 0.4802632 0.5263158 0.5225146 0.5555556 0.6666667
## NA's
## RandomForest 0
## NeuralNetwork 0
# boxplot of results
bwplot(results, metric = "Sens")
