Statistical Learning: Week-10

## load required packages and data
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)
library(caret)

## Loading required package: lattice

library(e1071)
install.packages("psych")

## Installing package into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## 
##   There is a binary version available but the source version is later:
##       binary source needs_compilation
## psych  2.1.3  2.2.3             FALSE

## installing the source package 'psych'

library(psych)  # for pairs.panels() function

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

# load data from our dropbox shared folder and preliminary check
pokemon <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")
names(pokemon)

##  [1] "abilities"         "against_bug"       "against_dark"     
##  [4] "against_dragon"    "against_electric"  "against_fairy"    
##  [7] "against_fight"     "against_fire"      "against_flying"   
## [10] "against_ghost"     "against_grass"     "against_ground"   
## [13] "against_ice"       "against_normal"    "against_poison"   
## [16] "against_psychic"   "against_rock"      "against_steel"    
## [19] "against_water"     "attack"            "base_egg_steps"   
## [22] "base_happiness"    "base_total"        "capture_rate"     
## [25] "classfication"     "defense"           "experience_growth"
## [28] "height_m"          "hp"                "japanese_name"    
## [31] "name"              "percentage_male"   "pokedex_number"   
## [34] "sp_attack"         "sp_defense"        "speed"            
## [37] "type1"             "type2"             "weight_kg"        
## [40] "generation"        "is_legendary"

names(pokemon)[25] <- "classification"

# create a new column "type" to denote three main "Gosanke(s)"
pokemon$type <- NA
pokemon$type[pokemon$type1 == "grass" | pokemon$type2 == "grass"] <- "grass"
pokemon$type[pokemon$type1 == "water" | pokemon$type2 == "water"] <- "water"
pokemon$type[pokemon$type1 == "fire" | pokemon$type2 == "fire"] <- "fire"
pokemon$type <- as.factor(pokemon$type)       # convert to factor (because svm only accept numeric values)

# remove rows containing NA in any of the variables and remove irrelevant and text features
pokemon <- pokemon[complete.cases(pokemon), -c(1, 30, 31, 33, 36, 37, 39, 40, 41)]

# temporarily move out type and assign it to y
y <- pokemon$type  # move out type as y

# convert factor variable to numeric
pokemon$capture_rate <- as.numeric(pokemon$capture_rate)

## preprocess the data, "range" scales the data to the interval between zero and one
pokemon_normalize <- preProcess(pokemon, method = "range")
pokemon <- predict(pokemon_normalize, newdata = pokemon)

pokemon$type <- y    

## check out correlation between each pair of explanatory variables, note that for Naive Bayes, the independent variables should not be highly correlated (may take quite some time to process, please be patient)
# par(mar=c(1,1,1,1))
# pairs.panels(pokemon[-33])


## 75-25 train-test split on Y before testing the model
train_row_numbers <- createDataPartition(pokemon$type, p=0.75, list=FALSE)
training <- pokemon[train_row_numbers, ]
testing <- pokemon[-train_row_numbers, ]

## fit a two-variable Naive Bayes model on the training data
nb.fit <- naiveBayes(type ~ against_fire + against_water,
                       data = training)

## you can do it manually
## "tables" is inside your naiveBayes object, it shows the outcome to be predicted and the features used as predictors

plot(0:3, xlim=c(-0.8,1), ylim=c(0,15), col="red", ylab="density",type="n", xlab="Against fire",main="Feature distribution for each type")
curve(dnorm(x, nb.fit$tables$against_fire[1,1], nb.fit$tables$against_fire[1,2]), add=TRUE, col="red")
curve(dnorm(x, nb.fit$tables$against_fire[2,1], nb.fit$tables$against_fire[2,2]), add=TRUE, col="blue")
curve(dnorm(x, nb.fit$tables$against_fire[3,1], nb.fit$tables$against_fire[3,2]), add=TRUE, col ="green")
legend("topleft", c("fire", "grass", "water"), col = c("red","green","blue"), lwd=1)

#Note: for the argument inside dnorm() function, the first one is the vector of quantity, the second one is the "mean," with the third one being "s.d."), the "mean" is stored in the first column of nb.fit$tables$against_fire[,] and the "s.d." of nb.fit$tables$against_fire is stored in the second column.

## or, using the more handy function from klaR package
install.packages("klaR")

## Installing package into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## 
##   There is a binary version available but the source version is later:
##      binary source needs_compilation
## klaR 0.6-15  1.7-0             FALSE

## installing the source package 'klaR'

library(klaR)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# interactive plot
nb.fit1 <- NaiveBayes(type ~ against_fire + against_water, data = training)  # note the NB function of klaR package begins with capital letters
plot(nb.fit1)

## predict on training
predict_train_nb <- predict(nb.fit, training)

## generate confusion matrix
confmat_train_nb <- table(predict_train_nb, training$type)
confmat_train_nb

##                 
## predict_train_nb fire grass water
##            fire    41     0     0
##            grass    0    65    10
##            water    1     1    81

## calculate accuracy
(confmat_train_nb[1, 1] + confmat_train_nb[2, 2] + confmat_train_nb[3, 3])/ sum(confmat_train_nb) * 100

## [1] 93.96985

## predict on testing
predict_test_nb <- predict(nb.fit, testing)

## generate confusion matrix                           
confmat_test_nb <- table(predict_test_nb, testing$type)
confmat_test_nb

##                
## predict_test_nb fire grass water
##           fire    13     0     0
##           grass    0    20     2
##           water    0     1    28

## calculate accuracy
(confmat_test_nb[1, 1] + confmat_test_nb[2, 2] + confmat_test_nb[3, 3])/ sum(confmat_test_nb) * 100

## [1] 95.3125

## check out the result of Laplace smoothing-controlled naive Bayes model
nb_laplace.fit <- naiveBayes(type ~ against_fire + against_water, data = training, laplace = 0.5)

# the higher the laplace (aka., alpha value), the higher the degree of smoothing 
## predict on testing data
predict_test_nb_laplace <- predict(nb_laplace.fit, testing)


## generate confusion matrix                           
confmat_test_nb_laplace <- table(predict_test_nb_laplace, testing$type)
confmat_test_nb_laplace

##                        
## predict_test_nb_laplace fire grass water
##                   fire    13     0     0
##                   grass    0    20     2
##                   water    0     1    28

## calculate accuracy
(confmat_test_nb_laplace[1, 1] + confmat_test_nb_laplace[2, 2] + confmat_test_nb_laplace[3, 3])/ sum(confmat_test_nb_laplace) * 100

## [1] 95.3125

## It doesn't seem that using laplace smoothing helps improve model fit, at least for our data and current model specification, this suggests that perhaps we do not have that many empty categories. Of course, you can tweak your laplace smoother value and try again.

Regression tree on HMDA data

Identifying important variables in mortgage denial

## load required packages

install.packages(c("rsample", "dplyr", "rpart", "rpart.plot", "ipred", "caret"))

## Installing packages into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## also installing the dependency 'rlang'

## 
##   There are binary versions available but the source versions are later:
##            binary source needs_compilation
## rlang      0.4.11  1.0.2              TRUE
## rsample     0.1.0  0.1.1             FALSE
## dplyr       1.0.6  1.0.8              TRUE
## rpart      4.1-15 4.1.16              TRUE
## rpart.plot  3.0.9  3.1.0             FALSE
## ipred      0.9-11 0.9-12              TRUE
## caret      6.0-86 6.0-92              TRUE
## 
##   Binaries will be installed

## Warning: packages 'dplyr', 'caret' are in use and will not be installed

## package 'rlang' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'rlang'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\rlang\libs\x64\rlang.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\rlang\libs\x64\rlang.dll: Permission
## denied

## Warning: restored 'rlang'

## package 'rpart' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'rpart'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\rpart\libs\x64\rpart.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\rpart\libs\x64\rpart.dll: Permission
## denied

## Warning: restored 'rpart'

## package 'ipred' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'ipred'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\ipred\libs\x64\ipred.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\ipred\libs\x64\ipred.dll: Permission
## denied

## Warning: restored 'ipred'

## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpcxI2iq\downloaded_packages

## installing the source packages 'rsample', 'rpart.plot'

library(rsample)     # data splitting

## 
## Attaching package: 'rsample'

## The following object is masked from 'package:e1071':
## 
##     permutations

library(dplyr)       # data wrangling
library(rpart)       # performing regression trees
library(rpart.plot)  # plotting regression trees
library(ipred)       # bagging
library(caret)       # bagging
library(AER)         # HMDA data

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

# load and examine data
data(HMDA)
names(HMDA)

##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"

# subset and convert the dependent variable to numeric
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]
data$deny <- ifelse(data$deny == "yes", 1, 0) 

# do a 80-20 train-test split
set.seed(123)
train_row_numbers <- createDataPartition(data$deny, p=0.8, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]


# fit the model and examine the output
# but we first need to convert "deny" back to factor class
training$deny <- as.factor(training$deny)

tree.fit1 <- rpart(deny ~ ., data = training, method  = "anova")

# reading model output (from left to right: node), decision criterion, split, n, deviance, y value)
tree.fit1

## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1904 202.97430 1.121324  
##   2) hirat< 0.3657 1837 178.22540 1.108873  
##     4) lvrat< 0.8034951 1244  87.74518 1.076367 *
##     5) lvrat>=0.8034951 593  86.40809 1.177066 *
##   3) hirat>=0.3657 67  16.65672 1.462687  
##     6) lvrat< 0.8823142 51  10.98039 1.313725 *
##     7) lvrat>=0.8823142 16   0.93750 1.937500 *

######## reading the output of cptable (from rpart) #########
# The Model Summary lists the variables that were used to construct the model. We can see that for this tree model, only two of the four right-hand-side variables provided were used. 
# Root node error is the percent of correctly sorted records at the first (root) node. This value can be used to calculate two measures of predictive performance in combination with Rel Error (rel error) and X Error (xerror), depending on the values of CP, both of which are included in the Pruning Table.
# rel error is just our familiar RSS = 1- R^2, which is the error for predictions of the data that were used to estimate the model; xerror is the so-called cross-validation error (note that rpart has built-in cv)
# Root Node Error x Rel Error is the "resubstitution error rate" (the error rate computed on the training sample). 
# Root Node Error x X Error is the cross-validated error rate, which is a more objective measure of predictive accuracy. 
# n is the number of records used to construct the tree.
# CP (the first column) is the complexity parameter (alpha)
# Each level in the Pruning table is the depth of the tree where each of the corresponding values is calculated. This can help you make decisions regarding where to prune the tree. 
# Each row in this table represents a different height/depth of the tree. More levels in a tree has lower classification error on the training data, but it comes with the risk of overfitting. Often, the cross-validation error will actually grow as the tree gets more levels (at least, after the 'optimal' level, please refer to the second row).
# Cross-validation error typically increases as the tree "grows" after the optimal level. The rule of thumb is to select the lowest level where rel error + xstd < xerror.
# Finally, the Leaf Summary lists variables and split thresholds at each node, as well as how the target variable records were split by percentages.

# view how the SSE changes with each split

tree.fit1$cptable

##           CP nsplit rel error    xerror       xstd
## 1 0.03986801      0 1.0000000 1.0022832 0.05327719
## 2 0.02334692      1 0.9601320 0.9755685 0.05235941
## 3 0.02006213      2 0.9367851 0.9867728 0.05287234
## 4 0.01000000      3 0.9167229 0.9726986 0.05240960

# alternatively, you may use
printcp(tree.fit1)

## 
## Regression tree:
## rpart(formula = deny ~ ., data = training, method = "anova")
## 
## Variables actually used in tree construction:
## [1] hirat lvrat
## 
## Root node error: 202.97/1904 = 0.1066
## 
## n= 1904 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.039868      0   1.00000 1.00228 0.053277
## 2 0.023347      1   0.96013 0.97557 0.052359
## 3 0.020062      2   0.93679 0.98677 0.052872
## 4 0.010000      3   0.91672 0.97270 0.052410

# plot it and compare with your model output, tree.fit1
rpart.plot(tree.fit1)

# how many terminal nodes are enough? Let's plot it
plotcp(tree.fit1)

# would more terminal nodes be better? Lets grow a full tree by setting cp = 0 (no penalty, which results in a fully-grown tree).
tree.fit2 <- rpart(deny ~ ., data = training, method  = "anova", control = list(cp = 0, xval = 10))  # xval is the number of cross-validation

plotcp(tree.fit2)
abline(v = 12, lty = "dashed")  # "v" means add a vertical line

## hyperparameter tuning: find the optimal splits and depth

# we first do it manually
tree.fit3 <- rpart(deny ~ ., data = training, method  = "anova", control = list(minsplit = 10, maxdepth = 12, xval = 10))

# check out the reuslt
tree.fit3$cptable

##           CP nsplit rel error    xerror       xstd
## 1 0.03986801      0 1.0000000 1.0007827 0.05319907
## 2 0.02334692      1 0.9601320 0.9886714 0.05293369
## 3 0.02006213      2 0.9367851 0.9828285 0.05321602
## 4 0.01000000      3 0.9167229 0.9652715 0.05267972

## tuning via grid search

# how many different combinations do we have?
hyper_grid <- expand.grid(
  minsplit = seq(5, 20, 1),  # from 5 to 20, with an increment of 1
  maxdepth = seq(8, 15, 1)
)

# insert the hyper_grid into the model and run through a loop

trees <- list()


for (i in 1:nrow(hyper_grid)) {
  
  # get minsplit, maxdepth values at row i
  minsplit <- hyper_grid$minsplit[i]
  maxdepth <- hyper_grid$maxdepth[i]
  # train a model and store in the list
  # R will use feed i-th minsplit and maxdepth values from hyper_grid into rpart()'s control function as hyperparameters
  trees[[i]] <- rpart(
    deny ~ .,
    data = training,
    method  = "anova",
    control = list(minsplit = minsplit, maxdepth = maxdepth)
    )
}

# view the 10 models in the output
head(trees)

## [[1]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1904 202.97430 1.121324  
##   2) hirat< 0.3657 1837 178.22540 1.108873  
##     4) lvrat< 0.8034951 1244  87.74518 1.076367 *
##     5) lvrat>=0.8034951 593  86.40809 1.177066 *
##   3) hirat>=0.3657 67  16.65672 1.462687  
##     6) lvrat< 0.8823142 51  10.98039 1.313725 *
##     7) lvrat>=0.8823142 16   0.93750 1.937500 *
## 
## [[2]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1904 202.97430 1.121324  
##   2) hirat< 0.3657 1837 178.22540 1.108873  
##     4) lvrat< 0.8034951 1244  87.74518 1.076367 *
##     5) lvrat>=0.8034951 593  86.40809 1.177066 *
##   3) hirat>=0.3657 67  16.65672 1.462687  
##     6) lvrat< 0.8823142 51  10.98039 1.313725 *
##     7) lvrat>=0.8823142 16   0.93750 1.937500 *
## 
## [[3]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1904 202.97430 1.121324  
##   2) hirat< 0.3657 1837 178.22540 1.108873  
##     4) lvrat< 0.8034951 1244  87.74518 1.076367 *
##     5) lvrat>=0.8034951 593  86.40809 1.177066 *
##   3) hirat>=0.3657 67  16.65672 1.462687  
##     6) lvrat< 0.8823142 51  10.98039 1.313725 *
##     7) lvrat>=0.8823142 16   0.93750 1.937500 *
## 
## [[4]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1904 202.97430 1.121324  
##   2) hirat< 0.3657 1837 178.22540 1.108873  
##     4) lvrat< 0.8034951 1244  87.74518 1.076367 *
##     5) lvrat>=0.8034951 593  86.40809 1.177066 *
##   3) hirat>=0.3657 67  16.65672 1.462687  
##     6) lvrat< 0.8823142 51  10.98039 1.313725 *
##     7) lvrat>=0.8823142 16   0.93750 1.937500 *
## 
## [[5]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1904 202.97430 1.121324  
##   2) hirat< 0.3657 1837 178.22540 1.108873  
##     4) lvrat< 0.8034951 1244  87.74518 1.076367 *
##     5) lvrat>=0.8034951 593  86.40809 1.177066 *
##   3) hirat>=0.3657 67  16.65672 1.462687  
##     6) lvrat< 0.8823142 51  10.98039 1.313725 *
##     7) lvrat>=0.8823142 16   0.93750 1.937500 *
## 
## [[6]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 1904 202.97430 1.121324  
##   2) hirat< 0.3657 1837 178.22540 1.108873  
##     4) lvrat< 0.8034951 1244  87.74518 1.076367 *
##     5) lvrat>=0.8034951 593  86.40809 1.177066 *
##   3) hirat>=0.3657 67  16.65672 1.462687  
##     6) lvrat< 0.8823142 51  10.98039 1.313725 *
##     7) lvrat>=0.8823142 16   0.93750 1.937500 *

# extract the model whose cp has the smallest error

# define function to get optimal cp
get_cp <- function(x) {
  min <- which.min(x$cptable[, "CP"]) # to select from the column named "CP" and return an index
  cp <- x$cptable[min, "CP"]  # using the index to subset the desired value
}

# define function to get minimum error
get_min_error <- function(x) {
  min    <- which.min(x$cptable[, "xerror"]) # to select from the column named "xerror" and return an index
  xerror <- x$cptable[min, "xerror"]  # using the index to subset the desired value
}

# select the top 5 model having smallest SSE
hyper_grid %>%
  mutate(
    cp    = purrr::map_dbl(trees, get_cp),  # purrr is a useful functional programming command from the tidyverse package, map_dbl() outputs double vectors (i.e, numeric values that have decimals) from an element of an object; mutate() adds new variables while preserving existing ones (from a data.frame or data.table object)
    error = purrr::map_dbl(trees, get_min_error)
    ) %>%
  arrange(error) %>%
  top_n(-5, wt = error)  # I use -5 to display the result in ascending order, otherwise it will appear in descending order

##   minsplit maxdepth   cp     error
## 1       16       14 0.01 0.9395929
## 2       14        9 0.01 0.9421587
## 3       16       10 0.01 0.9443737
## 4       20       12 0.01 0.9459529
## 5       20       11 0.01 0.9460676

# train the optimal_tree using the parameter values from previous step
optimal_tree <- rpart(
    deny ~ .,
    data = training,
    method  = "anova",
    control = list(minsplit = 16, maxdepth = 14, cp = 0.01)
    )
# predict on testing data
# also, convert "deny" in the testing data back to factor class
testing$deny <- as.factor(testing$deny)
pred <- predict(optimal_tree, newdata = testing)
# create confusion matrix and generate accuracy score

confmat <- table(pred, testing$deny)

(confmat[1, 1] + confmat[2, 2])/ sum(confmat) * 100

## [1] 66.38655

Bagging: an alternative approach to reduce model variance

Using bootstrap aggregating approach to reduce model variance.

library(rpart)
library(caret)       # bagging
library(AER)         # HMDA data

# load and examine data
data(HMDA)
names(HMDA)

##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"

# subset and convert the dependent variable to numeric
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]
data$deny <- ifelse(data$deny == "yes", 1, 0) 

set.seed(123)

# 80-20 train-test split
train_row_numbers <- createDataPartition(data$deny, p=0.8, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]

# convert "deny" to factor class for classification task
training$deny <- as.factor(training$deny)
# train bagged model

bagged.fit <- bagging(
  deny ~ .,
  data = training,
  coob = TRUE
)

# display model output
bagged.fit

## 
## Bagging classification trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = deny ~ ., data = training, coob = TRUE)
## 
## Out-of-bag estimate of misclassification error:  0.1497

# how many trees are enough to minimize model variance? let's cross-validate the result up to 50 trees, beginning from 10

ntree <- 10:50

# create empty vector to store OOB RMSE values with length equal to the number of trees
rmse <- vector(mode = "numeric", length = length(ntree)) # we tell R the mode (class) of this vector and the length of the output we want

for (i in seq_along(ntree)) {
  
  # perform bagged model
  model <- bagging(
  deny ~ .,
  data = training,
  coob = TRUE,
  nbagg = ntree[i]
)
  # get OOB error
  rmse[i] <- model$err
}

plot(ntree, rmse, type = 'l', lwd = 2)   # "type" mean line type (i = line), "lwd" means line width
abline(v = 25, col = "red", lty = "dashed")

## estimate using caret
# Specify 10-fold cross validation
control <- trainControl(method = "cv",  number = 10) 

# CV bagged model
bagged_cv.fit <- train(
  deny ~ .,
  data = training,
  method = "treebag",  # using "bagging"
  trControl = control,
  importance = TRUE,
  metric = "Accuracy"
  )

# display results
bagged_cv.fit

## Bagged CART 
## 
## 1904 samples
##    4 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1714, 1714, 1713, 1714, 1714, 1714, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8603114  0.1189729

# plot most important variables (note: we only have 4 variables)
plot(varImp(bagged_cv.fit), 4)

# compare with the actual dependent variable of the testing data and calculate accuracy
# also convert deny in testing data to factor class in order for it to be arranged into confusion matrix 

testing$deny <- as.factor(testing$deny)
pred <- predict(bagged_cv.fit, testing)

confmat <- table(pred, testing$deny)

(confmat[1, 1] + confmat[2, 2])/ sum(confmat) * 100

## [1] 86.34454

Random Forests

Build the best tree out of randomly selected de-correlated trees

## install and download required packages
install.packages(c("randomForest", "ranger", "ggplot2"))

## Installing packages into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## Warning: package 'randomForest' is not available (for R version 3.6.3)

## 
##   There are binary versions available but the source versions are later:
##         binary source needs_compilation
## ranger  0.12.1 0.13.1              TRUE
## ggplot2  3.3.3  3.3.5             FALSE
## 
##   Binaries will be installed
## package 'ranger' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'ranger'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\ranger\libs\x64\ranger.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\ranger\libs\x64\ranger.dll: Permission
## denied

## Warning: restored 'ranger'

## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpcxI2iq\downloaded_packages

## installing the source package 'ggplot2'

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(rsample)     # data splitting 
library(dplyr)       # data wrangling
library(rpart)       # performing regression trees
library(rpart.plot)  # plotting regression trees
library(ipred)       # bagging
library(caret)  
library(ranger)

## 
## Attaching package: 'ranger'

## The following object is masked from 'package:randomForest':
## 
##     importance

library(ggplot2)
library(h2o)         # an efficient java-based platform

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

set.seed(123)  # for reproducibility 

library(AER)         # which HMDA data is attached

# load and examine data
data(HMDA)
names(HMDA)

##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"

# subset and convert the dependent variable to numeric
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]

# do a 80-20 train-test split
train_row_numbers <- createDataPartition(data$deny, p=0.8, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]


# default RF model
rf.fit <- randomForest(
  deny ~ .,
  data = training
)

rf.fit

## 
## Call:
##  randomForest(formula = deny ~ ., data = training) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 12.29%
## Confusion matrix:
##       no yes class.error
## no  1634  42  0.02505967
## yes  192  36  0.84210526

# plot the model
par(mar=c(2,2,2,6))  # The par() is used to set or query graphical parameters
# mar(): large margin on the right side mar=c(bottom, left, top, right)
plot(rf.fit, main = "Random Forests model performance and the number of trees needed")
legend("right", colnames(rf.fit$err.rate), col=1:3, cex=0.8, fill=1:3)

# note: "yes" and "no" are lines for error in prediction for that specific label, and OOB (the first column in rf.fit$err.rate) is just the average of the two. As the number of trees increase, your OOB error gets lower because you get a better prediction from more trees.


# understanding "err.rate"
head(rf.fit$err.rate)

##            OOB         no       yes
## [1,] 0.1940086 0.10638298 0.7888889
## [2,] 0.1840069 0.09794319 0.8028169
## [3,] 0.1857542 0.09474522 0.8352273
## [4,] 0.1778191 0.08468596 0.8477157
## [5,] 0.1754794 0.07906977 0.8472222
## [6,] 0.1564778 0.06457801 0.8127854

# Each tree gives a classification on leftover data (OOB), meaning that the tree "votes" for that class. The forest chooses the classification having the most votes over all the trees in the forest. For a binary dependent variable, the vote will be YES or NO, counting up the YES votes. This is the RF score and the percent YES votes received is the predicted probability. In regression case, it is the average of the dependent variable. In other words, if we increase the number of trees, the OOB error will be computed at each step. The OOB error is NOT calculated from comparing the prediction obtained from 1 tree onto OOB samples with respect to that tree, but rather you use the averaged prediction across trees from which this sample is not used


# find the number of trees with the lowest OOB error
# the err.rate is measured as 1 - accuracy

which.min(rf.fit$err.rate[,1])

## [1] 231

# RMSE of this optimal random forest
sqrt(rf.fit$err.rate[which.min(rf.fit$err.rate[,1])])

## [1] 0.3483153

# note that for models of binary outcome and classification tasks (meaning that the outcomes are of factor class), their model errors are stored as "err.rate" inside the randomForest object. An example using model of continuous outcome is illustrated at below

# create ames data from AmesHousing
ames <- AmesHousing::make_ames()

set.seed(123)
train_row_numbers <- createDataPartition(ames$Sale_Price, p=0.75, list=FALSE)
ames_training <- ames[train_row_numbers, ]
ames_testing <- ames[-train_row_numbers, ]

# fit a random forest model of continuous outcome

# default RF model
ames_rf.fit <- randomForest(
  formula = Sale_Price ~ .,
  data    = ames_training
)

# check out the elements inside ames_rf.fit object
names(ames_rf.fit)

##  [1] "call"            "type"            "predicted"       "mse"            
##  [5] "rsq"             "oob.times"       "importance"      "importanceSD"   
##  [9] "localImportance" "proximity"       "ntree"           "mtry"           
## [13] "forest"          "coefs"           "y"               "test"           
## [17] "inbag"           "terms"

# number of trees with the lowest MSE
which.min(ames_rf.fit$mse)

## [1] 139

# RMSE of this optimal random forest
sqrt(ames_rf.fit$mse[which.min(ames_rf.fit$mse)])

## [1] 25632.45

## hyperparameter tuning for random forests

features <- training[-1]  # minus the dependent variable

set.seed(123)

rf.fit2 <- tuneRF(
           x = features,
           y = training$deny,
           ntreeTry = 500,  #num. of trees
          mtryStart = 5,    #num. of features
         stepFactor = 1.5,  #at each iteration, mtry is inflated (or deflated) by this value
            improve = 0.01, # each iteration must improve the OOB error by this much in order to continue
              trace = FALSE    # to hide real-time progress, if you want it to be displayed, then select TRUE 
)

## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## invalid mtry: reset to within valid range

## 0.008064516 0.01 
## -0.01209677 0.01

## comparing grid search using randomForest & ranger

system.time(
  ames_randomForest <- randomForest(
    deny ~ ., 
    data = training, 
    ntree = 500,
    mtry = floor(length(features) / 4)
  )
)

##    user  system elapsed 
##    0.38    0.03    0.42

system.time(
  ames_ranger <- ranger(
    deny ~ ., 
    data = training, 
    num.trees = 500,
    mtry = floor(length(features) / 4)
  )
)

##    user  system elapsed 
##    0.47    0.01    0.19

# ranger is a bit faster!
# set up hyperparameter tuning grid

hyper_grid_2 <- expand.grid(
  mtry = seq(1, 4, by = 1),
  node_size = seq(3, 9, by = 2),
  sampe_size = c(.5, .6, .75, .80),
  OOB_RMSE = 0
)

# check out the number of unique combination of hyperparameters
nrow(hyper_grid_2)

## [1] 64

# tuning over hyper_grid_2

for(i in 1:nrow(hyper_grid_2)) {
  
  # train model
  model <- ranger(
    deny ~ ., 
    data = training, 
    num.trees = 100,
    mtry = hyper_grid_2$mtry[i],
    min.node.size = hyper_grid_2$node_size[i],
    sample.fraction = hyper_grid_2$sampe_size[i],
    seed = 123
  )
  
  # add OOB error to grid. note how we append the list of values to a vector ans index them
  hyper_grid_2$OOB_RMSE[i] <- sqrt(model$prediction.error)
}

# display the top-10 best performing model
hyper_grid_2 %>% 
  dplyr::arrange(OOB_RMSE) %>%
  head(10)

##    mtry node_size sampe_size  OOB_RMSE
## 1     1         9       0.60 0.3368165
## 2     1         7       0.80 0.3375953
## 3     2         7       0.75 0.3383723
## 4     1         9       0.80 0.3391475
## 5     1         3       0.50 0.3399209
## 6     1         7       0.50 0.3399209
## 7     1         9       0.50 0.3399209
## 8     3         9       0.50 0.3399209
## 9     1         3       0.60 0.3399209
## 10    1         5       0.60 0.3399209

# fitting the best performing model
# generate an empty vector
OOB_RMSE <- vector(mode = "numeric", length = 100)

# create the best performing tree
for(i in seq_along(OOB_RMSE)) {

  optimal_ranger <- ranger(
    deny ~ ., 
    data = training, 
    num.trees = 100,
    mtry = 1,
    min.node.size = 7,
    sample.fraction = .5,
    importance = 'impurity'
  )
  
  OOB_RMSE[i] <- sqrt(optimal_ranger$prediction.error)
}

# display the change in RMSE from the OOB sample

hist(OOB_RMSE, breaks = 10)   # breaks = bin size

## visualizing variable importance in descending order

df <- data.frame(vname = names(data[-1]), importance = optimal_ranger$variable.importance)

ggplot(df, aes(x = reorder(vname, -importance), y = importance, fill = vname, label = round(importance, digits = 2))) +                # Draw second ggplot
  geom_bar(stat = "Identity") +
  ggtitle("Most important variables (in the order of 1 to 4)") +
  xlab("Variable") + 
  ylab("Importance") +
  geom_text(size = 3, position = position_stack(vjust = 0.5), col = "white")

## The built-in advantage of out-of-the-box (OOB) algorithm, it sometimes works like a charm even without having to resort to hyperparameter tuning. We will illustrate this using the AmesHousing data we just built.

# create training and validation data, again 75-25 split
set.seed(123)
valid_split <- initial_split(ames_training, .75)

# training data
ames_train_v2 <- analysis(valid_split)

# validation data
ames_valid <- assessment(valid_split)
x_test <- ames_valid[setdiff(names(ames_valid), "Sale_Price")]   # sort all not "Sale_Price" to x_test
y_test <- ames_valid$Sale_Price

rf_oob_comp <- randomForest(
  formula = Sale_Price ~ .,
  data    = ames_train_v2,
  xtest   = x_test,
  ytest   = y_test
)

# extract OOB & validation errors
oob <- sqrt(rf_oob_comp$mse)
validation <- sqrt(rf_oob_comp$test$mse)  # mse in the test set

# compare error rates
tibble::tibble(
  `Out of Bag Error` = oob,
  `Test error` = validation,
  ntrees = 1:rf_oob_comp$ntree
) %>%
  gather(Metric, RMSE, -ntrees) %>%   # gather columns (from a dataframe-compatible object) into key-value pairs
  ggplot(aes(ntrees, RMSE, color = Metric)) +
  geom_line() +
  scale_y_continuous(labels = scales::dollar) +
  xlab("Number of trees")

Gradient Boost Model (GBM)

Boosting and Gradient Descent

# load required packages
install.packages(c("gbm", "xgboost", "pdp", "ggplot2"))

## Installing packages into 'C:/Users/hktse/Documents/R/win-library/3.6'
## (as 'lib' is unspecified)

## 
##   There are binary versions available but the source versions are later:
##          binary  source needs_compilation
## xgboost 1.4.1.1 1.6.0.1              TRUE
## ggplot2   3.3.3   3.3.5             FALSE
## 
##   Binaries will be installed
## package 'gbm' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'gbm'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\gbm\libs\x64\gbm.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\gbm\libs\x64\gbm.dll: Permission denied

## Warning: restored 'gbm'

## package 'xgboost' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'xgboost'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\xgboost\libs\x64\xgboost.dll
## to C:\Users\hktse\Documents\R\win-library\3.6\xgboost\libs\x64\xgboost.dll:
## Permission denied

## Warning: restored 'xgboost'

## package 'pdp' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'pdp'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying C:
## \Users\hktse\Documents\R\win-library\3.6\00LOCK\pdp\libs\x64\pdp.dll to C:
## \Users\hktse\Documents\R\win-library\3.6\pdp\libs\x64\pdp.dll: Permission denied

## Warning: restored 'pdp'

## 
## The downloaded binary packages are in
##  C:\Users\hktse\AppData\Local\Temp\RtmpcxI2iq\downloaded_packages

## installing the source package 'ggplot2'

library(rsample)      # data splitting 
library(gbm)          # basic implementation

## Loaded gbm 2.1.8

library(xgboost)      # a faster implementation of gbm

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(caret)        # an aggregator package for performing many machine learning models
library(pdp)          # model visualization: partial plot
library(purrr)        # model visualization

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:pdp':
## 
##     partial

## The following object is masked from 'package:car':
## 
##     some

## The following object is masked from 'package:caret':
## 
##     lift

library(ggplot2)      # model visualization


# load the data and do train-test split

CBI <- read.csv("https://www.dropbox.com/s/426fnyypna3h0bo/CBI.csv?dl=1")
names(CBI)[23] <- "cbi"
df <- CBI[complete.cases(CBI), c("polity", "cbi", "pol_polarization", "gdppc", "nkaopen", "cpi")]

set.seed(123)
train_row_numbers <- createDataPartition(df$polity, p=0.75, list=FALSE)
training <- df[train_row_numbers, ]
testing <- df[-train_row_numbers, ]

# fit a GBM with 10000 trees, 1 tree length, learning rate = 0.001, using 5-fold CV

gbm.fit <- gbm(
  cbi ~ .,
  distribution = "gaussian",
  training,
  n.trees = 10000,  # num. of trees
  interaction.depth = 1,  # tree depth
  shrinkage = 0.001,   # learning rate
  cv.folds = 5,     # n-fold cross-validation
  n.cores = NULL, # will use all cores by default
  verbose = FALSE  # display log
  )  


# get MSE and compute RMSE
min_MSE <- which.min(gbm.fit$cv.error)  # get index of the minimal RMSE
sqrt(gbm.fit$cv.error[min_MSE])

## [1] 0.1839064

# plot loss function as a result of n trees added to the ensemble
par("mar")

## [1] 5.1 4.1 4.1 2.1

par(mar=c(1,1,1,1))
gbm.perf(gbm.fit, method = "cv")

## [1] 10000

## you can also do the same thing using the train() function from caret

set.seed(123)

control <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)

gbm_caret.fit1 <- train(cbi ~ ., data = training, 
                 method = "gbm", 
                 trControl = control,
                 ## This last option is actually one
                 ## for gbm() that passes through
                 verbose = FALSE)



## tuning over four hyperparameter: learning rate (shrinkage), tree depth (interaction.depth), number of nodes, fraction of sample for subsampling

hyper_grid1 <- expand.grid(
  shrinkage = c(.01, .1, .3),
  interaction.depth = c(1, 3, 5),
  n.minobsinnode = c(5, 10, 15),
  bag.fraction = c(.65, .8, 1), 
  optimal_trees = 0,               
  min_RMSE = 0                     
)

# how many unique combinations do we have?
nrow(hyper_grid1)

## [1] 81

random_index <- sample(1:nrow(training), nrow(training))
random_training <- training[random_index, ]

# grid search 
for(i in 1:nrow(hyper_grid1)) {
  
  # train model
  gbm.tune <- gbm(
    cbi ~ .,
    distribution = "gaussian",
    data = random_training,
    n.trees = 5000,
    interaction.depth = hyper_grid1$interaction.depth[i],
    shrinkage = hyper_grid1$shrinkage[i],
    n.minobsinnode = hyper_grid1$n.minobsinnode[i],
    bag.fraction = hyper_grid1$bag.fraction[i],
    train.fraction = .75,
    n.cores = NULL, # will use all cores by default
    verbose = FALSE
  )
  
  # add min training error and trees to grid
  hyper_grid1$optimal_trees[i] <- which.min(gbm.tune$valid.error)
  hyper_grid1$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}

# display top 10 performing models in descending order

hyper_grid1 %>% 
  dplyr::arrange(min_RMSE) %>%
  head(10)

##    shrinkage interaction.depth n.minobsinnode bag.fraction optimal_trees
## 1       0.01                 5             10         1.00           661
## 2       0.30                 5              5         1.00            14
## 3       0.30                 5              5         0.80            21
## 4       0.30                 5             15         0.80            30
## 5       0.30                 5             10         0.80            10
## 6       0.01                 5              5         1.00           679
## 7       0.10                 5             10         1.00            60
## 8       0.10                 5              5         1.00            70
## 9       0.10                 5             10         0.65           103
## 10      0.30                 3             10         0.65            80
##     min_RMSE
## 1  0.1240678
## 2  0.1248653
## 3  0.1258153
## 4  0.1262704
## 5  0.1265303
## 6  0.1269341
## 7  0.1272763
## 8  0.1275816
## 9  0.1281732
## 10 0.1288596

## do this again within a more refined parameter values space

hyper_grid2 <- expand.grid(
  shrinkage = c(0.1, 0.3),
  interaction.depth = c(3, 5),
  n.minobsinnode = c(10, 15),
  bag.fraction = c(0.8, 1), 
  optimal_trees = 0,               
  min_RMSE = 0                     
)

# total number of combinations
nrow(hyper_grid2)

## [1] 16

# grid search 
for(i in 1:nrow(hyper_grid2)) {
  
  # reproducibility
  set.seed(123)
  
  # train model
  gbm.tune <- gbm(
    cbi ~ .,
    distribution = "gaussian",
    data = random_training,
    n.trees = 5000,
    interaction.depth = hyper_grid2$interaction.depth[i],
    shrinkage = hyper_grid2$shrinkage[i],
    n.minobsinnode = hyper_grid2$n.minobsinnode[i],
    bag.fraction = hyper_grid2$bag.fraction[i],
    train.fraction = .75,
    n.cores = NULL, # will use all cores by default
    verbose = FALSE
  )
  
  # add min training error and trees to grid
  hyper_grid2$optimal_trees[i] <- which.min(gbm.tune$valid.error)
  hyper_grid2$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}

# display top 10 performing models
hyper_grid2 %>% 
  dplyr::arrange(min_RMSE) %>%
  head(10)

##    shrinkage interaction.depth n.minobsinnode bag.fraction optimal_trees
## 1        0.3                 5             10          0.8            16
## 2        0.1                 5             10          0.8            84
## 3        0.1                 5             10          1.0            60
## 4        0.1                 5             15          0.8           110
## 5        0.3                 5             10          1.0            28
## 6        0.3                 3             10          0.8           120
## 7        0.1                 5             15          1.0           227
## 8        0.3                 5             15          1.0            68
## 9        0.3                 3             15          0.8            84
## 10       0.3                 3             15          1.0            48
##     min_RMSE
## 1  0.1217854
## 2  0.1268901
## 3  0.1272763
## 4  0.1305541
## 5  0.1308355
## 6  0.1310061
## 7  0.1314758
## 8  0.1320263
## 9  0.1323270
## 10 0.1328844

# train final GBM model
gbm.fit.final <- gbm(
  cbi ~ .,
  distribution = "gaussian",
  data = training,
  n.trees = 81,
  interaction.depth = 5,
  shrinkage = 0.3,
  n.minobsinnode = 10,
  bag.fraction = 0.8, 
  train.fraction = 0.63,
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
  )  

# examine the result
gbm.fit.final

## gbm(formula = cbi ~ ., distribution = "gaussian", data = training, 
##     n.trees = 81, interaction.depth = 5, n.minobsinnode = 10, 
##     shrinkage = 0.3, bag.fraction = 0.8, train.fraction = 0.63, 
##     verbose = FALSE, n.cores = NULL)
## A gradient boosted model with gaussian loss function.
## 81 iterations were performed.
## The best test-set iteration was 24.
## There were 5 predictors of which 5 had non-zero influence.

# visualize variable importance

par(mar = c(5, 8, 1, 1))
summary(
  gbm.fit.final, 
  cBars = 5,
  method = relative.influence, # you can also use permutation.test.gbm
  las = 2
  )

##                               var   rel.inf
## cpi                           cpi 44.590056
## gdppc                       gdppc 32.134923
## nkaopen                   nkaopen 11.961609
## polity                     polity  8.849158
## pol_polarization pol_polarization  2.464254

# the "las" argument inside summary() specifies the numeric value indicating the orientation of the tick mark labels or text you want to add. The options are as follows: always parallel to the axis (the default, 0), always horizontal (1), always perpendicular to the axis (2), and always vertical (3).


# a more handy way is to simply do this 
summary(gbm.fit.final)

##                               var   rel.inf
## cpi                           cpi 44.590056
## gdppc                       gdppc 32.134923
## nkaopen                   nkaopen 11.961609
## polity                     polity  8.849158
## pol_polarization pol_polarization  2.464254

## Partial dependence plot (PDP): how the dependent variable changes conditional on one of the explanatory variables
## we estimate the uncentered and centered version and plot them side by side

ice1 <- gbm.fit.final %>%
  pdp::partial(
    pred.var = "cpi", 
    n.trees = gbm.fit.final$n.trees, 
    grid.resolution = 100,
    ice = TRUE
    ) %>%
  autoplot(rug = TRUE, train = training, alpha = .1) +
  xlab("CPI") +
  ylab(expression(hat(y))) +
  ggtitle("Non-centered")

## Warning in partial.default(., pred.var = "cpi", n.trees =
## gbm.fit.final$n.trees, : Recursive method not available for "gbm" objects when
## `ice = TRUE`. Using brute force method instead.

## Warning: `fun.y` is deprecated. Use `fun` instead.

## Warning: Ignoring unknown parameters: csides

ice2 <- gbm.fit.final %>%
  pdp::partial(
    pred.var = "cpi", 
    n.trees = gbm.fit.final$n.trees, 
    grid.resolution = 100,
    ice = TRUE
    ) %>%
  autoplot(rug = TRUE, train = training, alpha = .1, center = TRUE) +
  xlab("CPI") +
  ylab(expression(hat(y))) +
  ggtitle("Centered")

## Warning in partial.default(., pred.var = "cpi", n.trees =
## gbm.fit.final$n.trees, : Recursive method not available for "gbm" objects when
## `ice = TRUE`. Using brute force method instead.

## Warning: `fun.y` is deprecated. Use `fun` instead.

## Warning: Ignoring unknown parameters: csides

par("mar")

## [1] 5 8 1 1

par(mar=c(1,1,1,1))
gridExtra::grid.arrange(ice1, ice2, nrow = 1)

## Warning: Use of `object[["yhat.id"]]` is discouraged. Use `.data[["yhat.id"]]`
## instead.

## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.

## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.

## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

## Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.

## Warning: Use of `object[["yhat.id"]]` is discouraged. Use `.data[["yhat.id"]]`
## instead.

## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.

## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

## Warning: Use of `object[[1L]]` is discouraged. Use `.data[[1L]]` instead.

## Warning: Use of `object[["yhat"]]` is discouraged. Use `.data[["yhat"]]`
## instead.

## Warning: Use of `x.rug[[1L]]` is discouraged. Use `.data[[1L]]` instead.

Supplementary materials

GBM on data of binary outcomes

library(rsample)      # data splitting 
library(gbm)          # basic implementation
library(xgboost)
library(AER)

# load and subset data
data(HMDA)
HMDA$deny <- ifelse(HMDA$deny == "yes", 1, 0)
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]

# do a 75-25 train-test split
set.seed(123)
train_row_numbers <- createDataPartition(data$deny, p=0.75, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]


# fit a GBM of Bernoulli distribution
gbm.logit.fit <- gbm(deny ~ .,
                data = training,
                interaction.depth = 2,
                distribution = "bernoulli",  # for binary outcome
                cv.folds = 5,
                shrinkage = .1,
                n.trees = 50)

# examine output and model performance
summary(gbm.logit.fit)

##         var   rel.inf
## lvrat lvrat 49.306624
## hirat hirat 39.715339
## mhist mhist  7.950341
## unemp unemp  3.027695

gbm.perf(gbm.logit.fit, method = "cv")

## [1] 38