Statistical Learning: Week-10

Naïve Bayes

## Load required packages and data
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.4.3

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

# install.packages("psych") if you haven't installed this package
library(psych)  # for pairs.panels() function

## Warning: package 'psych' was built under R version 4.4.3

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

# Load data from our dropbox shared folder and preliminary check
pokemon <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")
names(pokemon)

##  [1] "abilities"         "against_bug"       "against_dark"     
##  [4] "against_dragon"    "against_electric"  "against_fairy"    
##  [7] "against_fight"     "against_fire"      "against_flying"   
## [10] "against_ghost"     "against_grass"     "against_ground"   
## [13] "against_ice"       "against_normal"    "against_poison"   
## [16] "against_psychic"   "against_rock"      "against_steel"    
## [19] "against_water"     "attack"            "base_egg_steps"   
## [22] "base_happiness"    "base_total"        "capture_rate"     
## [25] "classfication"     "defense"           "experience_growth"
## [28] "height_m"          "hp"                "japanese_name"    
## [31] "name"              "percentage_male"   "pokedex_number"   
## [34] "sp_attack"         "sp_defense"        "speed"            
## [37] "type1"             "type2"             "weight_kg"        
## [40] "generation"        "is_legendary"

names(pokemon)[25] <- "classification"

# Create a new column "type" to denote three main "Gosanke(s)"
pokemon$type <- NA
pokemon$type[pokemon$type1 == "grass" | pokemon$type2 == "grass"] <- "grass"
pokemon$type[pokemon$type1 == "water" | pokemon$type2 == "water"] <- "water"
pokemon$type[pokemon$type1 == "fire" | pokemon$type2 == "fire"] <- "fire"
pokemon$type <- as.factor(pokemon$type)       # convert to factor (because svm only accept numeric values)

# Remove rows containing NA in any of the variables and remove irrelevant and text features
pokemon <- pokemon[complete.cases(pokemon), -c(1, 30, 31, 33, 37, 38, 40, 41)]

# Temporarily assign type column to y
y <- pokemon$type  # move out type as y

# Convert factor variable to numeric
pokemon$capture_rate <- as.numeric(pokemon$capture_rate)

## Preprocess the data, "range" scales the data to the interval between zero and one
pokemon_normalize <- preProcess(pokemon, method = "range")
pokemon <- predict(pokemon_normalize, newdata = pokemon)

# Add the original type value (y) back in
pokemon$type <- y    

## Check out correlation between each pair of explanatory variables, note that for Naive Bayes, the independent variables should not be highly correlated (may take quite some time to process, be patient)
# par(mar=c(1,1,1,1))
# pairs.panels(pokemon[-34])  # Remove the dependent variable "type"


## 75-25 train-test split on Y before testing the model
train_row_numbers <- createDataPartition(pokemon$type, p=0.75, list=FALSE)
training <- pokemon[train_row_numbers, ]
testing <- pokemon[-train_row_numbers, ]

## Fit a two-variable Naive Bayes model on the training data
nb.fit <- naiveBayes(type ~ against_fire + against_water,
                       data = training)

## You can also do it manually
# "tables" is inside your naiveBayes object, it contains multiple elements. Within each element (representing a variable) the columns display predicted probabilities and s.d., and each row denotes each outcome type.
nb.fit$tables

## $against_fire
##        against_fire
## Y             [,1]       [,2]
##   fire  0.07142857 0.03086067
##   grass 0.52323232 0.18424897
##   water 0.07692308 0.04653216
## 
## $against_water
##        against_water
## Y             [,1]       [,2]
##   fire  0.46031746 0.04114756
##   grass 0.07272727 0.02798601
##   water 0.08351648 0.04928467

# For each element inside nb.fit$tables, the first column is the "mean", the second column is the "s.d."
nb.fit$tables$against_fire

##        against_fire
## Y             [,1]       [,2]
##   fire  0.07142857 0.03086067
##   grass 0.52323232 0.18424897
##   water 0.07692308 0.04653216

# Plot each predictor (x-axis) against outcome (type)
plot(0:3, xlim=c(-0.8,1), ylim=c(0,15), col="red", ylab="density", type="n", xlab="Against fire", main="Type distribution for each type")   # type = "n" is used when drawing multiple graphs (e.g., lines) on the same plot
curve(dnorm(x, nb.fit$tables$against_fire[1,1], nb.fit$tables$against_fire[1,2]), add=TRUE, col="red") # Predict "fire"
curve(dnorm(x, nb.fit$tables$against_fire[2,1], nb.fit$tables$against_fire[2,2]), add=TRUE, col="blue") # Predict "water"
curve(dnorm(x, nb.fit$tables$against_fire[3,1], nb.fit$tables$against_fire[3,2]), add=TRUE, col ="green") # Predict "grass"
# dnorm() returns the value of the probability density function for the normal distribution given parameters x, μ (mean), and σ (s.d.)
legend("topleft", c("fire", "grass", "water"), col = c("red","green","blue"), lwd=1) # Place the legend on the top-left corner and mark by different colors

# against_fire seems to be a better predictor of a Pokemon's "water" type.

# Note: for the argument inside dnorm() function, the first one is the vector of quantity, the second one is the "mean," and the third one being "s.d."), the "mean" is stored in the first column of nb.fit$tables$against_fire[,] and the "s.d." of nb.fit$tables$against_fire is stored in the second column.

## or, you can use the more handy function from klaR package
# install.packages("klaR") if you haven't installed it
library(klaR)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Interactive plot
nb.fit1 <- NaiveBayes(type ~ against_fire + against_water, data = training)  # Note the NB function of klaR package begins with capital letters
plot(nb.fit1)

## Predict on training data
predict_train_nb <- predict(nb.fit, training)

## Generate confusion matrix
confmat_train_nb <- table(predict_train_nb, training$type)
confmat_train_nb

##                 
## predict_train_nb fire grass water
##            fire    41     0     0
##            grass    0    64     7
##            water    1     2    84

## Calculate accuracy: need to create a 3 by 3 confusion matrix
(confmat_train_nb[1, 1] + confmat_train_nb[2, 2] + confmat_train_nb[3, 3])/ sum(confmat_train_nb) * 100

## [1] 94.97487

## Predict on testing
predict_test_nb <- predict(nb.fit, testing)

## Generate confusion matrix                           
confmat_test_nb <- table(predict_test_nb, testing$type)
confmat_test_nb

##                
## predict_test_nb fire grass water
##           fire    13     0     0
##           grass    0    21     5
##           water    0     0    25

## Calculate accuracy
(confmat_test_nb[1, 1] + confmat_test_nb[2, 2] + confmat_test_nb[3, 3])/ sum(confmat_test_nb) * 100

## [1] 92.1875

## Check out the result of Laplace smoothing-controlled Naive Bayes model, set the smoothing parameter to 0.5
nb_laplace.fit <- naiveBayes(type ~ against_fire + against_water, data = training, laplace = 0.5)

# The higher the laplace (aka., alpha value), the higher the degree of smoothing 
## Predict on testing data
predict_test_nb_laplace <- predict(nb_laplace.fit, testing)


## Generate confusion matrix                           
confmat_test_nb_laplace <- table(predict_test_nb_laplace, testing$type)
confmat_test_nb_laplace

##                        
## predict_test_nb_laplace fire grass water
##                   fire    13     0     0
##                   grass    0    21     5
##                   water    0     0    25

## Calculate accuracy
(confmat_test_nb_laplace[1, 1] + confmat_test_nb_laplace[2, 2] + confmat_test_nb_laplace[3, 3])/ sum(confmat_test_nb_laplace) * 100

## [1] 92.1875

## It doesn't seem that using laplace smoothing helps improve model fit, at least for our data and current model specification, this suggests that perhaps we do not have that many empty categories. Of course, you can tweak your Laplace smoother value and try again.

Regression tree on HMDA data

Identifying important variables in mortgage denial

## load required packages

# install.packages(c("rsample", "dplyr", "rpart", "rpart.plot", "ipred", "caret"))
library(rsample)     # data splitting

## 
## Attaching package: 'rsample'

## The following object is masked from 'package:e1071':
## 
##     permutations

library(dplyr)       # data wrangling
library(rpart)       # perform regression trees
library(rpart.plot)  # plott regression trees
library(ipred)       # bagging
library(caret)       # bagging
library(AER)         # HMDA data

## Warning: package 'AER' was built under R version 4.4.3

## Loading required package: car

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

## Loading required package: lmtest

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

# Load and examine data
data(HMDA)
names(HMDA)

##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"

# Subset variables
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]
# Convert the dependent variable to factor class
data$deny = as.factor(ifelse(data$deny == "no", 0, 1))

# Do a 80-20 train-test split
set.seed(123)
train_row_numbers <- createDataPartition(data$deny, p=0.8, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]

# Fit the model and examine the output
# ANOVA method is for regressions on binary dependent variable (0, 1). For "classification" task, where the dependent variable (such as a categorical lable: {1, 2, 3}) doesn't need to be strictly bounded between (0, 1), do not set method = "anova" 
tree.fit1 <- rpart(deny ~ ., data = training, method  = "anova")

# Read model output (from left to right: node, decision criterion, split, n (of obs in a node), deviance, y value)

tree.fit1

## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1904 200.697500 1.119748  
##    2) lvrat< 0.9479871 1825 171.800500 1.105205  
##      4) hirat< 0.3657 1771 151.252400 1.094297 *
##      5) hirat>=0.3657 54  13.425930 1.462963  
##       10) lvrat< 0.8812374 45  10.311110 1.355556 *
##       11) lvrat>=0.8812374 9   0.000000 2.000000 *
##    3) lvrat>=0.9479871 79  19.594940 1.455696  
##      6) hirat< 0.323 66  15.530300 1.378788 *
##      7) hirat>=0.323 13   1.692308 1.846154 *

# The Model Summary lists the variables that were used to construct the model. We can see that for this tree model, only two of the four right-hand-side variables were actually used for splitting. What are they?

# Root node error is the percent of correctly sorted records at the first (root) node. This value can be used to calculate two measures of predictive performance in combination with Rel Error (rel error) and X Error (xerror), depending on the values of CP (complexity parameter: minimum improvement needed at each node), both of which are included in the Pruning Table. Let's check it out.

######## Reading the output of cptable (from rpart) #########
tree.fit1$cptable

##           CP nsplit rel error    xerror       xstd
## 1 0.04634834      0 1.0000000 1.0009326 0.05373051
## 2 0.03548735      1 0.9536517 1.0009418 0.05369544
## 3 0.01551995      2 0.9181643 0.9571625 0.05180856
## 4 0.01182041      3 0.9026444 0.9445115 0.05207255
## 5 0.01000000      4 0.8908240 0.9488196 0.05245405

# rel error is just our familiar RSS = 1 - R^2, which is the error from the model prediction; xerror is the so-called cross-validation error (note that rpart has built-in cv)
# Root Node Error * Rel Error is the "resubstitution error rate" (the error rate computed on the training sample). 
# Root Node Error * X Error is the cross-validated error rate, which is a more objective measure of prediction accuracy. 
# nsplit is the number of split used to construct the tree.
# CP (the first column) is the complexity parameter (alpha)
# Each level in the Pruning table is the "depth" of the tree where each of the corresponding values is calculated. This can help you make decisions regarding where to prune the tree. 
# Each row in this table represents a different height/depth of the tree. More levels in a tree has lower classification error on the training data, but it comes with the risk of overfitting. Often, the cross-validation error will actually grow as the tree gets more levels (at least, after the 'optimal' level, just look at the second row).
# Cross-validation error typically increases as the tree "grows" beyond the optimal level. The rule of thumb is to select the lowest level (of split) where rel error + xstd < xerror. This can be easily done using the following code:
CPtable <- as.data.frame(tree.fit1$cptable) # Convert the cptable to a dataframe object for the ease of data retrieval
CPtable$nsplit[CPtable$`rel error` + CPtable$`xstd` < CPtable$`xerror`][1] # Conditional subset and display the first "nsplit" value that satisfies this criterion

## [1] 4

# Finally, the Leaf Summary lists variables and split thresholds at each node, as well as how the target variable records were split by percentages.
summary(tree.fit1)

## Call:
## rpart(formula = deny ~ ., data = training, method = "anova")
##   n= 1904 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.04634834      0 1.0000000 1.0009326 0.05373051
## 2 0.03548735      1 0.9536517 1.0009418 0.05369544
## 3 0.01551995      2 0.9181643 0.9571625 0.05180856
## 4 0.01182041      3 0.9026444 0.9445115 0.05207255
## 5 0.01000000      4 0.8908240 0.9488196 0.05245405
## 
## Variable importance
## lvrat hirat 
##    56    44 
## 
## Node number 1: 1904 observations,    complexity param=0.04634834
##   mean=1.119748, MSE=0.1054083 
##   left son=2 (1825 obs) right son=3 (79 obs)
##   Primary splits:
##       lvrat < 0.9479871 to the left,  improve=0.046348340, (0 missing)
##       hirat < 0.3657    to the left,  improve=0.045842610, (0 missing)
##       mhist splits as  LRRR, improve=0.007291801, (0 missing)
##       unemp < 3.4       to the left,  improve=0.003667915, (0 missing)
##   Surrogate splits:
##       hirat < 0.92      to the left,  agree=0.959, adj=0.013, (0 split)
## 
## Node number 2: 1825 observations,    complexity param=0.03548735
##   mean=1.105205, MSE=0.09413729 
##   left son=4 (1771 obs) right son=5 (54 obs)
##   Primary splits:
##       hirat < 0.3657    to the left,  improve=0.041456340, (0 missing)
##       lvrat < 0.7481748 to the left,  improve=0.014679240, (0 missing)
##       mhist splits as  LRRR, improve=0.006066354, (0 missing)
##       unemp < 3.4       to the left,  improve=0.005166927, (0 missing)
## 
## Node number 3: 79 observations,    complexity param=0.01182041
##   mean=1.455696, MSE=0.2480372 
##   left son=6 (66 obs) right son=7 (13 obs)
##   Primary splits:
##       hirat < 0.323     to the left,  improve=0.121068300, (0 missing)
##       lvrat < 0.9532624 to the left,  improve=0.023732580, (0 missing)
##       unemp < 3.4       to the right, improve=0.007657016, (0 missing)
##       mhist splits as  LRLR, improve=0.002931202, (0 missing)
## 
## Node number 4: 1771 observations
##   mean=1.094297, MSE=0.08540508 
## 
## Node number 5: 54 observations,    complexity param=0.01551995
##   mean=1.462963, MSE=0.2486283 
##   left son=10 (45 obs) right son=11 (9 obs)
##   Primary splits:
##       lvrat < 0.8812374 to the left,  improve=0.23200000, (0 missing)
##       mhist splits as  LRR-, improve=0.15167810, (0 missing)
##       hirat < 0.379     to the right, improve=0.04555665, (0 missing)
##       unemp < 3.4       to the left,  improve=0.03955893, (0 missing)
## 
## Node number 6: 66 observations
##   mean=1.378788, MSE=0.2353076 
## 
## Node number 7: 13 observations
##   mean=1.846154, MSE=0.1301775 
## 
## Node number 10: 45 observations
##   mean=1.355556, MSE=0.2291358 
## 
## Node number 11: 9 observations
##   mean=2, MSE=0

# Note how the MSE changes at each split
# Surrogate split means the variable used as the primary splitting variable possibly contains missing values or discontinuity, so the algorithm will instead find other variables to split on. 

# Alternatively, you may use printcp() to get cptable
printcp(tree.fit1)

## 
## Regression tree:
## rpart(formula = deny ~ ., data = training, method = "anova")
## 
## Variables actually used in tree construction:
## [1] hirat lvrat
## 
## Root node error: 200.7/1904 = 0.10541
## 
## n= 1904 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.046348      0   1.00000 1.00093 0.053731
## 2 0.035487      1   0.95365 1.00094 0.053695
## 3 0.015520      2   0.91816 0.95716 0.051809
## 4 0.011820      3   0.90264 0.94451 0.052073
## 5 0.010000      4   0.89082 0.94882 0.052454

# Plot it and compare with your model output, tree.fit1
rpart.plot(tree.fit1)

# How many terminal nodes are enough? Let's plot it
plotcp(tree.fit1)

# The more terminal nodes the better? Lets grow a full tree by setting cp = 0 (no penalty, which will result in a fully-grown tree).
tree.fit2 <- rpart(deny ~ ., data = training, method  = "anova", control = list(cp = 0, xval = 10))  # xval is the number of cross-validation

plotcp(tree.fit2)
abline(v = 4, lty = "dashed")  # "v" means adding a vertical line, using this to mark the number of trees with the lowest rel error

## Hyperparameter tuning: find the optimal splits and depth

# We first do it manually
tree.fit3 <- rpart(deny ~ ., data = training, method  = "anova", control = list(
                  minsplit = 10,  # minimum number of observations in a single node
                  maxdepth = 12,  # maximum depth (node #) of the final tree, root node is counted as depth 0
                  xval = 10))  # number of cross-validation (CV)

# Check out the result
tree.fit3$cptable

##           CP nsplit rel error    xerror       xstd
## 1 0.04634834      0 1.0000000 1.0010384 0.05373607
## 2 0.03548735      1 0.9536517 0.9900219 0.05336842
## 3 0.01551995      2 0.9181643 0.9477492 0.05161866
## 4 0.01182041      3 0.9026444 0.9383437 0.05218819
## 5 0.01000000      4 0.8908240 0.9375020 0.05249234

## Tuning via grid search

# How many different combinations do we have?
hyper_grid <- expand.grid(
  minsplit = seq(5, 20, 1),  # from 5 to 20, with an increment of 1
  maxdepth = seq(8, 15, 1)
)

nrow(hyper_grid)

## [1] 128

# Check out the first 5 rows
head(hyper_grid, 5)

##   minsplit maxdepth
## 1        5        8
## 2        6        8
## 3        7        8
## 4        8        8
## 5        9        8

# Insert the hyper_grid into the model and iterate through a loop

# Create an empty list to append estimation result from each iteration of the loop
trees <- list()


for (i in 1:nrow(hyper_grid)) {
  
  # Get minsplit, maxdepth values at row i of hyper_grid
  minsplit <- hyper_grid$minsplit[i]
  maxdepth <- hyper_grid$maxdepth[i]
  # Train a model and store in the list
  # R will feed i-th minsplit and maxdepth values from the i-th row of hyper_grid into rpart()'s control function as hyperparameters
  trees[[i]] <- rpart(
    deny ~ .,
    data = training,
    method  = "anova",
    control = list(minsplit = minsplit, maxdepth = maxdepth)
    )
}

# View the top 5 models in the output
head(trees, 5)  # "trees" is stored as a list object

## [[1]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1904 200.697500 1.119748  
##    2) lvrat< 0.9479871 1825 171.800500 1.105205  
##      4) hirat< 0.3657 1771 151.252400 1.094297 *
##      5) hirat>=0.3657 54  13.425930 1.462963  
##       10) lvrat< 0.8812374 45  10.311110 1.355556 *
##       11) lvrat>=0.8812374 9   0.000000 2.000000 *
##    3) lvrat>=0.9479871 79  19.594940 1.455696  
##      6) hirat< 0.323 66  15.530300 1.378788 *
##      7) hirat>=0.323 13   1.692308 1.846154 *
## 
## [[2]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1904 200.697500 1.119748  
##    2) lvrat< 0.9479871 1825 171.800500 1.105205  
##      4) hirat< 0.3657 1771 151.252400 1.094297 *
##      5) hirat>=0.3657 54  13.425930 1.462963  
##       10) lvrat< 0.8812374 45  10.311110 1.355556 *
##       11) lvrat>=0.8812374 9   0.000000 2.000000 *
##    3) lvrat>=0.9479871 79  19.594940 1.455696  
##      6) hirat< 0.323 66  15.530300 1.378788 *
##      7) hirat>=0.323 13   1.692308 1.846154 *
## 
## [[3]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1904 200.697500 1.119748  
##    2) lvrat< 0.9479871 1825 171.800500 1.105205  
##      4) hirat< 0.3657 1771 151.252400 1.094297 *
##      5) hirat>=0.3657 54  13.425930 1.462963  
##       10) lvrat< 0.8812374 45  10.311110 1.355556 *
##       11) lvrat>=0.8812374 9   0.000000 2.000000 *
##    3) lvrat>=0.9479871 79  19.594940 1.455696  
##      6) hirat< 0.323 66  15.530300 1.378788 *
##      7) hirat>=0.323 13   1.692308 1.846154 *
## 
## [[4]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1904 200.697500 1.119748  
##    2) lvrat< 0.9479871 1825 171.800500 1.105205  
##      4) hirat< 0.3657 1771 151.252400 1.094297 *
##      5) hirat>=0.3657 54  13.425930 1.462963  
##       10) lvrat< 0.8812374 45  10.311110 1.355556 *
##       11) lvrat>=0.8812374 9   0.000000 2.000000 *
##    3) lvrat>=0.9479871 79  19.594940 1.455696  
##      6) hirat< 0.323 66  15.530300 1.378788 *
##      7) hirat>=0.323 13   1.692308 1.846154 *
## 
## [[5]]
## n= 1904 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 1904 200.697500 1.119748  
##    2) lvrat< 0.9479871 1825 171.800500 1.105205  
##      4) hirat< 0.3657 1771 151.252400 1.094297 *
##      5) hirat>=0.3657 54  13.425930 1.462963  
##       10) lvrat< 0.8812374 45  10.311110 1.355556 *
##       11) lvrat>=0.8812374 9   0.000000 2.000000 *
##    3) lvrat>=0.9479871 79  19.594940 1.455696  
##      6) hirat< 0.323 66  15.530300 1.378788 *
##      7) hirat>=0.323 13   1.692308 1.846154 *

# Extract the model whose cp has the smallest error

# Define function to get optimal cp with smallest error
get_cp <- function(x) {
  min <- which.min(x$cptable[, "CP"]) # Select from the column named "CP" and return a row index
  cp <- x$cptable[min, "CP"]  # Using this index to subset the desired value
}

# Define function to get minimum xerror (cross-validation error)
get_min_error <- function(x) {
  min    <- which.min(x$cptable[, "xerror"]) # Select from the column named "xerror" and return a row index
  xerror <- x$cptable[min, "xerror"]  # Using this index to subset the desired value
}

# Select the 5 best-performing models having the smallest SSE
hyper_grid <- hyper_grid %>%
  mutate(
    cp    = purrr::map_dbl(trees, get_cp),  # purrr is a useful functional programming command from the tidyverse package, map_dbl() outputs double vectors (i.e, numeric values that have decimals) from an element of an object; mutate() adds new variables while preserving existing ones (from a data.frame or data.table object)
    error = purrr::map_dbl(trees, get_min_error)
    ) %>%
  arrange(error) %>%
  top_n(-5, wt = error)  # Use -5 to display the result in ascending order (the original function display output in descending order)
# The first row (model) has the lowest error, so on and so forth

hyper_grid

##   minsplit maxdepth   cp     error
## 1        8        8 0.01 0.9146128
## 2       18       12 0.01 0.9151306
## 3       16       13 0.01 0.9167266
## 4        7        8 0.01 0.9177339
## 5       15       11 0.01 0.9191623

# Get the estimated minsplit, maxdepth, and complexity parameter
# They are stored in the three columns of the first row 
min_split <- hyper_grid[1, 1]
max_depth <- hyper_grid[1, 2]
complexity_para <- hyper_grid[1, 3]

# Train the optimal_tree using relevant parameter values from the previous step
optimal_tree <- rpart(
    deny ~ .,
    data = training,
    # method  = "anova", in order to get predicted class, we need to suppress the "anove" method
    control = list(minsplit = min_split, maxdepth = max_depth, cp = complexity_para)
    )

# Predict on testing data
pred <- predict(optimal_tree, newdata = testing, type = "class") # set type to "class"

# Create confusion matrix and generate accuracy score
confmat <- table(Predicted = pred, Actual = testing$deny)

# View the confusion table (lots of Type II error)
confmat

##          Actual
## Predicted   0   1
##         0 412  52
##         1   7   5

# Calculate accuracy score
(confmat[1, 1] + confmat[2, 2])/ sum(confmat) * 100

## [1] 87.60504

Bagging: an alternative approach to reduce model variance

Using bootstrap aggregating (bagging) approach to reduce model variance.

library(rpart)
library(caret)       # bagging
library(AER)         # HMDA data

# Load and examine data
data(HMDA)
names(HMDA)

##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"

# Subset variables and convert the dependent variable to factor class
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]
data$deny <- as.factor(ifelse(data$deny == "yes", 1, 0))

set.seed(123)

# 80-20 train-test split
train_row_numbers <- createDataPartition(data$deny, p=0.8, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]


# Train bagged model

bagged.fit <- bagging(
  deny ~ .,
  data = training,
  coob = TRUE     # Use OOB sample to estimate test error
)

# Display model output
bagged.fit

## 
## Bagging classification trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = deny ~ ., data = training, coob = TRUE)
## 
## Out-of-bag estimate of misclassification error:  0.1444

# How many trees are enough to minimize model variance? Let's cross-validate the result up to 50 trees, beginning from 10

ntree <- 10:50

# Create empty vector to store OOB RMSE values with length equal to the number of trees
rmse <- vector(mode = "numeric", length = length(ntree)) # Tell R the mode (class) of this vector and the length of the output we want

for (i in seq_along(ntree)) {
  
  # Perform bagged model
  model <- bagging(
  deny ~ .,
  data = training,
  coob = TRUE, # coob means to use the oob error rate
  nbagg = ntree[i]  # Set nbagg to n-th tree
)
  # Get OOB error
  rmse[i] <- model$err  # OOB error is stored in `err` inside your model output 
}

plot(ntree, rmse, type = 'l', lwd = 2)   # "type" means line type (i = line), "lwd" means line width
abline(v = 44, col = "red", lty = "dashed") # Mark the number of trees with the lowest RMSE

## Estimate using caret
# Specify 10-fold cross validation
control <- trainControl(method = "cv",  number = 10) 

# CV bagged model
bagged_cv.fit <- train(
  deny ~ .,
  data = training,
  method = "treebag",  # Use "bagging"
  trControl = control,  # Insert the control function defined earlier
  importance = TRUE,
  metric = "Accuracy"
  )

# Display results
bagged_cv.fit

## Bagged CART 
## 
## 1904 samples
##    4 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1713, 1714, 1714, 1713, 1713, 1714, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8602948  0.1559178

# Plot most important variables (note: we only have 4 variables), rank them by importance
plot(varImp(bagged_cv.fit), 4)

# Compare with the actual dependent variable of the testing data and calculate accuracy
pred <- predict(bagged_cv.fit, testing)

# Generate confusion table
confmat <- table(Predicted = pred, Actual = testing$deny)

confmat

##          Actual
## Predicted   0   1
##         0 402  50
##         1  17   7

# Calculate accuracy score
(confmat[1, 1] + confmat[2, 2])/ sum(confmat) * 100

## [1] 85.92437

Random Forests

Build the best tree out of a large number of randomly sampled de-correlated trees

## install and download required packages
# install.packages(c("randomForest", "ranger", "ggplot2"))
library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:psych':
## 
##     outlier

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(rsample)     # data splitting 
library(dplyr)       # data wrangling
library(rpart)       # build tree models
library(rpart.plot)  # plot regression trees
library(ipred)       # bagging
library(caret)  
library(ranger)

## 
## Attaching package: 'ranger'

## The following object is masked from 'package:randomForest':
## 
##     importance

library(ggplot2)
library(h2o)         # An efficient java-based platform

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

set.seed(123)  # For reproducibility 

library(AER)         # where HMDA data is attached

# Load and examine data
data(HMDA)
names(HMDA)

##  [1] "deny"      "pirat"     "hirat"     "lvrat"     "chist"     "mhist"    
##  [7] "phist"     "unemp"     "selfemp"   "insurance" "condomin"  "afam"     
## [13] "single"    "hschool"

# Subset and convert the dependent variable to factor class
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]

# You may also convert the dependent variable to factor class this way
# data$deny <- as.factor(ifelse(data$deny == "yes", 1, 0))

# Do a 80-20 train-test split
train_row_numbers <- createDataPartition(data$deny, p=0.8, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]


# Default RF model
rf.fit <- randomForest(
  deny ~ .,
  data = training,
  importance = TRUE, # Allow us to get feature importance statistics
  ntree = 500  # Number of trees, the default is 500
)

rf.fit

## 
## Call:
##  randomForest(formula = deny ~ ., data = training, importance = TRUE,      ntree = 500) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 12.29%
## Confusion matrix:
##       no yes class.error
## no  1636  40  0.02386635
## yes  194  34  0.85087719

# Plot the model
par(mar=c(2, 2, 2, 6))  # par() is used to set graphical parameters
# mar(): large margin on the right side mar = c(bottom, left, top, right)
plot(rf.fit, main = "Random Forests model performance and the number of trees needed")
legend("right", colnames(rf.fit$err.rate), col=1:3, cex=0.9, fill=1:3)  # Place legend on the right, set and fill color by the column values inside rf.fit$err.rate, set size to 0.9

## Understanding "err.rate"
head(rf.fit$err.rate)

##            OOB         no       yes
## [1,] 0.1940086 0.10638298 0.7888889
## [2,] 0.1933216 0.11122244 0.7785714
## [3,] 0.1874560 0.10305958 0.7796610
## [4,] 0.1780564 0.09103943 0.7850000
## [5,] 0.1758882 0.08788282 0.7906977
## [6,] 0.1751825 0.08776425 0.7954545

# Note: "yes" and "no" are lines for the errors in predicting these labels, that is, the percentage of trees predicted for "yes" and "no" classes.
# OOB (the first column in rf.fit$err.rate) is the error rate for all trees with index less than or equal to the row index. As the number of trees increase, your OOB error should become smaller because you get a better prediction from more trees.

Error rate (\(\textsf{err.rate}\)) is a non-monotonous function of the number of trees. The expected error rate (error rate = 1 − accuracy) as a function of T (the number of trees):

\[\begin{equation*} E(e_{i}(T)) = P\left(\sum^{T}_{t=1}e_{it} > 0.5\cdot{T}\right), \end{equation*}\]

where \(e_{it}\) is a binomial random variable with the expectation that \(E(e_{i}(T))\) \(=\) \(\epsilon\).

Each tree gives a classification on the leftover data (OOB), meaning that the tree “votes” for the class of these OOB data points (like the testing set). We can verify this with a bit of hand-calculation:

pred <- predict(rf.fit, testing)
contmat <- table(Predicted = pred, Actual = testing$deny)
accuracy <- sum(diag(confmat)) / sum(confmat) # Sum the diagonal elements and divided by the sum of the confusion matrix
1 - accuracy

## [1] 0.1407563

avg_OOB_error <- sum(rf.fit$err.rate[,1]) / nrow(rf.fit$err.rate)
avg_OOB_error

## [1] 0.1264018

# The value of avg_OOB_error is indeed very close to 1 - accuracy


# The RF model chooses the classification having the most votes among all trees in the forest. For a binary dependent variable, the vote will be YES or NO, counting up the YES votes. This is the RF score and the percent YES votes received is the predicted probability. In regression case (i.e., when Y is continuous), it is the average of the dependent variable. In other words, if we increase the number of trees, the OOB error will be computed at each step. The OOB error is NOT calculated by using the prediction obtained from 1 tree on the OOB samples with respect to that tree, but rather you use the averaged prediction from all trees on the OOB sample.


# Find the number of trees with the lowest OOB error
# The err.rate is measured as 1 - accuracy

# Find the row index of the number of trees giving the lowest OOB error
which.min(rf.fit$err.rate[,1])

## [1] 322

# RMSE of this optimal random forest OOB error
sqrt(rf.fit$err.rate[which.min(rf.fit$err.rate[,1])])

## [1] 0.3475605

Note that for models of binary outcome or classification tasks (meaning that the outcomes are of factor class), their model errors are stored as \(\textsf{err.rate}\) inside the \(\textsf{randomForest}\) object. An example using model of continuous outcome is illustrated below.

# Create ames data from AmesHousing
ames <- AmesHousing::make_ames()

# Do a 75-25 sample split
set.seed(123)
train_row_numbers <- createDataPartition(ames$Sale_Price, p=0.75, list=FALSE) # Sale_Price is a continuous variable
ames_training <- ames[train_row_numbers, ]
ames_testing <- ames[-train_row_numbers, ]

# Fit a random forest model on continuous outcome

# Default RF model
ames_rf.fit <- randomForest(
  formula = Sale_Price ~ .,
     data = ames_training
)

# Check out the elements inside ames_rf.fit object
names(ames_rf.fit)

##  [1] "call"            "type"            "predicted"       "mse"            
##  [5] "rsq"             "oob.times"       "importance"      "importanceSD"   
##  [9] "localImportance" "proximity"       "ntree"           "mtry"           
## [13] "forest"          "coefs"           "y"               "test"           
## [17] "inbag"           "terms"

# Number of trees with the lowest MSE
which.min(ames_rf.fit$mse)

## [1] 139

# Get the RMSE of the random forest model estimated at the optimal number of trees
sqrt(ames_rf.fit$mse[which.min(ames_rf.fit$mse)])

## [1] 25632.45

Hyperparameter tuning for random forests using \(\textsf{tuneRF()}\) function

# We will be using the HMDA training data we previously generated
features <- training[-1]  # Remove the dependent variable

set.seed(123)

rf.fit2 <- tuneRF(
  x = features,
  y = training$deny,
  ntreeTry = 500,  # Number of trees
  mtryStart = 3,    # Number of features (variables, note that we only have 4 variables, so mtryStart must be < 4)
  stepFactor = 1.5,  # At each iteration, mtry is inflated (or deflated) by this value
  improve = 0.01, # Each iteration must improve (reduce) the OOB error by this much in order to continue
  trace = FALSE    # Hide real-time progress, if you want it to be displayed, then set TRUE 
)

## 0.02904564 0.01 
## -0.07692308 0.01

# As mtryStart (number of features) increases, OOB errors should decrease.

## Comparing grid search using randomForest & ranger
## Use system.time() to record run time
system.time(
  ames_randomForest <- randomForest(
    deny ~ ., 
    data = training, 
    ntree = 500,
    mtry = floor(length(features) / 3)
  )
)

##    user  system elapsed 
##    0.47    0.11    0.59

system.time(
  ames_ranger <- ranger(
    deny ~ ., 
    data = training, 
    num.trees = 500,
    mtry = floor(length(features) / 3)
  )
)

##    user  system elapsed 
##    0.44    0.05    0.27

# ranger is a bit faster! (Rangers lead the way!)
# Set up hyperparameter tuning grid

hyper_grid_2 <- expand.grid(
  mtry = seq(1, 4, by = 1),
  node_size = seq(3, 9, by = 2),
  sampe_size = c(.5, .6, .75, .80),
  OOB_RMSE = 0  # We will fill in this vector later, for now, we just pad it with 0
)

# Check out the number of unique combination of hyperparameters
nrow(hyper_grid_2)

## [1] 64

# Tuning over hyper_grid_2

for(i in 1:nrow(hyper_grid_2)) {
  
  # Train model
  model <- ranger(
    deny ~ ., 
    data = training, 
    num.trees = 100,
    mtry = hyper_grid_2$mtry[i],
    min.node.size = hyper_grid_2$node_size[i],
    sample.fraction = hyper_grid_2$sampe_size[i],
    seed = 123
  )
  
  # Add the square root of OOB error to grid. Note how we append the list of values to the vector and index them (fill in and replace the 0 inside the OOB_RMSE column)
  hyper_grid_2$OOB_RMSE[i] <- sqrt(model$prediction.error)
}

# Display the 10 best performing model
hyper_grid_2 %>% 
  dplyr::arrange(OOB_RMSE) %>%
  head(10)

##    mtry node_size sampe_size  OOB_RMSE
## 1     1         9       0.60 0.3368165
## 2     1         7       0.80 0.3375953
## 3     2         7       0.75 0.3383723
## 4     1         9       0.80 0.3391475
## 5     1         3       0.50 0.3399209
## 6     1         7       0.50 0.3399209
## 7     1         9       0.50 0.3399209
## 8     3         9       0.50 0.3399209
## 9     1         3       0.60 0.3399209
## 10    1         5       0.60 0.3399209

# Retrieve optimal set of hyperparameters: the top 1 
best_para <- hyper_grid_2 %>% 
  dplyr::arrange(OOB_RMSE) %>%
  head(1)

m <- best_para$mtry
node_size <- best_para$node_size
p <- best_para$sampe_size

# Fit the best performing model
# Generate an empty vector of length 100
OOB_RMSE <- vector(mode = "numeric", length = 100)

# Create the best performing tree
for(i in seq_along(OOB_RMSE)) {
  
  optimal_ranger <- ranger(
    deny ~ ., 
    data = training, 
    num.trees = 100,
    mtry = m, # Number of features
    min.node.size = node_size, # Node size
    sample.fraction = p, # Fraction of sample used in each tree
    importance = 'impurity'
  )
  
  OOB_RMSE[i] <- sqrt(optimal_ranger$prediction.error) # Get RMSE for each OOB estimate
}

# Display the change in RMSE from the OOB sample

hist(OOB_RMSE, breaks = 10)   # breaks = bin size

## Visualizing variable importance in descending order 
# We first arrange the output into a data.frame object

df <- data.frame(vname = names(data[-1]), importance = optimal_ranger$variable.importance)  # data[-1] removes the dependent variable

ggplot(df, aes(x = reorder(vname, -importance), y = importance, fill = vname, label = round(importance, digits = 2))) +  # fill means label with variable names
  geom_bar(stat = "Identity") +
  ggtitle("Most important variables (in the order of 1 to 4)") +
  xlab("Variable") + 
  ylab("Importance") +
  geom_text(size = 3, position = position_stack(vjust = 0.5), col = "white")  # vjust = 0.5 means placing text in the middle of the stack

## The built-in advantage of out-of-the-box (OOB) algorithm, it sometimes works like a charm even without having to resort to hyperparameter tuning. We will illustrate this using the AmesHousing data we just built.

# Create training and validation data, again using 75-25 split
set.seed(123)
valid_split <- initial_split(ames, .75)

# Training data: we will do this a bit differently here
ames_train_v2 <- analysis(valid_split) # The analysis() function automatically generates the training data by the proportion you set earlier (.75)


ames_valid <- assessment(valid_split) # The assessment() function generates the testing data from the remaining 1 - .75 sample

# Get variables
x_test <- ames_valid[setdiff(names(ames_valid), "Sale_Price")]   # Sort all not "Sale_Price" columns to x_test (this creates the list of x variables)

y_test <- ames_valid$Sale_Price # Assign Sale_Price to y_test (the DV)

# Note that this function automatically validate trained RF model on testing data, while retaining OOB error on the way (already reserved in the training data)!
rf_oob_comp <- randomForest(
  formula = Sale_Price ~ .,
  data    = ames_train_v2,
  xtest   = x_test,
  ytest   = y_test
)

# Extract OOB & validation errors (MSE)
oob <- sqrt(rf_oob_comp$mse)  # This contains OOB errors for all 500 trees
validation <- sqrt(rf_oob_comp$test$mse)  # MSE estimated from the test set

# Compare error rates
# We first arrange the output into tibble format
# Take a look at the cross-section of this beast
error_rates <- tibble::tibble(
  `Out of Bag Error` = oob,
  `Test error` = validation,
  ntrees = 1:rf_oob_comp$ntree
) %>% 
  gather(Metric, RMSE, -ntrees)
# Gather columns (from a dataframe-compatible object) into key-value pairs, first column in tibble table will become "Metric", their corresponding row values will be mapped to "RMSE" and displayed in descending order of "ntrees"

error_rates

## # A tibble: 1,000 × 3
##    ntrees Metric             RMSE
##     <int> <chr>             <dbl>
##  1      1 Out of Bag Error 42610.
##  2      2 Out of Bag Error 40286.
##  3      3 Out of Bag Error 39016.
##  4      4 Out of Bag Error 35857.
##  5      5 Out of Bag Error 35165.
##  6      6 Out of Bag Error 33045.
##  7      7 Out of Bag Error 31940.
##  8      8 Out of Bag Error 31038.
##  9      9 Out of Bag Error 30742.
## 10     10 Out of Bag Error 30131.
## # ℹ 990 more rows

# Now rearrange this tibble object into a plot
error_rates %>% 
  ggplot(aes(ntrees, RMSE, color = Metric)) + # ntrees on the x-axis, RMSE on the y-axis, colored by Metric type
  geom_line() +
  scale_y_continuous(labels = scales::dollar) +  # Set scales to dollar because we are using mortgage data
  xlab("Number of trees")  # x-axis label

# OOB and test errors are pretty close!

Gradient Boost Model (GBM)

Boosting and Gradient Descent

# load required packages
# install.packages(c("gbm", "xgboost", "pdp", "ggplot2"))
library(rsample)      # data splitting 
library(gbm)          # basic implementation

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(xgboost)      # a faster implementation of gbm

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

library(caret)        # an aggregator package for performing many machine learning models
library(pdp)          # model visualization: partial plot

## Warning: package 'pdp' was built under R version 4.4.3

library(purrr)        # model visualization

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:pdp':
## 
##     partial

## The following object is masked from 'package:car':
## 
##     some

## The following object is masked from 'package:caret':
## 
##     lift

library(ggplot2)      # model visualization


# Load the data and do train-test split

CBI <- read.csv("https://www.dropbox.com/s/426fnyypna3h0bo/CBI.csv?dl=1")
names(CBI)[23] <- "cbi"
df <- CBI[complete.cases(CBI), c("polity", "cbi", "pol_polarization", "gdppc", "nkaopen", "cpi")]

set.seed(123)
train_row_numbers <- createDataPartition(df$polity, p=0.75, list=FALSE)
training <- df[train_row_numbers, ]
testing <- df[-train_row_numbers, ]

# Fit a GBM with 10000 trees, 1 tree length, learning rate = 0.001, using 5-fold CV

gbm.fit <- gbm(
  cbi ~ .,
  distribution = "gaussian",
  training,
  n.trees = 10000,  # num. of trees
  interaction.depth = 1,  # tree depth
  shrinkage = 0.001,   # learning rate
  cv.folds = 5,     # n-fold cross-validation
  n.cores = NULL, # Will use all CPUs by default
  verbose = FALSE  # Display log
)  


# Get MSE and compute RMSE
min_MSE <- which.min(gbm.fit$cv.error)  # Get the index of the row having minimum MSE
# Calculate RMSE
sqrt(gbm.fit$cv.error[min_MSE])

## [1] 0.1839064

# Plot loss function as a function of n trees added to the ensemble
par("mar")

## [1] 5.1 4.1 4.1 2.1

par(mar=c(1,1,1,1))
gbm.perf(gbm.fit, method = "cv")

## [1] 10000

## You can also do the same using the train() function from the caret package

set.seed(123)

# First define the control parameters
control <- trainControl(## 10-fold CV
  method = "repeatedcv",
  number = 10,
  ## Repeated ten times
  repeats = 10)

gbm_caret.fit1 <- train(cbi ~ ., data = training, 
                        method = "gbm", 
                        trControl = control,
                        verbose = FALSE)


## Tune over four hyperparameter: learning rate (shrinkage), tree depth (interaction.depth), min number of obs (in each node), fraction of sample used for each subsampling

hyper_grid1 <- expand.grid(
  shrinkage = c(.01, .1, .3),
  interaction.depth = c(1, 3, 5),
  n.minobsinnode = c(5, 10, 15),
  bag.fraction = c(.65, .8, 1), 
  optimal_trees = 0,  # Temporarily fill in 0, we will fill each row with tuned values later             
  min_RMSE = 0        # Temporarily fill in 0, we will fill each row with tuned values later   
)

# How many unique combinations do we have?
nrow(hyper_grid1)

## [1] 81

# Re-sample data to get new training set
random_index <- sample(1:nrow(df), nrow(training))
random_training <- df[random_index, ]

# Grid search 
for(i in 1:nrow(hyper_grid1)) {
  
  # Train model
  gbm.tune <- gbm(
    cbi ~ .,
    distribution = "gaussian",
    data = random_training,
    n.trees = 5000, # number of trees
    interaction.depth = hyper_grid1$interaction.depth[i], # tree depth
    shrinkage = hyper_grid1$shrinkage[i], # learning rate (the rate at which the gradient descends)
    n.minobsinnode = hyper_grid1$n.minobsinnode[i], # Number of observation(s) in each terminal node
    bag.fraction = hyper_grid1$bag.fraction[i], # Fraction of sample for sub-sampling
    train.fraction = .75,
    n.cores = NULL, # Will use all CPUs by default
    verbose = FALSE  # Do not print out progress
  )
  
  # Locate minimum training error from the n-th tree, add it to the grid
  hyper_grid1$optimal_trees[i] <- which.min(gbm.tune$valid.error)
  hyper_grid1$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}

# Display 10 best performing models in descending order

hyper_grid1 %>% 
  dplyr::arrange(min_RMSE) %>%
  head(10)

##    shrinkage interaction.depth n.minobsinnode bag.fraction optimal_trees
## 1       0.30                 3              5         0.80            48
## 2       0.10                 5              5         0.80            52
## 3       0.10                 5              5         1.00            94
## 4       0.10                 3             10         1.00            82
## 5       0.01                 5              5         0.80           490
## 6       0.01                 5             10         0.80           616
## 7       0.10                 5              5         0.65            43
## 8       0.01                 5              5         0.65           591
## 9       0.01                 5             10         0.65           883
## 10      0.30                 5              5         0.80            26
##     min_RMSE
## 1  0.1335817
## 2  0.1337356
## 3  0.1346350
## 4  0.1348126
## 5  0.1349345
## 6  0.1355182
## 7  0.1356389
## 8  0.1359518
## 9  0.1362342
## 10 0.1363685

## Do this again within a more refined hyperparameter values space

hyper_grid2 <- expand.grid(
  shrinkage = c(0.1, 0.3),
  interaction.depth = c(3, 5),
  n.minobsinnode = c(10, 15),
  bag.fraction = c(0.8, 1), 
  optimal_trees = 0,               
  min_RMSE = 0                     
)

# Total number of combinations
nrow(hyper_grid2)

## [1] 16

# Grid search 
for(i in 1:nrow(hyper_grid2)) {
  
  # Set seed for reproducibility
  set.seed(123)
  
  # Train model
  gbm.tune <- gbm(
    cbi ~ .,
    data = random_training,
    distribution = "gaussian",
    cv.folds = 5,
    n.trees = 5000,
    interaction.depth = hyper_grid2$interaction.depth[i],
    shrinkage = hyper_grid2$shrinkage[i],
    n.minobsinnode = hyper_grid2$n.minobsinnode[i],
    bag.fraction = hyper_grid2$bag.fraction[i],
    train.fraction = .75,
    n.cores = NULL, # will use all cores by default
    verbose = FALSE
  )
  
  # Locate minimum training error from the n-th tree, add it to the grid
  hyper_grid2$optimal_trees[i] <- which.min(gbm.tune$valid.error)
  hyper_grid2$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}

# Display top 10 performing models
hyper_grid2 %>% 
  dplyr::arrange(min_RMSE) %>%
  head(10)

##    shrinkage interaction.depth n.minobsinnode bag.fraction optimal_trees
## 1        0.1                 3             10          1.0            82
## 2        0.1                 5             10          0.8            37
## 3        0.1                 5             15          0.8            84
## 4        0.3                 3             15          0.8            22
## 5        0.3                 3             10          1.0            29
## 6        0.1                 5             10          1.0           102
## 7        0.3                 5             10          0.8            11
## 8        0.1                 3             10          0.8            72
## 9        0.3                 3             10          0.8            14
## 10       0.1                 3             15          0.8            92
##     min_RMSE
## 1  0.1348126
## 2  0.1359682
## 3  0.1375543
## 4  0.1384468
## 5  0.1386144
## 6  0.1389080
## 7  0.1394285
## 8  0.1394570
## 9  0.1396065
## 10 0.1404738

# Get the list of best-performing hyperparameters
optimal_hyperparameters <- hyper_grid2 %>% 
  dplyr::arrange(min_RMSE) %>%
  head(1) # the best one

# Define the list of optimal hyperparameters
shrink <- optimal_hyperparameters$shrinkage
ntrees <- optimal_hyperparameters$optimal_trees
minobs <- optimal_hyperparameters$n.minobsinnode
bagfrac <- optimal_hyperparameters$bag.fraction
depth <- optimal_hyperparameters$interaction.depth

# Train final GBM model
gbm.fit.final <- gbm(
  cbi ~ .,
  distribution = "gaussian",
  data = training,
  n.trees = ntrees,
  interaction.depth = depth,
  shrinkage = shrink,
  n.minobsinnode = minobs,
  bag.fraction = bagfrac, 
  train.fraction = 0.63,
  n.cores = NULL, # will use all cores by default
  verbose = FALSE
)  

# Examine the result
gbm.fit.final

## gbm(formula = cbi ~ ., distribution = "gaussian", data = training, 
##     n.trees = ntrees, interaction.depth = depth, n.minobsinnode = minobs, 
##     shrinkage = shrink, bag.fraction = bagfrac, train.fraction = 0.63, 
##     verbose = FALSE, n.cores = NULL)
## A gradient boosted model with gaussian loss function.
## 82 iterations were performed.
## The best test-set iteration was 78.
## There were 5 predictors of which 5 had non-zero influence.

# Visualize variable importance

par(mar = c(5, 8, 1, 1)) 

summary(
  gbm.fit.final, 
  cBars = 5,  # Since we have 5 variables to display
  method = relative.influence, # You can also use permutation.test.gbm
  las = 2
)

##                               var    rel.inf
## cpi                           cpi 51.5136786
## gdppc                       gdppc 20.9317551
## polity                     polity 13.7666878
## nkaopen                   nkaopen 13.2030997
## pol_polarization pol_polarization  0.5847788

# The "las" argument inside summary() provides the numeric value indicating the orientation of the tick mark labels or text you want to add to the two axes. The options are the followings: always parallel to the axis (the default, 0), always horizontal (1), always perpendicular to the axis (2), and always vertical (3).


# An even simpler way is to do this 
summary(gbm.fit.final)

##                               var    rel.inf
## cpi                           cpi 51.5136786
## gdppc                       gdppc 20.9317551
## polity                     polity 13.7666878
## nkaopen                   nkaopen 13.2030997
## pol_polarization pol_polarization  0.5847788

Supplementary materials

GBM on data of binary outcomes

library(rsample)      # data splitting 
library(gbm)          # basic implementation
library(xgboost)
library(AER)

# Load and subset data
data(HMDA)
HMDA$deny <- ifelse(HMDA$deny == "yes", 1, 0)
data <- HMDA[, c("deny", "hirat", "lvrat", "mhist", "unemp")]

# Do a 75-25 train-test split
set.seed(123)
train_row_numbers <- createDataPartition(data$deny, p=0.75, list=FALSE)
training <- data[train_row_numbers, ]
testing <- data[-train_row_numbers, ]


# Fit a GBM of Bernoulli distribution
gbm.logit.fit <- gbm(deny ~ .,
                     data = training,
                     interaction.depth = 2,
                     distribution = "bernoulli",  # for binary outcome
                     cv.folds = 5,
                     shrinkage = .1,
                     n.trees = 50)

# Examine output and model performance
summary(gbm.logit.fit)

##         var   rel.inf
## lvrat lvrat 49.306624
## hirat hirat 39.715339
## mhist mhist  7.950341
## unemp unemp  3.027695

gbm.perf(gbm.logit.fit, method = "cv")

## [1] 38