Bagged Trees Vs Random Forest

This markdown will demonstrate the difference of bagged trees vs random forest.

Reminder:

Bagged trees build \(B\) different decision trees using bootstrap samples and aggregate the predictions (hence bagged - Bootstrap AGGregated)

We’ll be using a generated data set so we know which features are useful and which are not.

RNGversion('4.1.0'); set.seed(2870)
N  <-  510
ex_data <- 
  data.frame(
    # These will be the useful predictors
    x1 = round(rnorm(N, 20, 3)),
    x2 = round(rnorm(N, 40, 5)),
    x3 = round(runif(N, 10, 20)),
    x4 = rbinom(N, 1, 0.75)
  )

# We'll use a loop to add 6 useless columns:
useless_x = 6
for (i in (ncol(ex_data) + 1):(ncol(ex_data) + useless_x)){
  ex_data[, paste0('x', i)] <- round(rnorm(N, mean = runif(1, 20, 40), sd = runif(1, 3, 6)))
}

# Creating the response variable as a combination of the first
ex_data <- 
  ex_data |> 
  mutate(
    .before = 1,
    # Structur of relationship
    y = -30 + 3*x1 + 1*x2 + 5*x3 + 30*x4,
    # Adding random noise
    y = round(y + rnorm(N, 0, sd = 20))
  )

tibble(ex_data)

## # A tibble: 510 × 11
##        y    x1    x2    x3    x4    x5    x6    x7    x8    x9   x10
##    <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1   174    19    42    13     1    22    35    33    26    43    32
##  2   151    18    35    12     1    19    38    39    25    40    33
##  3   156    18    47    14     1    16    41    38    22    34    26
##  4   145    16    39    12     1    26    37    31    31    40    31
##  5   119    19    40    10     1    17    31    39    28    37    27
##  6   126    18    32    18     0    24    40    36    14    38    26
##  7   164    24    36    16     0    23    39    41    10    39    32
##  8   163    23    30    13     1    21    41    31    28    31    33
##  9   194    18    42    16     1    23    38    33    16    41    28
## 10   143    15    44    19     0    19    35    38    13    34    32
## # ℹ 500 more rows

We’ll have the first 10 rows be the observations we want to predict and the last 500 be the data used to make predictions

pred_size <- 10
test_ex <- ex_data[1:pred_size, ]
train_ex <- ex_data[-(1:pred_size), ]

Looking at single trees

Let’s make a collection of 300 bootstrap trees and examine the variance of the 300 predictions per test set

Simulation of individual trees

Starting by creating the 300 trees

RNGversion('4.1.0'); set.seed(2870)
B <- 300 # B for bags

# Vector of lists to store the results
tree_vec <- vector(mode = 'list', B)

# For loop to find our B trees:
for (i in 1:B){
  tree_vec[[i]] <- 
    rpart(
      formula = y ~ .,
      data = train_ex |> slice_sample(prop = 1, replace = T),
      method = 'anova',
      minsplit = 2,
      minbucket = 1,
      cp = 0
    )
}

Now we’ll make a prediction for the 10 test cases for each tree

# Making a data frame to save the results
test_pred_tree <- 
  data.frame(
    matrix(-1, nrow = B, ncol = pred_size)
  )

# Changing column names
colnames(test_pred_tree) <- paste0('y', 0:(pred_size - 1))

# Looping through each tree and predicting the 10 test cases
for (i in 1:B){
  test_pred_tree[i, ] <- 
    predict(
      object = tree_vec[[i]],
      newdata = test_ex
    )
}

tibble(test_pred_tree)

## # A tibble: 300 × 10
##       y0    y1    y2    y3    y4    y5    y6    y7    y8    y9
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1   202   147   203   131   142   154   146   186   140   200
##  2   178   147   202   163   179   144   170   178   211   134
##  3   133   174   203   185   135   154   146   130   204   200
##  4   224   143   177   161   144   200   146   157   184   155
##  5   166   120   129   131   170   189   110   170   140   158
##  6   144   135   203   153   135   158   184   157   157   138
##  7   170   140   203   140   165   174   170   150   140   158
##  8   134   185   167   174   144   143   150   130   170   200
##  9   224   144   190   135   135   175   176   184   199   181
## 10   166   147   129   135   159   174   170   154   164   142
## # ℹ 290 more rows

Now let’s look at the variance of each of the 10 predictions:

test_pred_tree |> 
  pivot_longer(
    cols = everything(),
    values_to = 'y_hat',
    names_to = 'obs'
  ) |> 
  summarize(
    .by = obs,
    sd_yhat = sd(y_hat)
  )

## # A tibble: 10 × 2
##    obs   sd_yhat
##    <chr>   <dbl>
##  1 y0       24.5
##  2 y1       20.9
##  3 y2       22.4
##  4 y3       19.1
##  5 y4       22.7
##  6 y5       21.4
##  7 y6       26.4
##  8 y7       22.7
##  9 y8       25.1
## 10 y9       20.8

and to visualize it:

Bagged Trees

Now let’s try making a “bag” of 100 trees and predict the results. Let’s do that a total of 300 times (just like we had 300 individual trees, we simulate bagging 300 times)

To simplify the process, let’s make a function first to predict the response using the bagged trees

bagging_trees <- function(train, test, bags){
  # Data frame to save the predictions:
  test_pred_df <- 
    data.frame(
      matrix(-1, nrow = bags, ncol = nrow(test))
    )
  
  # Changing column names
  colnames(test_pred_df) <- paste0('y', 0:(pred_size - 1))
  
  # Loop to find the predictions using bagged trees
  for (i in 1:bags){
    # Tree using bootstrapping  
    bagged_tree <- 
      rpart(
        formula = y ~ .,
        data = train |> slice_sample(prop = 1, replace = T),
        method = 'anova',
        minsplit = 2,
        minbucket = 1,
        cp = 0,
        xval = 1 # no pruning = no cross-val needed
      )
    
    # Saving the predictions for the test set in the data frame    
    test_pred_df[i, ] <- 
      predict(
        object = bagged_tree,
        newdata = test
      )
  }
  
  return(colMeans(test_pred_df))
}

Simulation of bagged trees

Now let’s use the function to create a prediction from 100 bagged trees a total of 300 times

B <- 300
# Data frame to save the results of each set of bagged trees
bagged_preds <- 
  data.frame(
    matrix(-1, nrow = B, ncol = nrow(test_ex))
  )

# Changing column names
colnames(bagged_preds) <- paste0('y', 0:(pred_size - 1))

# Conducting the simulation
for (i in 1:B){
  bagged_preds[i, ] <-
    bagging_trees(train = train_ex, test = test_ex, bags = 100)
}

tibble(bagged_preds)

## # A tibble: 300 × 10
##       y0    y1    y2    y3    y4    y5    y6    y7    y8    y9
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  166.  144.  175.  152.  163.  161.  158.  155.  174.  160.
##  2  163.  146.  170.  150.  156.  160.  158.  155.  173.  160.
##  3  164.  144.  172.  150.  158.  162.  157.  156.  171.  159.
##  4  166.  145.  167   149.  158.  160.  159.  162.  176   159.
##  5  163.  144.  174.  153.  156.  159.  160.  159.  173.  156.
##  6  163.  143.  174.  148.  159.  161.  157.  159.  170.  158.
##  7  160.  146.  170.  148.  154.  158.  159.  155.  170.  161.
##  8  164.  146.  172.  149.  162.  156.  156.  158.  169.  161.
##  9  165.  144.  171.  151.  159.  157.  160.  160.  167.  162.
## 10  165.  142.  171.  149.  164.  163.  160.  156.  172.  160.
## # ℹ 290 more rows

Spread: Regression Tree vs Bagged Trees

Let’s compare the variance of the individual 300 trees vs the 300 sets of bagged trees:

The spread decreased dramatically! And all it took was doing 100 times the work!

But can we do better?

While the trees are given different data sets, they are likely using the same predictors to form the split at each chance:

For each of the 100 bagged trees, X3 and X4 were always either the first or second split, with the other being the second split. This will cause the predictions made by each tree to be very similar to each other, which won’t reduce the variance by as much as we’d like.

So what can we do? How can we force each of the bagged trees to be more disimilar to one another?

By limiting the variables they can use to form the splits!

Random forests

Random forests are just like bagged trees with one exception: Each tree is given a random subset of the predictors that it can use

Just like bootstrapping picked random rows to build the tree, random forests also picks random columns to use to form the splits!

This forces the trees to be more unique, which will help improve the variance reduction!

Why? The math is a little beyond the scope of the course, but trust that it works!

Simulation

The code below will create a function to form the random trees:

random_trees <- function(train, test, bags = 100, npred = 3){
  # Data frame to save the predictions:
  test_pred_df <- 
    data.frame(
      matrix(-1, nrow = bags, ncol = nrow(test))
    )
  
  # Changing column names
  colnames(test_pred_df) <- paste0('y', 0:(pred_size - 1))
  
  # Loop to find the predictions using bagged trees
  for (i in 1:bags){
    # Picking the column number for the random columns to use 
    pred_loc <- sample(2:ncol(train), size = npred)
    
    # Tree using bootstrapping  
    bagged_tree <- 
      rpart(
        formula = y ~ .,
        data = train |> 
          slice_sample(prop = 1, replace = T) |> 
          dplyr::select(1, all_of(pred_loc)),
        method = 'anova',
        minsplit = 2,
        minbucket = 1,
        cp = 0,
        xval = 1 # no pruning = no cross-val needed
      )
    
    # Saving the predictions for the test set in the data frame    
    test_pred_df[i, ] <- 
      predict(
        object = bagged_tree,
        newdata = test
      )
  }
  
  return(colMeans(test_pred_df))
}

Again, let’s simulate fitting 300 random forests by only allowing each tree to have 5 predictors, chosen at random:

RNGversion('4.1.0'); set.seed(2870)

B <- 300
# Data frame to save the results of each set of random trees
rf_preds <- 
  data.frame(
    matrix(-1, nrow = B, ncol = nrow(test_ex))
  )

# Changing column names
colnames(rf_preds) <- paste0('y', 0:(pred_size - 1))

# Conducting the simulation
for (i in 1:B){
rf_preds[i, ] <-
    random_trees(train = train_ex, test = test_ex, bags = 100, npred = 5)
}

tibble(rf_preds)

## # A tibble: 300 × 10
##       y0    y1    y2    y3    y4    y5    y6    y7    y8    y9
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  164.  149.  161.  159.  160.  157.  164.  162.  174.  156.
##  2  167.  152.  163.  155.  164.  157   169.  162.  169.  154.
##  3  168.  158.  167.  158.  162.  160.  167.  160.  170.  155.
##  4  166.  154.  161.  158.  165.  159.  161.  160.  169.  153 
##  5  167.  149.  161.  159.  166.  160.  162   164.  171.  158.
##  6  171.  151.  167.  155.  166.  167.  163.  164.  169.  161.
##  7  161.  161.  163.  157.  162.  163.  168.  155.  170.  158.
##  8  163.  157.  165.  157.  163.  163.  164.  158.  172.  163.
##  9  169.  147.  163.  160.  162.  159.  168.  161.  168.  151.
## 10  161.  152.  161.  153.  163.  167.  166.  162.  169.  161.
## # ℹ 290 more rows

Let’s compare the predictions for single trees, bagged trees, and random trees!

The purpose of the random selection of predictors is to make the predictions less correlated for the \(B\) different trees. We make them less correlated by making worse trees!

But when the individual twigs are put together, they become a mighty predictor!