This markdown will demonstrate the difference of bagged trees vs random forest.

Reminder:

Bagged trees build \(B\) different decision trees using bootstrap samples and aggregate the predictions (hence bagged - Bootstrap AGGregated)

We’ll be using a generated data set so we know which variables are useful and which are not.

RNGversion('4.1.0'); set.seed(2870)

pred_size <- 10
N  <-  500 + pred_size

# Creating the mean vector of the predictors
mu_vec <- round(runif(10, 100, 250))

## Creating the covariance matrix
# Starting with the correlation matrix
cor_mat <- 
  matrix(runif(100, -1, 1), ncol = 10) |> 
  crossprod() |> 
  cov2cor() 

# Vector of standard deviations:
sd_vec <- round(runif(10, 20, 40))

# Converting correlation matrix to covariance matrix
cov_mat <- diag(sd_vec) %*% cor_mat %*% diag(sd_vec)

### Making the data set:
ex_data <- 
  MASS::mvrnorm(n = N, mu = mu_vec, Sigma = cov_mat) |> 
  data.frame() |> 
  janitor::clean_names() |> 
  # Creating the response variable that only depends on x1 - x4:
  mutate(
    .before = 1,
    # Structur of relationship
    y = 100 + 3*x1 + 1*x2 + 5*x3 + 3*x4,
    # Adding random noise
    y = round(y + rnorm(N, 0, sd = 20))
  )

tibble(ex_data)
## # A tibble: 510 × 11
##        y    x1    x2    x3    x4    x5    x6    x7    x8    x9   x10
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1943 221.   79.4  137.  137. 116.   142. 143.   189.  181. 112. 
##  2  1939 131.  157.   188.  123. 232.   288.  97.7  147.  107. 176. 
##  3  1733 128.   89.3  141.  145. 175.   234. 150.   131.  149. 146. 
##  4  1783  88.8 126.   155.  163. 135.   277. 117.   143.  119. 147. 
##  5  1945 147.   94.2  156.  172. 119.   234. 106.   165.  144. 133. 
##  6  1613 106.  135.   126.  131. 105.   238.  88.9  166.  146. 105. 
##  7  1551 124.  114.   127.  109. 177.   231. 172.   150.  179. 145. 
##  8  1923 152.   87.7  153.  176.  98.0  183. 152.   141.  138. 123. 
##  9  1929 164.  130.   123.  208.  93.0  158.  85.8  140.  109.  92.4
## 10  1949 151.  127.   172.  145. 210.   226. 126.   128.  140. 163. 
## # ℹ 500 more rows

We’ll have the first 10 rows be the observations we want to predict and the last 500 be the data used to make predictions

test_ex <- ex_data[1:pred_size, ]
train_ex <- ex_data[-(1:pred_size), ]

Let’s look at a correlation plot of the data:

GGally::ggpairs(train_ex)

Looking at single trees

Let’s make a collection of 300 bootstrap trees and examine the variance of the 300 predictions per test set

Simulation of individual trees

Starting by creating the 300 trees

RNGversion('4.1.0'); set.seed(2870)
B <- 300 # B for bags

# Vector of lists to store the results
tree_vec <- vector(mode = 'list', B)

# For loop to find our B trees:
for (i in 1:B){
  tree_vec[[i]] <- 
    rpart(
      formula = y ~ .,
      data = train_ex |> slice_sample(prop = 1, replace = T),
      method = 'anova',
      minsplit = 2,
      minbucket = 1,
      cp = 0
    )
}

Now we’ll make a prediction for the 10 test cases for each tree

# Making a data frame to save the results
test_pred_tree <- 
  data.frame(
    matrix(-1, nrow = B, ncol = pred_size)
  )

# Changing column names
colnames(test_pred_tree) <- paste0('y', 0:(pred_size - 1))

# Looping through each tree and predicting the 10 test cases
for (i in 1:B){
  test_pred_tree[i, ] <- 
    predict(
      object = tree_vec[[i]],
      newdata = test_ex
    )
}

tibble(test_pred_tree)
## # A tibble: 300 × 10
##       y0    y1    y2    y3    y4    y5    y6    y7    y8    y9
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1  1991  1812  1692  1767  1826  1526  1576  1920  1840  1859
##  2  1935  1812  1762  1729  1867  1696  1472  1867  1840  1974
##  3  1986  1845  1635  1643  1822  1538  1537  1839  1978  1997
##  4  1941  1812  1822  1806  1970  1575  1537  1920  1783  1847
##  5  1855  1812  1360  1652  1822  1538  1575  1906  1840  1971
##  6  1968  1845  1738  1643  1830  1575  1588  2018  1984  1953
##  7  1986  1822  1704  1675  1942  1635  1537  1839  1902  1898
##  8  1986  1822  1698  1643  1862  1513  1720  1894  1731  1957
##  9  1966  1860  1853  1993  1846  1698  1698  1970  1840  1856
## 10  1966  1860  1704  1832  1937  1575  1545  1894  1840  1946
## # ℹ 290 more rows

Now let’s look at the variance of each of the 10 predictions:

test_pred_tree |> 
  pivot_longer(
    cols = everything(),
    values_to = 'y_hat',
    names_to = 'obs'
  ) |> 
  summarize(
    .by = obs,
    sd_yhat = sd(y_hat)
  )
## # A tibble: 10 × 2
##    obs   sd_yhat
##    <chr>   <dbl>
##  1 y0       69.3
##  2 y1       65.7
##  3 y2       63.7
##  4 y3       95.7
##  5 y4       66.1
##  6 y5       53.4
##  7 y6       53.8
##  8 y7       65.5
##  9 y8       77.2
## 10 y9       68.2

and to visualize it:

Bagged Trees

Now let’s try making a “bag” of 100 trees and predict the results. Let’s do that a total of 300 times (just like we had 300 individual trees, we simulate bagging 300 times)

We can use the bagging() function in the ipred package to perform bagging for us. To use bagging(), it works the same as rpart(), but you just need to add nbagg = ... to tell it how many trees to create. We’ll use nbagg = 100

bag_ex <- 
  ipred::bagging(
    formula = y ~ .,
    data = train_ex,
    nbagg = 100
  )

# Predicting the 10 test cases using the bagged trees:
data.frame(
  actual = test_ex$y,
  predicted = predict(bag_ex, newdata = test_ex)
)
##    actual predicted
## 1    1943  1835.553
## 2    1939  1940.297
## 3    1733  1694.618
## 4    1783  1796.263
## 5    1945  1882.186
## 6    1613  1605.142
## 7    1551  1594.833
## 8    1923  1921.948
## 9    1929  1836.822
## 10   1949  1899.440

Now let’s repeat it 300 times like we did with the trees and save the prediction for each of them:

#set.seed(2870)
# Making a data frame to save the results
bagged_preds <- 
  data.frame(
    matrix(-1, nrow = B, ncol = pred_size)
  )

# Changing column names
colnames(bagged_preds) <- paste0('y', 0:(pred_size - 1))

for (i in 1:B){
  # Creating the bagged trees
  bag_loop <- 
    ipred::bagging(
      formula = y ~ .,
      data = train_ex,
      nbagg = 500
    )
  
  # Predicting the test cases
  bagged_preds[i, ] <- predict(bag_loop, newdata = test_ex)
  
}

head(bagged_preds, 10)
##          y0       y1       y2       y3       y4       y5       y6       y7
## 1  1844.519 1939.642 1677.220 1800.783 1879.666 1615.373 1595.325 1914.984
## 2  1842.020 1934.948 1679.307 1807.569 1886.715 1616.917 1601.645 1916.014
## 3  1842.364 1940.769 1678.051 1802.335 1879.768 1611.831 1600.264 1913.339
## 4  1843.853 1939.564 1676.822 1800.385 1880.451 1610.927 1600.333 1912.796
## 5  1842.349 1939.592 1682.604 1804.328 1885.537 1611.164 1593.710 1916.537
## 6  1847.003 1943.956 1676.981 1803.065 1884.821 1614.595 1601.940 1918.196
## 7  1844.704 1936.209 1680.070 1798.483 1884.940 1613.420 1593.811 1915.725
## 8  1844.972 1939.225 1679.433 1803.193 1882.138 1620.820 1599.465 1915.971
## 9  1845.466 1942.484 1675.456 1809.570 1885.616 1613.945 1597.321 1915.106
## 10 1844.203 1936.370 1678.784 1804.421 1880.354 1611.509 1594.641 1910.541
##          y8       y9
## 1  1840.672 1907.308
## 2  1845.077 1902.870
## 3  1843.402 1910.229
## 4  1841.906 1905.589
## 5  1843.671 1906.038
## 6  1840.428 1905.746
## 7  1845.810 1908.855
## 8  1840.663 1906.172
## 9  1838.007 1905.288
## 10 1843.918 1903.712

Spread: Regression Tree vs Bagged Trees

Let’s compare the variance of the individual 300 trees vs the 300 sets of bagged trees:

The spread decreased dramatically! And all it took was doing 100 times the work!

But can we do better?

While the trees are given different data sets, they are likely using the same predictors to form the split at each chance.

Let’s look at the last set of 100 bagged trees from our previous simulation:

For each of the 100 bagged trees, X3 was always either the first split, with X1 being the second split 77% of the time. This will cause the predictions made by each tree to be very similar to each other, which won’t reduce the variance by as much as we’d like.

So what can we do? How can we force each of the bagged trees to be more dissimilar to one another?

By limiting the variables they can use to form the splits!

Random forests

Random forests are just like bagged trees with one exception: Each time a node is split, a random subset of the predictors are used instead of all possible predictors

Just like bootstrapping picked random rows to build the tree, random forests also picks random columns to use to form the splits!

This forces the trees to be more unique, which will help improve the variance reduction!

Why? The math is a little beyond the scope of the course, but trust that it works!

To create a random forest, we can use the randomForest() function in the randomForest package. It works like bagging(), but we need to specify the mtry = ... argument, which represents the number of variables available at each split and ntree in place of nbagg

randomForest::randomForest(
  formula = y ~ .,
  data    = train_ex,
  ntree   = 100,     # 100 random trees
  mtry    = 3 
) 
## 
## Call:
##  randomForest(formula = y ~ ., data = train_ex, ntree = 100, mtry = 3) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 3796.019
##                     % Var explained: 89.97

Simulation

Let’s see how a random forest compares to individual trees and bagged tree sets:

#RNGversion('4.1.0'); set.seed(2870)

B <- 300
# Data frame to save the results of each set of random trees
rf_preds <- 
  data.frame(
    matrix(-1, nrow = B, ncol = nrow(test_ex))
  )

# Changing column names
colnames(rf_preds) <- paste0('y', 0:(pred_size - 1))

# Conducting the simulation
for (i in 1:B){
  forest_loop <- 
    randomForest::randomForest(
      formula = y ~ .,
      data    = train_ex,
      ntree   = 500,     # 500 random trees
      mtry    = 3 
    ) 
    
  rf_preds[i, ] <- predict(forest_loop, newdata = test_ex)
    
}

tibble(rf_preds)
## # A tibble: 300 × 10
##       y0    y1    y2    y3    y4    y5    y6    y7    y8    y9
##    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 1853. 1876. 1720. 1798. 1920. 1602. 1579. 1917. 1875. 1903.
##  2 1858. 1870. 1715. 1806. 1902. 1602. 1567. 1914. 1872. 1910.
##  3 1857. 1868. 1714. 1798. 1917. 1594. 1581. 1910. 1876. 1899.
##  4 1857. 1857. 1711. 1787. 1910. 1597. 1573. 1911. 1870. 1904.
##  5 1848. 1865. 1706. 1799. 1912. 1593. 1576. 1924. 1871. 1898.
##  6 1856. 1872. 1707. 1805. 1907. 1594. 1585. 1916. 1865. 1897.
##  7 1862. 1867. 1707. 1800. 1916. 1602. 1580. 1910. 1872. 1910.
##  8 1861. 1861. 1718. 1784. 1911. 1598. 1580. 1913. 1876. 1906.
##  9 1859. 1866. 1709. 1802. 1915. 1602. 1582. 1917. 1870. 1902.
## 10 1861. 1860. 1700. 1795. 1909. 1600. 1573. 1920. 1871. 1904.
## # ℹ 290 more rows

Let’s compare the predictions for single trees, bagged trees, and random trees!

The purpose of the random selection of predictors is to make the predictions less correlated for the \(B\) different trees. We make them less correlated by making worse trees!

But when the individual twigs are put together, they become a mighty predictor!

model_comparison <- 
  bind_rows(
    .id = 'model',
    'tree' = scale(test_pred_tree,  center = test_ex$y, scale = F) |> data.frame(),
    'bagging' = scale(bagged_preds, center = test_ex$y, scale = F) |> data.frame(),
    'forest'  = scale(rf_preds,     center = test_ex$y, scale = F) |> data.frame(),
  ) |> 
  pivot_longer(
    cols = -model,
    names_to = 'obs',
    values_to = 'y_hat'
  ) |> 
  summarize(
    .by = c(model, obs),
    bias = mean(y_hat),
    var_pred = var(y_hat)
  ) |> 
  mutate(
    mse_pred = bias^2 + var_pred
  ) 
  # pivot_wider(
  #   id_cols = obs,
  #   values_from = mse_pred,
  #   names_from = model
  # ) 
ggplot(
  data = model_comparison,
  mapping = aes(
    x = obs,
    y = mse_pred,
    fill = fct_inorder(model)
  )
) + 
  geom_col(
    position = 'dodge2',
    color = 'black'
  )  + 
  geom_hline(
    data = model_comparison |> summarize(.by = model, mse = mean(mse_pred)),
    mapping = aes(
      yintercept = mse,
      color = fct_inorder(model)
    ),
    linetype = 'dashed',
    show.legend = F
  ) +
  labs(
    x = NULL,
    y = 'MSE',
    fill = 'Model',
    title = 'Comparing MSE for single trees, bagging, and random forests',
    subtitle = 'dashed lines indicate the average MSE across the 10 predictions'
  ) + 
  scale_y_continuous(
    expand = c(0, 0, 0.05, 0)
  ) + 
  theme_bw() + 
  theme(
    plot.title = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5)
  )