This markdown will demonstrate the difference of bagged trees vs random forest.
Reminder:
Bagged trees build \(B\) different decision trees using bootstrap samples and aggregate the predictions (hence bagged - Bootstrap AGGregated)
We’ll be using a generated data set so we know which variables are useful and which are not.
RNGversion('4.1.0'); set.seed(2870)
pred_size <- 10
N <- 500 + pred_size
# Creating the mean vector of the predictors
mu_vec <- round(runif(10, 100, 250))
## Creating the covariance matrix
# Starting with the correlation matrix
cor_mat <-
matrix(runif(100, -1, 1), ncol = 10) |>
crossprod() |>
cov2cor()
# Vector of standard deviations:
sd_vec <- round(runif(10, 20, 40))
# Converting correlation matrix to covariance matrix
cov_mat <- diag(sd_vec) %*% cor_mat %*% diag(sd_vec)
### Making the data set:
ex_data <-
MASS::mvrnorm(n = N, mu = mu_vec, Sigma = cov_mat) |>
data.frame() |>
janitor::clean_names() |>
# Creating the response variable that only depends on x1 - x4:
mutate(
.before = 1,
# Structur of relationship
y = 100 + 3*x1 + 1*x2 + 5*x3 + 3*x4,
# Adding random noise
y = round(y + rnorm(N, 0, sd = 20))
)
tibble(ex_data)
## # A tibble: 510 × 11
## y x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1943 221. 79.4 137. 137. 116. 142. 143. 189. 181. 112.
## 2 1939 131. 157. 188. 123. 232. 288. 97.7 147. 107. 176.
## 3 1733 128. 89.3 141. 145. 175. 234. 150. 131. 149. 146.
## 4 1783 88.8 126. 155. 163. 135. 277. 117. 143. 119. 147.
## 5 1945 147. 94.2 156. 172. 119. 234. 106. 165. 144. 133.
## 6 1613 106. 135. 126. 131. 105. 238. 88.9 166. 146. 105.
## 7 1551 124. 114. 127. 109. 177. 231. 172. 150. 179. 145.
## 8 1923 152. 87.7 153. 176. 98.0 183. 152. 141. 138. 123.
## 9 1929 164. 130. 123. 208. 93.0 158. 85.8 140. 109. 92.4
## 10 1949 151. 127. 172. 145. 210. 226. 126. 128. 140. 163.
## # ℹ 500 more rows
We’ll have the first 10 rows be the observations we want to predict and the last 500 be the data used to make predictions
test_ex <- ex_data[1:pred_size, ]
train_ex <- ex_data[-(1:pred_size), ]
Let’s look at a correlation plot of the data:
GGally::ggpairs(train_ex)
Let’s make a collection of 300 bootstrap trees and examine the variance of the 300 predictions per test set
Starting by creating the 300 trees
RNGversion('4.1.0'); set.seed(2870)
B <- 300 # B for bags
# Vector of lists to store the results
tree_vec <- vector(mode = 'list', B)
# For loop to find our B trees:
for (i in 1:B){
tree_vec[[i]] <-
rpart(
formula = y ~ .,
data = train_ex |> slice_sample(prop = 1, replace = T),
method = 'anova',
minsplit = 2,
minbucket = 1,
cp = 0
)
}
Now we’ll make a prediction for the 10 test cases for each tree
# Making a data frame to save the results
test_pred_tree <-
data.frame(
matrix(-1, nrow = B, ncol = pred_size)
)
# Changing column names
colnames(test_pred_tree) <- paste0('y', 0:(pred_size - 1))
# Looping through each tree and predicting the 10 test cases
for (i in 1:B){
test_pred_tree[i, ] <-
predict(
object = tree_vec[[i]],
newdata = test_ex
)
}
tibble(test_pred_tree)
## # A tibble: 300 × 10
## y0 y1 y2 y3 y4 y5 y6 y7 y8 y9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1991 1812 1692 1767 1826 1526 1576 1920 1840 1859
## 2 1935 1812 1762 1729 1867 1696 1472 1867 1840 1974
## 3 1986 1845 1635 1643 1822 1538 1537 1839 1978 1997
## 4 1941 1812 1822 1806 1970 1575 1537 1920 1783 1847
## 5 1855 1812 1360 1652 1822 1538 1575 1906 1840 1971
## 6 1968 1845 1738 1643 1830 1575 1588 2018 1984 1953
## 7 1986 1822 1704 1675 1942 1635 1537 1839 1902 1898
## 8 1986 1822 1698 1643 1862 1513 1720 1894 1731 1957
## 9 1966 1860 1853 1993 1846 1698 1698 1970 1840 1856
## 10 1966 1860 1704 1832 1937 1575 1545 1894 1840 1946
## # ℹ 290 more rows
Now let’s look at the variance of each of the 10 predictions:
test_pred_tree |>
pivot_longer(
cols = everything(),
values_to = 'y_hat',
names_to = 'obs'
) |>
summarize(
.by = obs,
sd_yhat = sd(y_hat)
)
## # A tibble: 10 × 2
## obs sd_yhat
## <chr> <dbl>
## 1 y0 69.3
## 2 y1 65.7
## 3 y2 63.7
## 4 y3 95.7
## 5 y4 66.1
## 6 y5 53.4
## 7 y6 53.8
## 8 y7 65.5
## 9 y8 77.2
## 10 y9 68.2
and to visualize it:
Now let’s try making a “bag” of 100 trees and predict the results. Let’s do that a total of 300 times (just like we had 300 individual trees, we simulate bagging 300 times)
We can use the bagging() function in the
ipred package to perform bagging for us. To use
bagging(), it works the same as rpart(), but
you just need to add nbagg = ... to tell it how many trees
to create. We’ll use nbagg = 100
bag_ex <-
ipred::bagging(
formula = y ~ .,
data = train_ex,
nbagg = 100
)
# Predicting the 10 test cases using the bagged trees:
data.frame(
actual = test_ex$y,
predicted = predict(bag_ex, newdata = test_ex)
)
## actual predicted
## 1 1943 1835.553
## 2 1939 1940.297
## 3 1733 1694.618
## 4 1783 1796.263
## 5 1945 1882.186
## 6 1613 1605.142
## 7 1551 1594.833
## 8 1923 1921.948
## 9 1929 1836.822
## 10 1949 1899.440
Now let’s repeat it 300 times like we did with the trees and save the prediction for each of them:
#set.seed(2870)
# Making a data frame to save the results
bagged_preds <-
data.frame(
matrix(-1, nrow = B, ncol = pred_size)
)
# Changing column names
colnames(bagged_preds) <- paste0('y', 0:(pred_size - 1))
for (i in 1:B){
# Creating the bagged trees
bag_loop <-
ipred::bagging(
formula = y ~ .,
data = train_ex,
nbagg = 500
)
# Predicting the test cases
bagged_preds[i, ] <- predict(bag_loop, newdata = test_ex)
}
head(bagged_preds, 10)
## y0 y1 y2 y3 y4 y5 y6 y7
## 1 1844.519 1939.642 1677.220 1800.783 1879.666 1615.373 1595.325 1914.984
## 2 1842.020 1934.948 1679.307 1807.569 1886.715 1616.917 1601.645 1916.014
## 3 1842.364 1940.769 1678.051 1802.335 1879.768 1611.831 1600.264 1913.339
## 4 1843.853 1939.564 1676.822 1800.385 1880.451 1610.927 1600.333 1912.796
## 5 1842.349 1939.592 1682.604 1804.328 1885.537 1611.164 1593.710 1916.537
## 6 1847.003 1943.956 1676.981 1803.065 1884.821 1614.595 1601.940 1918.196
## 7 1844.704 1936.209 1680.070 1798.483 1884.940 1613.420 1593.811 1915.725
## 8 1844.972 1939.225 1679.433 1803.193 1882.138 1620.820 1599.465 1915.971
## 9 1845.466 1942.484 1675.456 1809.570 1885.616 1613.945 1597.321 1915.106
## 10 1844.203 1936.370 1678.784 1804.421 1880.354 1611.509 1594.641 1910.541
## y8 y9
## 1 1840.672 1907.308
## 2 1845.077 1902.870
## 3 1843.402 1910.229
## 4 1841.906 1905.589
## 5 1843.671 1906.038
## 6 1840.428 1905.746
## 7 1845.810 1908.855
## 8 1840.663 1906.172
## 9 1838.007 1905.288
## 10 1843.918 1903.712
Let’s compare the variance of the individual 300 trees vs the 300 sets of bagged trees:
The spread decreased dramatically! And all it took was doing 100 times the work!
But can we do better?
While the trees are given different data sets, they are likely using the same predictors to form the split at each chance.
Let’s look at the last set of 100 bagged trees from our previous simulation:
For each of the 100 bagged trees, X3 was always either the first split, with X1 being the second split 77% of the time. This will cause the predictions made by each tree to be very similar to each other, which won’t reduce the variance by as much as we’d like.
So what can we do? How can we force each of the bagged trees to be more dissimilar to one another?
By limiting the variables they can use to form the splits!
Random forests are just like bagged trees with one exception: Each time a node is split, a random subset of the predictors are used instead of all possible predictors
Just like bootstrapping picked random rows to build the tree, random forests also picks random columns to use to form the splits!
This forces the trees to be more unique, which will help improve the variance reduction!
Why? The math is a little beyond the scope of the course, but trust that it works!
To create a random forest, we can use the randomForest()
function in the randomForest package. It works like
bagging(), but we need to specify the
mtry = ... argument, which represents the number of
variables available at each split and ntree in place of
nbagg
randomForest::randomForest(
formula = y ~ .,
data = train_ex,
ntree = 100, # 100 random trees
mtry = 3
)
##
## Call:
## randomForest(formula = y ~ ., data = train_ex, ntree = 100, mtry = 3)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 3796.019
## % Var explained: 89.97
Let’s see how a random forest compares to individual trees and bagged tree sets:
#RNGversion('4.1.0'); set.seed(2870)
B <- 300
# Data frame to save the results of each set of random trees
rf_preds <-
data.frame(
matrix(-1, nrow = B, ncol = nrow(test_ex))
)
# Changing column names
colnames(rf_preds) <- paste0('y', 0:(pred_size - 1))
# Conducting the simulation
for (i in 1:B){
forest_loop <-
randomForest::randomForest(
formula = y ~ .,
data = train_ex,
ntree = 500, # 500 random trees
mtry = 3
)
rf_preds[i, ] <- predict(forest_loop, newdata = test_ex)
}
tibble(rf_preds)
## # A tibble: 300 × 10
## y0 y1 y2 y3 y4 y5 y6 y7 y8 y9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1853. 1876. 1720. 1798. 1920. 1602. 1579. 1917. 1875. 1903.
## 2 1858. 1870. 1715. 1806. 1902. 1602. 1567. 1914. 1872. 1910.
## 3 1857. 1868. 1714. 1798. 1917. 1594. 1581. 1910. 1876. 1899.
## 4 1857. 1857. 1711. 1787. 1910. 1597. 1573. 1911. 1870. 1904.
## 5 1848. 1865. 1706. 1799. 1912. 1593. 1576. 1924. 1871. 1898.
## 6 1856. 1872. 1707. 1805. 1907. 1594. 1585. 1916. 1865. 1897.
## 7 1862. 1867. 1707. 1800. 1916. 1602. 1580. 1910. 1872. 1910.
## 8 1861. 1861. 1718. 1784. 1911. 1598. 1580. 1913. 1876. 1906.
## 9 1859. 1866. 1709. 1802. 1915. 1602. 1582. 1917. 1870. 1902.
## 10 1861. 1860. 1700. 1795. 1909. 1600. 1573. 1920. 1871. 1904.
## # ℹ 290 more rows
Let’s compare the predictions for single trees, bagged trees, and random trees!
The purpose of the random selection of predictors is to make the predictions less correlated for the \(B\) different trees. We make them less correlated by making worse trees!
But when the individual twigs are put together, they become a mighty predictor!
model_comparison <-
bind_rows(
.id = 'model',
'tree' = scale(test_pred_tree, center = test_ex$y, scale = F) |> data.frame(),
'bagging' = scale(bagged_preds, center = test_ex$y, scale = F) |> data.frame(),
'forest' = scale(rf_preds, center = test_ex$y, scale = F) |> data.frame(),
) |>
pivot_longer(
cols = -model,
names_to = 'obs',
values_to = 'y_hat'
) |>
summarize(
.by = c(model, obs),
bias = mean(y_hat),
var_pred = var(y_hat)
) |>
mutate(
mse_pred = bias^2 + var_pred
)
# pivot_wider(
# id_cols = obs,
# values_from = mse_pred,
# names_from = model
# )
ggplot(
data = model_comparison,
mapping = aes(
x = obs,
y = mse_pred,
fill = fct_inorder(model)
)
) +
geom_col(
position = 'dodge2',
color = 'black'
) +
geom_hline(
data = model_comparison |> summarize(.by = model, mse = mean(mse_pred)),
mapping = aes(
yintercept = mse,
color = fct_inorder(model)
),
linetype = 'dashed',
show.legend = F
) +
labs(
x = NULL,
y = 'MSE',
fill = 'Model',
title = 'Comparing MSE for single trees, bagging, and random forests',
subtitle = 'dashed lines indicate the average MSE across the 10 predictions'
) +
scale_y_continuous(
expand = c(0, 0, 0.05, 0)
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)
)