This markdown will demonstrate the difference of bagged trees vs random forest.
Reminder:
Bagged trees build \(B\) different decision trees using bootstrap samples and aggregate the predictions (hence bagged - Bootstrap AGGregated)
We’ll be using a generated data set so we know which features are useful and which are not.
RNGversion('4.1.0'); set.seed(2870)
N <- 510
ex_data <-
data.frame(
# These will be the useful predictors
x1 = round(rnorm(N, 20, 3)),
x2 = round(rnorm(N, 40, 5)),
x3 = round(runif(N, 10, 20)),
x4 = rbinom(N, 1, 0.75)
)
# We'll use a loop to add 6 useless columns:
useless_x = 6
for (i in (ncol(ex_data) + 1):(ncol(ex_data) + useless_x)){
ex_data[, paste0('x', i)] <- round(rnorm(N, mean = runif(1, 20, 40), sd = runif(1, 3, 6)))
}
# Creating the response variable as a combination of the first
ex_data <-
ex_data |>
mutate(
.before = 1,
# Structur of relationship
y = -30 + 3*x1 + 1*x2 + 5*x3 + 30*x4,
# Adding random noise
y = round(y + rnorm(N, 0, sd = 20))
)
tibble(ex_data)
## # A tibble: 510 × 11
## y x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 174 19 42 13 1 22 35 33 26 43 32
## 2 151 18 35 12 1 19 38 39 25 40 33
## 3 156 18 47 14 1 16 41 38 22 34 26
## 4 145 16 39 12 1 26 37 31 31 40 31
## 5 119 19 40 10 1 17 31 39 28 37 27
## 6 126 18 32 18 0 24 40 36 14 38 26
## 7 164 24 36 16 0 23 39 41 10 39 32
## 8 163 23 30 13 1 21 41 31 28 31 33
## 9 194 18 42 16 1 23 38 33 16 41 28
## 10 143 15 44 19 0 19 35 38 13 34 32
## # ℹ 500 more rows
We’ll have the first 10 rows be the observations we want to predict and the last 500 be the data used to make predictions
pred_size <- 10
test_ex <- ex_data[1:pred_size, ]
train_ex <- ex_data[-(1:pred_size), ]
Let’s make a collection of 300 bootstrap trees and examine the variance of the 300 predictions per test set
Starting by creating the 300 trees
RNGversion('4.1.0'); set.seed(2870)
B <- 300 # B for bags
# Vector of lists to store the results
tree_vec <- vector(mode = 'list', B)
# For loop to find our B trees:
for (i in 1:B){
tree_vec[[i]] <-
rpart(
formula = y ~ .,
data = train_ex |> slice_sample(prop = 1, replace = T),
method = 'anova',
minsplit = 2,
minbucket = 1,
cp = 0
)
}
Now we’ll make a prediction for the 10 test cases for each tree
# Making a data frame to save the results
test_pred_tree <-
data.frame(
matrix(-1, nrow = B, ncol = pred_size)
)
# Changing column names
colnames(test_pred_tree) <- paste0('y', 0:(pred_size - 1))
# Looping through each tree and predicting the 10 test cases
for (i in 1:B){
test_pred_tree[i, ] <-
predict(
object = tree_vec[[i]],
newdata = test_ex
)
}
tibble(test_pred_tree)
## # A tibble: 300 × 10
## y0 y1 y2 y3 y4 y5 y6 y7 y8 y9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 202 147 203 131 142 154 146 186 140 200
## 2 178 147 202 163 179 144 170 178 211 134
## 3 133 174 203 185 135 154 146 130 204 200
## 4 224 143 177 161 144 200 146 157 184 155
## 5 166 120 129 131 170 189 110 170 140 158
## 6 144 135 203 153 135 158 184 157 157 138
## 7 170 140 203 140 165 174 170 150 140 158
## 8 134 185 167 174 144 143 150 130 170 200
## 9 224 144 190 135 135 175 176 184 199 181
## 10 166 147 129 135 159 174 170 154 164 142
## # ℹ 290 more rows
Now let’s look at the variance of each of the 10 predictions:
test_pred_tree |>
pivot_longer(
cols = everything(),
values_to = 'y_hat',
names_to = 'obs'
) |>
summarize(
.by = obs,
sd_yhat = sd(y_hat)
)
## # A tibble: 10 × 2
## obs sd_yhat
## <chr> <dbl>
## 1 y0 24.5
## 2 y1 20.9
## 3 y2 22.4
## 4 y3 19.1
## 5 y4 22.7
## 6 y5 21.4
## 7 y6 26.4
## 8 y7 22.7
## 9 y8 25.1
## 10 y9 20.8
and to visualize it:
Now let’s try making a “bag” of 100 trees and predict the results. Let’s do that a total of 300 times (just like we had 300 individual trees, we simulate bagging 300 times)
To simplify the process, let’s make a function first to predict the response using the bagged trees
bagging_trees <- function(train, test, bags){
# Data frame to save the predictions:
test_pred_df <-
data.frame(
matrix(-1, nrow = bags, ncol = nrow(test))
)
# Changing column names
colnames(test_pred_df) <- paste0('y', 0:(pred_size - 1))
# Loop to find the predictions using bagged trees
for (i in 1:bags){
# Tree using bootstrapping
bagged_tree <-
rpart(
formula = y ~ .,
data = train |> slice_sample(prop = 1, replace = T),
method = 'anova',
minsplit = 2,
minbucket = 1,
cp = 0,
xval = 1 # no pruning = no cross-val needed
)
# Saving the predictions for the test set in the data frame
test_pred_df[i, ] <-
predict(
object = bagged_tree,
newdata = test
)
}
return(colMeans(test_pred_df))
}
Now let’s use the function to create a prediction from 100 bagged trees a total of 300 times
B <- 300
# Data frame to save the results of each set of bagged trees
bagged_preds <-
data.frame(
matrix(-1, nrow = B, ncol = nrow(test_ex))
)
# Changing column names
colnames(bagged_preds) <- paste0('y', 0:(pred_size - 1))
# Conducting the simulation
for (i in 1:B){
bagged_preds[i, ] <-
bagging_trees(train = train_ex, test = test_ex, bags = 100)
}
tibble(bagged_preds)
## # A tibble: 300 × 10
## y0 y1 y2 y3 y4 y5 y6 y7 y8 y9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 166. 144. 175. 152. 163. 161. 158. 155. 174. 160.
## 2 163. 146. 170. 150. 156. 160. 158. 155. 173. 160.
## 3 164. 144. 172. 150. 158. 162. 157. 156. 171. 159.
## 4 166. 145. 167 149. 158. 160. 159. 162. 176 159.
## 5 163. 144. 174. 153. 156. 159. 160. 159. 173. 156.
## 6 163. 143. 174. 148. 159. 161. 157. 159. 170. 158.
## 7 160. 146. 170. 148. 154. 158. 159. 155. 170. 161.
## 8 164. 146. 172. 149. 162. 156. 156. 158. 169. 161.
## 9 165. 144. 171. 151. 159. 157. 160. 160. 167. 162.
## 10 165. 142. 171. 149. 164. 163. 160. 156. 172. 160.
## # ℹ 290 more rows
Let’s compare the variance of the individual 300 trees vs the 300 sets of bagged trees:
The spread decreased dramatically! And all it took was doing 100 times the work!
But can we do better?
While the trees are given different data sets, they are likely using the same predictors to form the split at each chance:
For each of the 100 bagged trees, X3 and X4 were always either the first or second split, with the other being the second split. This will cause the predictions made by each tree to be very similar to each other, which won’t reduce the variance by as much as we’d like.
So what can we do? How can we force each of the bagged trees to be more disimilar to one another?
By limiting the variables they can use to form the splits!
Random forests are just like bagged trees with one exception: Each tree is given a random subset of the predictors that it can use
Just like bootstrapping picked random rows to build the tree, random forests also picks random columns to use to form the splits!
This forces the trees to be more unique, which will help improve the variance reduction!
Why? The math is a little beyond the scope of the course, but trust that it works!
The code below will create a function to form the random trees:
random_trees <- function(train, test, bags = 100, npred = 3){
# Data frame to save the predictions:
test_pred_df <-
data.frame(
matrix(-1, nrow = bags, ncol = nrow(test))
)
# Changing column names
colnames(test_pred_df) <- paste0('y', 0:(pred_size - 1))
# Loop to find the predictions using bagged trees
for (i in 1:bags){
# Picking the column number for the random columns to use
pred_loc <- sample(2:ncol(train), size = npred)
# Tree using bootstrapping
bagged_tree <-
rpart(
formula = y ~ .,
data = train |>
slice_sample(prop = 1, replace = T) |>
dplyr::select(1, all_of(pred_loc)),
method = 'anova',
minsplit = 2,
minbucket = 1,
cp = 0,
xval = 1 # no pruning = no cross-val needed
)
# Saving the predictions for the test set in the data frame
test_pred_df[i, ] <-
predict(
object = bagged_tree,
newdata = test
)
}
return(colMeans(test_pred_df))
}
Again, let’s simulate fitting 300 random forests by only allowing each tree to have 5 predictors, chosen at random:
RNGversion('4.1.0'); set.seed(2870)
B <- 300
# Data frame to save the results of each set of random trees
rf_preds <-
data.frame(
matrix(-1, nrow = B, ncol = nrow(test_ex))
)
# Changing column names
colnames(rf_preds) <- paste0('y', 0:(pred_size - 1))
# Conducting the simulation
for (i in 1:B){
rf_preds[i, ] <-
random_trees(train = train_ex, test = test_ex, bags = 100, npred = 5)
}
tibble(rf_preds)
## # A tibble: 300 × 10
## y0 y1 y2 y3 y4 y5 y6 y7 y8 y9
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 164. 149. 161. 159. 160. 157. 164. 162. 174. 156.
## 2 167. 152. 163. 155. 164. 157 169. 162. 169. 154.
## 3 168. 158. 167. 158. 162. 160. 167. 160. 170. 155.
## 4 166. 154. 161. 158. 165. 159. 161. 160. 169. 153
## 5 167. 149. 161. 159. 166. 160. 162 164. 171. 158.
## 6 171. 151. 167. 155. 166. 167. 163. 164. 169. 161.
## 7 161. 161. 163. 157. 162. 163. 168. 155. 170. 158.
## 8 163. 157. 165. 157. 163. 163. 164. 158. 172. 163.
## 9 169. 147. 163. 160. 162. 159. 168. 161. 168. 151.
## 10 161. 152. 161. 153. 163. 167. 166. 162. 169. 161.
## # ℹ 290 more rows
Let’s compare the predictions for single trees, bagged trees, and random trees!
The purpose of the random selection of predictors is to make the predictions less correlated for the \(B\) different trees. We make them less correlated by making worse trees!
But when the individual twigs are put together, they become a mighty predictor!