Random forests for binary classification: predicting legal status of trees in San Francisco

Author

Lu Mao

Published

December 13, 2024

Data

  • Data source: San Francisco Department of Public Works (DPW) Bureau of Urban Forestry
  • Outcome: legal status of trees (DPW Maintained vs otherwise)
  • Features: plot size, species, site info, etc.
# load needed libraries
library(tidyverse)
library(gtsummary)
library(tidymodels)
library(themis)
sf_trees <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-01-28/sf_trees.csv")

trees_df <- sf_trees %>%
  mutate(
    legal_status = case_when(
      legal_status == "DPW Maintained" ~ legal_status,
      TRUE ~ "Other"
    ),
    plot_size = parse_number(plot_size)
  ) %>%
  select(-address) %>%
  na.omit() %>%
  mutate_if(is.character, factor) # convert categorical variables to factors

Descriptive analysis

Location map of trees by legal status:

trees_df %>%
  ggplot(aes(longitude, latitude, color = legal_status)) +
  geom_point(size = 0.5, alpha = 0.4) +
  labs(color = NULL) +
  theme_minimal()

# extract year from date
trees_df <- trees_df |> mutate(year = year(date))
# summary table by legal status
trees_df %>%
  select(!c(tree_id, date, species, site_info, caretaker)) %>%
  tbl_summary(
    by = legal_status) |> 
  add_p()
Characteristic DPW Maintained, N = 18,0981 Other, N = 5,7441 p-value2
site_order 1.0 (1.0, 3.0) 2.0 (1.0, 4.0) <0.001
dbh 3.0 (3.0, 6.0) 3.0 (3.0, 8.0) <0.001
plot_size 3.00 (3.00, 3.00) 3.00 (3.00, 3.00) <0.001
latitude 37.763 (37.738, 37.779) 37.761 (37.741, 37.780) 0.4
longitude -122.43 (-122.47, -122.41) -122.42 (-122.44, -122.41) <0.001
year 2,008 (2,000, 2,013) 2,008 (1,999, 2,016) <0.001
1 Median (IQR)
2 Wilcoxon rank sum test
trees_df %>%
  count(legal_status, caretaker) %>%
  add_count(caretaker, wt = n, name = "caretaker_count") %>%
  filter(caretaker_count > 50) %>%
  group_by(legal_status) %>%
  mutate(percent_legal = n / sum(n)) %>%
  ggplot(aes(percent_legal, caretaker, fill = legal_status)) +
  geom_col(position = "dodge") +
  labs(
    fill = NULL,
    x = "% of trees in each category"
  ) +
  theme_classic()

Random forests

Model building

# data splitting
set.seed(123)
trees_split <- initial_split(trees_df, strata = legal_status)
trees_train <- training(trees_split)
trees_test <- testing(trees_split)

Data preprocessing (recipe):

  • Specify formula.
  • Set tree_id as id.
  • step_other() to collapse categorical levels for species, caretaker, and the site info.
  • Extract year from date
  • There are many more DPW maintained trees than not, so let’s downsample the data for training.
tree_rec <- recipe(legal_status ~ ., data = trees_train) %>%
  update_role(tree_id, new_role = "ID") %>%
  step_other(species, caretaker, threshold = 0.01) %>%
  step_other(site_info, threshold = 0.005) %>%
  step_dummy(all_nominal(), -all_outcomes()) %>%
  step_date(date, features = c("year")) %>%
  step_rm(date) |> 
  step_downsample(legal_status)


# ?step_downsample
tree_prep <- prep(tree_rec)
juiced <- juice(tree_prep)
# ?prep
# ?juice

Model specification:

  • 1000 trees:
  • mtry (number of variables randomly sampled at each split): tune;
  • min_n (minimum number of observations in terminal nodes): tune.
tune_spec <- rand_forest(
  mtry = tune(),
  trees = 1000,
  min_n = tune()
) %>%
  set_mode("classification") %>%
  set_engine("ranger", importance = "permutation")

Workflow:

tree_wf <- workflow() %>%
  add_recipe(tree_rec) %>%
  add_model(tune_spec)

Model tuning

Set up cross-validation (CV) resamples (10-fold CV):

set.seed(234)
trees_folds <- vfold_cv(trees_train, v = 10, strata = legal_status)

Tune models on 20 random combinations of mtry and min_n:

doParallel::registerDoParallel()

set.seed(345)
tune_res <- tree_wf |> 
  tune_grid(
  resamples = trees_folds,
  grid = 20
)
saveRDS(tune_res, "tune_res.rds")

Model evaluation and finalization

Select best model by AUC:

# prediction results
 tune_res %>% collect_metrics() |> filter(.metric == "roc_auc") |> arrange(desc(mean)) 
# A tibble: 20 × 8
    mtry min_n .metric .estimator  mean     n  std_err .config              
   <int> <int> <chr>   <chr>      <dbl> <int>    <dbl> <chr>                
 1    14     3 roc_auc binary     0.943    10 0.00131  Preprocessor1_Model14
 2    19     8 roc_auc binary     0.941    10 0.00149  Preprocessor1_Model19
 3    36     5 roc_auc binary     0.941    10 0.00174  Preprocessor1_Model18
 4    18    11 roc_auc binary     0.940    10 0.00144  Preprocessor1_Model13
 5    36    12 roc_auc binary     0.939    10 0.00174  Preprocessor1_Model20
 6    10    17 roc_auc binary     0.938    10 0.00113  Preprocessor1_Model06
 7     8    15 roc_auc binary     0.938    10 0.00104  Preprocessor1_Model17
 8    28    19 roc_auc binary     0.937    10 0.00163  Preprocessor1_Model02
 9    32    20 roc_auc binary     0.937    10 0.00171  Preprocessor1_Model04
10    26    26 roc_auc binary     0.936    10 0.00160  Preprocessor1_Model12
11    32    24 roc_auc binary     0.936    10 0.00166  Preprocessor1_Model15
12    40    22 roc_auc binary     0.935    10 0.00182  Preprocessor1_Model16
13    22    28 roc_auc binary     0.935    10 0.00160  Preprocessor1_Model09
14    12    30 roc_auc binary     0.935    10 0.00129  Preprocessor1_Model11
15    15    32 roc_auc binary     0.935    10 0.00142  Preprocessor1_Model05
16    23    31 roc_auc binary     0.935    10 0.00161  Preprocessor1_Model10
17    28    37 roc_auc binary     0.933    10 0.00167  Preprocessor1_Model03
18     7    35 roc_auc binary     0.933    10 0.000951 Preprocessor1_Model01
19     4    39 roc_auc binary     0.926    10 0.000856 Preprocessor1_Model07
20     2     6 roc_auc binary     0.904    10 0.00125  Preprocessor1_Model08
# best model  
best_rf <- tune_res %>%
  select_best(metric = "roc_auc")

Finalize model workflow and fit:

# finalize model workflow
final_rf <- tree_wf |> 
  finalize_workflow(best_rf)



final_rf
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()

── Preprocessor ────────────────────────────────────────────────────────────────
6 Recipe Steps

• step_other()
• step_other()
• step_dummy()
• step_date()
• step_rm()
• step_downsample()

── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (classification)

Main Arguments:
  mtry = 14
  trees = 1000
  min_n = 3

Engine-Specific Arguments:
  importance = permutation

Computational engine: ranger 
# final fit
final_fit <- final_rf |> 
  last_fit(trees_split)

Performance on test set:

final_fit %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc")
# A tibble: 1 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 roc_auc binary         0.946 Preprocessor1_Model1

Assess variance importance:

library(vip)

# trained model workflow
trained_rf <- extract_workflow(final_fit)

vip_rf <- trained_rf |> extract_fit_parsnip() |> vip() +
  theme_classic()

vip_rf

Classification trees

Specify workflow (tune cost_complexity):

ct_mod_spec <- decision_tree(
  cost_complexity = tune(),
  min_n = tune(),
    ) %>%
  set_mode("classification") %>%
  set_engine("rpart")

ct_wf <- workflow() %>%
  add_recipe(tree_rec) %>%
  add_model(ct_mod_spec)

Set up grid for tuning parameter and tune model:

set.seed(456)

# ?grid_regular
cost_complexity_grid <- grid_regular(
  cost_complexity(),
  levels = 20
) 

ct_tune_res <- ct_wf |> 
  tune_grid(
  resamples = trees_folds,
  grid = 20
)

Select best model workflow:

# plot CV AUC as a functional of cost_complexity
ct_tune_res %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") |> 
  ggplot(aes(cost_complexity, mean)) +
  geom_point() +
  geom_line() +
  scale_x_log10() +
  theme_minimal()

# select best model
best_ct <- ct_tune_res %>%
  select_best(metric = "roc_auc")

Final model fit and test results:

# finalize workflow
final_ct <- ct_wf |> 
  finalize_workflow(best_ct)

# final fit
final_ct_fit <- final_ct |> 
  last_fit(trees_split)

# performance on test set
final_ct_fit %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc")
# A tibble: 1 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 roc_auc binary         0.892 Preprocessor1_Model1
# plot roc curve
final_ct_fit %>%
  collect_predictions() %>%
  roc_curve(truth = legal_status, `.pred_DPW Maintained`) %>%
  autoplot()

Assessing variable importance:

library(rpart.plot)  # for visualizing a decision tree

# trained tree workflow
trained_tr <- extract_workflow(final_ct_fit)
# trained_tr

# Plot tree structure
trained_tr %>%
  extract_fit_engine() %>%
  rpart.plot(roundint = FALSE, fallen.leaves = FALSE)

# Variable importance
vip_tr <- trained_tr |> extract_fit_parsnip() %>% 
  vip() + theme_classic()

Plot VIP graphs for random forests and classification trees side by side:

library(patchwork)

vip_rf + vip_tr