This version uses a more presentation-friendly target for the Random Forest section:
Can we predict whether a squirrel is foraging?
This is a stronger example than predicting approaches because:
approaches, so model
evaluation is more meaningfulThis file is organized to support Slides 5–9:
url <- "https://raw.githubusercontent.com/JaydeeJan/Data-624-Group-Project/refs/heads/main/2018_Central_Park_Squirrel_Census_-_Squirrel_Data_20260407.csv"
squirrels_raw <- read_csv(url, show_col_types = FALSE) |>
clean_names()
tibble(
rows = nrow(squirrels_raw),
columns = ncol(squirrels_raw)
)
## # A tibble: 1 × 2
## rows columns
## <int> <int>
## 1 3023 31
We use foraging as the target variable:
This is a meaningful real-world behavior question that is easy to explain in a presentation.
These predictors are simple and interpretable:
squirrels <- squirrels_raw |>
select(
foraging,
shift,
age,
primary_fur_color,
location,
above_ground_sighter_measurement,
running,
chasing,
climbing,
eating,
approaches,
indifferent,
runs_from,
x,
y
)
glimpse(squirrels)
## Rows: 3,023
## Columns: 15
## $ foraging <lgl> FALSE, FALSE, FALSE, TRUE, TRUE, TRUE…
## $ shift <chr> "PM", "AM", "PM", "PM", "AM", "AM", "…
## $ age <chr> NA, NA, NA, "Adult", "Adult", "Adult"…
## $ primary_fur_color <chr> NA, NA, "Gray", "Gray", "Gray", "Cinn…
## $ location <chr> NA, NA, "Above Ground", NA, "Above Gr…
## $ above_ground_sighter_measurement <chr> NA, NA, "10", NA, NA, NA, "FALSE", "F…
## $ running <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ chasing <lgl> FALSE, FALSE, TRUE, FALSE, FALSE, FAL…
## $ climbing <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ eating <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FAL…
## $ approaches <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FA…
## $ indifferent <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TR…
## $ runs_from <lgl> FALSE, FALSE, FALSE, TRUE, FALSE, FAL…
## $ x <dbl> -73.95613, -73.96886, -73.97428, -73.…
## $ y <dbl> 40.79408, 40.78378, 40.77553, 40.7903…
squirrels |>
count(foraging) |>
mutate(percent = percent(n / sum(n)))
## # A tibble: 2 × 3
## foraging n percent
## <lgl> <int> <chr>
## 1 FALSE 1588 52.5%
## 2 TRUE 1435 47.5%
list(
shift = squirrels |> count(shift),
age = squirrels |> count(age),
primary_fur_color = squirrels |> count(primary_fur_color),
location = squirrels |> count(location)
)
## $shift
## # A tibble: 2 × 2
## shift n
## <chr> <int>
## 1 AM 1347
## 2 PM 1676
##
## $age
## # A tibble: 4 × 2
## age n
## <chr> <int>
## 1 ? 4
## 2 Adult 2568
## 3 Juvenile 330
## 4 <NA> 121
##
## $primary_fur_color
## # A tibble: 4 × 2
## primary_fur_color n
## <chr> <int>
## 1 Black 103
## 2 Cinnamon 392
## 3 Gray 2473
## 4 <NA> 55
##
## $location
## # A tibble: 3 × 2
## location n
## <chr> <int>
## 1 Above Ground 843
## 2 Ground Plane 2116
## 3 <NA> 64
squirrels |>
mutate(
foraging_label = case_when(
is.na(foraging) ~ "Missing",
foraging == TRUE ~ "Yes",
foraging == FALSE ~ "No"
)
) |>
count(foraging_label) |>
ggplot(aes(x = foraging_label, y = n)) +
geom_col() +
labs(
title = "Was the squirrel foraging?",
x = "Foraging",
y = "Count"
)
# helper for columns stored as logical or text-like yes/no
to_yes_no <- function(x) {
case_when(
str_to_lower(as.character(x)) == "true" ~ "Yes",
str_to_lower(as.character(x)) == "false" ~ "No",
str_to_lower(as.character(x)) == "yes" ~ "Yes",
str_to_lower(as.character(x)) == "no" ~ "No",
TRUE ~ NA_character_
)
}
squirrels_clean <- squirrels |>
mutate(
foraging = factor(to_yes_no(foraging), levels = c("No", "Yes")),
shift = fct_explicit_na(factor(shift), na_level = "Unknown"),
age = fct_explicit_na(factor(age), na_level = "Unknown"),
primary_fur_color = fct_explicit_na(factor(primary_fur_color), na_level = "Unknown"),
location = fct_explicit_na(factor(location), na_level = "Unknown"),
above_ground_sighter_measurement = na_if(above_ground_sighter_measurement, "FALSE"),
above_ground_sighter_measurement = readr::parse_number(as.character(above_ground_sighter_measurement)),
across(
c(running, chasing, climbing, eating, approaches, indifferent, runs_from),
~ factor(to_yes_no(.x), levels = c("No", "Yes"))
)
) |>
filter(!is.na(foraging))
summary(squirrels_clean)
## foraging shift age primary_fur_color location
## No :1588 AM:1347 ? : 4 Black : 103 Above Ground: 843
## Yes:1435 PM:1676 Adult :2568 Cinnamon: 392 Ground Plane:2116
## Juvenile: 330 Gray :2473 Unknown : 64
## Unknown : 121 Unknown : 55
##
##
##
## above_ground_sighter_measurement running chasing climbing eating
## Min. : 0.00 No :2293 No :2744 No :2365 No :2263
## 1st Qu.: 5.00 Yes: 730 Yes: 279 Yes: 658 Yes: 760
## Median : 10.00
## Mean : 15.21
## 3rd Qu.: 20.00
## Max. :180.00
## NA's :2230
## approaches indifferent runs_from x y
## No :2845 No :1569 No :2345 Min. :-73.98 Min. :40.76
## Yes: 178 Yes:1454 Yes: 678 1st Qu.:-73.97 1st Qu.:40.77
## Median :-73.97 Median :40.78
## Mean :-73.97 Mean :40.78
## 3rd Qu.:-73.96 3rd Qu.:40.79
## Max. :-73.95 Max. :40.80
##
set.seed(624)
squirrel_split <- initial_split(squirrels_clean, prop = 0.80, strata = foraging)
train_data <- training(squirrel_split)
test_data <- testing(squirrel_split)
tibble(
set = c("Training", "Test"),
n = c(nrow(train_data), nrow(test_data))
)
## # A tibble: 2 × 2
## set n
## <chr> <int>
## 1 Training 2418
## 2 Test 605
squirrel_recipe <- recipe(foraging ~ ., data = train_data) |>
step_impute_median(all_numeric_predictors()) |>
step_unknown(all_nominal_predictors()) |>
step_novel(all_nominal_predictors())
squirrel_recipe
rf_spec <- rand_forest(
mtry = tune(),
min_n = tune(),
trees = 500
) |>
set_mode("classification") |>
set_engine("ranger", importance = "impurity", probability = TRUE)
rf_spec
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 500
## min_n = tune()
##
## Engine-Specific Arguments:
## importance = impurity
## probability = TRUE
##
## Computational engine: ranger
rf_workflow <- workflow() |>
add_recipe(squirrel_recipe) |>
add_model(rf_spec)
set.seed(624)
folds <- vfold_cv(train_data, v = 5, strata = foraging)
folds
## # 5-fold cross-validation using stratification
## # A tibble: 5 × 2
## splits id
## <list> <chr>
## 1 <split [1934/484]> Fold1
## 2 <split [1934/484]> Fold2
## 3 <split [1934/484]> Fold3
## 4 <split [1935/483]> Fold4
## 5 <split [1935/483]> Fold5
set.seed(624)
rf_grid <- grid_regular(
mtry(range = c(2, 8)),
min_n(range = c(2, 20)),
levels = 5
)
rf_tuned <- tune_grid(
rf_workflow,
resamples = folds,
grid = rf_grid,
metrics = metric_set(accuracy, bal_accuracy, roc_auc)
)
collect_metrics(rf_tuned)
## # A tibble: 75 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 2 2 accuracy binary 0.775 5 0.00774 pre0_mod01_post0
## 2 2 2 bal_accuracy binary 0.774 5 0.00784 pre0_mod01_post0
## 3 2 2 roc_auc binary 0.852 5 0.00957 pre0_mod01_post0
## 4 2 6 accuracy binary 0.775 5 0.00686 pre0_mod02_post0
## 5 2 6 bal_accuracy binary 0.773 5 0.00694 pre0_mod02_post0
## 6 2 6 roc_auc binary 0.851 5 0.00987 pre0_mod02_post0
## 7 2 11 accuracy binary 0.774 5 0.00806 pre0_mod03_post0
## 8 2 11 bal_accuracy binary 0.773 5 0.00802 pre0_mod03_post0
## 9 2 11 roc_auc binary 0.850 5 0.00963 pre0_mod03_post0
## 10 2 15 accuracy binary 0.768 5 0.00974 pre0_mod04_post0
## # ℹ 65 more rows
best_params <- select_best(rf_tuned, metric = "roc_auc")
best_params
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 3 2 pre0_mod06_post0
final_rf_workflow <- finalize_workflow(rf_workflow, best_params)
final_rf_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_impute_median()
## • step_unknown()
## • step_novel()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = 3
## trees = 500
## min_n = 2
##
## Engine-Specific Arguments:
## importance = impurity
## probability = TRUE
##
## Computational engine: ranger
final_rf_fit <- fit(final_rf_workflow, data = train_data)
final_rf_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_impute_median()
## • step_unknown()
## • step_novel()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~3L, x), num.trees = ~500, min.node.size = min_rows(~2L, x), importance = ~"impurity", probability = ~TRUE, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 2418
## Number of independent variables: 14
## Mtry: 3
## Target node size: 2
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1496759
rf_test_preds <- predict(final_rf_fit, test_data, type = "prob") |>
bind_cols(predict(final_rf_fit, test_data)) |>
bind_cols(test_data |> select(foraging))
head(rf_test_preds)
## # A tibble: 6 × 4
## .pred_No .pred_Yes .pred_class foraging
## <dbl> <dbl> <fct> <fct>
## 1 0.848 0.152 No No
## 2 0.713 0.287 No Yes
## 3 0.118 0.882 Yes Yes
## 4 0.375 0.625 Yes No
## 5 0.581 0.419 No Yes
## 6 0.761 0.239 No Yes
rf_accuracy <- accuracy(rf_test_preds, truth = foraging, estimate = .pred_class)
rf_bal_accuracy <- bal_accuracy(rf_test_preds, truth = foraging, estimate = .pred_class)
rf_roc_auc <- roc_auc(
rf_test_preds,
truth = foraging,
.pred_Yes,
event_level = "second"
)
rf_summary <- bind_rows(
rf_accuracy,
rf_bal_accuracy,
rf_roc_auc
) |>
select(.metric, .estimate)
rf_summary
## # A tibble: 3 × 2
## .metric .estimate
## <chr> <dbl>
## 1 accuracy 0.792
## 2 bal_accuracy 0.789
## 3 roc_auc 0.872
rf_conf_mat <- conf_mat(rf_test_preds, truth = foraging, estimate = .pred_class)
rf_conf_mat
## Truth
## Prediction No Yes
## No 266 74
## Yes 52 213
rf_engine <- extract_fit_engine(final_rf_fit)
vip(rf_engine, num_features = 12) +
labs(title = "Random Forest Variable Importance: Predicting Foraging")
This section helps show whether Random Forest performs better than one tree.
tree_spec <- decision_tree(
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) |>
set_mode("classification") |>
set_engine("rpart")
tree_workflow <- workflow() |>
add_recipe(squirrel_recipe) |>
add_model(tree_spec)
set.seed(624)
tree_tuned <- tune_grid(
tree_workflow,
resamples = folds,
grid = 15,
metrics = metric_set(accuracy, bal_accuracy, roc_auc)
)
best_tree <- select_best(tree_tuned, metric = "roc_auc")
final_tree_workflow <- finalize_workflow(tree_workflow, best_tree)
final_tree_fit <- fit(final_tree_workflow, data = train_data)
tree_test_preds <- predict(final_tree_fit, test_data, type = "prob") |>
bind_cols(predict(final_tree_fit, test_data)) |>
bind_cols(test_data |> select(foraging))
rf_compare <- bind_rows(
tibble(
model = "Random Forest",
metric = c("accuracy", "bal_accuracy", "roc_auc"),
estimate = c(
accuracy(rf_test_preds, truth = foraging, estimate = .pred_class)$.estimate,
bal_accuracy(rf_test_preds, truth = foraging, estimate = .pred_class)$.estimate,
roc_auc(rf_test_preds, truth = foraging, .pred_Yes, event_level = "second")$.estimate
)
),
tibble(
model = "Single Tree",
metric = c("accuracy", "bal_accuracy", "roc_auc"),
estimate = c(
accuracy(tree_test_preds, truth = foraging, estimate = .pred_class)$.estimate,
bal_accuracy(tree_test_preds, truth = foraging, estimate = .pred_class)$.estimate,
roc_auc(tree_test_preds, truth = foraging, .pred_Yes, event_level = "second")$.estimate
)
)
)
rf_compare
## # A tibble: 6 × 3
## model metric estimate
## <chr> <chr> <dbl>
## 1 Random Forest accuracy 0.792
## 2 Random Forest bal_accuracy 0.789
## 3 Random Forest roc_auc 0.872
## 4 Single Tree accuracy 0.798
## 5 Single Tree bal_accuracy 0.796
## 6 Single Tree roc_auc 0.846
In this squirrel example, Random Forest provides a practical way to
predict whether a squirrel is foraging using time,
location, appearance, and behavior variables. Compared with the earlier
approaches target, foraging is a more balanced
and more presentation-friendly outcome, so the model evaluation is
easier to interpret. This makes it a stronger teaching example for
showing how Random Forest works on real-world observational data.
model_summary <- rf_summary
model_summary
## # A tibble: 3 × 2
## .metric .estimate
## <chr> <dbl>
## 1 accuracy 0.792
## 2 bal_accuracy 0.789
## 3 roc_auc 0.872