Jeffrey Pine Beetle Outbreak

---
title: "Jeffrey Pine Beetle Outbreak"
output: 
  flexdashboard::flex_dashboard:
    favicon: "img/pine_beetle.png"
    logo: "img/pine_beetle.png"
    social: "menu"
    source_code: "embed"
    theme:
      version: 4
      bg: "#0A1628"
      fg: "#E1F1FF" 
      primary: "#364055"
      base_font:
        google: "EB Garamond"
      code_font:
        google: "JetBrains Mono"
---

```{r setup, include=FALSE}

# =========================================================
# LOAD LIBRARIES
# =========================================================
library(conflicted)
library(DT)
library(flexdashboard)
library(labelled)
library(plotly)
library(tidymodels)
library(tidyverse)
library(vip)
library(performance)
library(see)

# =========================================================
# RESOLVE CONFLICTS
# =========================================================
conflicted::conflicts_prefer(dplyr::filter)
conflicted::conflicts_prefer(dplyr::select)
conflicted::conflicts_prefer(plotly::layout)

# =========================================================
# COLOR PALETTE
# =========================================================
nav_color       <- "#364055"
font_color      <- "#E1F1FF"
bg_color        <- "#0A1628"

# SCATTER PLOT COLORS
scatter_plot_colors <- c(
  "healthy" = "#006600",
  "infested" = "#7B2D2D"
)
healthy_color   <- "#006600"
infested_color  <- "#7B2D2D"

# BAR PLOT COLORS
bar_plot_colors <- c(
  "Ridge"  = "#2E6A9E",
  "Lasso"  = "#2A7A5E"
)


# =========================================================
# LOAD DATA
# =========================================================
pine_tbl <- readr::read_csv(
  file      = "data/Data_1993.csv",
  col_types = "ifddiidfdddddddddddddfffff"
) |>
  labelled::set_variable_labels(
    # =====================================================
    # Tree Characteristics 
    # =====================================================
    TreeNum      = "unique tree identifier",
    Response     = "1 = infested, 0 = not infested",
    Easting      = "horizontal (east-west) position",
    Northing     = "vertical (north-south) position",
    TreeDiam     = "tree diameter/size",
    Infest_Serv1 = "infestation severity",
    Infest_Serv2 = "infestation severity",
    DeadDist     = "minimum linear distance to nearest brood",
    
    # =====================================================
    # Neighborhood Characteristics 
    # =====================================================
    SDI_20th          = "Stand Density Index at .05 acre",
    BA_20th           = "basal area at .05 acre",
    `Neigh_SDI_1/4th` = "Stand Density Index at .25 acre",
    `Neigh_1/4th`     = "basal area within .25 acre",
    `Neigh_1/2th`     = "basal area within .50 acre",
    Neigh_1           = "basal area within 1 acre",
    Neigh_1.5         = "basal area within 1.5 acres",
    
    # =====================================================
    # Basal Area (Infested Trees)
    # =====================================================
    BA_Inf_20th       = "infested basal area within .05 acre",
    `BA_Infest_1/4th` = "infested basal area within .25 acre",
    `BA_Infest_1/2th` = "infested basal area within .50 acre",
    BA_Infest_1       = "infested basal area within 1 acre",
    BA_Infest_1.5     = "infested basal area within 1.5 acres",
    
    # =====================================================
    # Indicators (Boolean variables)
    # =====================================================
    Ind_DeadDist          = "nearest brood tree within 50 m",
    IND_BA_Infest_20th    = "infested tree in neighborhood",
    `IND_BA_Infest_1/4th` = "infested tree within .25 acre",
    `IND_BA_Infest_1/2th` = "infested tree within .50 acre",
    IND_BA_Infest_1       = "infested tree within 1 acre",
    IND_BA_Infest_1.5     = "infested tree within 1.5 acres"
  )
```

```{r helper_functions, include=FALSE}
# ========================================================
# HELPER FUNCTIONS
# ========================================================

# ---------------------------------------------------------
# Predicted vs. Actual scatter plot
# ---------------------------------------------------------
make_pred_scatter <- function(.preds_tbl) {
  ref_range <- range(c(.preds_tbl$actual, .preds_tbl$predicted), na.rm = TRUE)

  plot_ly() |>
    add_trace(
      data      = .preds_tbl,
      x         = ~actual,
      y         = ~predicted,
      type      = "scatter",
      mode      = "markers",
      color     = ~response_label,
      colors    = scatter_plot_colors,
      text      = ~tooltip,
      hoverinfo = "text",
      marker    = list(size = 7, opacity = 0.55)
    ) |>
    add_trace(
      x          = ref_range,
      y          = ref_range,
      type       = "scatter",
      mode       = "lines",
      line       = list(dash = "dot", color = font_color, width = 1.5),
      showlegend = FALSE,
      hoverinfo  = "skip",
      inherit    = FALSE
    ) |>
    plotly::layout(
      paper_bgcolor = bg_color,
      plot_bgcolor  = bg_color,
      xaxis  = list(
        title     = "Actual DeadDist (m)",
        color     = font_color,
        gridcolor = nav_color
      ),
      yaxis  = list(
        title     = "Predicted DeadDist (m)",
        color     = font_color,
        gridcolor = nav_color
      ),
      legend = list(
        orientation = "h",
        y           = 1,
        font        = list(color = font_color),
        bgcolor     = nav_color
      ),
      hovermode = "closest"
    )
}

# ---------------------------------------------------------
# Create a gauge
# ---------------------------------------------------------
create_gauge <- function(value, label) {
  flexdashboard::gauge(
  value = value,
  min = 0,
  max = 100,
  symbol = "%",
  label = label,
  )
}

# ---------------------------------------------------------
# Create a RMSE value box
# ---------------------------------------------------------
create_valuebox <- function(value) {
  flexdashboard::valueBox(
    value   = value,
    caption = str_glue("On average, predicted values are {value} from the actual value."),
    icon    = "fa-ruler-horizontal",
    color   = nav_color
  )
}

# ---------------------------------------------------------
# Look up a variable label by column name
# ---------------------------------------------------------
get_var_label <- function(var_name) {
  labelled::var_label(pine_tbl[[var_name]]) %||% var_name
}
```


Outbreak Tracker {data-orientation="rows"}
============================================================================

## Row

### Total Trees

```{r valueBox_total}
# total number of trees in the study
total_trees   <- nrow(pine_tbl)

# UI: statistic value box
flexdashboard::valueBox(
  value   = str_glue("{scales::comma(total_trees)}"),
  caption = "Trees Observed",
  icon    = "fa-tree",
  color   = nav_color
)
```

### Infested Trees

```{r valueBox_infested}
# proportion of trees that are infested
pct_infested  <- round(mean(pine_tbl$Response == "1") * 100, 1)

# UI: statistic value box
flexdashboard::valueBox(
  value   = str_glue("{pct_infested} %"),
  caption = "Trees Infested",
  icon    = "fa-bug",
  color   = infested_color
)
```

### Mean Brood Distance

```{r valueBox_brood}
# average distance (m) to nearest brood tree
mean_brood <- round(mean(pine_tbl$DeadDist, na.rm = TRUE), 1)

# UI: statistic value box
flexdashboard::valueBox(
  value   = str_glue("{mean_brood} m"),
  caption = "Mean Distance to Nearest Brood Tree",
  icon    = "fa-ruler-horizontal",
  color   = nav_color
)
```

### Median Tree Diameter

```{r valueBox_diameter}
# median tree diameter (cm) in the study
median_diam <- median(pine_tbl$TreeDiam, na.rm = TRUE)

# UI: statistic value box
flexdashboard::valueBox(
  value   = str_glue("{median_diam} cm"),
  caption = "Median Tree Diameter",
  icon    = "fa-circle-notch",
  color   = nav_color
)
```

## Row

### Outbreak Tracker 📍 {data-padding=20}

```{r outbreak_tracker}

# ========================================================
# FILTER & TIDY: Prepare the dataset for plotting.
# - Keep only columns needed for the spatial scatter.
# - Round nearest-brood distance to one decimal place.
# ========================================================
outbreak_tbl <- pine_tbl |>
  select(
    TreeNum,  # tree id
    TreeDiam, # tree diameter 
    DeadDist, # distance to nearest brood tree 
    Easting,  # UTM easting coordinate
    Northing, # UTM northing coordinate 
    Response  # infested (1) vs. not infested (0)
  ) |>
  mutate(
    DeadDist = round(DeadDist, 1),
    Response = factor(Response, 
                      levels = c(0, 1), 
                      labels = c("Healthy", "Infested"))
  )

# ========================================================
# UI PLOT: Spatial Scatter
# ========================================================
plot_ly(
  data        = outbreak_tbl,
  x           = ~Easting,
  y           = ~Northing,
  type        = "scatter",
  mode        = "markers",
  color       = ~Response,
  colors      = c("Healthy" = healthy_color, "Infested" = infested_color),
  hoverinfo   = "text",
  showlegend  = TRUE,
  text        = ~stringr::str_glue("<b>Tree #{TreeNum}</b><br>",
                                   "Diameter: {TreeDiam} cm<br>",
                                   "Nearest Brood: {DeadDist} m<br>",
                                   "Easting: {Easting}<br>",
                                   "Northing: {Northing}<br>"),
  marker      = list(
    size      = ~TreeDiam,
    sizemode  = "diameter",
    sizemin   = 1,
    sizeref   = 2.5
  )
) |>
plotly::layout(
  hovermode     = "closest",
  dragmode      = "pan",
  paper_bgcolor = bg_color,
  plot_bgcolor  = bg_color,
  legend        = list(
    orientation = "h",
    y           = 1,
    font        = list(color = font_color),
    bgcolor     = nav_color
  ),
  xaxis = list(title = "UTM Easting",color = font_color,tickformat = "d"),
  yaxis = list(title = "UTM Northing", color = font_color, tickformat = "d")
)
```

> This spatial scatter plot shows the location of each tree in the study, colored by infestation status. The size of each point corresponds to tree diameter. Hover over points to see detailed information about each tree.

Linear Regression {data-navmenu="Regression" data-orientation=rows .hidden}
============================================================================

```{r split_data, include=FALSE, cache = TRUE}
# --- Data Split ------------------------------------------
set.seed(1234)
pine_data_split <- initial_split(pine_tbl, prop = .8)
pine_train_data <- training(pine_data_split)
pine_test_data  <- testing(pine_data_split)
```

```{r linear_regression, include=FALSE, cache = TRUE}
# ========================================================
# LINEAR REGRESSION MODEL
# ========================================================
# --- Model Specification ---------------------------------
linear_model_spec <- 
  linear_reg() |>
  set_engine("lm")

# --- Preprocessing Recipe --------------------------------
linear_recipe <- pine_train_data |>
  recipe(DeadDist ~ Neigh_1.5 + BA_Infest_1.5 + `BA_Infest_1/4th`) |>
  step_sqrt(all_outcomes()) |> 
  step_corr(all_predictors())

# --- Workflow --------------------------------------------
linear_workflow <- 
  workflow() |>
  add_model(linear_model_spec) |>
  add_recipe(linear_recipe)
  
# --- Fit on Training Data --------------------------------
linear_fitted <- linear_workflow |>
  fit(data = pine_train_data)

# --- Evaluate on Test Data -------------------------------
linear_last_fit <- linear_workflow |>
  last_fit(pine_data_split)

# --- Variable Importance ---------------------------------
linear_vip_data <- linear_fitted |>
  extract_fit_parsnip() |>
  vip::vi() |>
  mutate(
    Variable   = str_remove_all(Variable, "`"),
    Importance = round(Importance / sum(Importance) * 100, 0)
  )

# --- Metrics (back-transformed to original units) --------
linear_rsq <- linear_last_fit |>
  collect_metrics() |>
  filter(.metric == "rsq") |>
  pull(.estimate) |>
  round(2)

linear_rmse <- linear_last_fit |>
  collect_predictions() |>
  mutate(
    pred_orig   = .pred^2,
    actual_orig = DeadDist^2
  ) |>
  summarise(rmse = sqrt(mean((pred_orig - actual_orig)^2))) |>
  pull(rmse) |>
  round(2)

# --- Predictions Tibble ----------------------------------
linear_preds_tbl <- linear_last_fit |>
  collect_predictions() |>
  mutate(
    actual         = DeadDist^2,
    predicted      = .pred^2,
    response_label = dplyr::case_when(
      pine_tbl$Response[.row] == "0" ~ "healthy",
      pine_tbl$Response[.row] == "1" ~ "infested",
      TRUE ~ "unknown"
    ),
    tooltip = str_glue(
      "<b>Actual</b>: {round(actual, 2)} m<br>",
      "<b>Predicted</b>: {round(predicted, 2)} m"
    )
  )
```

## Row

### VIP 1 {.no-title}

```{r linear_gauges_1}
create_gauge(linear_vip_data$Importance[1], label = linear_vip_data$Variable[1])
```

> Variable importance of `r get_var_label(linear_vip_data$Variable[1])`.

### VIP 2 {.no-title}

```{r linear_gauges_2}
create_gauge(linear_vip_data$Importance[2], label = linear_vip_data$Variable[2])
```

> Variable importance of `r get_var_label(linear_vip_data$Variable[2])`.

### VIP 3 {.no-title}

```{r linear_gauges_3}
create_gauge(linear_vip_data$Importance[3], label = linear_vip_data$Variable[3])
```

> Variable importance of `r get_var_label(linear_vip_data$Variable[3])`.

### R² {.no-title}

```{r linear_gauge_rsq}
create_gauge(linear_rsq*100, label = str_glue("R²: {linear_rsq}"))
```

> The amount of variability accounted for in `DeadDist`.

### RMSE indicator {.no-title}

```{r linear_rmse}
create_valuebox(str_glue("{linear_rmse} m"))
```

## Row

### Predicted vs. Actual

```{r linear_scatter}
make_pred_scatter(linear_preds_tbl)
```

> Points near the dashed line indicate accurate predictions. Greater uncertainty at larger distances is expected given the right-skewed distribution of `DeadDist`.

Ridge Regression {data-navmenu="Regression" data-orientation=rows .hidden}
============================================================================

```{r ridge_regression, include=FALSE, cache = TRUE}
# ========================================================
# RIDGE REGULARIZATION MODEL
# ========================================================

# --- Model Specification ---------------------------------
# penalty controls degree of shrinkage (lambda); estimated by Dr. Smirnova
# mixture = 0 specifies pure ridge (vs. mixture = 1 for lasso)
ridge_model_spec <-
  linear_reg(mixture = 0, penalty = 0.1629751) |>
  set_engine("glmnet")

# --- Preprocessing Recipe --------------------------------
pine_recipe <- pine_train_data |>
  recipe(DeadDist ~ Neigh_1.5 + BA_Infest_1.5 + `BA_Infest_1/4th`) |>
  step_sqrt(all_outcomes()) |>
  step_corr(all_predictors()) |>
  step_normalize(all_numeric(), -all_outcomes()) |>
  step_zv(all_numeric(), -all_outcomes())


# --- Workflow --------------------------------------------
pine_ridge_workflow <-
  workflow() |>
  add_model(ridge_model_spec) |>
  add_recipe(pine_recipe)

# --- Fit on Training Data --------------------------------
pine_ridge_fitted <-
  pine_ridge_workflow |>
  fit(data = pine_train_data)

# --- Evaluate on Test Data -------------------------------
ridge_last_fit <- pine_ridge_workflow |>
  last_fit(pine_data_split)

# --- Variable Importance ---------------------------------
ridge_vip_data <- pine_ridge_fitted |>
  extract_fit_parsnip() |>
  vip::vi() |>
  mutate(Importance = round(Importance / sum(Importance) * 100, 0))

# --- Metrics (back-transformed to original units) --------
ridge_rsq <- ridge_last_fit |>
  collect_metrics() |>
  filter(.metric == "rsq") |>
  pull(.estimate) |>
  round(2)

ridge_rmse <- ridge_last_fit |>
  collect_predictions() |>
  mutate(
    pred_orig   = .pred^2,
    actual_orig = DeadDist^2
  ) |>
  summarise(rmse = sqrt(mean((pred_orig - actual_orig)^2))) |>
  pull(rmse) |>
  round(2)


# --- Predictions Tibble ----------------------------------
ridge_preds_tbl <- ridge_last_fit |>
  collect_predictions() |>
  mutate(
    actual         = DeadDist^2,
    predicted      = .pred^2,
    response_label = dplyr::case_when(
      pine_tbl$Response[.row] == "0" ~ "healthy",
      pine_tbl$Response[.row] == "1" ~ "infested",
      TRUE ~ "unknown"
    ),
    tooltip = str_glue(
      "<b>Actual</b>: {round(actual, 2)} m<br>",
      "<b>Predicted</b>: {round(predicted, 2)} m"
    )
  )
```

## Row

### VIP 1 {.no-title}

```{r ridge_gauges_1}
create_gauge(ridge_vip_data$Importance[1], label = ridge_vip_data$Variable[1])
```

> Variable importance of `r get_var_label(ridge_vip_data$Variable[1])`.

### VIP 2 {.no-title}

```{r ridge_gauges_2}
create_gauge(ridge_vip_data$Importance[2], label = ridge_vip_data$Variable[2])
```

> Variable importance of `r get_var_label(ridge_vip_data$Variable[2])`.

### VIP 3 {.no-title}

```{r ridge_gauges_3}
create_gauge(ridge_vip_data$Importance[3], label = ridge_vip_data$Variable[3])
```

> Variable importance of `r get_var_label(ridge_vip_data$Variable[3])`.

### R²  {.no-title}

```{r ridge_gauge_rsq}
create_gauge(ridge_rsq*100, label = str_glue("R²: {ridge_rsq}"))
```

> The amount of variability accounted for in `DeadDist`.

### RMSE indicator {.no-title}

```{r ridge_rmse}
create_valuebox(str_glue("{ridge_rmse} m"))
```

## Row

### Predicted vs. Actual

```{r ridge_scatter}
make_pred_scatter(ridge_preds_tbl)
```

> Points near the dashed line indicate accurate predictions. This model shows linear regression with Ridge regularization, so predictions reflect the raw least-squares fit on the test dataset. Greater uncertainty at larger distances is expected given the right-skewed distribution of `DeadDist`.


LASSO Regression {data-navmenu="Regression" data-orientation=rows .hidden}
============================================================================

```{r lasso_regression, include=FALSE, cache = TRUE}
# =========================================================
# LASSO REGRESSION MODEL
# =========================================================

# --- Bootstrap Resamples & Lambda Grid -------------------
# Use resampling + tuning to find the optimal penalty (lambda)
# rather than specifying a single value manually
set.seed(1234)
pine_boot_resamples <- bootstraps(pine_train_data)
lambda_candidate_grid <- grid_regular(penalty(), levels = 50)


# --- Model Specification ---------------------------------
# mixture = 1 specifies pure lasso (vs. mixture = 0 for ridge)
# penalty = tune() defers the lambda value to the tuning step
lasso_model_spec <-
  linear_reg(mixture = 1, penalty = tune()) |>
  set_engine("glmnet")

lasso_model_spec |> translate()


# --- Workflow --------------------------------------------
pine_lasso_workflow <-
  workflow() |>
  add_model(lasso_model_spec) |>
  add_recipe(pine_recipe)


# --- Tune Penalty Over Bootstrap Resamples ---------------
lasso_tuning_results <- tune_grid(
  pine_lasso_workflow,
  resamples = pine_boot_resamples,
  grid = lambda_candidate_grid
)

lasso_tuning_results |> collect_metrics()


# --- Select Best Penalty & Finalize Workflow -------------
# Select the penalty value that yields the lowest RMSE across resamples
# Alternatives: select_by_pct_loss() or select_by_one_std_err()
# see https://juliasilge.com/blog/lasso-the-office/
best_lambda <- lasso_tuning_results |>
  select_best(metric = "rmse")

# Substitute the chosen penalty back into the workflow
final_lasso_workflow <- finalize_workflow(
  pine_lasso_workflow,
  best_lambda
)


# --- Fit on Training Data --------------------------------
pine_lasso_fitted <-
  final_lasso_workflow |>
  fit(data = pine_train_data)

# Inspect fitted coefficients
pine_lasso_fitted |> extract_fit_parsnip() |> tidy()


# --- Evaluate on Test Data -------------------------------
# Refit finalized workflow on full training set
lasso_last_fit <- final_lasso_workflow |>
  last_fit(pine_data_split)



# --- Variable Importance ---------------------------------
lasso_vip_data <- pine_lasso_fitted |>
  extract_fit_parsnip() |>
  vip::vi() |>
  mutate(Importance = round(Importance / sum(Importance) * 100, 0))


# --- Metrics (back-transformed to original units) --------
lasso_rsq <- lasso_last_fit |>
  collect_metrics() |>
  filter(.metric == "rsq") |>
  pull(.estimate) |>
  round(2)

lasso_rmse <- lasso_last_fit |>
  collect_predictions() |>
  mutate(
    pred_orig   = .pred^2,
    actual_orig = DeadDist^2
  ) |>
  summarise(rmse = sqrt(mean((pred_orig - actual_orig)^2))) |>
  pull(rmse) |>
  round(2)


# --- Predictions Tibble ----------------------------------
lasso_preds_tbl <- lasso_last_fit |>
  collect_predictions() |>
  mutate(
    actual         = DeadDist^2,
    predicted      = .pred^2,
    response_label = dplyr::case_when(
      pine_tbl$Response[.row] == "0" ~ "healthy",
      pine_tbl$Response[.row] == "1" ~ "infested",
      TRUE ~ "unknown"
    ),
    tooltip = str_glue(
      "<b>Actual</b>: {round(actual, 2)} m<br>",
      "<b>Predicted</b>: {round(predicted, 2)} m"
    )
  )
```

## Row

### VIP 1 {.no-title}

```{r lasso_gauges_1}
create_gauge(lasso_vip_data$Importance[1], label = lasso_vip_data$Variable[1])
```

> Variable importance of `r get_var_label(lasso_vip_data$Variable[1])`.

### VIP 2 {.no-title}

```{r lasso_gauges_2}
create_gauge(lasso_vip_data$Importance[2], label = lasso_vip_data$Variable[2])
```

> Variable importance of `r get_var_label(lasso_vip_data$Variable[2])`.

### VIP 3 {.no-title}

```{r lasso_gauges_3}
create_gauge(lasso_vip_data$Importance[3], label = lasso_vip_data$Variable[3])
```

> Variable importance of `r get_var_label(lasso_vip_data$Variable[3])`.

### R² {.no-title}

```{r lasso_gauge_rsq}
create_gauge(lasso_rsq*100, label = str_glue("R²: {lasso_rsq}"))
```

> The amount of variability accounted for in `DeadDist`.

### RMSE indicator {.no-title}

```{r lasso_rmse}
create_valuebox(str_glue("{lasso_rmse} m"))
```

## Row

### Predicted vs. Actual

```{r lasso_scatter}
make_pred_scatter(lasso_preds_tbl)
```

> Points near the dashed line indicate accurate predictions. This model applies LASSO (L1) regularization, which can shrink some coefficients to exactly zero, performing variable selection. Greater uncertainty at larger distances is expected given the right-skewed distribution of `DeadDist`.


Data Dictionary 📚 {data-navmenu="Data" .hidden}
=============================================================================

```{r data_dictionary}
dict_tbl <- pine_tbl |>
  labelled::generate_dictionary() |>
  dplyr::select(variable, label) |>
  dplyr::arrange(variable)

# UI
DT::datatable(
  data      = dict_tbl,
  colnames  = NULL,
  rownames  = FALSE,
  options  = list(pageLength = 20, scrollX = TRUE)
)
```

Data Explorer 🧭 {data-navmenu="Data" .hidden}
=============================================================================

```{r data_explorer}
# Round all doubles except coordinates
dbl_cols <- pine_tbl |>
  dplyr::select(where(is.double)) |>
  dplyr::select(-Easting, -Northing) |>
  names()

# UI
DT::datatable(
  data     = pine_tbl,
  rownames = FALSE,
  options  = list(pageLength = 20, scrollX = TRUE)
) |>
  formatRound(columns = dbl_cols, digits = 1)
```