Predicting Pine Beetle Brood Tree Distance From the Pine Beetle Data Set

model	rmse	rsq	mae
Linear Regression	5.266969	0.9272151	3.004681
Random Forest	2.219862	0.9870917	1.527648

---
title: "Predicting Pine Beetle Brood Tree Distance From the Pine Beetle Data Set"
output:
  flexdashboard::flex_dashboard:
    orientation: rows
    vertical_layout: scroll
    theme: 
      version: 4
      bootswatch: minty
    source_code: embed
    #https://rstudio.github.io/flexdashboard/articles/theme.html
---

```{r setup, include=FALSE}
library(flexdashboard)
library(tidymodels)
library(tidyverse)
library(vip)
library(plotly)
library(gt)
library(readxl)
library(bslib)
library(ranger)
theme_set(theme_minimal())

pine_beetle <- read_xlsx(
  "~/Desktop/Spring Semester 2026/DataScience2/hgen-612/data/Data_1993.xlsx")
#pine_beetle <- read_xlsx("data/Data_1993.xlsx") # For reproducibility purposes
```

```{r data-prep}
pine_data <- pine_beetle %>%
  drop_na(DeadDist)

set.seed(123)

split <- initial_split(pine_data, prop = 0.8)

train <- training(split)
test  <- testing(split)
```

```{r recipe}
# make a recipe
pine_recipe <- recipe(DeadDist ~ ., data = train) %>%
  step_zv(all_predictors()) %>% # remove vars with no variation
  step_impute_mean(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())
```

```{r models}
lm_model <- linear_reg() %>%
  set_engine("lm")

rf_model <- rand_forest( #using a random forest model due to complexity of data, IDK much about ecology and I expect the var relationships to not all be linear.
  trees = 500,
  min_n = 5
) %>%
  set_engine("ranger") %>%
  set_mode("regression")
```

```{r workflows}
# compare lm and rf model
lm_workflow <- workflow() %>%
  add_recipe(pine_recipe) %>%
  add_model(lm_model)

rf_workflow <- workflow() %>%
  add_recipe(pine_recipe) %>%
  add_model(rf_model)

lm_fit <- fit(lm_workflow, train)
rf_fit <- fit(rf_workflow, train)
```

```{r predictions}
lm_preds <- predict(lm_fit, test) %>%
  bind_cols(test)

rf_preds <- predict(rf_fit, test) %>%
  bind_cols(test)
```

Page 1
=======================================================================

Column
-----------------------------------------------------------------------

### Dataset Overview
*Sample of predictors used to generate the brood tree distance model.*

```{r}
pine_data %>%
  sample_n(10) %>%
  gt()
```

Column
-----------------------------------------------------------------------

### Distribution of Distance to Brood Tree
*Understanding the response variable.*

```{r}
dist_plot <- pine_data %>%
  ggplot(aes(x = DeadDist)) + 
  geom_histogram(bins = 30, fill = "#2C7FB8", alpha = 0.8) +
  labs(
    title = "Distribution of Distance to Nearest Brood Tree",
    x = "Distance (DeadDist)",
    y = "Count"
  )

ggplotly(dist_plot) %>%
  layout(margin = list(l = 70, r = 30, t = 60, b = 100)) # created bigger margins b/c redering was cutting off my axis
```

Column
-----------------------------------------------------------------------

### Tree Diameter vs Distance
*Larger trees potentially influence brood tree spacing.*

```{r}
diam_plot <- pine_data %>%
  ggplot(aes(TreeDiam, DeadDist)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm") +
  labs(
    title = "Tree Diameter vs Distance to Brood Tree",
    x = "Diameter of Tree",
    y = "Distance to Brood Tree"
  )

ggplotly(diam_plot) %>%
  ggplotly(dist_plot) %>%
  layout(margin = list(l = 70, r = 30, t = 60, b = 100))
```

Page 2
=======================================================================

Column
-----------------------------------------------------------------------

### Linear Regression Model Results
*Predicted vs observed distances.*

```{r}
lm_plot <- lm_preds %>%
  ggplot(aes(DeadDist, .pred)) +
  geom_point(alpha = 0.4) +
  geom_abline(linetype = 2, color = "red") +
  labs(
    title = "Linear Regression Predictions",
    x = "Observed Distance",
    y = "Predicted Distance"
  )

ggplotly(lm_plot) %>%
  ggplotly(dist_plot) %>%
  layout(margin = list(l = 70, r = 30, t = 60, b = 100))
```

Column
-----------------------------------------------------------------------

### Random Forest Model Results
*Comparing nonlinear prediction and its performance.*

```{r}
rf_plot <- rf_preds %>%
  ggplot(aes(DeadDist, .pred)) +
  geom_point(alpha = 0.4) +
  geom_abline(linetype = 2, color = "red") +
  labs(
    title = "Random Forest Predictions",
    x = "Observed Distance",
    y = "Predicted Distance"
  )

ggplotly(rf_plot) %>%
  ggplotly(dist_plot) %>%
  layout(margin = list(l = 70, r = 30, t = 60, b = 100))
```

Column
-----------------------------------------------------------------------

### Comparing Performance of the Models 
*Random forest model improves predictive accuracy.*

```{r}
lm_metrics <- lm_preds %>%
  metrics(truth = DeadDist, estimate = .pred) %>%
  mutate(model = "Linear Regression")

rf_metrics <- rf_preds %>%
  metrics(truth = DeadDist, estimate = .pred) %>%
  mutate(model = "Random Forest")

bind_rows(lm_metrics, rf_metrics) %>%
  select(model, .metric, .estimate) %>%
  pivot_wider(names_from = .metric, values_from = .estimate) %>%
  gt()
```

Page 3
=======================================================================

Column
-----------------------------------------------------------------------

### Residual Diagnostics
*Evaluating model errors.*

```{r}
res_plot <- lm_preds %>%
  mutate(residual = DeadDist - .pred) %>%
  ggplot(aes(.pred, residual)) +
  geom_point(alpha = 0.4) +
  geom_hline(yintercept = 0, linetype = 2) +
  labs(
    title = "Residual Diagnostics",
    x = "Predicted Distance",
    y = "Residual"
  )

ggplotly(res_plot) %>%
  layout(margin = list(l = 70, r = 30, t = 60, b = 100))
```

Column
-----------------------------------------------------------------------

### QQ Plot of Residuals
*Assessing normality.*

``` {r}
qq_plot <- lm_preds %>%
  mutate(residual = DeadDist - .pred) %>%
  ggplot(aes(sample = residual)) +
  stat_qq(alpha = 0.5) +
  stat_qq_line(color = "red", linetype = 2) +
  labs(
    title = "QQ Plot of Residuals",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  )

qq_plot 
```

TreeNum	Response	Easting	Northing	TreeDiam	Infest_Serv1	Infest_Serv2	Ind_DeadDist	DeadDist	SDI_20th	Neigh_SDI_1/4th	BA_20th	Neigh_1/4th	Neigh_1/2th	Neigh_1	Neigh_1.5	BA_Inf_20th	BA_Infest_1/4th	BA_Infest_1/2th	BA_Infest_1	BA_Infest_1.5	IND_BA_Infest_20th	IND_BA_Infest_1/4th	IND_BA_Infest_1/2th	IND_BA_Infest_1	IND_BA_Infest_1.5
7719	0	766033.4	4333971	9	11	9.440874	1	26.587816	40.54258	122.12179	27.160920	83.48983	140.88773	233.2239	304.0278	0.000000	0.000000	0.000000	9.440874	9.440874	0	0	0	1	1
6102	0	766081.2	4334437	6	1	0.545400	1	11.383459	23.76008	99.13796	13.395024	59.38861	125.05477	254.7127	410.1790	0.000000	0.545400	0.545400	1.876176	3.430566	0	1	1	1	1
7071	0	766183.9	4334292	23	23	11.786094	1	1.486691	36.97867	176.81597	24.401196	125.31656	194.07514	321.6606	445.6136	2.721546	7.041114	14.027688	31.431402	34.676532	1	1	1	1	1
6579	0	766198.8	4334411	11	1	0.349056	1	44.193197	24.05113	105.57015	13.607730	58.25963	103.73508	186.2268	253.9710	0.000000	0.000000	0.000000	0.000000	0.000000	0	0	0	0	0
6702	0	766213.6	4334072	9	12	6.337548	1	44.207482	44.10163	105.32468	28.562598	70.79837	158.23690	277.5486	366.5033	0.000000	0.000000	0.000000	0.000000	0.000000	0	0	0	0	0
1682	0	765975.3	4334312	9	1	1.396224	1	32.804310	10.68820	54.87602	6.812046	34.57836	69.81665	182.5890	264.6281	0.000000	0.000000	0.000000	1.745280	8.497332	0	0	0	1	1
2757	0	766201.6	4334115	6	12	6.337548	1	39.707111	23.13676	86.56978	15.914772	54.12550	93.90697	215.5421	332.7104	0.000000	0.000000	0.000000	0.000000	2.339766	0	0	0	0	1
9285	0	766045.2	4334370	11	1	0.441774	1	18.888134	39.60248	139.96822	21.325140	76.10512	159.37133	267.1806	386.7759	0.000000	0.000000	0.441774	2.023434	6.735690	0	0	1	1	1
8939	1	766005.5	4334310	13	11	9.451782	1	4.884396	23.79555	136.32957	14.093136	84.84242	144.55282	280.3029	394.7496	0.349056	4.417740	7.624692	14.736708	16.623792	1	1	1	1	1
887	0	766155.6	4333960	18	2	2.361582	1	4.714548	33.47555	141.50770	24.439374	103.86598	151.95389	303.0079	433.9148	2.361582	2.361582	2.906982	43.495650	91.272690	1	1	1	1	1