Hypothesis 2: Higher DBH will be correlated with a higher percentage of canker on the trunk and base

Data Import

Importing 2024 & 2025 Data

See the full code for the break down for the importing and cleaning open the documents of the same name. Or here: - 2024 cleaning code and - 2025 cleaning code

Create the R files if needed.

Purl (or stitch) together R code from the markdown cleaning files. These stitched files are stored in ‘purl’ folder with the date of the code purled.

# library(knitr)
# 
# purl("cleaning_2024_health_form_data.Rmd", 
#      output = 
#        paste("purl/cleaning_2024_health_form_data",
#        Sys.Date(),".R"))
# 
# purl("cleaning_2025_health_form_data.Rmd", 
#      output = 
#        paste("purl/cleaning_2025_health_form_data",
#        Sys.Date(),".R"))

Run the R files to get cleaned data

Next, we can run those extracted R files to actually import and clean the data.

## 2024 data
source(paste("purl/cleaning_2024_health_form_data", Sys.Date(), ".R"))

## 2025 data
source(paste("purl/cleaning_2025_health_form_data", Sys.Date(), ".R"))

Combining 2024 and 2025 data along canker area & DBH

library(dplyr)

# Add a 'year' column to each dataset to allow grouping/plotting by year
health_assess_2024 <- health_assess_2024 %>%
  mutate(year = 2024)

health_assess_2025 <- health_assess_2025 %>%
  mutate(year = 2025)

# Keep only the necessary columns for this hypothesis
sliced_2024 <- health_assess_2024 %>%
  select(year, dbh_cm, base_canker_area, trunk_canker_area) 

sliced_2025 <- health_assess_2025 %>%
  select(year, dbh_cm, base_canker_area, trunk_canker_area) 

# Combine the two data frames
combined_2024_2025 <- bind_rows(sliced_2024, sliced_2025) 

Setup of Analysis

This chunk defines a re-usable function to run a regression across multiple input datas.

do_hypoth2_analysis <- function(data, predictor, response, xlabel, ylabel, color = "black") {
  # Define linear model
  model <- lm(reformulate(predictor, response), data = data)
  coefs <- coef(model)
  r_squared <- summary(model)$r.squared
  
  # Create equation and R^2 text
  equation <- paste0("y = ", round(coefs[2], 2), "x + ", round(coefs[1], 2))
  r2_text <- paste0("italic(R)^2 == ", round(r_squared, 4))
  
  # Get plotting ranges
  max_x <- max(data[[predictor]], na.rm = TRUE)
  
  max_y <- max(data[[response]], na.rm = TRUE)
  min_y <- min(data[[response]], na.rm = TRUE)
  range_y <- max_y - min_y
  
  # Annotate sample size near the bottom-right
  sample_size_y_position <- min_y + 0.08 * range_y
  
  # Annotate equation and r2 near the top-right
  eq_y_position <- max_y - 0.05 * range_y
  r2_y_position <- max_y - 0.12 * range_y
  
  # PLOT ------------------------------------------------------
  ggplot(data, aes_string(x = predictor, y = response)) +
    # Points
    geom_point(color = color) +
    
    # Line of best fit
    geom_smooth(method = "lm", color = color) +
    
    # Axes Label & Theme
    labs(x = xlabel, y = ylabel) +  # Set the y-axis label
    theme(
      axis.title.y = element_text(color = color) # Change y-axis label color
    ) + 
    
    # Samples size text
    annotate(
      "text", 
      x = max_x,
      y = sample_size_y_position,
      label = paste("n = ", count(data)),
      hjust = 1,
      size = 6,
      color = "black"
    ) +
    
    # Equation and R^2 text
    annotate(
      "text",
      x = max_x,
      y = eq_y_position,
      label = equation,
      hjust = 1,
      size = 4,
      color = color
    ) +
    annotate(
      "text",
      x = max_x,
      y = r2_y_position,
      label = r2_text,
      hjust = 1,
      size = 4,
      color = color,
      parse = TRUE # Ensures that italics styling works
    )
}

Data Analysis

Here are the major delineations of questions across this hypotheses: * Done ? In-progress

Notably: * I’d like to include both base and trunk canker area on the same graph; I think that could be very nice

Patterns overall– across all individuals (all sites, years, and age classes)

library(scales)

hex <- hue_pal()(3)

patterns_across_all_base <- do_hypoth2_analysis(
  data = combined_2024_2025, 
  predictor = "dbh_cm", 
  response = "base_canker_area",
  xlabel = "All Individuals DBH (cm)",
  ylabel = "% of Base Canker Area",
  color = hex[1]
)
patterns_across_all_base

patterns_across_all_trunk <- do_hypoth2_analysis(
  data = combined_2024_2025, 
  predictor = "dbh_cm", 
  response = "trunk_canker_area",
  xlabel = "All Individuals DBH (cm)",
  ylabel = "% of Trunk Canker Area",
  color = hex[2]
)
patterns_across_all_trunk

Patterns across years

Patterns across 2025

library(scales)

hex <- hue_pal()(3)

patterns_across_2025_base <- do_hypoth2_analysis(
  data = health_assess_2025, 
  predictor = "dbh_cm", 
  response = "base_canker_area",
  xlabel = "2025 Individuals DBH (cm)",
  ylabel = "% of Base Canker Area",
  color = hex[1]
)
patterns_across_2025_base

patterns_across_2025_trunk <- do_hypoth2_analysis(
  data = health_assess_2025, 
  predictor = "dbh_cm", 
  response = "trunk_canker_area",
  xlabel = "2025 Individuals DBH (cm)",
  ylabel = "% of Trunk Canker Area",
  color = hex[2]
)
patterns_across_2025_trunk

patterns_across_2025_girdled <- do_hypoth2_analysis(
  data = health_assess_2025, 
  predictor = "dbh_cm", 
  response = "girdled_canker_circum",
  xlabel = "2025 Individuals DBH (cm)",
  ylabel = "% of Circumference Girdled",
  color = hex[3]
)
patterns_across_2025_girdled

Patterns across 2024

library(scales)

hex <- hue_pal()(3)

patterns_across_2024_base <- do_hypoth2_analysis(
  data = health_assess_2024, 
  predictor = "dbh_cm", 
  response = "base_canker_area",
  xlabel = "All 2024 Individuals DBH (cm)",
  ylabel = "% of Base Canker Area",
  color = hex[1]
)
patterns_across_2024_base

patterns_across_2024_trunk <- do_hypoth2_analysis(
  data = health_assess_2024, 
  predictor = "dbh_cm", 
  response = "trunk_canker_area",
  xlabel = "All 2024 Individuals DBH (cm)",
  ylabel = "% of Trunk Canker Area",
  color = hex[2]
)
patterns_across_2024_trunk

Patterns across sites ?

  • Need to clean the data for the other sites before we can do this sections (07-21-2025)

Results

Patterns Overall: Positive correlation between base canker area and DBH, but no correlation with trunk canker area.

library(patchwork)

patterns_across_all_base + patterns_across_all_trunk 

2024 Canker areas: No correlation between DBH and % canker areas in 2024.

library(patchwork)

patterns_across_2024_base + patterns_across_2024_trunk

2025 Base canker area: Positive correlation between DBH and % base canker area

I would interpret this as older trees correlate with larger base canker reas.

patterns_across_2025_base

2025 Girdled circumference: Negative correlation between DBH and % circumference girdled

This result would indicate that younger trees have slightly higher girdling.

We may be seeing this trend due to an inaccuracy with attempting to estimate circumference on smaller trees.

patterns_across_2025_girdled

Across years: Appears to be an increase in DBH ~ Canker Area over 2024 and 2025.

(patterns_across_2024_base + patterns_across_2025_base) + plot_annotation("% Base Canker Area 2024 versus 2025")

(patterns_across_2024_trunk + patterns_across_2025_trunk) + plot_annotation("% Trunk Canker Area 2024 versus 2025")

Future work

Comparing base and trunk areas on same graph

Two response variables (base/trunk compared together)

# 
# # This ensures adds a DBH measure to each of the canker entries, such that they can be plotted against each other. 
# data <- health_assess_2025
# predictor <- "dbh_cm"
# response1 <- "base_canker_area"
# 
# long_data <- data %>%
#   select(all_of(c(predictor, response1, response2))) %>%
#   pivot_longer(
#     cols = c(all_of(response1), all_of(response2)),
#     names_to = "response_type",
#     values_to = "response_value"
#   )
# 
# # Fit models and extract stats for annotation
# annotations <- long_data %>%
#   group_by(response_type) %>% # Collects all of the base/trunk canker area readings together
#   group_modify(~ {
#     model <- lm(response_value ~ get(predictor), data = long_health_data)
#     coefs <- coef(model)
#     r_squared <- summary(model)$r.squared
#     eq <- paste0("y == ", round(coefs[2], 2), " * x + ", round(coefs[1], 2))
#     r2 <- paste0("italic(R)^2 == ", round(r_squared, 4))
#     data.frame(equation = eq, r2 = r2)
#   })
# 
# # Determine top-right positions for annotation
#   max_x <- max(long_data[[predictor]], na.rm = TRUE)
#   max_y <- max(long_data$response_value, na.rm = TRUE)
#   min_y <- min(long_data$response_value, na.rm = TRUE)
#   range_y <- max_y - min(long_data$response_value, na.rm = TRUE)
# 
#   # Add y positions for annotations
#   # annotations["base_canker_area"]$eq_y <- 
#   annotations$eq_y <- min_y + 0.05 * range_y
#   annotations$r2_y <- min_y + 0.12 * range_y
#   annotations$x <- max_x
# 
#   # Plot
#   ggplot(long_data, aes_string(x = predictor, y = "response_value", color = "response_type")) +
#     geom_point() +
#     geom_smooth(method = "lm", se = FALSE) +
#     # xlab(xlabel) +
#     # ylab(ylabel) +
#     # Add annotations for each response type
#     geom_text(data = annotations,
#               aes(x = x, y = eq_y, label = equation, color = response_type),
#               hjust = 1, size = 4, parse = TRUE, inherit.aes = FALSE) +
#     geom_text(data = annotations,
#               aes(x = x, y = r2_y, label = r2, color = response_type),
#               hjust = 1, size = 4, parse = TRUE, inherit.aes = FALSE) +
#     theme_minimal()
# 
# view(annotations)