Hypothesis 3: Percent Live canopy will positively correlate with percent canker areas

Data Import

Click for data import info

Importing 2024 & 2025 Data

See the full code for the break down for the importing and cleaning open the documents of the same name. Or here: - 2024 cleaning code and - 2025 cleaning code

Create the R files if needed.

Purl (or stitch) together R code from the markdown cleaning files. These stitched files are stored in ‘purl’ folder with the date of the code purled.

# library(knitr)
# 
# purl("cleaning_2024_health_form_data.Rmd",
#      output =
#        paste("purl/cleaning_2024_health_form_data",
#        Sys.Date(),".R"))
# 
# purl("cleaning_2025_health_form_data.Rmd",
#      output =
#        paste("purl/cleaning_2025_health_form_data",
#        Sys.Date(),".R"))

Run the R files to get cleaned data

Next, we can run those extracted R files to actually import and clean the data.

## 2024 data
source(paste("purl/cleaning_2024_health_form_data", Sys.Date(), ".R"))

## 2025 data
source(paste("purl/cleaning_2025_health_form_data", Sys.Date(), ".R"))

Combining 2024 and 2025 data along canker area & DBH

library(dplyr)

# Add a 'year' column to each dataset to allow grouping/plotting by year
health_assess_2024 <- health_assess_2024 %>%
  mutate(year = 2024)

health_assess_2025 <- health_assess_2025 %>%
  mutate(year = 2025)

# Keep only the necessary columns for this hypothesis
sliced_2024 <- health_assess_2024 %>%
  select(year, dbh_cm, percent_live_canopy, base_canker_area, trunk_canker_area, seedling_y_n) 

sliced_2025 <- health_assess_2025 %>%
  select(year, dbh_cm, percent_live_canopy, base_canker_area, trunk_canker_area, seedling_y_n) 

# Combine the two data frames
combined_2024_2025 <- bind_rows(sliced_2024, sliced_2025) 

# Fixing the Y, N, Yes, No discrepancies in combined
combined_2024_2025 <- combined_2024_2025 %>% mutate(seedling_y_n = if_else(seedling_y_n == "N", "No", seedling_y_n))
combined_2024_2025 <- combined_2024_2025 %>% mutate(seedling_y_n = if_else(seedling_y_n == "Y", "Yes", seedling_y_n))

Setup of Analysis

This chunk defines a re-usable function to run a regression across multiple input datas.

do_hypoth3_analysis <- function(data = combined_2024_2025, predictor, response, xlabel, ylabel, color = "black", color_points = "black") {
  # Model: Define linear model
  model <- lm(reformulate(predictor, response), data = data)
  coefs <- coef(model)
  r_squared <- summary(model)$r.squared
  p_value <- summary(model)$coefficients[2,4]   # pull p.value from linear regression model
  
  # Text: Create equation and R^2 text
  equation <- paste0("y = ", round(coefs[2], 2), "x + ", round(coefs[1], 2))
  r2_text <- paste0("italic(R)^2 == ", round(r_squared, 4))
  p_text <- paste0("p-value = ", round(p_value, 10))
  
  # Position: Get plotting ranges
  max_x <- max(data[[predictor]], na.rm = TRUE)
  
  max_y <- max(data[[response]], na.rm = TRUE)
  min_y <- min(data[[response]], na.rm = TRUE)
  range_y <- max_y - min_y
  
  # Annotate sample size near the bottom-right
  sample_size_y_position <- min_y + 0.08 * range_y
  
  # Annotate equation and r2 near the top-right
  p_y_position  <- max_y - 0.05 * range_y
  eq_y_position <- max_y - 0.12 * range_y
  r2_y_position <- max_y - 0.19 * range_y
  
  # PLOT ------------------------------------------------------
  ggplot(data, aes_string(x = predictor, y = response)) +
    # Points
    geom_point(aes_string(color = color_points)) +
    
    # Line of best fit
    geom_smooth(method = "lm", color = color) +
    
    # Axes Label & Theme
    labs(x = xlabel, y = ylabel) +  # Set the y-axis label
    theme(
      axis.title.x = element_text(color = color) # Change y-axis label color
    ) + 
    
    # Samples size text
    annotate(
      "text", 
      x = max_x,
      y = sample_size_y_position,
      label = paste("n = ", count(data)),
      hjust = 1,
      size = 6,
      color = "black"
    ) +
    
    # Equation and R^2 text
    annotate(
      "text",
      x = max_x,
      y = eq_y_position,
      label = equation,
      hjust = 1,
      size = 4,
      color = color
    ) +
    annotate(
      "text",
      x = max_x,
      y = r2_y_position,
      label = r2_text,
      hjust = 1,
      size = 4,
      color = color,
      parse = TRUE # Ensures that italics styling works
    ) + 
    # P value text
    annotate(
      "text",
      x = max_x,
      y = p_y_position,
      label = p_text,
      hjust = 1,
      size = 4,
      color = color
    )
}

Data Analysis

Here are the major delineations of questions across this hypotheses: * Done ? In-progress

Hypothesis 3: Live canopy/severity of infection based on canopy will correlate with canker area

Patterns overall in 2025

library(scales)

hex <- hue_pal()(5)

patterns_across_2025_base <- do_hypoth3_analysis(
  data = health_assess_2025, 
  predictor = "base_canker_area", 
  response = "percent_live_canopy",
  xlabel = "% of Base Canker Area",
  ylabel = "% Live Canopy",
  color = hex[3],
  color_points = "seedling_y_n"
)
patterns_across_2025_base

patterns_across_2025_trunk <- do_hypoth3_analysis(
  data = health_assess_2025, 
  predictor = "trunk_canker_area", 
  response = "percent_live_canopy",
  xlabel = "% of Trunk Canker Area",
  ylabel = "% Live Canopy",
  color = hex[4],
  color_points = "seedling_y_n"
)
patterns_across_2025_trunk

library(patchwork)

(patterns_across_2025_base + patterns_across_2025_trunk) + plot_layout(guide = "collect")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Patterns across seedlings versus adults

Seedlings

library(scales)

hex <- hue_pal()(5)

seedlings_combined <- health_assess_2025 %>% filter(is.na(dbh_cm))

patterns_across_seedlings_base <- do_hypoth3_analysis(
  data = seedlings_combined, 
  predictor = "base_canker_area", 
  response = "percent_live_canopy",
  xlabel = "% of Base Canker Area in Seedlings",
  ylabel = "% Live Canopy",
  color = hex[3],
  color_points = "seedling_y_n"
)
patterns_across_seedlings_base
## `geom_smooth()` using formula = 'y ~ x'

patterns_across_seedlings_trunk <- do_hypoth3_analysis(
  data = seedlings_combined, 
  predictor = "trunk_canker_area", 
  response = "percent_live_canopy",
  xlabel = "% of Trunk Canker Area in Seedlings",
  ylabel = "% Live Canopy",
  color = hex[4],
  color_points = "seedling_y_n"
)
patterns_across_seedlings_trunk
## `geom_smooth()` using formula = 'y ~ x'

library(patchwork)

seedlings_2025 <- (patterns_across_seedlings_base + patterns_across_seedlings_trunk) + plot_layout(guide = "collect")
seedlings_2025
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Adults

library(scales)

hex <- hue_pal()(5)

adults_combined <- health_assess_2025 %>% filter(!is.na(dbh_cm))

patterns_across_adults_base <- do_hypoth3_analysis(
  data = adults_combined, 
  predictor = "base_canker_area", 
  response = "percent_live_canopy",
  xlabel = "% of Base Canker Area in 2025 Adults",
  ylabel = "% Live Canopy",
  color = hex[3],
  color_points = "seedling_y_n"
)
patterns_across_adults_base
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

patterns_across_adults_trunk <- do_hypoth3_analysis(
  data = adults_combined, 
  predictor = "trunk_canker_area", 
  response = "percent_live_canopy",
  xlabel = "% of Trunk Canker Area in 2025 Adults",
  ylabel = "% Live Canopy",
  color = hex[4],
  color_points = "seedling_y_n"
)
patterns_across_adults_trunk
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

library(patchwork)

adults_2025 <- (patterns_across_adults_base + patterns_across_adults_trunk) + plot_layout(guide = "collect")
adults_2025
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Results

In 2025, seedlings show no relationship between either canker areas and canopy.

However, 2025 adults show a week negative relationship between both canker areas and percent live canopy.

Discussion as of 07-22-2025

On July 22nd, 2025, Sean, Emma and Hank discussed these results during this meeting, these are the highlights: