See the full code for the break down for the importing and cleaning open the documents of the same name. Or here: - 2024 cleaning code and - 2025 cleaning code
Purl (or stitch) together R code from the markdown cleaning files. These stitched files are stored in ‘purl’ folder with the date of the code purled.
# library(knitr)
#
# purl("cleaning_2024_health_form_data.Rmd",
# output =
# paste("purl/cleaning_2024_health_form_data",
# Sys.Date(),".R"))
#
# purl("cleaning_2025_health_form_data.Rmd",
# output =
# paste("purl/cleaning_2025_health_form_data",
# Sys.Date(),".R"))
Next, we can run those extracted R files to actually import and clean the data.
## 2024 data
source(paste("purl/cleaning_2024_health_form_data", Sys.Date(), ".R"))
## 2025 data
source(paste("purl/cleaning_2025_health_form_data", Sys.Date(), ".R"))
library(dplyr)
# Add a 'year' column to each dataset to allow grouping/plotting by year
health_assess_2024 <- health_assess_2024 %>%
mutate(year = 2024)
health_assess_2025 <- health_assess_2025 %>%
mutate(year = 2025)
# Keep only the necessary columns for this hypothesis
sliced_2024 <- health_assess_2024 %>%
select(year, dbh_cm, base_canker_area, trunk_canker_area)
sliced_2025 <- health_assess_2025 %>%
select(year, dbh_cm, base_canker_area, trunk_canker_area)
# Combine the two data frames
combined_2024_2025 <- bind_rows(sliced_2024, sliced_2025)
This chunk defines a re-usable function to run a regression across multiple input datas.
do_hypoth2_analysis <- function(data, predictor, response, xlabel, ylabel, color = "black") {
# Define linear model
model <- lm(reformulate(predictor, response), data = data)
coefs <- coef(model)
r_squared <- summary(model)$r.squared
# Create equation and R^2 text
equation <- paste0("y = ", round(coefs[2], 2), "x + ", round(coefs[1], 2))
r2_text <- paste0("italic(R)^2 == ", round(r_squared, 4))
# Get plotting ranges
max_x <- max(data[[predictor]], na.rm = TRUE)
max_y <- max(data[[response]], na.rm = TRUE)
min_y <- min(data[[response]], na.rm = TRUE)
range_y <- max_y - min_y
# Annotate sample size near the bottom-right
sample_size_y_position <- min_y + 0.08 * range_y
# Annotate equation and r2 near the top-right
eq_y_position <- max_y - 0.05 * range_y
r2_y_position <- max_y - 0.12 * range_y
# PLOT ------------------------------------------------------
ggplot(data, aes_string(x = predictor, y = response)) +
# Points
geom_point(color = color) +
# Line of best fit
geom_smooth(method = "lm", color = color) +
# Axes Label & Theme
labs(x = xlabel, y = ylabel) + # Set the y-axis label
theme(
axis.title.y = element_text(color = color) # Change y-axis label color
) +
# Samples size text
annotate(
"text",
x = max_x,
y = sample_size_y_position,
label = paste("n = ", count(data)),
hjust = 1,
size = 6,
color = "black"
) +
# Equation and R^2 text
annotate(
"text",
x = max_x,
y = eq_y_position,
label = equation,
hjust = 1,
size = 4,
color = color
) +
annotate(
"text",
x = max_x,
y = r2_y_position,
label = r2_text,
hjust = 1,
size = 4,
color = color,
parse = TRUE # Ensures that italics styling works
)
}
Here are the major delineations of questions across this hypotheses: * Done ? In-progress
Notably: * I’d like to include both base and trunk canker area on the same graph; I think that could be very nice
library(scales)
hex <- hue_pal()(3)
patterns_across_all_base <- do_hypoth2_analysis(
data = combined_2024_2025,
predictor = "dbh_cm",
response = "base_canker_area",
xlabel = "All Individuals DBH (cm)",
ylabel = "% of Base Canker Area",
color = hex[1]
)
patterns_across_all_base
patterns_across_all_trunk <- do_hypoth2_analysis(
data = combined_2024_2025,
predictor = "dbh_cm",
response = "trunk_canker_area",
xlabel = "All Individuals DBH (cm)",
ylabel = "% of Trunk Canker Area",
color = hex[2]
)
patterns_across_all_trunk
library(scales)
hex <- hue_pal()(3)
patterns_across_2025_base <- do_hypoth2_analysis(
data = health_assess_2025,
predictor = "dbh_cm",
response = "base_canker_area",
xlabel = "2025 Individuals DBH (cm)",
ylabel = "% of Base Canker Area",
color = hex[1]
)
patterns_across_2025_base
patterns_across_2025_trunk <- do_hypoth2_analysis(
data = health_assess_2025,
predictor = "dbh_cm",
response = "trunk_canker_area",
xlabel = "2025 Individuals DBH (cm)",
ylabel = "% of Trunk Canker Area",
color = hex[2]
)
patterns_across_2025_trunk
patterns_across_2025_girdled <- do_hypoth2_analysis(
data = health_assess_2025,
predictor = "dbh_cm",
response = "girdled_canker_circum",
xlabel = "2025 Individuals DBH (cm)",
ylabel = "% of Circumference Girdled",
color = hex[3]
)
patterns_across_2025_girdled
library(scales)
hex <- hue_pal()(3)
patterns_across_2024_base <- do_hypoth2_analysis(
data = health_assess_2024,
predictor = "dbh_cm",
response = "base_canker_area",
xlabel = "All 2024 Individuals DBH (cm)",
ylabel = "% of Base Canker Area",
color = hex[1]
)
patterns_across_2024_base
patterns_across_2024_trunk <- do_hypoth2_analysis(
data = health_assess_2024,
predictor = "dbh_cm",
response = "trunk_canker_area",
xlabel = "All 2024 Individuals DBH (cm)",
ylabel = "% of Trunk Canker Area",
color = hex[2]
)
patterns_across_2024_trunk
library(patchwork)
patterns_across_all_base + patterns_across_all_trunk
library(patchwork)
patterns_across_2024_base + patterns_across_2024_trunk
I would interpret this as older trees correlate with larger base canker reas.
patterns_across_2025_base
This result would indicate that younger trees have slightly higher girdling.
We may be seeing this trend due to an inaccuracy with attempting to estimate circumference on smaller trees.
patterns_across_2025_girdled
(patterns_across_2024_base + patterns_across_2025_base) + plot_annotation("% Base Canker Area 2024 versus 2025")
(patterns_across_2024_trunk + patterns_across_2025_trunk) + plot_annotation("% Trunk Canker Area 2024 versus 2025")
#
# # This ensures adds a DBH measure to each of the canker entries, such that they can be plotted against each other.
# data <- health_assess_2025
# predictor <- "dbh_cm"
# response1 <- "base_canker_area"
#
# long_data <- data %>%
# select(all_of(c(predictor, response1, response2))) %>%
# pivot_longer(
# cols = c(all_of(response1), all_of(response2)),
# names_to = "response_type",
# values_to = "response_value"
# )
#
# # Fit models and extract stats for annotation
# annotations <- long_data %>%
# group_by(response_type) %>% # Collects all of the base/trunk canker area readings together
# group_modify(~ {
# model <- lm(response_value ~ get(predictor), data = long_health_data)
# coefs <- coef(model)
# r_squared <- summary(model)$r.squared
# eq <- paste0("y == ", round(coefs[2], 2), " * x + ", round(coefs[1], 2))
# r2 <- paste0("italic(R)^2 == ", round(r_squared, 4))
# data.frame(equation = eq, r2 = r2)
# })
#
# # Determine top-right positions for annotation
# max_x <- max(long_data[[predictor]], na.rm = TRUE)
# max_y <- max(long_data$response_value, na.rm = TRUE)
# min_y <- min(long_data$response_value, na.rm = TRUE)
# range_y <- max_y - min(long_data$response_value, na.rm = TRUE)
#
# # Add y positions for annotations
# # annotations["base_canker_area"]$eq_y <-
# annotations$eq_y <- min_y + 0.05 * range_y
# annotations$r2_y <- min_y + 0.12 * range_y
# annotations$x <- max_x
#
# # Plot
# ggplot(long_data, aes_string(x = predictor, y = "response_value", color = "response_type")) +
# geom_point() +
# geom_smooth(method = "lm", se = FALSE) +
# # xlab(xlabel) +
# # ylab(ylabel) +
# # Add annotations for each response type
# geom_text(data = annotations,
# aes(x = x, y = eq_y, label = equation, color = response_type),
# hjust = 1, size = 4, parse = TRUE, inherit.aes = FALSE) +
# geom_text(data = annotations,
# aes(x = x, y = r2_y, label = r2, color = response_type),
# hjust = 1, size = 4, parse = TRUE, inherit.aes = FALSE) +
# theme_minimal()
#
# view(annotations)