pacman::p_load(tidyverse, readxl, raster, vegan, ggplot2,
tigris, sf, sp, plotly, ggrepel, rstanarm, mgcv,
brms, lme4, caret, MuMIn, VSURF, loo, patchwork)
EnvironmentalOutputs <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Environmental_Outputs.xlsx")
# Tree PCQ Data
tree_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
sheet = "Tree_PCQ")
# Veg Data
Veg_Cover <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
sheet = "Veg_Cover")
# Shrub Cover Data
shrub_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
sheet = "Shrub_Cover")
# Site Data
CogonSites <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/CogonSites_FL_AL_MS.xlsx")
# Count the total number of quadrats per plot
quadrat_count <- Veg_Cover %>%
group_by(Plot) %>%
summarize(total_quadrats = n_distinct(Quadrat), .groups = "drop")
#Filter tree data to only include trees with "tree" in the growth column
tree_data <- dplyr::filter(tree_data, Growth == "Tree")
#Filter Veg Cover to exclude Shrubs and Trees
Veg_Cover <- dplyr::filter(Veg_Cover, Growth != "Shrub" & Growth != "Tree")
#Filter Shrub Cover to only include Shrubs and Trees
shrub_data <- dplyr::filter(shrub_data, Growth == "Shrub" | Growth == "Tree")
# Total length of Shrub cover at a site
shrub_cover <- shrub_data %>%
mutate(Cover = Line_End - Line_Start) %>%
group_by(Species_Name, Plot, InvStatus) %>%
summarise(Total_Cover = sum(Cover, na.rm = TRUE), .groups = "drop") %>%
mutate(Percent_Cover = Total_Cover / 3000 * 100)
# Combine Plot and Quadrat columns
Veg_Cover <- Veg_Cover %>%
mutate(Plot_Quadrat = paste(Plot, Quadrat, sep = '_'))
# Join with CogonSites to get site information
Veg_Cover <- Veg_Cover %>%
left_join(CogonSites, by = "Plot")
# Sum species cover across quadrats for each species at each plot
veg_cover_summed <- Veg_Cover %>%
group_by(Plot, Species_Name, InvStatus) %>%
summarize(total_cover = sum(Cover_Per, na.rm = TRUE), .groups = "drop")
# Calculate average herbaceous species cover
avg_species_cover <- veg_cover_summed %>%
left_join(quadrat_count, by = "Plot") %>%
mutate(avg_cover = total_cover / total_quadrats)
This species matrix includes herbaceous and shrub species
# Merge shrub cover with herbaceous average cover
combined_cover <- avg_species_cover %>%
full_join(
shrub_cover %>%
select(Plot, Species_Name, Percent_Cover, InvStatus),
by = c("Plot", "Species_Name")
) %>%
mutate(
overlap_flag = ifelse(!is.na(avg_cover) & !is.na(Percent_Cover), TRUE, FALSE), # Flag overlaps
final_cover = case_when(
!is.na(avg_cover) & is.na(Percent_Cover) ~ avg_cover, # Use herbaceous cover if no shrub data
is.na(avg_cover) & !is.na(Percent_Cover) ~ Percent_Cover, # Use shrub cover if no herbaceous data
TRUE ~ NA_real_ # Leave as NA where overlaps exist
)
)
# Extract cogongrass cover
cogongrass_cover <- combined_cover %>%
filter(Species_Name == "Imperata_cylindrica") %>%
select(Plot, final_cover) %>%
rename(Cogongrass_Cover = final_cover)
combined_cover <- combined_cover %>%
filter(Species_Name != "Imperata_cylindrica" , Species_Name != "Imperata_cylindrica_Live") # Remove cogongrass from species matrix
## Remove any non_native species
combined_cover <- combined_cover %>%
filter(InvStatus.x != "Non_Native") # Remove non-native species from species matrix
species_matrix <- combined_cover %>%
select(Plot, Species_Name, final_cover) %>%
pivot_wider(
names_from = Species_Name,
values_from = final_cover,
values_fill = 0
)
# Calculate Shannon diversity index for each site
shannon_diversity <- species_matrix %>%
select(-Plot) %>%
vegan::diversity(index = "shannon") %>%
as.data.frame() %>%
setNames("Shannon_Diversity") %>%
mutate(Plot = species_matrix$Plot)
# Merge Shannon diversity with cogongrass cover, tree canopy cover, and environmental outputs
model_data <- shannon_diversity %>%
left_join(cogongrass_cover, by = "Plot") %>%
left_join(EnvironmentalOutputs, by = "Plot") %>%
mutate(Cogongrass_Cover = ifelse(is.na(Cogongrass_Cover), 0, Cogongrass_Cover))
# Inspect to see if there are any columns with only one level
summary(model_data)
## Shannon_Diversity Plot Cogongrass_Cover Site
## Min. :0.000 Length:206 Min. : 0.00 Length:206
## 1st Qu.:2.065 Class :character 1st Qu.: 0.00 Class :character
## Median :2.407 Mode :character Median : 0.00 Mode :character
## Mean :2.333 Mean :17.08
## 3rd Qu.:2.720 3rd Qu.:29.82
## Max. :3.552 Max. :92.14
##
## BurnYear Camera Latitude Longitude
## Min. :2000 Length:206 Min. :28.48 Min. :-89.80
## 1st Qu.:2014 Class :character 1st Qu.:29.56 1st Qu.:-88.91
## Median :2022 Mode :character Median :30.77 Median :-87.14
## Mean :2018 Mean :30.38 Mean :-86.72
## 3rd Qu.:2022 3rd Qu.:30.98 3rd Qu.:-83.58
## Max. :2024 Max. :31.59 Max. :-81.94
## NA's :42
## Muname NLCD_LandCover Region SoilType
## Length:206 Length:206 Length:206 Length:206
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Status TreeCanopy_NLCD aspect elevation
## Length:206 Min. : 0.00 Min. : 0.0 Min. : 17.00
## Class :character 1st Qu.:51.50 1st Qu.: 90.0 1st Qu.: 38.00
## Mode :character Median :75.50 Median :180.0 Median : 59.00
## Mean :66.22 Mean :171.7 Mean : 57.68
## 3rd Qu.:86.75 3rd Qu.:270.0 3rd Qu.: 75.00
## Max. :98.00 Max. :350.6 Max. :111.00
##
## ndvi pr slope sph
## Min. :0.2728 Min. :3.850 Min. : 0.000 Min. :0.01051
## 1st Qu.:0.4303 1st Qu.:4.345 1st Qu.: 2.149 1st Qu.:0.01074
## Median :0.4624 Median :4.592 Median : 2.984 Median :0.01091
## Mean :0.4503 Mean :4.566 Mean : 3.601 Mean :0.01145
## 3rd Qu.:0.4846 3rd Qu.:4.796 3rd Qu.: 4.688 3rd Qu.:0.01183
## Max. :0.5357 Max. :5.294 Max. :18.429 Max. :0.01325
##
## srad tmmn agb
## Min. :204.7 Min. :285.7 Min. : 5.00
## 1st Qu.:206.8 1st Qu.:286.6 1st Qu.: 74.00
## Median :207.0 Median :287.2 Median : 96.00
## Mean :210.6 Mean :287.4 Mean : 99.92
## 3rd Qu.:216.6 3rd Qu.:288.6 3rd Qu.:133.00
## Max. :224.2 Max. :290.3 Max. :211.00
##
Model Fitting
# Ensure Site is a factor
model_data$Site <- as.factor(model_data$Site)
# 1) Refit under ML for fair AIC comparison
shannon_lmer_ml <- lmer(Shannon_Diversity ~ Cogongrass_Cover +
(1 | Site),
data = model_data, REML = FALSE)
shannon_lmer_quad_ml <- lmer(Shannon_Diversity ~ Cogongrass_Cover + I(Cogongrass_Cover^2) +
(1 | Site),
data = model_data, REML = FALSE)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
shannon_gam_ml <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
s(Site, bs = "re"),
data = model_data, method = "ML")
# AIC table and ΔAIC
aic_tab <- AIC(shannon_lmer_ml, shannon_lmer_quad_ml, shannon_gam_ml)
aic_tab$delta <- aic_tab$AIC - min(aic_tab$AIC)
print(aic_tab)
## df AIC delta
## shannon_lmer_ml 4.00000 313.5426 30.98326
## shannon_lmer_quad_ml 5.00000 302.0660 19.50660
## shannon_gam_ml 14.33328 282.5594 0.00000
# Check smooth complexity (EDF ~ 1 suggests near-linear)
gam_summ <- summary(shannon_gam_ml)
edf_cover <- gam_summ$s.table[rownames(gam_summ$s.table) == "s(Cogongrass_Cover)", "edf"]
cat("EDF for s(Cogongrass_Cover):", round(edf_cover, 2), "\n")
## EDF for s(Cogongrass_Cover): 3.57
summary(shannon_gam_ml)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.26314 0.09581 23.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Cogongrass_Cover) 3.573 4.366 6.476 4.5e-05 ***
## s(Site) 7.366 9.000 5.104 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.262 Deviance explained = 30.2%
## -ML = 147.37 Scale est. = 0.21317 n = 206
# 1. Initialize prediction vectors for all three models
sites <- levels(model_data$Site)
obs_all <- c()
pred_lmer_all <- c()
pred_gam_all <- c()
pred_quad_all <- c() # New vector for quadratic model predictions
for (s in sites) {
train <- subset(model_data, Site != s)
test <- subset(model_data, Site == s)
# Fit on training data (ML)
fit_lmer <- lmer(Shannon_Diversity ~ Cogongrass_Cover + (1 | Site),
data = train, REML = FALSE)
fit_gam <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
data = train, method = "ML")
# New Model Fit: LMER with Quadratic Term
fit_quad <- lmer(Shannon_Diversity ~ Cogongrass_Cover + I(Cogongrass_Cover^2) + (1 | Site),
data = train, REML = FALSE)
# Predict on held-out site using population-level effects (exclude site RE)
# re.form = NA and exclude = "s(Site)" ensure no random effects are used,
# making the prediction purely a population-level (fixed effects) estimate.
p_lmer <- predict(fit_lmer, newdata = test, re.form = NA)
p_gam <- predict(fit_gam, newdata = test, exclude = "s(Site)")
# New Prediction Step: Quadratic Model
p_quad <- predict(fit_quad, newdata = test, re.form = NA)
# 2. Collect observations and predictions
obs_all <- c(obs_all, test$Shannon_Diversity)
pred_lmer_all <- c(pred_lmer_all, p_lmer)
pred_gam_all <- c(pred_gam_all, p_gam)
pred_quad_all <- c(pred_quad_all, p_quad) # Collect quadratic predictions
}
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels ONF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels WSF not in original fit
# Define metric functions (as they were in the original code)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
# 3. Calculate metrics for all three models
metrics <- list(
LMM_Linear = c( # Renamed for clarity
RMSE = rmse(obs_all, pred_lmer_all),
MAE = mae(obs_all, pred_lmer_all),
R2 = r2_oos(obs_all, pred_lmer_all)
),
LMM_Quadratic = c( # NEW: Quadratic model metrics
RMSE = rmse(obs_all, pred_quad_all),
MAE = mae(obs_all, pred_quad_all),
R2 = r2_oos(obs_all, pred_quad_all)
),
GAM_Smooth = c( # Renamed for clarity
RMSE = rmse(obs_all, pred_gam_all),
MAE = mae(obs_all, pred_gam_all),
R2 = r2_oos(obs_all, pred_gam_all)
)
)
print(metrics)
## $LMM_Linear
## RMSE MAE R2
## 0.5650397 0.4485244 -0.1101912
##
## $LMM_Quadratic
## RMSE MAE R2
## 0.55213009 0.44305887 -0.06004089
##
## $GAM_Smooth
## RMSE MAE R2
## 0.54653551 0.44203332 -0.03866752
plot_data <- data.frame(
Observed = obs_all,
LMM_Predicted = pred_lmer_all,
LMM_Quadratic_Predicted = pred_quad_all,
GAM_Predicted = pred_gam_all
)
p <- ggplot(plot_data, aes(x = Observed)) +
geom_point(aes(y = LMM_Predicted, color = "LMM"), alpha = 0.6) +
geom_point(aes(y = LMM_Quadratic_Predicted, color = "LMM Quadratic"), alpha = 0.6) +
geom_point(aes(y = GAM_Predicted, color = "GAM"), alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
labs(x = "Observed Shannon Diversity", y = "Predicted Shannon Diversity") +
scale_color_manual(name = "Model", values = c("LMM" = "blue", "LMM Quadratic" = "orange", "GAM" = "red")) +
theme_minimal()
print(p)
## GAM Model
# Leave-One-Site-Out GAM with LOOCV
library(mgcv)
# Initialize vectors to store observations and predictions
sites <- levels(model_data$Site)
obs_all <- c()
pred_gam_all <- c()
# Determine if we have enough sites for a random effect
use_mixed <- length(sites) >= 3
n_unique <- length(unique(model_data$Cogongrass_Cover))
k_val <- min(5, n_unique - 1)
# Fit GAM on all data
if (use_mixed) {
cat("Fitting GAM with site random effect.\n")
fit_gam_full <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
data = model_data, method = "ML")
} else {
cat("Fitting GAM without site random effect.\n")
fit_gam_full <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
data = model_data, method = "ML")
}
## Fitting GAM with site random effect.
print(summary(fit_gam_full))
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.26314 0.09581 23.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Cogongrass_Cover) 3.573 4.366 6.476 4.5e-05 ***
## s(Site) 7.366 9.000 5.104 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.262 Deviance explained = 30.2%
## -ML = 147.37 Scale est. = 0.21317 n = 206
# ------------------------------------------
# Leave-One-Site-Out Cross-Validation
# ------------------------------------------
for (s in sites) {
train <- subset(model_data, Site != s)
test <- subset(model_data, Site == s)
if (use_mixed) {
fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
data = train, method = "ML")
p_gam <- predict(fit_cv, newdata = test, exclude = "s(Site)")
} else {
fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
data = train, method = "ML")
p_gam <- predict(fit_cv, newdata = test)
}
obs_all <- c(obs_all, test$Shannon_Diversity)
pred_gam_all <- c(pred_gam_all, p_gam)
}
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels ONF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels WSF not in original fit
# ------------------------------------------
# Calculate LOOCV Metrics
# ------------------------------------------
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
metrics <- c(RMSE = rmse(obs_all, pred_gam_all),
MAE = mae(obs_all, pred_gam_all),
R2 = r2_oos(obs_all, pred_gam_all))
cat("\nLOOCV Metrics:\n")
##
## LOOCV Metrics:
print(metrics)
## RMSE MAE R2
## 0.54653551 0.44203332 -0.03866752
# Return the full model and CV metrics
list(model = fit_gam_full, cv_metrics = metrics)
## $model
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
##
## Estimated degrees of freedom:
## 3.57 7.37 total = 11.94
##
## ML score: 147.3724
##
## $cv_metrics
## RMSE MAE R2
## 0.54653551 0.44203332 -0.03866752
plot(shannon_gam_ml, pages = 1, rug = TRUE, se = TRUE)
# Assuming shannon_gam_ml is already fitted from your previous steps
# shannon_gam_ml <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
# data = model_data, method = "ML")
# --- 1. Extract data for the smooth term s(Cogongrass_Cover) ---
# We need to create a new data frame to predict from, covering the range of Cogongrass_Cover
# We only need to vary Cogongrass_Cover and keep other variables at a constant or mean value
# (or just not include them if predicting only specific smooth terms).
# The 'exclude' argument in predict.gam helps isolate the smooth term.
# Get one valid site level
site_level <- levels(model_data$Site)[1]
# Create a sequence of Cogongrass_Cover values
cogongrass_range <- range(model_data$Cogongrass_Cover)
pred_data_gam <- data.frame(Cogongrass_Cover = seq(cogongrass_range[1], cogongrass_range[2], length.out = 200))
pred_data_gam$Site <- factor(site_level, levels = levels(model_data$Site))
# Predict the smooth term.
# type="terms" gives the contribution of each term.
# se.fit=TRUE gives standard errors for confidence intervals.
# exclude="s(Site)" tells predict to fix the random effect at 0 (its population average),
# so we only see the smooth effect of Cogongrass_Cover.
gam_pred <- predict(shannon_gam_ml,
newdata = pred_data_gam,
type = "terms",
se.fit = TRUE,
exclude = "s(Site)")
# Combine predictions with the original predictor values
smooth_df <- data.frame(
Cogongrass_Cover = pred_data_gam$Cogongrass_Cover,
fit = gam_pred$fit[, "s(Cogongrass_Cover)"], # Extract the specific smooth term
se = gam_pred$se.fit[, "s(Cogongrass_Cover)"] # Extract its SE
)
# Calculate confidence intervals
smooth_df$upper <- smooth_df$fit + (2 * smooth_df$se) # Approx 95% CI
smooth_df$lower <- smooth_df$fit - (2 * smooth_df$se) # Approx 95% CI
# --- 2. Get data for the rug plot (actual data points) ---
rug_data <- data.frame(Cogongrass_Cover = model_data$Cogongrass_Cover)
# --- 3. Create the ggplot ---
smooth_plot <- ggplot(smooth_df, aes(x = Cogongrass_Cover, y = fit)) +
# Add the filled confidence interval band
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey80", alpha = 0.5) +
# Add the smooth line
geom_line(color = "steelblue", linewidth = 1) + # Use linewidth instead of size for geom_line
# Add the rug plot (data point density)
geom_rug(data = rug_data, aes(x = Cogongrass_Cover), inherit.aes = FALSE,
sides = "b", length = unit(0.03, "npc"), color = "black", alpha = 0.7) +
# Labels and Title
labs(
x = "Cogongrass Cover (%)",
y = "Effect on Shannon Diversity"
) +
# Customize theme for publication quality
theme_minimal(base_size = 14) + # Start with a clean theme, set base font size
theme(
plot.title = element_text(hjust = 0.5, face = "bold", margin = margin(b = 10)),
axis.title.x = element_text(margin = margin(t = 10)),
axis.title.y = element_text(margin = margin(r = 10)),
panel.grid.minor = element_blank(), # Remove minor grid lines
panel.grid.major = element_line(color = "grey70", linetype = "dotted"), # Lighter, dotted major grid lines
axis.line = element_line(color = "black") # Add axis lines for definition
) +
# Set y-axis limits to match the default plot if desired, or let ggplot handle it
# coord_cartesian(ylim = c(-1.2, 0.4)) # Adjust based on your actual data and desired visual
NULL # Placeholder for potential further customizations
# Save plot
ggsave("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/03_Figure/GAM_Smooth_Cogon_Diversity.png", plot = smooth_plot,
width = 8, height = 6, dpi = 300)
central_fl <- c("ONF", "WSF")
north_fl <- c("BRSF", "Escambia", "Jay", "Howell", "Hooper")
south_ms <- c("CNF", "DSNF", "BCNWR")
run_region_models <- function(region_sites, data, min_sites = 3) {
region_data <- subset(data, Site %in% region_sites)
region_data$Site <- droplevels(region_data$Site)
n_sites <- length(unique(region_data$Site))
cat("\n=============================\n")
cat("Region Sites:", region_sites, "\n")
cat("Number of Sites:", n_sites, "\n")
cat("=============================\n")
use_mixed <- n_sites >= min_sites
n_unique <- length(unique(region_data$Cogongrass_Cover))
k_val <- min(5, n_unique - 1)
if (use_mixed) {
cat("Using mixed-effects GAM.\n")
fit_gam <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
s(Site, bs = "re"),
data = region_data, method = "ML")
} else {
cat("Fewer than", min_sites, "sites detected. Removing random effect.\n")
fit_gam <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
data = region_data, method = "ML")
}
print(summary(fit_gam))
# --------------------------------------------------
# Leave-One-Site-Out CV
# --------------------------------------------------
sites <- levels(region_data$Site)
obs_all <- pred_gam <- c()
for (s in sites) {
train <- subset(region_data, Site != s)
test <- subset(region_data, Site == s)
if (use_mixed) {
fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
s(Site, bs = "re"),
data = train, method = "ML")
p_gam <- predict(fit_cv, newdata = test, exclude = "s(Site)")
} else {
fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
data = train, method = "ML")
p_gam <- predict(fit_cv, newdata = test)
}
obs_all <- c(obs_all, test$Shannon_Diversity)
pred_gam <- c(pred_gam, p_gam)
}
# --------------------------------------------------
# Metrics
# --------------------------------------------------
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae <- function(y, yhat) mean(abs(y - yhat))
r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
metrics <- list(
GAM = c(RMSE = rmse(obs_all, pred_gam),
MAE = mae(obs_all, pred_gam),
R2 = r2_oos(obs_all, pred_gam))
)
print(metrics)
return(list(model = fit_gam, cv_metrics = metrics))
}
# Run the function and save output
central_mod <- run_region_models(central_fl, model_data)
##
## =============================
## Region Sites: ONF WSF
## Number of Sites: 2
## =============================
## Fewer than 3 sites detected. Removing random effect.
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.25021 0.07696 29.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Cogongrass_Cover) 1 1 15.2 0.000289 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.218 Deviance explained = 23.3%
## -ML = 42.148 Scale est. = 0.30802 n = 52
## $GAM
## RMSE MAE R2
## 1.1106755 0.9834191 -2.1940763
north_mod <- run_region_models(north_fl, model_data)
##
## =============================
## Region Sites: BRSF Escambia Jay Howell Hooper
## Number of Sites: 5
## =============================
## Using mixed-effects GAM.
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2183 0.1098 20.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Cogongrass_Cover) 1.000 1 0.00 0.9829
## s(Site) 2.656 4 2.34 0.0122 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.111 Deviance explained = 15.5%
## -ML = 46.728 Scale est. = 0.19069 n = 75
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit
## $GAM
## RMSE MAE R2
## 0.4951541 0.3910516 -0.1587838
south_mod <- run_region_models(south_ms, model_data)
##
## =============================
## Region Sites: CNF DSNF BCNWR
## Number of Sites: 3
## =============================
## Using mixed-effects GAM.
##
## Family: gaussian
## Link function: identity
##
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.4831 0.0823 30.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Cogongrass_Cover) 2.520 3.069 3.696 0.0158 *
## s(Site) 1.156 2.000 1.947 0.0387 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.168 Deviance explained = 20.8%
## -ML = 55.602 Scale est. = 0.21648 n = 79
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit
## $GAM
## RMSE MAE R2
## 0.5682273 0.4246825 -0.2560978
plot_region_model <- function(mod_obj, region_data, region_label, color = "steelblue") {
fit_gam <- mod_obj$model
# Generate prediction grid over observed range
cov_range <- range(region_data$Cogongrass_Cover, na.rm = TRUE)
pred_df <- data.frame(
Cogongrass_Cover = seq(cov_range[1], cov_range[2], length.out = 200),
Site = region_data$Site[1] # placeholder site for re term; excluded below
)
# Predict excluding random effect so we get the population-level smooth
preds <- predict(fit_gam, newdata = pred_df,
exclude = "s(Site)", se.fit = TRUE)
pred_df$fit <- preds$fit
pred_df$lwr <- preds$fit - 1.96 * preds$se.fit
pred_df$upr <- preds$fit + 1.96 * preds$se.fit
# Pull CV metrics for subtitle
m <- mod_obj$cv_metrics$GAM
subtitle_text <- sprintf("LOOCV — RMSE: %.3f | MAE: %.3f | R²: %.3f",
m["RMSE"], m["MAE"], m["R2"])
p <- ggplot() +
# Raw observations
geom_point(data = region_data,
aes(x = Cogongrass_Cover, y = Shannon_Diversity, color = Site),
alpha = 0.6, size = 2) +
# 95% CI ribbon
geom_ribbon(data = pred_df,
aes(x = Cogongrass_Cover, ymin = lwr, ymax = upr),
fill = color, alpha = 0.2) +
# Population-level smooth
geom_line(data = pred_df,
aes(x = Cogongrass_Cover, y = fit),
color = color, linewidth = 1.2) +
labs(
title = paste("Shannon Diversity ~ Cogongrass Cover |", region_label),
subtitle = subtitle_text,
x = "Cogongrass Cover (%)",
y = "Shannon Diversity",
color = "Site"
) +
theme_bw(base_size = 13) +
theme(
plot.title = element_text(face = "bold"),
plot.subtitle = element_text(size = 10, color = "grey40"),
legend.position = "right"
)
return(p)
}
# Subset data per region (mirrors what run_region_models does internally)
central_data <- subset(model_data, Site %in% central_fl)
north_data <- subset(model_data, Site %in% north_fl)
south_data <- subset(model_data, Site %in% south_ms)
# Generate plots
p_central <- plot_region_model(central_mod, central_data, "Central Florida", color = "steelblue")
p_north <- plot_region_model(north_mod, north_data, "North Florida", color = "darkorange")
p_south <- plot_region_model(south_mod, south_data, "South Mississippi",color = "forestgreen")
# View individually
p_central
p_north
p_south
# arrange together
p_central / p_north / p_south