pacman::p_load(tidyverse, readxl, raster, vegan, ggplot2,
               tigris, sf, sp, plotly, ggrepel, rstanarm, mgcv,
               brms, lme4, caret, MuMIn, VSURF, loo, patchwork)

EnvironmentalOutputs <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Environmental_Outputs.xlsx")

# Tree PCQ Data
tree_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Tree_PCQ")

# Veg Data
Veg_Cover <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Veg_Cover")

# Shrub Cover Data
shrub_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                         sheet = "Shrub_Cover")

# Site Data
CogonSites <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/CogonSites_FL_AL_MS.xlsx")

Number of quadrats sampled per plot

# Count the total number of quadrats per plot
quadrat_count <- Veg_Cover %>%
  group_by(Plot) %>%
  summarize(total_quadrats = n_distinct(Quadrat), .groups = "drop")

Filter All data to only include specified species (Per PLANTS database)

#Filter tree data to only include trees with "tree" in the growth column
tree_data <- dplyr::filter(tree_data, Growth == "Tree")

#Filter Veg Cover to exclude Shrubs and Trees
Veg_Cover <- dplyr::filter(Veg_Cover, Growth != "Shrub" & Growth != "Tree")

#Filter Shrub Cover to only include Shrubs and Trees
shrub_data <- dplyr::filter(shrub_data, Growth == "Shrub" | Growth == "Tree")

Shrub Cover Conversion

# Total length of Shrub cover at a site
shrub_cover <- shrub_data %>%
  mutate(Cover = Line_End - Line_Start) %>%
  group_by(Species_Name, Plot, InvStatus) %>%
  summarise(Total_Cover = sum(Cover, na.rm = TRUE), .groups = "drop") %>%
  mutate(Percent_Cover = Total_Cover / 3000 * 100)

Herbacous Cover Conversion

# Combine Plot and Quadrat columns
Veg_Cover <- Veg_Cover %>%
  mutate(Plot_Quadrat = paste(Plot, Quadrat, sep = '_'))

# Join with CogonSites to get site information
Veg_Cover <- Veg_Cover %>%
  left_join(CogonSites, by = "Plot")

# Sum species cover across quadrats for each species at each plot
veg_cover_summed <- Veg_Cover %>%
  group_by(Plot, Species_Name, InvStatus) %>%
  summarize(total_cover = sum(Cover_Per, na.rm = TRUE), .groups = "drop")

# Calculate average herbaceous species cover
avg_species_cover <- veg_cover_summed %>%
  left_join(quadrat_count, by = "Plot") %>%
  mutate(avg_cover = total_cover / total_quadrats)

Merging Herb cover with Shrub

This species matrix includes herbaceous and shrub species

# Merge shrub cover with herbaceous average cover
combined_cover <- avg_species_cover %>%
  full_join(
    shrub_cover %>%
      select(Plot, Species_Name, Percent_Cover, InvStatus),
    by = c("Plot", "Species_Name")
  ) %>%
  mutate(
    overlap_flag = ifelse(!is.na(avg_cover) & !is.na(Percent_Cover), TRUE, FALSE), # Flag overlaps
    final_cover = case_when(
      !is.na(avg_cover) & is.na(Percent_Cover) ~ avg_cover,  # Use herbaceous cover if no shrub data
      is.na(avg_cover) & !is.na(Percent_Cover) ~ Percent_Cover, # Use shrub cover if no herbaceous data
      TRUE ~ NA_real_ # Leave as NA where overlaps exist
    )
  )

Extract Cogongrass Cover

# Extract cogongrass cover
cogongrass_cover <- combined_cover %>%
  filter(Species_Name == "Imperata_cylindrica") %>%
  select(Plot, final_cover) %>%
  rename(Cogongrass_Cover = final_cover)

Species Matrix

combined_cover <- combined_cover %>%
  filter(Species_Name != "Imperata_cylindrica" , Species_Name != "Imperata_cylindrica_Live") # Remove cogongrass from species matrix

## Remove any non_native species
combined_cover <- combined_cover %>%
  filter(InvStatus.x != "Non_Native") # Remove non-native species from species matrix

species_matrix <- combined_cover %>%
  select(Plot, Species_Name, final_cover) %>%
  pivot_wider(
    names_from = Species_Name,
    values_from = final_cover,
    values_fill = 0
  )

Shannon Diversity

# Calculate Shannon diversity index for each site
shannon_diversity <- species_matrix %>%
  select(-Plot) %>%
  vegan::diversity(index = "shannon") %>%
  as.data.frame() %>%
  setNames("Shannon_Diversity") %>%
  mutate(Plot = species_matrix$Plot)

Merge Shannon Diversity with Cogongrass Cover and Environmental Data

# Merge Shannon diversity with cogongrass cover, tree canopy cover, and environmental outputs
model_data <- shannon_diversity %>%
  left_join(cogongrass_cover, by = "Plot") %>%
  left_join(EnvironmentalOutputs, by = "Plot") %>%
  mutate(Cogongrass_Cover = ifelse(is.na(Cogongrass_Cover), 0, Cogongrass_Cover))

# Inspect to see if there are any columns with only one level
summary(model_data)
##  Shannon_Diversity     Plot           Cogongrass_Cover     Site          
##  Min.   :0.000     Length:206         Min.   : 0.00    Length:206        
##  1st Qu.:2.065     Class :character   1st Qu.: 0.00    Class :character  
##  Median :2.407     Mode  :character   Median : 0.00    Mode  :character  
##  Mean   :2.333                        Mean   :17.08                      
##  3rd Qu.:2.720                        3rd Qu.:29.82                      
##  Max.   :3.552                        Max.   :92.14                      
##                                                                          
##     BurnYear       Camera             Latitude       Longitude     
##  Min.   :2000   Length:206         Min.   :28.48   Min.   :-89.80  
##  1st Qu.:2014   Class :character   1st Qu.:29.56   1st Qu.:-88.91  
##  Median :2022   Mode  :character   Median :30.77   Median :-87.14  
##  Mean   :2018                      Mean   :30.38   Mean   :-86.72  
##  3rd Qu.:2022                      3rd Qu.:30.98   3rd Qu.:-83.58  
##  Max.   :2024                      Max.   :31.59   Max.   :-81.94  
##  NA's   :42                                                        
##     Muname          NLCD_LandCover        Region            SoilType        
##  Length:206         Length:206         Length:206         Length:206        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Status          TreeCanopy_NLCD     aspect        elevation     
##  Length:206         Min.   : 0.00   Min.   :  0.0   Min.   : 17.00  
##  Class :character   1st Qu.:51.50   1st Qu.: 90.0   1st Qu.: 38.00  
##  Mode  :character   Median :75.50   Median :180.0   Median : 59.00  
##                     Mean   :66.22   Mean   :171.7   Mean   : 57.68  
##                     3rd Qu.:86.75   3rd Qu.:270.0   3rd Qu.: 75.00  
##                     Max.   :98.00   Max.   :350.6   Max.   :111.00  
##                                                                     
##       ndvi              pr            slope             sph         
##  Min.   :0.2728   Min.   :3.850   Min.   : 0.000   Min.   :0.01051  
##  1st Qu.:0.4303   1st Qu.:4.345   1st Qu.: 2.149   1st Qu.:0.01074  
##  Median :0.4624   Median :4.592   Median : 2.984   Median :0.01091  
##  Mean   :0.4503   Mean   :4.566   Mean   : 3.601   Mean   :0.01145  
##  3rd Qu.:0.4846   3rd Qu.:4.796   3rd Qu.: 4.688   3rd Qu.:0.01183  
##  Max.   :0.5357   Max.   :5.294   Max.   :18.429   Max.   :0.01325  
##                                                                     
##       srad            tmmn            agb        
##  Min.   :204.7   Min.   :285.7   Min.   :  5.00  
##  1st Qu.:206.8   1st Qu.:286.6   1st Qu.: 74.00  
##  Median :207.0   Median :287.2   Median : 96.00  
##  Mean   :210.6   Mean   :287.4   Mean   : 99.92  
##  3rd Qu.:216.6   3rd Qu.:288.6   3rd Qu.:133.00  
##  Max.   :224.2   Max.   :290.3   Max.   :211.00  
## 

Model Fitting

# Ensure Site is a factor
model_data$Site <- as.factor(model_data$Site)

# 1) Refit under ML for fair AIC comparison
shannon_lmer_ml <- lmer(Shannon_Diversity ~ Cogongrass_Cover +
                          (1 | Site),
                        data = model_data, REML = FALSE)

shannon_lmer_quad_ml <- lmer(Shannon_Diversity ~ Cogongrass_Cover + I(Cogongrass_Cover^2) +
                               (1 | Site),
                             data = model_data, REML = FALSE)
## Warning: Some predictor variables are on very different scales: consider
## rescaling
shannon_gam_ml <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
                        s(Site, bs = "re"),
                      data = model_data, method = "ML")

Model Comparison

# AIC table and ΔAIC
aic_tab <- AIC(shannon_lmer_ml, shannon_lmer_quad_ml, shannon_gam_ml)
aic_tab$delta <- aic_tab$AIC - min(aic_tab$AIC)
print(aic_tab)
##                            df      AIC    delta
## shannon_lmer_ml       4.00000 313.5426 30.98326
## shannon_lmer_quad_ml  5.00000 302.0660 19.50660
## shannon_gam_ml       14.33328 282.5594  0.00000
# Check smooth complexity (EDF ~ 1 suggests near-linear)
gam_summ <- summary(shannon_gam_ml)
edf_cover <- gam_summ$s.table[rownames(gam_summ$s.table) == "s(Cogongrass_Cover)", "edf"]
cat("EDF for s(Cogongrass_Cover):", round(edf_cover, 2), "\n")
## EDF for s(Cogongrass_Cover): 3.57
summary(shannon_gam_ml)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.26314    0.09581   23.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value    
## s(Cogongrass_Cover) 3.573  4.366 6.476 4.5e-05 ***
## s(Site)             7.366  9.000 5.104 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.262   Deviance explained = 30.2%
## -ML = 147.37  Scale est. = 0.21317   n = 206

Leave One Out Cross Validation

# 1. Initialize prediction vectors for all three models
sites <- levels(model_data$Site)

obs_all <- c()
pred_lmer_all <- c()
pred_gam_all  <- c()
pred_quad_all <- c() # New vector for quadratic model predictions

for (s in sites) {
  train <- subset(model_data, Site != s)
  test  <- subset(model_data, Site == s)

  # Fit on training data (ML)
  fit_lmer <- lmer(Shannon_Diversity ~ Cogongrass_Cover + (1 | Site),
                   data = train, REML = FALSE)
  fit_gam  <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
                  data = train, method = "ML")
  # New Model Fit: LMER with Quadratic Term
  fit_quad <- lmer(Shannon_Diversity ~ Cogongrass_Cover + I(Cogongrass_Cover^2) + (1 | Site),
                   data = train, REML = FALSE)

  # Predict on held-out site using population-level effects (exclude site RE)
  # re.form = NA and exclude = "s(Site)" ensure no random effects are used,
  # making the prediction purely a population-level (fixed effects) estimate.
  p_lmer <- predict(fit_lmer, newdata = test, re.form = NA)
  p_gam  <- predict(fit_gam,  newdata = test, exclude = "s(Site)")
  # New Prediction Step: Quadratic Model
  p_quad <- predict(fit_quad, newdata = test, re.form = NA)

  # 2. Collect observations and predictions
  obs_all        <- c(obs_all, test$Shannon_Diversity)
  pred_lmer_all  <- c(pred_lmer_all, p_lmer)
  pred_gam_all   <- c(pred_gam_all, p_gam)
  pred_quad_all  <- c(pred_quad_all, p_quad) # Collect quadratic predictions
}
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels ONF not in original fit
## Warning: Some predictor variables are on very different scales: consider
## rescaling
## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels WSF not in original fit
# Define metric functions (as they were in the original code)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

# 3. Calculate metrics for all three models
metrics <- list(
  LMM_Linear = c( # Renamed for clarity
    RMSE = rmse(obs_all, pred_lmer_all),
    MAE  = mae(obs_all, pred_lmer_all),
    R2   = r2_oos(obs_all, pred_lmer_all)
  ),
  LMM_Quadratic = c( # NEW: Quadratic model metrics
    RMSE = rmse(obs_all, pred_quad_all),
    MAE  = mae(obs_all, pred_quad_all),
    R2   = r2_oos(obs_all, pred_quad_all)
  ),
  GAM_Smooth = c( # Renamed for clarity
    RMSE = rmse(obs_all, pred_gam_all),
    MAE  = mae(obs_all, pred_gam_all),
    R2   = r2_oos(obs_all, pred_gam_all)
  )
)

print(metrics)
## $LMM_Linear
##       RMSE        MAE         R2 
##  0.5650397  0.4485244 -0.1101912 
## 
## $LMM_Quadratic
##        RMSE         MAE          R2 
##  0.55213009  0.44305887 -0.06004089 
## 
## $GAM_Smooth
##        RMSE         MAE          R2 
##  0.54653551  0.44203332 -0.03866752

Plot Results

plot_data <- data.frame(
  Observed = obs_all,
  LMM_Predicted = pred_lmer_all,
  LMM_Quadratic_Predicted = pred_quad_all,
  GAM_Predicted = pred_gam_all
)

p <- ggplot(plot_data, aes(x = Observed)) +
  geom_point(aes(y = LMM_Predicted, color = "LMM"), alpha = 0.6) +
  geom_point(aes(y = LMM_Quadratic_Predicted, color = "LMM Quadratic"), alpha = 0.6) +
  geom_point(aes(y = GAM_Predicted, color = "GAM"), alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(x = "Observed Shannon Diversity", y = "Predicted Shannon Diversity") +
  scale_color_manual(name = "Model", values = c("LMM" = "blue", "LMM Quadratic" = "orange", "GAM" = "red")) +
  theme_minimal()

print(p)

## GAM Model

# Leave-One-Site-Out GAM with LOOCV
library(mgcv)

# Initialize vectors to store observations and predictions
sites <- levels(model_data$Site)
obs_all <- c()
pred_gam_all <- c()

# Determine if we have enough sites for a random effect
use_mixed <- length(sites) >= 3
n_unique <- length(unique(model_data$Cogongrass_Cover))
k_val <- min(5, n_unique - 1)

# Fit GAM on all data
if (use_mixed) {
  cat("Fitting GAM with site random effect.\n")
  fit_gam_full <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
                      data = model_data, method = "ML")
} else {
  cat("Fitting GAM without site random effect.\n")
  fit_gam_full <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                      data = model_data, method = "ML")
}
## Fitting GAM with site random effect.
print(summary(fit_gam_full))
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.26314    0.09581   23.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value    
## s(Cogongrass_Cover) 3.573  4.366 6.476 4.5e-05 ***
## s(Site)             7.366  9.000 5.104 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.262   Deviance explained = 30.2%
## -ML = 147.37  Scale est. = 0.21317   n = 206
# ------------------------------------------
# Leave-One-Site-Out Cross-Validation
# ------------------------------------------
for (s in sites) {
  
  train <- subset(model_data, Site != s)
  test  <- subset(model_data, Site == s)
  
  if (use_mixed) {
    fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
                  data = train, method = "ML")
    p_gam <- predict(fit_cv, newdata = test, exclude = "s(Site)")
  } else {
    fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                  data = train, method = "ML")
    p_gam <- predict(fit_cv, newdata = test)
  }
  
  obs_all <- c(obs_all, test$Shannon_Diversity)
  pred_gam_all <- c(pred_gam_all, p_gam)
}
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels ONF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels WSF not in original fit
# ------------------------------------------
# Calculate LOOCV Metrics
# ------------------------------------------
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

metrics <- c(RMSE = rmse(obs_all, pred_gam_all),
             MAE  = mae(obs_all, pred_gam_all),
             R2   = r2_oos(obs_all, pred_gam_all))

cat("\nLOOCV Metrics:\n")
## 
## LOOCV Metrics:
print(metrics)
##        RMSE         MAE          R2 
##  0.54653551  0.44203332 -0.03866752
# Return the full model and CV metrics
list(model = fit_gam_full, cv_metrics = metrics)
## $model
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Estimated degrees of freedom:
## 3.57 7.37  total = 11.94 
## 
## ML score: 147.3724     
## 
## $cv_metrics
##        RMSE         MAE          R2 
##  0.54653551  0.44203332 -0.03866752

Plot GAM Smooth

plot(shannon_gam_ml, pages = 1, rug = TRUE, se = TRUE)

# Assuming shannon_gam_ml is already fitted from your previous steps
# shannon_gam_ml <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
#                       data = model_data, method = "ML")


# --- 1. Extract data for the smooth term s(Cogongrass_Cover) ---
# We need to create a new data frame to predict from, covering the range of Cogongrass_Cover
# We only need to vary Cogongrass_Cover and keep other variables at a constant or mean value
# (or just not include them if predicting only specific smooth terms).
# The 'exclude' argument in predict.gam helps isolate the smooth term.
# Get one valid site level
site_level <- levels(model_data$Site)[1]

# Create a sequence of Cogongrass_Cover values
cogongrass_range <- range(model_data$Cogongrass_Cover)
pred_data_gam <- data.frame(Cogongrass_Cover = seq(cogongrass_range[1], cogongrass_range[2], length.out = 200))

pred_data_gam$Site <- factor(site_level, levels = levels(model_data$Site))

# Predict the smooth term.
# type="terms" gives the contribution of each term.
# se.fit=TRUE gives standard errors for confidence intervals.
# exclude="s(Site)" tells predict to fix the random effect at 0 (its population average),
# so we only see the smooth effect of Cogongrass_Cover.
gam_pred <- predict(shannon_gam_ml, 
                    newdata = pred_data_gam, 
                    type = "terms", 
                    se.fit = TRUE, 
                    exclude = "s(Site)")

# Combine predictions with the original predictor values
smooth_df <- data.frame(
  Cogongrass_Cover = pred_data_gam$Cogongrass_Cover,
  fit = gam_pred$fit[, "s(Cogongrass_Cover)"], # Extract the specific smooth term
  se = gam_pred$se.fit[, "s(Cogongrass_Cover)"]  # Extract its SE
)

# Calculate confidence intervals
smooth_df$upper <- smooth_df$fit + (2 * smooth_df$se) # Approx 95% CI
smooth_df$lower <- smooth_df$fit - (2 * smooth_df$se) # Approx 95% CI

# --- 2. Get data for the rug plot (actual data points) ---
rug_data <- data.frame(Cogongrass_Cover = model_data$Cogongrass_Cover)


# --- 3. Create the ggplot ---
smooth_plot <- ggplot(smooth_df, aes(x = Cogongrass_Cover, y = fit)) +
  # Add the filled confidence interval band
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey80", alpha = 0.5) +
  # Add the smooth line
  geom_line(color = "steelblue", linewidth = 1) + # Use linewidth instead of size for geom_line
  # Add the rug plot (data point density)
  geom_rug(data = rug_data, aes(x = Cogongrass_Cover), inherit.aes = FALSE,
           sides = "b", length = unit(0.03, "npc"), color = "black", alpha = 0.7) +
  # Labels and Title
  labs(
    x = "Cogongrass Cover (%)",
    y = "Effect on Shannon Diversity"
  ) +
  # Customize theme for publication quality
  theme_minimal(base_size = 14) + # Start with a clean theme, set base font size
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", margin = margin(b = 10)),
    axis.title.x = element_text(margin = margin(t = 10)),
    axis.title.y = element_text(margin = margin(r = 10)),
    panel.grid.minor = element_blank(), # Remove minor grid lines
    panel.grid.major = element_line(color = "grey70", linetype = "dotted"), # Lighter, dotted major grid lines
    axis.line = element_line(color = "black") # Add axis lines for definition
  ) +
  # Set y-axis limits to match the default plot if desired, or let ggplot handle it
  # coord_cartesian(ylim = c(-1.2, 0.4)) # Adjust based on your actual data and desired visual
  NULL # Placeholder for potential further customizations

# Save plot
ggsave("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/03_Figure/GAM_Smooth_Cogon_Diversity.png", plot = smooth_plot,
       width = 8, height = 6, dpi = 300)

Region Specific Analysis

Regional Groups

central_fl <- c("ONF", "WSF")
north_fl   <- c("BRSF", "Escambia", "Jay", "Howell", "Hooper")
south_ms   <- c("CNF", "DSNF", "BCNWR")

Regional Modeling

run_region_models <- function(region_sites, data, min_sites = 3) {
  
  region_data <- subset(data, Site %in% region_sites)
  region_data$Site <- droplevels(region_data$Site)
  
  n_sites <- length(unique(region_data$Site))
  
  cat("\n=============================\n")
  cat("Region Sites:", region_sites, "\n")
  cat("Number of Sites:", n_sites, "\n")
  cat("=============================\n")
  
  use_mixed <- n_sites >= min_sites
  n_unique <- length(unique(region_data$Cogongrass_Cover))
  k_val <- min(5, n_unique - 1)
  
  if (use_mixed) {
    cat("Using mixed-effects GAM.\n")
    fit_gam <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
                     s(Site, bs = "re"),
                   data = region_data, method = "ML")
  } else {
    cat("Fewer than", min_sites, "sites detected. Removing random effect.\n")
    fit_gam <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                   data = region_data, method = "ML")
  }
  
  print(summary(fit_gam))
  
  # --------------------------------------------------
  # Leave-One-Site-Out CV
  # --------------------------------------------------
  
  sites <- levels(region_data$Site)
  obs_all <- pred_gam <- c()
  
  for (s in sites) {
    
    train <- subset(region_data, Site != s)
    test  <- subset(region_data, Site == s)
    
    if (use_mixed) {
      fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
                      s(Site, bs = "re"),
                    data = train, method = "ML")
      p_gam <- predict(fit_cv, newdata = test, exclude = "s(Site)")
    } else {
      fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                    data = train, method = "ML")
      p_gam <- predict(fit_cv, newdata = test)
    }
    
    obs_all <- c(obs_all, test$Shannon_Diversity)
    pred_gam <- c(pred_gam, p_gam)
  }
  
  # --------------------------------------------------
  # Metrics
  # --------------------------------------------------
  
  rmse   <- function(y, yhat) sqrt(mean((y - yhat)^2))
  mae    <- function(y, yhat) mean(abs(y - yhat))
  r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
  
  metrics <- list(
    GAM = c(RMSE = rmse(obs_all, pred_gam),
            MAE  = mae(obs_all, pred_gam),
            R2   = r2_oos(obs_all, pred_gam))
  )
  
  print(metrics)
  
  return(list(model = fit_gam, cv_metrics = metrics))
}

Run Each Model

# Run the function and save output
central_mod <- run_region_models(central_fl, model_data)
## 
## =============================
## Region Sites: ONF WSF 
## Number of Sites: 2 
## =============================
## Fewer than 3 sites detected. Removing random effect.
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.25021    0.07696   29.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df    F  p-value    
## s(Cogongrass_Cover)   1      1 15.2 0.000289 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.218   Deviance explained = 23.3%
## -ML = 42.148  Scale est. = 0.30802   n = 52
## $GAM
##       RMSE        MAE         R2 
##  1.1106755  0.9834191 -2.1940763
north_mod   <- run_region_models(north_fl, model_data)
## 
## =============================
## Region Sites: BRSF Escambia Jay Howell Hooper 
## Number of Sites: 5 
## =============================
## Using mixed-effects GAM.
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.2183     0.1098   20.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df    F p-value  
## s(Cogongrass_Cover) 1.000      1 0.00  0.9829  
## s(Site)             2.656      4 2.34  0.0122 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.111   Deviance explained = 15.5%
## -ML = 46.728  Scale est. = 0.19069   n = 75
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit
## $GAM
##       RMSE        MAE         R2 
##  0.4951541  0.3910516 -0.1587838
south_mod   <- run_region_models(south_ms, model_data)
## 
## =============================
## Region Sites: CNF DSNF BCNWR 
## Number of Sites: 3 
## =============================
## Using mixed-effects GAM.
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.4831     0.0823   30.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value  
## s(Cogongrass_Cover) 2.520  3.069 3.696  0.0158 *
## s(Site)             1.156  2.000 1.947  0.0387 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.168   Deviance explained = 20.8%
## -ML = 55.602  Scale est. = 0.21648   n = 79
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit
## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit
## $GAM
##       RMSE        MAE         R2 
##  0.5682273  0.4246825 -0.2560978

Plot

plot_region_model <- function(mod_obj, region_data, region_label, color = "steelblue") {
  
  fit_gam <- mod_obj$model
  
  # Generate prediction grid over observed range
  cov_range <- range(region_data$Cogongrass_Cover, na.rm = TRUE)
  pred_df <- data.frame(
    Cogongrass_Cover = seq(cov_range[1], cov_range[2], length.out = 200),
    Site = region_data$Site[1]  # placeholder site for re term; excluded below
  )
  
  # Predict excluding random effect so we get the population-level smooth
  preds <- predict(fit_gam, newdata = pred_df, 
                   exclude = "s(Site)", se.fit = TRUE)
  
  pred_df$fit  <- preds$fit
  pred_df$lwr  <- preds$fit - 1.96 * preds$se.fit
  pred_df$upr  <- preds$fit + 1.96 * preds$se.fit
  
  # Pull CV metrics for subtitle
  m <- mod_obj$cv_metrics$GAM
  subtitle_text <- sprintf("LOOCV — RMSE: %.3f | MAE: %.3f | R²: %.3f",
                           m["RMSE"], m["MAE"], m["R2"])
  
  p <- ggplot() +
    # Raw observations
    geom_point(data = region_data,
               aes(x = Cogongrass_Cover, y = Shannon_Diversity, color = Site),
               alpha = 0.6, size = 2) +
    # 95% CI ribbon
    geom_ribbon(data = pred_df,
                aes(x = Cogongrass_Cover, ymin = lwr, ymax = upr),
                fill = color, alpha = 0.2) +
    # Population-level smooth
    geom_line(data = pred_df,
              aes(x = Cogongrass_Cover, y = fit),
              color = color, linewidth = 1.2) +
    labs(
      title    = paste("Shannon Diversity ~ Cogongrass Cover |", region_label),
      subtitle = subtitle_text,
      x        = "Cogongrass Cover (%)",
      y        = "Shannon Diversity",
      color    = "Site"
    ) +
    theme_bw(base_size = 13) +
    theme(
      plot.title    = element_text(face = "bold"),
      plot.subtitle = element_text(size = 10, color = "grey40"),
      legend.position = "right"
    )
  
  return(p)
}

# Subset data per region (mirrors what run_region_models does internally)
central_data <- subset(model_data, Site %in% central_fl)
north_data   <- subset(model_data, Site %in% north_fl)
south_data   <- subset(model_data, Site %in% south_ms)

# Generate plots
p_central <- plot_region_model(central_mod, central_data, "Central Florida",  color = "steelblue")
p_north   <- plot_region_model(north_mod,   north_data,   "North Florida",    color = "darkorange")
p_south   <- plot_region_model(south_mod,   south_data,   "South Mississippi",color = "forestgreen")

# View individually
p_central

p_north

p_south

# arrange together
p_central / p_north / p_south