pacman::p_load(tidyverse, readxl, raster, vegan, ggplot2,
               tigris, sf, sp, plotly, ggrepel, rstanarm, mgcv,
               brms, lme4, caret, MuMIn, VSURF, loo, patchwork)

EnvironmentalOutputs <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Environmental_Outputs.xlsx")

# Tree PCQ Data
tree_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Tree_PCQ")

# Veg Data
Veg_Cover <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Veg_Cover")

# Shrub Cover Data
shrub_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                         sheet = "Shrub_Cover")

# Site Data
CogonSites <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/CogonSites_FL_AL_MS.xlsx")

Number of quadrats sampled per plot

# Count the total number of quadrats per plot
quadrat_count <- Veg_Cover %>%
  group_by(Plot) %>%
  summarize(total_quadrats = n_distinct(Quadrat), .groups = "drop")

Filter All data to only include specified species (Per PLANTS database)

#Filter tree data to only include trees with "tree" in the growth column
tree_data <- dplyr::filter(tree_data, Growth == "Tree")

#Filter Veg Cover to exclude Shrubs and Trees
Veg_Cover <- dplyr::filter(Veg_Cover, Growth != "Shrub" & Growth != "Tree")

#Filter Shrub Cover to only include Shrubs and Trees
shrub_data <- dplyr::filter(shrub_data, Growth == "Shrub" | Growth == "Tree")

Shrub Cover Conversion

# Total length of Shrub cover at a site
shrub_cover <- shrub_data %>%
  mutate(Cover = Line_End - Line_Start) %>%
  group_by(Species_Name, Plot, InvStatus) %>%
  summarise(Total_Cover = sum(Cover, na.rm = TRUE), .groups = "drop") %>%
  mutate(Percent_Cover = Total_Cover / 3000 * 100)

Herbacous Cover Conversion

# Combine Plot and Quadrat columns
Veg_Cover <- Veg_Cover %>%
  mutate(Plot_Quadrat = paste(Plot, Quadrat, sep = '_'))

# Join with CogonSites to get site information
Veg_Cover <- Veg_Cover %>%
  left_join(CogonSites, by = "Plot")

# Sum species cover across quadrats for each species at each plot
veg_cover_summed <- Veg_Cover %>%
  group_by(Plot, Species_Name, InvStatus) %>%
  summarize(total_cover = sum(Cover_Per, na.rm = TRUE), .groups = "drop")

# Calculate average herbaceous species cover
avg_species_cover <- veg_cover_summed %>%
  left_join(quadrat_count, by = "Plot") %>%
  mutate(avg_cover = total_cover / total_quadrats)

Merging Herb cover with Shrub

This species matrix includes herbaceous and shrub species

# Merge shrub cover with herbaceous average cover
combined_cover <- avg_species_cover %>%
  full_join(
    shrub_cover %>%
      select(Plot, Species_Name, Percent_Cover, InvStatus),
    by = c("Plot", "Species_Name")
  ) %>%
  mutate(
    overlap_flag = ifelse(!is.na(avg_cover) & !is.na(Percent_Cover), TRUE, FALSE), # Flag overlaps
    final_cover = case_when(
      !is.na(avg_cover) & is.na(Percent_Cover) ~ avg_cover,  # Use herbaceous cover if no shrub data
      is.na(avg_cover) & !is.na(Percent_Cover) ~ Percent_Cover, # Use shrub cover if no herbaceous data
      TRUE ~ NA_real_ # Leave as NA where overlaps exist
    )
  )

Extract Cogongrass Cover

# Extract cogongrass cover
cogongrass_cover <- combined_cover %>%
  filter(Species_Name == "Imperata_cylindrica") %>%
  select(Plot, final_cover) %>%
  rename(Cogongrass_Cover = final_cover)

Species Matrix

combined_cover <- combined_cover %>%
  filter(Species_Name != "Imperata_cylindrica" , Species_Name != "Imperata_cylindrica_Live") # Remove cogongrass from species matrix

## Remove any non_native species
combined_cover <- combined_cover %>%
  filter(InvStatus.x != "Non_Native") # Remove non-native species from species matrix

species_matrix <- combined_cover %>%
  select(Plot, Species_Name, final_cover) %>%
  pivot_wider(
    names_from = Species_Name,
    values_from = final_cover,
    values_fill = 0
  )

Shannon Diversity

# Calculate Shannon diversity index for each site
shannon_diversity <- species_matrix %>%
  select(-Plot) %>%
  vegan::diversity(index = "shannon") %>%
  as.data.frame() %>%
  setNames("Shannon_Diversity") %>%
  mutate(Plot = species_matrix$Plot)

Merge Shannon Diversity with Cogongrass Cover and Environmental Data

# Merge Shannon diversity with cogongrass cover, tree canopy cover, and environmental outputs
model_data <- shannon_diversity %>%
  left_join(cogongrass_cover, by = "Plot") %>%
  left_join(EnvironmentalOutputs, by = "Plot") %>%
  mutate(Cogongrass_Cover = ifelse(is.na(Cogongrass_Cover), 0, Cogongrass_Cover))

# Inspect to see if there are any columns with only one level
summary(model_data)

##  Shannon_Diversity     Plot           Cogongrass_Cover     Site          
##  Min.   :0.000     Length:206         Min.   : 0.00    Length:206        
##  1st Qu.:2.065     Class :character   1st Qu.: 0.00    Class :character  
##  Median :2.407     Mode  :character   Median : 0.00    Mode  :character  
##  Mean   :2.333                        Mean   :17.08                      
##  3rd Qu.:2.720                        3rd Qu.:29.82                      
##  Max.   :3.552                        Max.   :92.14                      
##                                                                          
##     BurnYear       Camera             Latitude       Longitude     
##  Min.   :2000   Length:206         Min.   :28.48   Min.   :-89.80  
##  1st Qu.:2014   Class :character   1st Qu.:29.56   1st Qu.:-88.91  
##  Median :2022   Mode  :character   Median :30.77   Median :-87.14  
##  Mean   :2018                      Mean   :30.38   Mean   :-86.72  
##  3rd Qu.:2022                      3rd Qu.:30.98   3rd Qu.:-83.58  
##  Max.   :2024                      Max.   :31.59   Max.   :-81.94  
##  NA's   :42                                                        
##     Muname          NLCD_LandCover        Region            SoilType        
##  Length:206         Length:206         Length:206         Length:206        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Status          TreeCanopy_NLCD     aspect        elevation     
##  Length:206         Min.   : 0.00   Min.   :  0.0   Min.   : 17.00  
##  Class :character   1st Qu.:51.50   1st Qu.: 90.0   1st Qu.: 38.00  
##  Mode  :character   Median :75.50   Median :180.0   Median : 59.00  
##                     Mean   :66.22   Mean   :171.7   Mean   : 57.68  
##                     3rd Qu.:86.75   3rd Qu.:270.0   3rd Qu.: 75.00  
##                     Max.   :98.00   Max.   :350.6   Max.   :111.00  
##                                                                     
##       ndvi              pr            slope             sph         
##  Min.   :0.2728   Min.   :3.850   Min.   : 0.000   Min.   :0.01051  
##  1st Qu.:0.4303   1st Qu.:4.345   1st Qu.: 2.149   1st Qu.:0.01074  
##  Median :0.4624   Median :4.592   Median : 2.984   Median :0.01091  
##  Mean   :0.4503   Mean   :4.566   Mean   : 3.601   Mean   :0.01145  
##  3rd Qu.:0.4846   3rd Qu.:4.796   3rd Qu.: 4.688   3rd Qu.:0.01183  
##  Max.   :0.5357   Max.   :5.294   Max.   :18.429   Max.   :0.01325  
##                                                                     
##       srad            tmmn            agb        
##  Min.   :204.7   Min.   :285.7   Min.   :  5.00  
##  1st Qu.:206.8   1st Qu.:286.6   1st Qu.: 74.00  
##  Median :207.0   Median :287.2   Median : 96.00  
##  Mean   :210.6   Mean   :287.4   Mean   : 99.92  
##  3rd Qu.:216.6   3rd Qu.:288.6   3rd Qu.:133.00  
##  Max.   :224.2   Max.   :290.3   Max.   :211.00  
##

Model Fitting

# Ensure Site is a factor
model_data$Site <- as.factor(model_data$Site)

# 1) Refit under ML for fair AIC comparison
shannon_lmer_ml <- lmer(Shannon_Diversity ~ Cogongrass_Cover +
                          (1 | Site),
                        data = model_data, REML = FALSE)

shannon_lmer_quad_ml <- lmer(Shannon_Diversity ~ Cogongrass_Cover + I(Cogongrass_Cover^2) +
                               (1 | Site),
                             data = model_data, REML = FALSE)

## Warning: Some predictor variables are on very different scales: consider
## rescaling

shannon_gam_ml <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
                        s(Site, bs = "re"),
                      data = model_data, method = "ML")

Model Comparison

# AIC table and ΔAIC
aic_tab <- AIC(shannon_lmer_ml, shannon_lmer_quad_ml, shannon_gam_ml)
aic_tab$delta <- aic_tab$AIC - min(aic_tab$AIC)
print(aic_tab)

##                            df      AIC    delta
## shannon_lmer_ml       4.00000 313.5426 30.98326
## shannon_lmer_quad_ml  5.00000 302.0660 19.50660
## shannon_gam_ml       14.33328 282.5594  0.00000

# Check smooth complexity (EDF ~ 1 suggests near-linear)
gam_summ <- summary(shannon_gam_ml)
edf_cover <- gam_summ$s.table[rownames(gam_summ$s.table) == "s(Cogongrass_Cover)", "edf"]
cat("EDF for s(Cogongrass_Cover):", round(edf_cover, 2), "\n")

## EDF for s(Cogongrass_Cover): 3.57

summary(shannon_gam_ml)

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.26314    0.09581   23.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value    
## s(Cogongrass_Cover) 3.573  4.366 6.476 4.5e-05 ***
## s(Site)             7.366  9.000 5.104 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.262   Deviance explained = 30.2%
## -ML = 147.37  Scale est. = 0.21317   n = 206

Leave One Out Cross Validation

# 1. Initialize prediction vectors for all three models
sites <- levels(model_data$Site)

obs_all <- c()
pred_lmer_all <- c()
pred_gam_all  <- c()
pred_quad_all <- c() # New vector for quadratic model predictions

for (s in sites) {
  train <- subset(model_data, Site != s)
  test  <- subset(model_data, Site == s)

  # Fit on training data (ML)
  fit_lmer <- lmer(Shannon_Diversity ~ Cogongrass_Cover + (1 | Site),
                   data = train, REML = FALSE)
  fit_gam  <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
                  data = train, method = "ML")
  # New Model Fit: LMER with Quadratic Term
  fit_quad <- lmer(Shannon_Diversity ~ Cogongrass_Cover + I(Cogongrass_Cover^2) + (1 | Site),
                   data = train, REML = FALSE)

  # Predict on held-out site using population-level effects (exclude site RE)
  # re.form = NA and exclude = "s(Site)" ensure no random effects are used,
  # making the prediction purely a population-level (fixed effects) estimate.
  p_lmer <- predict(fit_lmer, newdata = test, re.form = NA)
  p_gam  <- predict(fit_gam,  newdata = test, exclude = "s(Site)")
  # New Prediction Step: Quadratic Model
  p_quad <- predict(fit_quad, newdata = test, re.form = NA)

  # 2. Collect observations and predictions
  obs_all        <- c(obs_all, test$Shannon_Diversity)
  pred_lmer_all  <- c(pred_lmer_all, p_lmer)
  pred_gam_all   <- c(pred_gam_all, p_gam)
  pred_quad_all  <- c(pred_quad_all, p_quad) # Collect quadratic predictions
}

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels ONF not in original fit

## Warning: Some predictor variables are on very different scales: consider
## rescaling

## Warning in predict.gam(fit_gam, newdata = test, exclude = "s(Site)"): factor
## levels WSF not in original fit

# Define metric functions (as they were in the original code)
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

# 3. Calculate metrics for all three models
metrics <- list(
  LMM_Linear = c( # Renamed for clarity
    RMSE = rmse(obs_all, pred_lmer_all),
    MAE  = mae(obs_all, pred_lmer_all),
    R2   = r2_oos(obs_all, pred_lmer_all)
  ),
  LMM_Quadratic = c( # NEW: Quadratic model metrics
    RMSE = rmse(obs_all, pred_quad_all),
    MAE  = mae(obs_all, pred_quad_all),
    R2   = r2_oos(obs_all, pred_quad_all)
  ),
  GAM_Smooth = c( # Renamed for clarity
    RMSE = rmse(obs_all, pred_gam_all),
    MAE  = mae(obs_all, pred_gam_all),
    R2   = r2_oos(obs_all, pred_gam_all)
  )
)

print(metrics)

## $LMM_Linear
##       RMSE        MAE         R2 
##  0.5650397  0.4485244 -0.1101912 
## 
## $LMM_Quadratic
##        RMSE         MAE          R2 
##  0.55213009  0.44305887 -0.06004089 
## 
## $GAM_Smooth
##        RMSE         MAE          R2 
##  0.54653551  0.44203332 -0.03866752

Plot Results

plot_data <- data.frame(
  Observed = obs_all,
  LMM_Predicted = pred_lmer_all,
  LMM_Quadratic_Predicted = pred_quad_all,
  GAM_Predicted = pred_gam_all
)

p <- ggplot(plot_data, aes(x = Observed)) +
  geom_point(aes(y = LMM_Predicted, color = "LMM"), alpha = 0.6) +
  geom_point(aes(y = LMM_Quadratic_Predicted, color = "LMM Quadratic"), alpha = 0.6) +
  geom_point(aes(y = GAM_Predicted, color = "GAM"), alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(x = "Observed Shannon Diversity", y = "Predicted Shannon Diversity") +
  scale_color_manual(name = "Model", values = c("LMM" = "blue", "LMM Quadratic" = "orange", "GAM" = "red")) +
  theme_minimal()

print(p)

## GAM Model

# Leave-One-Site-Out GAM with LOOCV
library(mgcv)

# Initialize vectors to store observations and predictions
sites <- levels(model_data$Site)
obs_all <- c()
pred_gam_all <- c()

# Determine if we have enough sites for a random effect
use_mixed <- length(sites) >= 3
n_unique <- length(unique(model_data$Cogongrass_Cover))
k_val <- min(5, n_unique - 1)

# Fit GAM on all data
if (use_mixed) {
  cat("Fitting GAM with site random effect.\n")
  fit_gam_full <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
                      data = model_data, method = "ML")
} else {
  cat("Fitting GAM without site random effect.\n")
  fit_gam_full <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                      data = model_data, method = "ML")
}

## Fitting GAM with site random effect.

print(summary(fit_gam_full))

## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.26314    0.09581   23.62   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value    
## s(Cogongrass_Cover) 3.573  4.366 6.476 4.5e-05 ***
## s(Site)             7.366  9.000 5.104 < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.262   Deviance explained = 30.2%
## -ML = 147.37  Scale est. = 0.21317   n = 206

# ------------------------------------------
# Leave-One-Site-Out Cross-Validation
# ------------------------------------------
for (s in sites) {
  
  train <- subset(model_data, Site != s)
  test  <- subset(model_data, Site == s)
  
  if (use_mixed) {
    fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
                  data = train, method = "ML")
    p_gam <- predict(fit_cv, newdata = test, exclude = "s(Site)")
  } else {
    fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                  data = train, method = "ML")
    p_gam <- predict(fit_cv, newdata = test)
  }
  
  obs_all <- c(obs_all, test$Shannon_Diversity)
  pred_gam_all <- c(pred_gam_all, p_gam)
}

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels ONF not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels WSF not in original fit

# ------------------------------------------
# Calculate LOOCV Metrics
# ------------------------------------------
rmse <- function(y, yhat) sqrt(mean((y - yhat)^2))
mae  <- function(y, yhat) mean(abs(y - yhat))
r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)

metrics <- c(RMSE = rmse(obs_all, pred_gam_all),
             MAE  = mae(obs_all, pred_gam_all),
             R2   = r2_oos(obs_all, pred_gam_all))

cat("\nLOOCV Metrics:\n")

## 
## LOOCV Metrics:

print(metrics)

##        RMSE         MAE          R2 
##  0.54653551  0.44203332 -0.03866752

# Return the full model and CV metrics
list(model = fit_gam_full, cv_metrics = metrics)

## $model
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Estimated degrees of freedom:
## 3.57 7.37  total = 11.94 
## 
## ML score: 147.3724     
## 
## $cv_metrics
##        RMSE         MAE          R2 
##  0.54653551  0.44203332 -0.03866752

Plot GAM Smooth

plot(shannon_gam_ml, pages = 1, rug = TRUE, se = TRUE)

# Assuming shannon_gam_ml is already fitted from your previous steps
# shannon_gam_ml <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re"),
#                       data = model_data, method = "ML")


# --- 1. Extract data for the smooth term s(Cogongrass_Cover) ---
# We need to create a new data frame to predict from, covering the range of Cogongrass_Cover
# We only need to vary Cogongrass_Cover and keep other variables at a constant or mean value
# (or just not include them if predicting only specific smooth terms).
# The 'exclude' argument in predict.gam helps isolate the smooth term.
# Get one valid site level
site_level <- levels(model_data$Site)[1]

# Create a sequence of Cogongrass_Cover values
cogongrass_range <- range(model_data$Cogongrass_Cover)
pred_data_gam <- data.frame(Cogongrass_Cover = seq(cogongrass_range[1], cogongrass_range[2], length.out = 200))

pred_data_gam$Site <- factor(site_level, levels = levels(model_data$Site))

# Predict the smooth term.
# type="terms" gives the contribution of each term.
# se.fit=TRUE gives standard errors for confidence intervals.
# exclude="s(Site)" tells predict to fix the random effect at 0 (its population average),
# so we only see the smooth effect of Cogongrass_Cover.
gam_pred <- predict(shannon_gam_ml, 
                    newdata = pred_data_gam, 
                    type = "terms", 
                    se.fit = TRUE, 
                    exclude = "s(Site)")

# Combine predictions with the original predictor values
smooth_df <- data.frame(
  Cogongrass_Cover = pred_data_gam$Cogongrass_Cover,
  fit = gam_pred$fit[, "s(Cogongrass_Cover)"], # Extract the specific smooth term
  se = gam_pred$se.fit[, "s(Cogongrass_Cover)"]  # Extract its SE
)

# Calculate confidence intervals
smooth_df$upper <- smooth_df$fit + (2 * smooth_df$se) # Approx 95% CI
smooth_df$lower <- smooth_df$fit - (2 * smooth_df$se) # Approx 95% CI

# --- 2. Get data for the rug plot (actual data points) ---
rug_data <- data.frame(Cogongrass_Cover = model_data$Cogongrass_Cover)


# --- 3. Create the ggplot ---
smooth_plot <- ggplot(smooth_df, aes(x = Cogongrass_Cover, y = fit)) +
  # Add the filled confidence interval band
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "grey80", alpha = 0.5) +
  # Add the smooth line
  geom_line(color = "steelblue", linewidth = 1) + # Use linewidth instead of size for geom_line
  # Add the rug plot (data point density)
  geom_rug(data = rug_data, aes(x = Cogongrass_Cover), inherit.aes = FALSE,
           sides = "b", length = unit(0.03, "npc"), color = "black", alpha = 0.7) +
  # Labels and Title
  labs(
    x = "Cogongrass Cover (%)",
    y = "Effect on Shannon Diversity"
  ) +
  # Customize theme for publication quality
  theme_minimal(base_size = 14) + # Start with a clean theme, set base font size
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", margin = margin(b = 10)),
    axis.title.x = element_text(margin = margin(t = 10)),
    axis.title.y = element_text(margin = margin(r = 10)),
    panel.grid.minor = element_blank(), # Remove minor grid lines
    panel.grid.major = element_line(color = "grey70", linetype = "dotted"), # Lighter, dotted major grid lines
    axis.line = element_line(color = "black") # Add axis lines for definition
  ) +
  # Set y-axis limits to match the default plot if desired, or let ggplot handle it
  # coord_cartesian(ylim = c(-1.2, 0.4)) # Adjust based on your actual data and desired visual
  NULL # Placeholder for potential further customizations

# Save plot
ggsave("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/03_Figure/GAM_Smooth_Cogon_Diversity.png", plot = smooth_plot,
       width = 8, height = 6, dpi = 300)

Region Specific Analysis

Regional Groups

central_fl <- c("ONF", "WSF")
north_fl   <- c("BRSF", "Escambia", "Jay", "Howell", "Hooper")
south_ms   <- c("CNF", "DSNF", "BCNWR")

Regional Modeling

run_region_models <- function(region_sites, data, min_sites = 3) {
  
  region_data <- subset(data, Site %in% region_sites)
  region_data$Site <- droplevels(region_data$Site)
  
  n_sites <- length(unique(region_data$Site))
  
  cat("\n=============================\n")
  cat("Region Sites:", region_sites, "\n")
  cat("Number of Sites:", n_sites, "\n")
  cat("=============================\n")
  
  use_mixed <- n_sites >= min_sites
  n_unique <- length(unique(region_data$Cogongrass_Cover))
  k_val <- min(5, n_unique - 1)
  
  if (use_mixed) {
    cat("Using mixed-effects GAM.\n")
    fit_gam <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
                     s(Site, bs = "re"),
                   data = region_data, method = "ML")
  } else {
    cat("Fewer than", min_sites, "sites detected. Removing random effect.\n")
    fit_gam <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                   data = region_data, method = "ML")
  }
  
  print(summary(fit_gam))
  
  # --------------------------------------------------
  # Leave-One-Site-Out CV
  # --------------------------------------------------
  
  sites <- levels(region_data$Site)
  obs_all <- pred_gam <- c()
  
  for (s in sites) {
    
    train <- subset(region_data, Site != s)
    test  <- subset(region_data, Site == s)
    
    if (use_mixed) {
      fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover) +
                      s(Site, bs = "re"),
                    data = train, method = "ML")
      p_gam <- predict(fit_cv, newdata = test, exclude = "s(Site)")
    } else {
      fit_cv <- gam(Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val),
                    data = train, method = "ML")
      p_gam <- predict(fit_cv, newdata = test)
    }
    
    obs_all <- c(obs_all, test$Shannon_Diversity)
    pred_gam <- c(pred_gam, p_gam)
  }
  
  # --------------------------------------------------
  # Metrics
  # --------------------------------------------------
  
  rmse   <- function(y, yhat) sqrt(mean((y - yhat)^2))
  mae    <- function(y, yhat) mean(abs(y - yhat))
  r2_oos <- function(y, yhat) 1 - sum((y - yhat)^2) / sum((y - mean(y))^2)
  
  metrics <- list(
    GAM = c(RMSE = rmse(obs_all, pred_gam),
            MAE  = mae(obs_all, pred_gam),
            R2   = r2_oos(obs_all, pred_gam))
  )
  
  print(metrics)
  
  return(list(model = fit_gam, cv_metrics = metrics))
}

Run Each Model

# Run the function and save output
central_mod <- run_region_models(central_fl, model_data)

## 
## =============================
## Region Sites: ONF WSF 
## Number of Sites: 2 
## =============================
## Fewer than 3 sites detected. Removing random effect.
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover, k = k_val)
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.25021    0.07696   29.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                     edf Ref.df    F  p-value    
## s(Cogongrass_Cover)   1      1 15.2 0.000289 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.218   Deviance explained = 23.3%
## -ML = 42.148  Scale est. = 0.30802   n = 52
## $GAM
##       RMSE        MAE         R2 
##  1.1106755  0.9834191 -2.1940763

north_mod   <- run_region_models(north_fl, model_data)

## 
## =============================
## Region Sites: BRSF Escambia Jay Howell Hooper 
## Number of Sites: 5 
## =============================
## Using mixed-effects GAM.
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.2183     0.1098   20.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df    F p-value  
## s(Cogongrass_Cover) 1.000      1 0.00  0.9829  
## s(Site)             2.656      4 2.34  0.0122 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.111   Deviance explained = 15.5%
## -ML = 46.728  Scale est. = 0.19069   n = 75

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BRSF not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Escambia not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Hooper not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Howell not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels Jay not in original fit

## $GAM
##       RMSE        MAE         R2 
##  0.4951541  0.3910516 -0.1587838

south_mod   <- run_region_models(south_ms, model_data)

## 
## =============================
## Region Sites: CNF DSNF BCNWR 
## Number of Sites: 3 
## =============================
## Using mixed-effects GAM.
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## Shannon_Diversity ~ s(Cogongrass_Cover) + s(Site, bs = "re")
## 
## Parametric coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.4831     0.0823   30.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                       edf Ref.df     F p-value  
## s(Cogongrass_Cover) 2.520  3.069 3.696  0.0158 *
## s(Site)             1.156  2.000 1.947  0.0387 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.168   Deviance explained = 20.8%
## -ML = 55.602  Scale est. = 0.21648   n = 79

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels BCNWR not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels CNF not in original fit

## Warning in predict.gam(fit_cv, newdata = test, exclude = "s(Site)"): factor
## levels DSNF not in original fit

## $GAM
##       RMSE        MAE         R2 
##  0.5682273  0.4246825 -0.2560978

Plot

plot_region_model <- function(mod_obj, region_data, region_label, color = "steelblue") {
  
  fit_gam <- mod_obj$model
  
  # Generate prediction grid over observed range
  cov_range <- range(region_data$Cogongrass_Cover, na.rm = TRUE)
  pred_df <- data.frame(
    Cogongrass_Cover = seq(cov_range[1], cov_range[2], length.out = 200),
    Site = region_data$Site[1]  # placeholder site for re term; excluded below
  )
  
  # Predict excluding random effect so we get the population-level smooth
  preds <- predict(fit_gam, newdata = pred_df, 
                   exclude = "s(Site)", se.fit = TRUE)
  
  pred_df$fit  <- preds$fit
  pred_df$lwr  <- preds$fit - 1.96 * preds$se.fit
  pred_df$upr  <- preds$fit + 1.96 * preds$se.fit
  
  # Pull CV metrics for subtitle
  m <- mod_obj$cv_metrics$GAM
  subtitle_text <- sprintf("LOOCV — RMSE: %.3f | MAE: %.3f | R²: %.3f",
                           m["RMSE"], m["MAE"], m["R2"])
  
  p <- ggplot() +
    # Raw observations
    geom_point(data = region_data,
               aes(x = Cogongrass_Cover, y = Shannon_Diversity, color = Site),
               alpha = 0.6, size = 2) +
    # 95% CI ribbon
    geom_ribbon(data = pred_df,
                aes(x = Cogongrass_Cover, ymin = lwr, ymax = upr),
                fill = color, alpha = 0.2) +
    # Population-level smooth
    geom_line(data = pred_df,
              aes(x = Cogongrass_Cover, y = fit),
              color = color, linewidth = 1.2) +
    labs(
      title    = paste("Shannon Diversity ~ Cogongrass Cover |", region_label),
      subtitle = subtitle_text,
      x        = "Cogongrass Cover (%)",
      y        = "Shannon Diversity",
      color    = "Site"
    ) +
    theme_bw(base_size = 13) +
    theme(
      plot.title    = element_text(face = "bold"),
      plot.subtitle = element_text(size = 10, color = "grey40"),
      legend.position = "right"
    )
  
  return(p)
}

# Subset data per region (mirrors what run_region_models does internally)
central_data <- subset(model_data, Site %in% central_fl)
north_data   <- subset(model_data, Site %in% north_fl)
south_data   <- subset(model_data, Site %in% south_ms)

# Generate plots
p_central <- plot_region_model(central_mod, central_data, "Central Florida",  color = "steelblue")
p_north   <- plot_region_model(north_mod,   north_data,   "North Florida",    color = "darkorange")
p_south   <- plot_region_model(south_mod,   south_data,   "South Mississippi",color = "forestgreen")

# View individually
p_central

p_north

p_south

# arrange together
p_central / p_north / p_south

O1_Veg_Div

Alan Ivory

2026-02-27

Number of quadrats sampled per plot

Filter All data to only include specified species (Per PLANTS database)

Shrub Cover Conversion

Herbacous Cover Conversion

Merging Herb cover with Shrub

Extract Cogongrass Cover

Species Matrix

Shannon Diversity

Merge Shannon Diversity with Cogongrass Cover and Environmental Data

Model Comparison

Leave One Out Cross Validation

Plot Results

Plot GAM Smooth

Region Specific Analysis

Regional Groups

Regional Modeling

Run Each Model

Plot