pacman::p_load(randomForest, Boruta, caret, tidyverse, vip, pdp,
               sf, corrplot, gridExtra, spdep, adespatial, tibble,
               readxl, terra, patchwork)

EnvironmentalOutputs <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Environmental_Outputs.xlsx")

social_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Social_Outputs.xlsx")

# Merge Environmental and Social Data
EnvironmentalOutputs <- EnvironmentalOutputs %>%
  left_join(social_data, by = "Plot")

# Tree PCQ Data
tree_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Tree_PCQ")

# Veg Data
Veg_Cover <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Veg_Cover")

# Shrub Cover Data
shrub_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                         sheet = "Shrub_Cover")

# Site Data
CogonSites <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/CogonSites_FL_AL_MS.xlsx")

Extract data from tif

agb_raster <- rast("I:/Cogongrass_Project/data/TreeMap2022_CONUS_DRYBIO_L.tif")

points <- vect(EnvironmentalOutputs,
               geom = c("Longitude", "Latitude"),
               crs = "EPSG:4326")

agb_vals <- extract(agb_raster, points)
## Warning: [extract] transforming vector data to the CRS of the raster
EnvironmentalOutputs$agb <- agb_vals[,2]

Extract Soil Data

# Load the raster
r_out <- rast("I:/Cogongrass_Project/data/Soils/taxorder_CONUS_30m.tif")

# Reproject points to match raster CRS and extract soil values
points_proj <- project(points, crs(r_out))
soil_vals <- extract(r_out, points_proj)

# Add soil type to EnvironmentalOutputs
EnvironmentalOutputs$SoilType <- soil_vals$taxorder
table(EnvironmentalOutputs$SoilType)
## 
##   1   4   9  10 
##  31  31   1 143
# Convert numeric IDs to labeled factor
soil_lookup <- data.frame(
  ID = 1:11,
  SoilOrder = c("Alfisols","Andisols","Aridisols","Entisols","Gelisols",
                "Histosols","Inceptisols","Mollisols","Spodosols","Ultisols","Vertisols")
)

soil_vals$SoilType <- soil_lookup$SoilOrder[soil_vals$taxorder]

EnvironmentalOutputs$SoilType <- as.factor(soil_vals$SoilType)

Number of quadrats sampled per plot

# Count the total number of quadrats per plot
quadrat_count <- Veg_Cover %>%
  group_by(Plot) %>%
  summarize(total_quadrats = n_distinct(Quadrat), .groups = "drop")

Filter All data to only include specified species (Per PLANTS database)

#Filter tree data to only include trees with "tree" in the growth column
tree_data <- dplyr::filter(tree_data, Growth == "Tree")

#Filter Veg Cover to exclude Shrubs and Trees
Veg_Cover <- dplyr::filter(Veg_Cover, Growth != "Shrub" & Growth != "Tree")

#Filter Shrub Cover to only include Shrubs and Trees
shrub_data <- dplyr::filter(shrub_data, Growth == "Shrub" | Growth == "Tree")

Shrub Cover Conversion

# Total length of Shrub cover at a site
shrub_cover <- shrub_data %>%
  mutate(Cover = Line_End - Line_Start) %>%
  group_by(Species_Name, Plot, InvStatus) %>%
  summarise(Total_Cover = sum(Cover, na.rm = TRUE), .groups = "drop") %>%
  mutate(Percent_Cover = Total_Cover / 3000 * 100)

Herbacous Cover Conversion

# Combine Plot and Quadrat columns
Veg_Cover <- Veg_Cover %>%
  mutate(Plot_Quadrat = paste(Plot, Quadrat, sep = '_'))

# Join with CogonSites to get site information
Veg_Cover <- Veg_Cover %>%
  left_join(CogonSites, by = "Plot")

# Sum species cover across quadrats for each species at each plot
veg_cover_summed <- Veg_Cover %>%
  group_by(Plot, Species_Name, InvStatus) %>%
  summarize(total_cover = sum(Cover_Per, na.rm = TRUE), .groups = "drop")

# Calculate average herbaceous species cover
avg_species_cover <- veg_cover_summed %>%
  left_join(quadrat_count, by = "Plot") %>%
  mutate(avg_cover = total_cover / total_quadrats)

Merging Herb cover with Shrub

This species matrix includes herbaceous and shrub species

# Merge shrub cover with herbaceous average cover
combined_cover <- avg_species_cover %>%
  full_join(
    shrub_cover %>%
      dplyr::select(Plot, Species_Name, Percent_Cover, InvStatus),
    by = c("Plot", "Species_Name")
  ) %>%
  mutate(
    overlap_flag = ifelse(!is.na(avg_cover) & !is.na(Percent_Cover), TRUE, FALSE), # Flag overlaps
    final_cover = case_when(
      !is.na(avg_cover) & is.na(Percent_Cover) ~ avg_cover,  # Use herbaceous cover if no shrub data
      is.na(avg_cover) & !is.na(Percent_Cover) ~ Percent_Cover, # Use shrub cover if no herbaceous data
      TRUE ~ NA_real_ # Leave as NA where overlaps exist
    )
  )

Extract Cogongrass Cover

# Extract cogongrass cover
cogongrass_cover <- combined_cover %>%
  filter(Species_Name == "Imperata_cylindrica") %>%
  dplyr::select(Plot, final_cover) %>%
  rename(Cogongrass_Cover = final_cover)

Species Matrix

combined_cover <- combined_cover %>%
  filter(Species_Name != "Imperata_cylindrica" , Species_Name != "Imperata_cylindrica_Live") # Remove cogongrass from species matrix

## Remove any non_native species
combined_cover <- combined_cover %>%
  filter(InvStatus.x != "Non_Native") # Remove non-native species from species matrix

species_matrix <- combined_cover %>%
  dplyr::select(Plot, Species_Name, final_cover) %>%
  pivot_wider(
    names_from = Species_Name,
    values_from = final_cover,
    values_fill = 0
  )

Shannon Diversity

# Calculate Shannon diversity index for each site
shannon_diversity <- species_matrix %>%
  dplyr::select(-Plot) %>%
  vegan::diversity(index = "shannon") %>%
  as.data.frame() %>%
  setNames("Shannon_Diversity") %>%
  mutate(Plot = species_matrix$Plot)

Merge Shannon Diversity with Cogongrass Cover and Environmental Data

# Merge Shannon diversity with cogongrass cover, tree canopy cover, and environmental outputs
model_data <- shannon_diversity %>%
  left_join(cogongrass_cover, by = "Plot") %>%
  left_join(EnvironmentalOutputs, by = "Plot") %>%
  mutate(Cogongrass_Cover = ifelse(is.na(Cogongrass_Cover), 0, Cogongrass_Cover))

# Inspect to see if there are any columns with only one level
summary(model_data)
##  Shannon_Diversity     Plot           Cogongrass_Cover     Site          
##  Min.   :0.000     Length:206         Min.   : 0.00    Length:206        
##  1st Qu.:2.065     Class :character   1st Qu.: 0.00    Class :character  
##  Median :2.407     Mode  :character   Median : 0.00    Mode  :character  
##  Mean   :2.333                        Mean   :17.08                      
##  3rd Qu.:2.720                        3rd Qu.:29.82                      
##  Max.   :3.552                        Max.   :92.14                      
##                                                                          
##     BurnYear       Camera             Latitude       Longitude     
##  Min.   :2000   Length:206         Min.   :28.48   Min.   :-89.80  
##  1st Qu.:2014   Class :character   1st Qu.:29.56   1st Qu.:-88.91  
##  Median :2022   Mode  :character   Median :30.77   Median :-87.14  
##  Mean   :2018                      Mean   :30.38   Mean   :-86.72  
##  3rd Qu.:2022                      3rd Qu.:30.98   3rd Qu.:-83.58  
##  Max.   :2024                      Max.   :31.59   Max.   :-81.94  
##  NA's   :42                                                        
##     Muname          NLCD_LandCover        Region             Status         
##  Length:206         Length:206         Length:206         Length:206        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##       SoilType     Soil_Group  TreeCanopy_NLCD     aspect        elevation     
##  Alfisols : 31   Min.   : 18   Min.   : 0.00   Min.   :  0.0   Min.   : 17.00  
##  Entisols : 31   1st Qu.:377   1st Qu.:51.50   1st Qu.: 90.0   1st Qu.: 38.00  
##  Spodosols:  1   Median :381   Median :75.50   Median :180.0   Median : 59.00  
##  Ultisols :143   Mean   :330   Mean   :66.22   Mean   :171.7   Mean   : 57.68  
##                  3rd Qu.:389   3rd Qu.:86.75   3rd Qu.:270.0   3rd Qu.: 75.00  
##                  Max.   :389   Max.   :98.00   Max.   :350.6   Max.   :111.00  
##                                                                                
##       ndvi              pr            slope             sph         
##  Min.   :0.2728   Min.   :3.850   Min.   : 0.000   Min.   :0.01051  
##  1st Qu.:0.4303   1st Qu.:4.345   1st Qu.: 2.149   1st Qu.:0.01074  
##  Median :0.4624   Median :4.592   Median : 2.984   Median :0.01091  
##  Mean   :0.4503   Mean   :4.566   Mean   : 3.601   Mean   :0.01145  
##  3rd Qu.:0.4846   3rd Qu.:4.796   3rd Qu.: 4.688   3rd Qu.:0.01183  
##  Max.   :0.5357   Max.   :5.294   Max.   :18.429   Max.   :0.01325  
##                                                                     
##       srad            tmmn            agb         avg_pop_den_1km   
##  Min.   :204.7   Min.   :285.7   Min.   :  6.90   Min.   :  0.0000  
##  1st Qu.:206.8   1st Qu.:286.6   1st Qu.: 21.21   1st Qu.:  0.5956  
##  Median :207.0   Median :287.2   Median : 35.82   Median :  2.1841  
##  Mean   :210.6   Mean   :287.4   Mean   : 38.51   Mean   :  8.3667  
##  3rd Qu.:216.6   3rd Qu.:288.6   3rd Qu.: 50.89   3rd Qu.:  5.2578  
##  Max.   :224.2   Max.   :290.3   Max.   :112.06   Max.   :122.9795  
##                                  NA's   :30                         
##  road_density_1km
##  Min.   :0.000   
##  1st Qu.:1.771   
##  Median :3.103   
##  Mean   :3.037   
##  3rd Qu.:3.810   
##  Max.   :6.932   
## 

Check Distribution of Response Variable

hist(model_data$Shannon_Diversity, breaks=30, main="Histogram of Shannon Diversity", xlab="Shannon Diversity")

summary(model_data$Shannon_Diversity)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.065   2.407   2.333   2.720   3.552

Moran’s Eigenvector Maps

  • Low numbered MEMs capture broad-scale spatial patterns, while higher numbered MEMs capture finer-scale patterns.
# Extract coordinates
coords <- model_data[, c("Longitude", "Latitude")]
coords_matrix <- as.matrix(coords)

# Build a spatial neighbours list using a distance-based threshold.
# We use the minimum distance that ensures every plot has at least one neighbour.
nb <- dnearneigh(coords_matrix, d1 = 0, d2 = max(nbdists(
  dnearneigh(coords_matrix, d1 = 0, d2 = 5), coords_matrix)[[1]]) * 1.5)

# If the above produces isolates, fall back to k-nearest neighbours (k=4)
if (any(card(nb) == 0)) {
  cat("  Some plots had no neighbours — switching to k=4 nearest neighbours.\n")
  nb <- knn2nb(knearneigh(coords_matrix, k = 4))
}

# Convert neighbours to spatial weights (binary, row-standardised)
listw <- nb2listw(nb, style = "W", zero.policy = TRUE)

# Compute Moran's I on raw response to confirm spatial autocorrelation exists
moran_test <- moran.test(model_data$Shannon_Diversity, listw, zero.policy = TRUE)
cat(sprintf("  Moran's I = %.4f, p = %.4f\n",
            moran_test$estimate["Moran I statistic"],
            moran_test$p.value))
##   Moran's I = -0.0008, p = 0.0233
if (moran_test$p.value < 0.05) {
  cat("  >> Significant spatial autocorrelation detected — MEMs are warranted.\n\n")
} else {
  cat("  >> No significant spatial autocorrelation — MEMs may have limited benefit.\n\n")
}
##   >> Significant spatial autocorrelation detected — MEMs are warranted.
# Build MEMs from the spatial weights matrix
mem_out <- mem(listw)                   # all MEMs with positive eigenvalues
mem_scores <- as.data.frame(mem_out)

# Select only MEMs significantly correlated with Shannon Diversity (p < 0.05)
# This avoids adding irrelevant spatial axes and overfitting
mem_pvals <- apply(mem_scores, 2, function(mem_col) {
  cor.test(mem_col, model_data$Shannon_Diversity)$p.value
})
sig_mems <- names(mem_pvals[mem_pvals < 0.05])

if (length(sig_mems) == 0) {
  cat("  No MEMs significantly correlated with Shannon Diversity (p < 0.05).\n")
  cat("  Using top 3 MEMs by correlation strength instead.\n")
  sig_mems <- names(sort(mem_pvals))[1:min(3, length(mem_pvals))]
}

cat(sprintf("  Total MEMs with positive eigenvalues : %d\n", ncol(mem_scores)))
##   Total MEMs with positive eigenvalues : 205
cat(sprintf("  MEMs retained (p < 0.05)             : %d\n", length(sig_mems)))
##   MEMs retained (p < 0.05)             : 12
cat(sprintf("  Retained MEMs: %s\n\n", paste(sig_mems, collapse = ", ")))
##   Retained MEMs: MEM2, MEM5, MEM23, MEM65, MEM72, MEM102, MEM118, MEM147, MEM149, MEM160, MEM194, MEM203
mem_selected <- mem_scores[, sig_mems, drop = FALSE]

Model Data Preparation

set.seed(97)
# Ensure the data is correctly formatted:
# convert categorical variables to numeric
model_data$NLCD_LandCover <- as.numeric(as.factor(model_data$NLCD_LandCover))
model_data$SoilType <- as.numeric(as.factor(model_data$SoilType))
model_data$Site <- as.numeric(as.factor(model_data$Site))

str(model_data)
## 'data.frame':    206 obs. of  26 variables:
##  $ Shannon_Diversity: num  2.69 2.65 2.54 1.61 2.97 ...
##  $ Plot             : chr  "BI200" "BI201" "BI202" "BI97" ...
##  $ Cogongrass_Cover : num  22.1 10.7 39.6 21.1 2.5 ...
##  $ Site             : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ BurnYear         : num  2022 2022 2022 2022 2022 ...
##  $ Camera           : chr  "NA" "S-4K" "NA" "NA" ...
##  $ Latitude         : num  30.8 30.8 30.8 30.9 30.8 ...
##  $ Longitude        : num  -86.9 -86.9 -86.8 -86.9 -86.9 ...
##  $ Muname           : chr  "TroupLoamySand_0_to_5_percent_slopes" "TroupLoamySand_0_to_5_percent_slopes" "TroupLoamySand_0_to_5_percent_slopes" "FuquayLoamySand_0_to_5_percent_slopes" ...
##  $ NLCD_LandCover   : num  4 4 4 4 4 4 4 4 4 4 ...
##  $ Region           : chr  "FA" "FA" "FA" "FA" ...
##  $ Status           : chr  "Invaded" "Invaded" "Invaded" "Invaded" ...
##  $ SoilType         : num  4 4 4 4 4 4 4 4 4 4 ...
##  $ Soil_Group       : num  381 381 381 389 381 381 381 381 381 389 ...
##  $ TreeCanopy_NLCD  : num  66 94 70 59 83 63 66 71 82 54 ...
##  $ aspect           : num  0 246.8 180 246.8 13.1 ...
##  $ elevation        : num  72 70 71 48 58 71 65 55 61 43 ...
##  $ ndvi             : num  0.46 0.475 0.461 0.468 0.466 ...
##  $ pr               : num  4.76 4.76 4.81 4.8 4.82 ...
##  $ slope            : num  0.927 2.349 1.854 2.351 4.751 ...
##  $ sph              : num  0.0109 0.0109 0.0108 0.0107 0.0107 ...
##  $ srad             : num  206 206 207 207 207 ...
##  $ tmmn             : num  287 287 287 287 287 ...
##  $ agb              : num  45 45 27.5 19.2 18.5 ...
##  $ avg_pop_den_1km  : num  0.00479 0.00305 0.57717 2.00298 1.05855 ...
##  $ road_density_1km : num  6.07 6.44 6.29 2.34 5.35 ...
env_predictors <- model_data[, c("Cogongrass_Cover", "elevation", "aspect",
                                  "slope", "pr", "sph", "srad", "tmmn",
                                  "TreeCanopy_NLCD", "NLCD_LandCover",
                                  "SoilType", "ndvi", "avg_pop_den_1km",
                                  "road_density_1km", "agb")]


# Combine environmental predictors + significant MEMs
all_predictors <- cbind(env_predictors, mem_selected)

model_df <- cbind(Shannon_Diversity = model_data$Shannon_Diversity,
                  all_predictors) %>%
  na.omit()

cat(sprintf("  Environmental predictors : %d\n", ncol(env_predictors)))
##   Environmental predictors : 15
cat(sprintf("  Spatial MEM predictors   : %d\n", ncol(mem_selected)))
##   Spatial MEM predictors   : 12
cat(sprintf("  Total predictors         : %d\n", ncol(all_predictors)))
##   Total predictors         : 27
cat(sprintf("  Observations (post-NA)   : %d\n\n", nrow(model_df)))
##   Observations (post-NA)   : 176

Correlation Check

cor_matrix <- cor(model_df, use = "complete.obs")

png("correlation_matrix.png", width = 900, height = 800, res = 120)
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.75,
         addCoef.col = "black", number.cex = 0.55,
         title = "Predictor Correlation Matrix", mar = c(0,0,1,0))
dev.off()
## png 
##   2
cat("Correlation matrix saved.\n")
## Correlation matrix saved.
# Compute correlation matrix for predictors only (exclude response)
pred_cor <- cor(model_df[, -1], use = "complete.obs")  # remove Shannon_Diversity

# Find pairs with |r| > 0.9
high_cor_pairs <- findCorrelation(pred_cor, cutoff = 0.9, names = TRUE, verbose = TRUE)
## Compare row 5  and column  8 with corr  0.91 
##   Means:  0.22 vs 0.09 so flagging column 5 
## Compare row 8  and column  6 with corr  0.913 
##   Means:  0.191 vs 0.08 so flagging column 8 
## All correlations <= 0.9
# Ensure 'pr' is always kept
if ("pr" %in% high_cor_pairs) {
  # Get all variables highly correlated with 'pr'
  pr_cor_vars <- names(which(abs(pred_cor["pr", ]) > 0.9))
  pr_cor_vars <- setdiff(pr_cor_vars, "pr")  # exclude pr itself

  # Remove these variables instead of pr
  high_cor_pairs <- setdiff(high_cor_pairs, "pr")
  high_cor_pairs <- unique(c(high_cor_pairs, pr_cor_vars))
}

# Remove highly correlated predictors
if (length(high_cor_pairs) > 0) {
  cat(sprintf("  Removing %d highly correlated predictors (|r| > 0.9) while keeping 'pr': %s\n",
              length(high_cor_pairs),
              paste(high_cor_pairs, collapse = ", ")))
  model_df <- model_df[, !(colnames(model_df) %in% high_cor_pairs)]
} else {
  cat("  No highly correlated predictors (|r| > 0.9) found — no removals necessary.\n")
}
##   Removing 1 highly correlated predictors (|r| > 0.9) while keeping 'pr': tmmn

Variable Selection- Boruta

cat("========== STEP 3: Boruta Variable Selection ==========\n")
## ========== STEP 3: Boruta Variable Selection ==========
cat("  Running Boruta (this may take a minute)...\n")
##   Running Boruta (this may take a minute)...
boruta_out <- Boruta(Shannon_Diversity ~ ., data = model_df,
                     doTrace = 1, maxRuns = 150, num.trees = 500)
## After 12 iterations, +1.3 secs:
##  confirmed 5 attributes: avg_pop_den_1km, Cogongrass_Cover, MEM203, MEM23, pr;
##  rejected 5 attributes: agb, aspect, MEM72, NLCD_LandCover, road_density_1km;
##  still have 16 attributes left.
## After 16 iterations, +1.7 secs:
##  confirmed 2 attributes: MEM118, srad;
##  rejected 2 attributes: MEM102, slope;
##  still have 12 attributes left.
## After 19 iterations, +2 secs:
##  confirmed 1 attribute: elevation;
##  rejected 1 attribute: ndvi;
##  still have 10 attributes left.
## After 29 iterations, +2.9 secs:
##  rejected 3 attributes: MEM160, MEM194, MEM65;
##  still have 7 attributes left.
## After 32 iterations, +3.1 secs:
##  confirmed 1 attribute: TreeCanopy_NLCD;
##  still have 6 attributes left.
## After 37 iterations, +3.6 secs:
##  confirmed 1 attribute: MEM5;
##  still have 5 attributes left.
## After 64 iterations, +5.5 secs:
##  confirmed 1 attribute: MEM2;
##  still have 4 attributes left.
## After 115 iterations, +9.2 secs:
##  rejected 1 attribute: SoilType;
##  still have 3 attributes left.
## After 136 iterations, +11 secs:
##  rejected 1 attribute: MEM147;
##  still have 2 attributes left.
boruta_final <- TentativeRoughFix(boruta_out)
selected_vars <- getSelectedAttributes(boruta_final, withTentative = FALSE)

# Separate selected vars into environmental and spatial
selected_env  <- selected_vars[!grepl("^MEM", selected_vars)]
selected_mems <- selected_vars[grepl("^MEM", selected_vars)]

cat(sprintf("\n  Variables confirmed       : %d\n", length(selected_vars)))
## 
##   Variables confirmed       : 11
cat(sprintf("  Environmental confirmed   : %d  (%s)\n",
            length(selected_env), paste(selected_env, collapse = ", ")))
##   Environmental confirmed   : 6  (Cogongrass_Cover, elevation, pr, srad, TreeCanopy_NLCD, avg_pop_den_1km)
cat(sprintf("  MEM predictors confirmed  : %d  (%s)\n\n",
            length(selected_mems),
            if (length(selected_mems) > 0) paste(selected_mems, collapse = ", ") else "none"))
##   MEM predictors confirmed  : 5  (MEM2, MEM5, MEM23, MEM118, MEM203)
# Boruta importance plot
png("boruta_spatial_importance.png", width = 1000, height = 600, res = 120)
par(mar = c(11, 4, 3, 1))
plot(boruta_final, las = 2, xlab = "",
     main = "")
dev.off()
## png 
##   2

Train/Test Split

model_df_sel <- model_df[, c("Shannon_Diversity", selected_vars)]
train_idx  <- createDataPartition(model_df_sel$Shannon_Diversity, p = 0.8, list = FALSE)
train_data <- model_df_sel[ train_idx, ]
test_data  <- model_df_sel[-train_idx, ]

Tune Random Forest via Cross-Validation

cat("========== STEP 4: Tuning Random Forest (10-fold CV) ==========\n")
## ========== STEP 4: Tuning Random Forest (10-fold CV) ==========
ctrl      <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
tune_grid <- expand.grid(mtry = seq(2, max(2, floor(sqrt(length(selected_vars))) + 3), by = 1))

rf_tuned <- train(Shannon_Diversity ~ .,
                  data       = train_data,
                  method     = "rf",
                  trControl  = ctrl,
                  tuneGrid   = tune_grid,
                  ntree      = 500,
                  importance = TRUE)

best_mtry <- rf_tuned$bestTune$mtry
cat(sprintf("  Best mtry: %d\n\n", best_mtry))
##   Best mtry: 6
rf_final <- randomForest(Shannon_Diversity ~ .,
                         data       = train_data,
                         ntree      = 1000,
                         mtry       = best_mtry,
                         importance = TRUE)

print(rf_final)
## 
## Call:
##  randomForest(formula = Shannon_Diversity ~ ., data = train_data,      ntree = 1000, mtry = best_mtry, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 0.2167753
##                     % Var explained: 27.48

Evaluate on Test Set

cat("========== STEP 5: Model Evaluation ==========\n")
## ========== STEP 5: Model Evaluation ==========
pred_test <- predict(rf_final, newdata = test_data)
bias_val <- mean(pred_test - test_data$Shannon_Diversity)
rel_bias <- bias_val / mean(test_data$Shannon_Diversity) * 100
rmse_val  <- sqrt(mean((pred_test - test_data$Shannon_Diversity)^2))
mae_val   <- mean(abs(pred_test - test_data$Shannon_Diversity))
ss_res    <- sum((pred_test - test_data$Shannon_Diversity)^2)
ss_tot    <- sum((test_data$Shannon_Diversity - mean(test_data$Shannon_Diversity))^2)
r2_val    <- 1 - ss_res / ss_tot

cat(sprintf("  RMSE : %.4f\n", rmse_val))
##   RMSE : 0.4735
cat(sprintf("  MAE  : %.4f\n", mae_val))
##   MAE  : 0.3702
cat(sprintf("  Bias : %.4f\n", bias_val))
##   Bias : -0.0506
cat(sprintf("  Relative Bias : %.2f%%\n", rel_bias))
##   Relative Bias : -2.12%
cat(sprintf("  R²   : %.4f\n\n", r2_val))
##   R²   : 0.1841
# Predicted vs observed plot
pred_obs_df <- data.frame(Observed = test_data$Shannon_Diversity,
                          Predicted = pred_test)

p_pred <- ggplot(pred_obs_df, aes(x = Observed, y = Predicted)) +
  geom_point(alpha = 0.7, colour = "#2E8B57", size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey30") +
  annotate("text", x = min(pred_obs_df$Observed) + 0.05,
           y = max(pred_obs_df$Predicted) - 0.1,
           label = sprintf("R² = %.3f\nRMSE = %.3f", r2_val, rmse_val),
           hjust = 0, size = 4.5) +
  labs(title = "",
       x = "Observed", y = "Predicted") +
  theme_bw(base_size = 13)

ggsave("predicted_vs_observed_spatial.png", p_pred, width = 6, height = 5, dpi = 150)

Residual Spatial Autocorrelation

  • If MEMs successfully absorbed spatial structure, residuals should show no significant Moran’s I. If autocorrelation remains, consider adding more MEMs or switching to a spatial RF approach.
# Get residuals for ALL observations (training + test) using out-of-bag for train
oob_pred   <- rf_final$predicted                          # OOB predictions (train set)
test_resid <- test_data$Shannon_Diversity - pred_test

# Full-dataset residuals for Moran test — predict all from final model
all_data_sel <- model_df_sel
all_preds    <- predict(rf_final, newdata = all_data_sel)
all_resid    <- all_data_sel$Shannon_Diversity - all_preds

# Match residuals back to spatial coordinates (using row indices surviving na.omit)
resid_df <- data.frame(
  resid = all_resid,
  Longitude = model_data$Longitude[as.integer(names(all_resid))],
  Latitude  = model_data$Latitude[as.integer(names(all_resid))]
)
resid_df <- resid_df[!is.na(resid_df$Longitude), ]

resid_coords <- as.matrix(resid_df[, c("Longitude", "Latitude")])
nb_resid     <- knn2nb(knearneigh(resid_coords, k = 4))
## Warning in knn2nb(knearneigh(resid_coords, k = 4)): neighbour object has 12
## sub-graphs
listw_resid  <- nb2listw(nb_resid, style = "W")

moran_resid  <- moran.test(resid_df$resid, listw_resid)
cat(sprintf("  Residual Moran's I = %.4f, p = %.4f\n",
            moran_resid$estimate["Moran I statistic"],
            moran_resid$p.value))
##   Residual Moran's I = -0.0879, p = 0.9554
if (moran_resid$p.value < 0.05) {
  cat("  >> Residual spatial autocorrelation remains. Consider:\n")
  cat("     - Lowering the MEM p-value threshold to retain more MEMs\n")
  cat("     - Adding Site or Region as a predictor\n")
  cat("     - Trying a Spatial Random Forest (SpatialML package)\n\n")
} else {
  cat("  >> No significant residual autocorrelation — MEMs successfully\n")
  cat("     absorbed spatial structure.\n\n")
}
##   >> No significant residual autocorrelation — MEMs successfully
##      absorbed spatial structure.
# Map of residuals
resid_sf <- st_as_sf(resid_df, coords = c("Longitude", "Latitude"), crs = 4326)

p_resid_map <- ggplot(resid_df, aes(x = Longitude, y = Latitude, colour = resid)) +
  geom_point(size = 3, alpha = 0.85) +
  scale_colour_gradient2(low = "#d73027", mid = "white", high = "#1a9850",
                         midpoint = 0, name = "Residual") +
  labs(title = "",
       subtitle = sprintf("Moran's I = %.3f, p = %.3f",
                          moran_resid$estimate["Moran I statistic"],
                          moran_resid$p.value)) +
  theme_bw(base_size = 12)

ggsave("residual_map.png", p_resid_map, width = 7, height = 5, dpi = 150)
cat("  Residual map saved.\n")
##   Residual map saved.

Variable Importance Plot

imp_df <- as.data.frame(importance(rf_final)) %>%
  rownames_to_column("Variable") %>%
  arrange(desc(`%IncMSE`)) %>%
  mutate(Type = ifelse(grepl("^MEM", Variable), "Spatial (MEM)", "Environmental"))

p_imp <- ggplot(imp_df, aes(x = reorder(Variable, `%IncMSE`),
                             y = `%IncMSE`, fill = Type)) +
  geom_col(alpha = 0.85) +
  scale_fill_manual(values = c("Environmental" = "#2E8B57", "Spatial (MEM)" = "#4472C4")) +
  coord_flip() +
  labs(title = "",
       subtitle = "",
       x = NULL, y = "% Increase in MSE", fill = "Predictor Type") +
  theme_bw(base_size = 13)

ggsave("variable_importance_spatial.png", p_imp, width = 8, height = 5, dpi = 150)
cat("Variable importance plot saved.\n")
## Variable importance plot saved.

Partial Dependence Plots

top4_env <- imp_df %>%
  filter(Type == "Environmental") %>%
  slice_head(n = 4) %>%
  pull(Variable)

pdp_plots <- lapply(top4_env, function(var) {
  pd <- partial(rf_final, pred.var = var, train = train_data)
  autoplot(pd) +
    labs(title = var, x = var, y = "Shannon Diversity") +
    theme_bw(base_size = 11)
})

png("partial_dependence_spatial_top4.png", width = 1100, height = 900, res = 130)
do.call(grid.arrange, c(pdp_plots, ncol = 2,
                        top = "Partial Dependence — Top 4 Environmental Predictors"))
dev.off()
## png 
##   2
cat("Partial dependence plots saved.\n\n")
## Partial dependence plots saved.

Partial Dependence Plots

gator_blue   <- "#0021A5"
gator_orange <- "#0021A5"

selected_env <- c(
  "pr", "srad", "TreeCanopy_NLCD", "Cogongrass_Cover", "elevation", "avg_pop_den_1km"
)

var_labels <- c(
  Cogongrass_Cover = "Cogongrass Cover (%)",
  pr               = "Precipitation (mm)",
  TreeCanopy_NLCD  = "Tree Canopy Cover (%)",
  srad             = expression("Surface Radiation (W/m"^2*")"),
  elevation        = "Elevation (m)",
  avg_pop_den_1km  = expression("Population Density (pop/1 km"^2*")")
)
n_boot <- 100

panel_labels <- LETTERS[1:length(selected_env)]

pdp_boot_list <- lapply(seq_along(selected_env), function(i) {
  
  var <- selected_env[i]
  
  boot_res <- lapply(1:n_boot, function(j) {
    idx <- sample(1:nrow(train_data), replace = TRUE)
    train_boot <- train_data[idx, ]
    
    rf_boot <- randomForest(
      Shannon_Diversity ~ ., 
      data = train_boot,
      ntree = 500
    )
    
    pd <- partial(rf_boot, pred.var = var, train = train_boot)
    pd$iter <- j
    pd
  })
  
  pd_all <- bind_rows(boot_res)
  
  pd_summary <- pd_all %>%
    group_by(!!sym(var)) %>%
    summarise(
      y = mean(yhat),
      ymin = quantile(yhat, 0.025),
      ymax = quantile(yhat, 0.975),
      .groups = "drop"
    ) %>%
    arrange(!!sym(var)) %>%
    mutate(
      y_smooth    = loess(y ~ get(var), span = 0.6)$fitted,
      ymin_smooth = loess(ymin ~ get(var), span = 0.6)$fitted,
      ymax_smooth = loess(ymax ~ get(var), span = 0.6)$fitted
    )

  ggplot(pd_summary, aes_string(x = var)) +
    ggplot(pd_summary, aes_string(x = var)) +
  geom_ribbon(
    aes(ymin = ymin_smooth, ymax = ymax_smooth),
    fill = gator_orange,
    alpha = 0.2
  ) +
  geom_line(
    aes(y = y_smooth),
    color = gator_blue,
    size = 1.3
  ) +
  labs(
    x = var_labels[[var]],
    y = NULL   # remove individual y-axis labels for shared label
  ) +
  theme_bw(base_size = 11) +
  theme(
    panel.grid.major = element_line(color = "gray80", size = 0.3),  # faint major grids
    panel.grid.minor = element_line(color = "gray90", size = 0.2),  # even fainter minor grids
    plot.title = element_text(face = "bold", size = 12)
  ) +
  ggtitle(panel_labels[i])
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gridExtra::grid.arrange(
  grobs = pdp_boot_list,
  ncol = 3   # better for 7 panels
)

png("partial_dependence_spatial_bootstrap.png",
    width = 1200, height = 900, res = 130)

gridExtra::grid.arrange(
  grid::textGrob("Shannon Diversity", rot = 90, gp = grid::gpar(fontsize = 14)),
  gridExtra::arrangeGrob(
    grobs = pdp_boot_list,
    ncol = 3
  ),
  ncol = 2,
  widths = c(0.05, 0.95)  # controls spacing of y-axis label
)

dev.off()
## png 
##   2

Summary Output

cat("============================================================\n")
## ============================================================
cat("  SPATIAL MODEL SUMMARY\n")
##   SPATIAL MODEL SUMMARY
cat("============================================================\n")
## ============================================================
cat(sprintf("  Moran's I (response)     : %.4f (p = %.4f)\n",
            moran_test$estimate["Moran I statistic"], moran_test$p.value))
##   Moran's I (response)     : -0.0008 (p = 0.0233)
cat(sprintf("  MEMs retained            : %d\n", length(sig_mems)))
##   MEMs retained            : 12
cat(sprintf("  MEM axes confirmed (Boruta): %d\n", length(selected_mems)))
##   MEM axes confirmed (Boruta): 5
cat(sprintf("  Environmental vars confirmed: %d\n", length(selected_env)))
##   Environmental vars confirmed: 6
cat(sprintf("  Total selected vars      : %d\n", length(selected_vars)))
##   Total selected vars      : 11
cat("------------------------------------------------------------\n")
## ------------------------------------------------------------
cat(sprintf("  Test RMSE : %.4f\n", rmse_val))
##   Test RMSE : 0.4735
cat(sprintf("  Test MAE  : %.4f\n", mae_val))
##   Test MAE  : 0.3702
cat(sprintf("  Test Bias : %.4f\n", bias_val))
##   Test Bias : -0.0506
cat(sprintf("  Relative Bias : %.2f%%\n", rel_bias))
##   Relative Bias : -2.12%
cat(sprintf("  Test R²   : %.4f\n", r2_val))
##   Test R²   : 0.1841
cat("------------------------------------------------------------\n")
## ------------------------------------------------------------
cat(sprintf("  Residual Moran's I       : %.4f (p = %.4f)\n",
            moran_resid$estimate["Moran I statistic"], moran_resid$p.value))
##   Residual Moran's I       : -0.0879 (p = 0.9554)
cat("============================================================\n")
## ============================================================
saveRDS(rf_final, "shannon_diversity_spatial_rf_model.rds")
cat("\nModel saved as 'shannon_diversity_spatial_rf_model.rds'\n")
## 
## Model saved as 'shannon_diversity_spatial_rf_model.rds'