pacman::p_load(randomForest, Boruta, caret, tidyverse, vip, pdp,
               sf, corrplot, gridExtra, spdep, adespatial, tibble,
               readxl, terra, patchwork)

EnvironmentalOutputs <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Environmental_Outputs.xlsx")

social_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Social_Outputs.xlsx")

# Merge Environmental and Social Data
EnvironmentalOutputs <- EnvironmentalOutputs %>%
  left_join(social_data, by = "Plot")

# Tree PCQ Data
tree_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Tree_PCQ")

# Veg Data
Veg_Cover <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                        sheet = "Veg_Cover")

# Shrub Cover Data
shrub_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
                         sheet = "Shrub_Cover")

# Site Data
CogonSites <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/CogonSites_FL_AL_MS.xlsx")

Extract data from tif

agb_raster <- rast("I:/Cogongrass_Project/data/TreeMap2022_CONUS_DRYBIO_L.tif")

points <- vect(EnvironmentalOutputs,
               geom = c("Longitude", "Latitude"),
               crs = "EPSG:4326")

agb_vals <- extract(agb_raster, points)

## Warning: [extract] transforming vector data to the CRS of the raster

EnvironmentalOutputs$agb <- agb_vals[,2]

Extract Soil Data

# Load the raster
r_out <- rast("I:/Cogongrass_Project/data/Soils/taxorder_CONUS_30m.tif")

# Reproject points to match raster CRS and extract soil values
points_proj <- project(points, crs(r_out))
soil_vals <- extract(r_out, points_proj)

# Add soil type to EnvironmentalOutputs
EnvironmentalOutputs$SoilType <- soil_vals$taxorder
table(EnvironmentalOutputs$SoilType)

## 
##   1   4   9  10 
##  31  31   1 143

# Convert numeric IDs to labeled factor
soil_lookup <- data.frame(
  ID = 1:11,
  SoilOrder = c("Alfisols","Andisols","Aridisols","Entisols","Gelisols",
                "Histosols","Inceptisols","Mollisols","Spodosols","Ultisols","Vertisols")
)

soil_vals$SoilType <- soil_lookup$SoilOrder[soil_vals$taxorder]

EnvironmentalOutputs$SoilType <- as.factor(soil_vals$SoilType)

Number of quadrats sampled per plot

# Count the total number of quadrats per plot
quadrat_count <- Veg_Cover %>%
  group_by(Plot) %>%
  summarize(total_quadrats = n_distinct(Quadrat), .groups = "drop")

Filter All data to only include specified species (Per PLANTS database)

#Filter tree data to only include trees with "tree" in the growth column
tree_data <- dplyr::filter(tree_data, Growth == "Tree")

#Filter Veg Cover to exclude Shrubs and Trees
Veg_Cover <- dplyr::filter(Veg_Cover, Growth != "Shrub" & Growth != "Tree")

#Filter Shrub Cover to only include Shrubs and Trees
shrub_data <- dplyr::filter(shrub_data, Growth == "Shrub" | Growth == "Tree")

Shrub Cover Conversion

# Total length of Shrub cover at a site
shrub_cover <- shrub_data %>%
  mutate(Cover = Line_End - Line_Start) %>%
  group_by(Species_Name, Plot, InvStatus) %>%
  summarise(Total_Cover = sum(Cover, na.rm = TRUE), .groups = "drop") %>%
  mutate(Percent_Cover = Total_Cover / 3000 * 100)

Herbacous Cover Conversion

# Combine Plot and Quadrat columns
Veg_Cover <- Veg_Cover %>%
  mutate(Plot_Quadrat = paste(Plot, Quadrat, sep = '_'))

# Join with CogonSites to get site information
Veg_Cover <- Veg_Cover %>%
  left_join(CogonSites, by = "Plot")

# Sum species cover across quadrats for each species at each plot
veg_cover_summed <- Veg_Cover %>%
  group_by(Plot, Species_Name, InvStatus) %>%
  summarize(total_cover = sum(Cover_Per, na.rm = TRUE), .groups = "drop")

# Calculate average herbaceous species cover
avg_species_cover <- veg_cover_summed %>%
  left_join(quadrat_count, by = "Plot") %>%
  mutate(avg_cover = total_cover / total_quadrats)

Merging Herb cover with Shrub

This species matrix includes herbaceous and shrub species

# Merge shrub cover with herbaceous average cover
combined_cover <- avg_species_cover %>%
  full_join(
    shrub_cover %>%
      dplyr::select(Plot, Species_Name, Percent_Cover, InvStatus),
    by = c("Plot", "Species_Name")
  ) %>%
  mutate(
    overlap_flag = ifelse(!is.na(avg_cover) & !is.na(Percent_Cover), TRUE, FALSE), # Flag overlaps
    final_cover = case_when(
      !is.na(avg_cover) & is.na(Percent_Cover) ~ avg_cover,  # Use herbaceous cover if no shrub data
      is.na(avg_cover) & !is.na(Percent_Cover) ~ Percent_Cover, # Use shrub cover if no herbaceous data
      TRUE ~ NA_real_ # Leave as NA where overlaps exist
    )
  )

Extract Cogongrass Cover

# Extract cogongrass cover
cogongrass_cover <- combined_cover %>%
  filter(Species_Name == "Imperata_cylindrica") %>%
  dplyr::select(Plot, final_cover) %>%
  rename(Cogongrass_Cover = final_cover)

Species Matrix

combined_cover <- combined_cover %>%
  filter(Species_Name != "Imperata_cylindrica" , Species_Name != "Imperata_cylindrica_Live") # Remove cogongrass from species matrix

## Remove any non_native species
combined_cover <- combined_cover %>%
  filter(InvStatus.x != "Non_Native") # Remove non-native species from species matrix

species_matrix <- combined_cover %>%
  dplyr::select(Plot, Species_Name, final_cover) %>%
  pivot_wider(
    names_from = Species_Name,
    values_from = final_cover,
    values_fill = 0
  )

Shannon Diversity

# Calculate Shannon diversity index for each site
shannon_diversity <- species_matrix %>%
  dplyr::select(-Plot) %>%
  vegan::diversity(index = "shannon") %>%
  as.data.frame() %>%
  setNames("Shannon_Diversity") %>%
  mutate(Plot = species_matrix$Plot)

Merge Shannon Diversity with Cogongrass Cover and Environmental Data

# Merge Shannon diversity with cogongrass cover, tree canopy cover, and environmental outputs
model_data <- shannon_diversity %>%
  left_join(cogongrass_cover, by = "Plot") %>%
  left_join(EnvironmentalOutputs, by = "Plot") %>%
  mutate(Cogongrass_Cover = ifelse(is.na(Cogongrass_Cover), 0, Cogongrass_Cover))

# Inspect to see if there are any columns with only one level
summary(model_data)

##  Shannon_Diversity     Plot           Cogongrass_Cover     Site          
##  Min.   :0.000     Length:206         Min.   : 0.00    Length:206        
##  1st Qu.:2.065     Class :character   1st Qu.: 0.00    Class :character  
##  Median :2.407     Mode  :character   Median : 0.00    Mode  :character  
##  Mean   :2.333                        Mean   :17.08                      
##  3rd Qu.:2.720                        3rd Qu.:29.82                      
##  Max.   :3.552                        Max.   :92.14                      
##                                                                          
##     BurnYear       Camera             Latitude       Longitude     
##  Min.   :2000   Length:206         Min.   :28.48   Min.   :-89.80  
##  1st Qu.:2014   Class :character   1st Qu.:29.56   1st Qu.:-88.91  
##  Median :2022   Mode  :character   Median :30.77   Median :-87.14  
##  Mean   :2018                      Mean   :30.38   Mean   :-86.72  
##  3rd Qu.:2022                      3rd Qu.:30.98   3rd Qu.:-83.58  
##  Max.   :2024                      Max.   :31.59   Max.   :-81.94  
##  NA's   :42                                                        
##     Muname          NLCD_LandCover        Region             Status         
##  Length:206         Length:206         Length:206         Length:206        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##       SoilType     Soil_Group  TreeCanopy_NLCD     aspect        elevation     
##  Alfisols : 31   Min.   : 18   Min.   : 0.00   Min.   :  0.0   Min.   : 17.00  
##  Entisols : 31   1st Qu.:377   1st Qu.:51.50   1st Qu.: 90.0   1st Qu.: 38.00  
##  Spodosols:  1   Median :381   Median :75.50   Median :180.0   Median : 59.00  
##  Ultisols :143   Mean   :330   Mean   :66.22   Mean   :171.7   Mean   : 57.68  
##                  3rd Qu.:389   3rd Qu.:86.75   3rd Qu.:270.0   3rd Qu.: 75.00  
##                  Max.   :389   Max.   :98.00   Max.   :350.6   Max.   :111.00  
##                                                                                
##       ndvi              pr            slope             sph         
##  Min.   :0.2728   Min.   :3.850   Min.   : 0.000   Min.   :0.01051  
##  1st Qu.:0.4303   1st Qu.:4.345   1st Qu.: 2.149   1st Qu.:0.01074  
##  Median :0.4624   Median :4.592   Median : 2.984   Median :0.01091  
##  Mean   :0.4503   Mean   :4.566   Mean   : 3.601   Mean   :0.01145  
##  3rd Qu.:0.4846   3rd Qu.:4.796   3rd Qu.: 4.688   3rd Qu.:0.01183  
##  Max.   :0.5357   Max.   :5.294   Max.   :18.429   Max.   :0.01325  
##                                                                     
##       srad            tmmn            agb         avg_pop_den_1km   
##  Min.   :204.7   Min.   :285.7   Min.   :  6.90   Min.   :  0.0000  
##  1st Qu.:206.8   1st Qu.:286.6   1st Qu.: 21.21   1st Qu.:  0.5956  
##  Median :207.0   Median :287.2   Median : 35.82   Median :  2.1841  
##  Mean   :210.6   Mean   :287.4   Mean   : 38.51   Mean   :  8.3667  
##  3rd Qu.:216.6   3rd Qu.:288.6   3rd Qu.: 50.89   3rd Qu.:  5.2578  
##  Max.   :224.2   Max.   :290.3   Max.   :112.06   Max.   :122.9795  
##                                  NA's   :30                         
##  road_density_1km
##  Min.   :0.000   
##  1st Qu.:1.771   
##  Median :3.103   
##  Mean   :3.037   
##  3rd Qu.:3.810   
##  Max.   :6.932   
##

Check Distribution of Response Variable

hist(model_data$Shannon_Diversity, breaks=30, main="Histogram of Shannon Diversity", xlab="Shannon Diversity")

summary(model_data$Shannon_Diversity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.065   2.407   2.333   2.720   3.552

Moran’s Eigenvector Maps

Low numbered MEMs capture broad-scale spatial patterns, while higher numbered MEMs capture finer-scale patterns.

# Extract coordinates
coords <- model_data[, c("Longitude", "Latitude")]
coords_matrix <- as.matrix(coords)

# Build a spatial neighbours list using a distance-based threshold.
# We use the minimum distance that ensures every plot has at least one neighbour.
nb <- dnearneigh(coords_matrix, d1 = 0, d2 = max(nbdists(
  dnearneigh(coords_matrix, d1 = 0, d2 = 5), coords_matrix)[[1]]) * 1.5)

# If the above produces isolates, fall back to k-nearest neighbours (k=4)
if (any(card(nb) == 0)) {
  cat("  Some plots had no neighbours — switching to k=4 nearest neighbours.\n")
  nb <- knn2nb(knearneigh(coords_matrix, k = 4))
}

# Convert neighbours to spatial weights (binary, row-standardised)
listw <- nb2listw(nb, style = "W", zero.policy = TRUE)

# Compute Moran's I on raw response to confirm spatial autocorrelation exists
moran_test <- moran.test(model_data$Shannon_Diversity, listw, zero.policy = TRUE)
cat(sprintf("  Moran's I = %.4f, p = %.4f\n",
            moran_test$estimate["Moran I statistic"],
            moran_test$p.value))

##   Moran's I = -0.0008, p = 0.0233

if (moran_test$p.value < 0.05) {
  cat("  >> Significant spatial autocorrelation detected — MEMs are warranted.\n\n")
} else {
  cat("  >> No significant spatial autocorrelation — MEMs may have limited benefit.\n\n")
}

##   >> Significant spatial autocorrelation detected — MEMs are warranted.

# Build MEMs from the spatial weights matrix
mem_out <- mem(listw)                   # all MEMs with positive eigenvalues
mem_scores <- as.data.frame(mem_out)

# Select only MEMs significantly correlated with Shannon Diversity (p < 0.05)
# This avoids adding irrelevant spatial axes and overfitting
mem_pvals <- apply(mem_scores, 2, function(mem_col) {
  cor.test(mem_col, model_data$Shannon_Diversity)$p.value
})
sig_mems <- names(mem_pvals[mem_pvals < 0.05])

if (length(sig_mems) == 0) {
  cat("  No MEMs significantly correlated with Shannon Diversity (p < 0.05).\n")
  cat("  Using top 3 MEMs by correlation strength instead.\n")
  sig_mems <- names(sort(mem_pvals))[1:min(3, length(mem_pvals))]
}

cat(sprintf("  Total MEMs with positive eigenvalues : %d\n", ncol(mem_scores)))

##   Total MEMs with positive eigenvalues : 205

cat(sprintf("  MEMs retained (p < 0.05)             : %d\n", length(sig_mems)))

##   MEMs retained (p < 0.05)             : 12

cat(sprintf("  Retained MEMs: %s\n\n", paste(sig_mems, collapse = ", ")))

##   Retained MEMs: MEM2, MEM5, MEM23, MEM65, MEM72, MEM102, MEM118, MEM147, MEM149, MEM160, MEM194, MEM203

mem_selected <- mem_scores[, sig_mems, drop = FALSE]

Model Data Preparation

set.seed(97)
# Ensure the data is correctly formatted:
# convert categorical variables to numeric
model_data$NLCD_LandCover <- as.numeric(as.factor(model_data$NLCD_LandCover))
model_data$SoilType <- as.numeric(as.factor(model_data$SoilType))
model_data$Site <- as.numeric(as.factor(model_data$Site))

str(model_data)

## 'data.frame':    206 obs. of  26 variables:
##  $ Shannon_Diversity: num  2.69 2.65 2.54 1.61 2.97 ...
##  $ Plot             : chr  "BI200" "BI201" "BI202" "BI97" ...
##  $ Cogongrass_Cover : num  22.1 10.7 39.6 21.1 2.5 ...
##  $ Site             : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ BurnYear         : num  2022 2022 2022 2022 2022 ...
##  $ Camera           : chr  "NA" "S-4K" "NA" "NA" ...
##  $ Latitude         : num  30.8 30.8 30.8 30.9 30.8 ...
##  $ Longitude        : num  -86.9 -86.9 -86.8 -86.9 -86.9 ...
##  $ Muname           : chr  "TroupLoamySand_0_to_5_percent_slopes" "TroupLoamySand_0_to_5_percent_slopes" "TroupLoamySand_0_to_5_percent_slopes" "FuquayLoamySand_0_to_5_percent_slopes" ...
##  $ NLCD_LandCover   : num  4 4 4 4 4 4 4 4 4 4 ...
##  $ Region           : chr  "FA" "FA" "FA" "FA" ...
##  $ Status           : chr  "Invaded" "Invaded" "Invaded" "Invaded" ...
##  $ SoilType         : num  4 4 4 4 4 4 4 4 4 4 ...
##  $ Soil_Group       : num  381 381 381 389 381 381 381 381 381 389 ...
##  $ TreeCanopy_NLCD  : num  66 94 70 59 83 63 66 71 82 54 ...
##  $ aspect           : num  0 246.8 180 246.8 13.1 ...
##  $ elevation        : num  72 70 71 48 58 71 65 55 61 43 ...
##  $ ndvi             : num  0.46 0.475 0.461 0.468 0.466 ...
##  $ pr               : num  4.76 4.76 4.81 4.8 4.82 ...
##  $ slope            : num  0.927 2.349 1.854 2.351 4.751 ...
##  $ sph              : num  0.0109 0.0109 0.0108 0.0107 0.0107 ...
##  $ srad             : num  206 206 207 207 207 ...
##  $ tmmn             : num  287 287 287 287 287 ...
##  $ agb              : num  45 45 27.5 19.2 18.5 ...
##  $ avg_pop_den_1km  : num  0.00479 0.00305 0.57717 2.00298 1.05855 ...
##  $ road_density_1km : num  6.07 6.44 6.29 2.34 5.35 ...

env_predictors <- model_data[, c("Cogongrass_Cover", "elevation", "aspect",
                                  "slope", "pr", "sph", "srad", "tmmn",
                                  "TreeCanopy_NLCD", "NLCD_LandCover",
                                  "SoilType", "ndvi", "avg_pop_den_1km",
                                  "road_density_1km", "agb")]


# Combine environmental predictors + significant MEMs
all_predictors <- cbind(env_predictors, mem_selected)

model_df <- cbind(Shannon_Diversity = model_data$Shannon_Diversity,
                  all_predictors) %>%
  na.omit()

cat(sprintf("  Environmental predictors : %d\n", ncol(env_predictors)))

##   Environmental predictors : 15

cat(sprintf("  Spatial MEM predictors   : %d\n", ncol(mem_selected)))

##   Spatial MEM predictors   : 12

cat(sprintf("  Total predictors         : %d\n", ncol(all_predictors)))

##   Total predictors         : 27

cat(sprintf("  Observations (post-NA)   : %d\n\n", nrow(model_df)))

##   Observations (post-NA)   : 176

Correlation Check

cor_matrix <- cor(model_df, use = "complete.obs")

png("correlation_matrix.png", width = 900, height = 800, res = 120)
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.75,
         addCoef.col = "black", number.cex = 0.55,
         title = "Predictor Correlation Matrix", mar = c(0,0,1,0))
dev.off()

## png 
##   2

cat("Correlation matrix saved.\n")

## Correlation matrix saved.

# Compute correlation matrix for predictors only (exclude response)
pred_cor <- cor(model_df[, -1], use = "complete.obs")  # remove Shannon_Diversity

# Find pairs with |r| > 0.9
high_cor_pairs <- findCorrelation(pred_cor, cutoff = 0.9, names = TRUE, verbose = TRUE)

## Compare row 5  and column  8 with corr  0.91 
##   Means:  0.22 vs 0.09 so flagging column 5 
## Compare row 8  and column  6 with corr  0.913 
##   Means:  0.191 vs 0.08 so flagging column 8 
## All correlations <= 0.9

# Ensure 'pr' is always kept
if ("pr" %in% high_cor_pairs) {
  # Get all variables highly correlated with 'pr'
  pr_cor_vars <- names(which(abs(pred_cor["pr", ]) > 0.9))
  pr_cor_vars <- setdiff(pr_cor_vars, "pr")  # exclude pr itself

  # Remove these variables instead of pr
  high_cor_pairs <- setdiff(high_cor_pairs, "pr")
  high_cor_pairs <- unique(c(high_cor_pairs, pr_cor_vars))
}

# Remove highly correlated predictors
if (length(high_cor_pairs) > 0) {
  cat(sprintf("  Removing %d highly correlated predictors (|r| > 0.9) while keeping 'pr': %s\n",
              length(high_cor_pairs),
              paste(high_cor_pairs, collapse = ", ")))
  model_df <- model_df[, !(colnames(model_df) %in% high_cor_pairs)]
} else {
  cat("  No highly correlated predictors (|r| > 0.9) found — no removals necessary.\n")
}

##   Removing 1 highly correlated predictors (|r| > 0.9) while keeping 'pr': tmmn

Variable Selection- Boruta

cat("========== STEP 3: Boruta Variable Selection ==========\n")

## ========== STEP 3: Boruta Variable Selection ==========

cat("  Running Boruta (this may take a minute)...\n")

##   Running Boruta (this may take a minute)...

boruta_out <- Boruta(Shannon_Diversity ~ ., data = model_df,
                     doTrace = 1, maxRuns = 150, num.trees = 500)

## After 12 iterations, +1.3 secs:

##  confirmed 5 attributes: avg_pop_den_1km, Cogongrass_Cover, MEM203, MEM23, pr;

##  rejected 5 attributes: agb, aspect, MEM72, NLCD_LandCover, road_density_1km;

##  still have 16 attributes left.

## After 16 iterations, +1.7 secs:

##  confirmed 2 attributes: MEM118, srad;

##  rejected 2 attributes: MEM102, slope;

##  still have 12 attributes left.

## After 19 iterations, +2 secs:

##  confirmed 1 attribute: elevation;

##  rejected 1 attribute: ndvi;

##  still have 10 attributes left.

## After 29 iterations, +2.9 secs:

##  rejected 3 attributes: MEM160, MEM194, MEM65;

##  still have 7 attributes left.

## After 32 iterations, +3.1 secs:

##  confirmed 1 attribute: TreeCanopy_NLCD;

##  still have 6 attributes left.

## After 37 iterations, +3.6 secs:

##  confirmed 1 attribute: MEM5;

##  still have 5 attributes left.

## After 64 iterations, +5.5 secs:

##  confirmed 1 attribute: MEM2;

##  still have 4 attributes left.

## After 115 iterations, +9.2 secs:

##  rejected 1 attribute: SoilType;

##  still have 3 attributes left.

## After 136 iterations, +11 secs:

##  rejected 1 attribute: MEM147;

##  still have 2 attributes left.

boruta_final <- TentativeRoughFix(boruta_out)
selected_vars <- getSelectedAttributes(boruta_final, withTentative = FALSE)

# Separate selected vars into environmental and spatial
selected_env  <- selected_vars[!grepl("^MEM", selected_vars)]
selected_mems <- selected_vars[grepl("^MEM", selected_vars)]

cat(sprintf("\n  Variables confirmed       : %d\n", length(selected_vars)))

## 
##   Variables confirmed       : 11

cat(sprintf("  Environmental confirmed   : %d  (%s)\n",
            length(selected_env), paste(selected_env, collapse = ", ")))

##   Environmental confirmed   : 6  (Cogongrass_Cover, elevation, pr, srad, TreeCanopy_NLCD, avg_pop_den_1km)

cat(sprintf("  MEM predictors confirmed  : %d  (%s)\n\n",
            length(selected_mems),
            if (length(selected_mems) > 0) paste(selected_mems, collapse = ", ") else "none"))

##   MEM predictors confirmed  : 5  (MEM2, MEM5, MEM23, MEM118, MEM203)

# Boruta importance plot
png("boruta_spatial_importance.png", width = 1000, height = 600, res = 120)
par(mar = c(11, 4, 3, 1))
plot(boruta_final, las = 2, xlab = "",
     main = "")
dev.off()

## png 
##   2

Train/Test Split

model_df_sel <- model_df[, c("Shannon_Diversity", selected_vars)]
train_idx  <- createDataPartition(model_df_sel$Shannon_Diversity, p = 0.8, list = FALSE)
train_data <- model_df_sel[ train_idx, ]
test_data  <- model_df_sel[-train_idx, ]

Tune Random Forest via Cross-Validation

cat("========== STEP 4: Tuning Random Forest (10-fold CV) ==========\n")

## ========== STEP 4: Tuning Random Forest (10-fold CV) ==========

ctrl      <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
tune_grid <- expand.grid(mtry = seq(2, max(2, floor(sqrt(length(selected_vars))) + 3), by = 1))

rf_tuned <- train(Shannon_Diversity ~ .,
                  data       = train_data,
                  method     = "rf",
                  trControl  = ctrl,
                  tuneGrid   = tune_grid,
                  ntree      = 500,
                  importance = TRUE)

best_mtry <- rf_tuned$bestTune$mtry
cat(sprintf("  Best mtry: %d\n\n", best_mtry))

##   Best mtry: 6

rf_final <- randomForest(Shannon_Diversity ~ .,
                         data       = train_data,
                         ntree      = 1000,
                         mtry       = best_mtry,
                         importance = TRUE)

print(rf_final)

## 
## Call:
##  randomForest(formula = Shannon_Diversity ~ ., data = train_data,      ntree = 1000, mtry = best_mtry, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 6
## 
##           Mean of squared residuals: 0.2167753
##                     % Var explained: 27.48

Evaluate on Test Set

cat("========== STEP 5: Model Evaluation ==========\n")

## ========== STEP 5: Model Evaluation ==========

pred_test <- predict(rf_final, newdata = test_data)
bias_val <- mean(pred_test - test_data$Shannon_Diversity)
rel_bias <- bias_val / mean(test_data$Shannon_Diversity) * 100
rmse_val  <- sqrt(mean((pred_test - test_data$Shannon_Diversity)^2))
mae_val   <- mean(abs(pred_test - test_data$Shannon_Diversity))
ss_res    <- sum((pred_test - test_data$Shannon_Diversity)^2)
ss_tot    <- sum((test_data$Shannon_Diversity - mean(test_data$Shannon_Diversity))^2)
r2_val    <- 1 - ss_res / ss_tot

cat(sprintf("  RMSE : %.4f\n", rmse_val))

##   RMSE : 0.4735

cat(sprintf("  MAE  : %.4f\n", mae_val))

##   MAE  : 0.3702

cat(sprintf("  Bias : %.4f\n", bias_val))

##   Bias : -0.0506

cat(sprintf("  Relative Bias : %.2f%%\n", rel_bias))

##   Relative Bias : -2.12%

cat(sprintf("  R²   : %.4f\n\n", r2_val))

##   R²   : 0.1841

# Predicted vs observed plot
pred_obs_df <- data.frame(Observed = test_data$Shannon_Diversity,
                          Predicted = pred_test)

p_pred <- ggplot(pred_obs_df, aes(x = Observed, y = Predicted)) +
  geom_point(alpha = 0.7, colour = "#2E8B57", size = 2.5) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey30") +
  annotate("text", x = min(pred_obs_df$Observed) + 0.05,
           y = max(pred_obs_df$Predicted) - 0.1,
           label = sprintf("R² = %.3f\nRMSE = %.3f", r2_val, rmse_val),
           hjust = 0, size = 4.5) +
  labs(title = "",
       x = "Observed", y = "Predicted") +
  theme_bw(base_size = 13)

ggsave("predicted_vs_observed_spatial.png", p_pred, width = 6, height = 5, dpi = 150)

Residual Spatial Autocorrelation

If MEMs successfully absorbed spatial structure, residuals should show no significant Moran’s I. If autocorrelation remains, consider adding more MEMs or switching to a spatial RF approach.

# Get residuals for ALL observations (training + test) using out-of-bag for train
oob_pred   <- rf_final$predicted                          # OOB predictions (train set)
test_resid <- test_data$Shannon_Diversity - pred_test

# Full-dataset residuals for Moran test — predict all from final model
all_data_sel <- model_df_sel
all_preds    <- predict(rf_final, newdata = all_data_sel)
all_resid    <- all_data_sel$Shannon_Diversity - all_preds

# Match residuals back to spatial coordinates (using row indices surviving na.omit)
resid_df <- data.frame(
  resid = all_resid,
  Longitude = model_data$Longitude[as.integer(names(all_resid))],
  Latitude  = model_data$Latitude[as.integer(names(all_resid))]
)
resid_df <- resid_df[!is.na(resid_df$Longitude), ]

resid_coords <- as.matrix(resid_df[, c("Longitude", "Latitude")])
nb_resid     <- knn2nb(knearneigh(resid_coords, k = 4))

## Warning in knn2nb(knearneigh(resid_coords, k = 4)): neighbour object has 12
## sub-graphs

listw_resid  <- nb2listw(nb_resid, style = "W")

moran_resid  <- moran.test(resid_df$resid, listw_resid)
cat(sprintf("  Residual Moran's I = %.4f, p = %.4f\n",
            moran_resid$estimate["Moran I statistic"],
            moran_resid$p.value))

##   Residual Moran's I = -0.0879, p = 0.9554

if (moran_resid$p.value < 0.05) {
  cat("  >> Residual spatial autocorrelation remains. Consider:\n")
  cat("     - Lowering the MEM p-value threshold to retain more MEMs\n")
  cat("     - Adding Site or Region as a predictor\n")
  cat("     - Trying a Spatial Random Forest (SpatialML package)\n\n")
} else {
  cat("  >> No significant residual autocorrelation — MEMs successfully\n")
  cat("     absorbed spatial structure.\n\n")
}

##   >> No significant residual autocorrelation — MEMs successfully
##      absorbed spatial structure.

# Map of residuals
resid_sf <- st_as_sf(resid_df, coords = c("Longitude", "Latitude"), crs = 4326)

p_resid_map <- ggplot(resid_df, aes(x = Longitude, y = Latitude, colour = resid)) +
  geom_point(size = 3, alpha = 0.85) +
  scale_colour_gradient2(low = "#d73027", mid = "white", high = "#1a9850",
                         midpoint = 0, name = "Residual") +
  labs(title = "",
       subtitle = sprintf("Moran's I = %.3f, p = %.3f",
                          moran_resid$estimate["Moran I statistic"],
                          moran_resid$p.value)) +
  theme_bw(base_size = 12)

ggsave("residual_map.png", p_resid_map, width = 7, height = 5, dpi = 150)
cat("  Residual map saved.\n")

##   Residual map saved.

Variable Importance Plot

imp_df <- as.data.frame(importance(rf_final)) %>%
  rownames_to_column("Variable") %>%
  arrange(desc(`%IncMSE`)) %>%
  mutate(Type = ifelse(grepl("^MEM", Variable), "Spatial (MEM)", "Environmental"))

p_imp <- ggplot(imp_df, aes(x = reorder(Variable, `%IncMSE`),
                             y = `%IncMSE`, fill = Type)) +
  geom_col(alpha = 0.85) +
  scale_fill_manual(values = c("Environmental" = "#2E8B57", "Spatial (MEM)" = "#4472C4")) +
  coord_flip() +
  labs(title = "",
       subtitle = "",
       x = NULL, y = "% Increase in MSE", fill = "Predictor Type") +
  theme_bw(base_size = 13)

ggsave("variable_importance_spatial.png", p_imp, width = 8, height = 5, dpi = 150)
cat("Variable importance plot saved.\n")

## Variable importance plot saved.

Partial Dependence Plots

top4_env <- imp_df %>%
  filter(Type == "Environmental") %>%
  slice_head(n = 4) %>%
  pull(Variable)

pdp_plots <- lapply(top4_env, function(var) {
  pd <- partial(rf_final, pred.var = var, train = train_data)
  autoplot(pd) +
    labs(title = var, x = var, y = "Shannon Diversity") +
    theme_bw(base_size = 11)
})

png("partial_dependence_spatial_top4.png", width = 1100, height = 900, res = 130)
do.call(grid.arrange, c(pdp_plots, ncol = 2,
                        top = "Partial Dependence — Top 4 Environmental Predictors"))
dev.off()

## png 
##   2

cat("Partial dependence plots saved.\n\n")

## Partial dependence plots saved.

Partial Dependence Plots

gator_blue   <- "#0021A5"
gator_orange <- "#0021A5"

selected_env <- c(
  "pr", "srad", "TreeCanopy_NLCD", "Cogongrass_Cover", "elevation", "avg_pop_den_1km"
)

var_labels <- c(
  Cogongrass_Cover = "Cogongrass Cover (%)",
  pr               = "Precipitation (mm)",
  TreeCanopy_NLCD  = "Tree Canopy Cover (%)",
  srad             = expression("Surface Radiation (W/m"^2*")"),
  elevation        = "Elevation (m)",
  avg_pop_den_1km  = expression("Population Density (pop/1 km"^2*")")
)
n_boot <- 100

panel_labels <- LETTERS[1:length(selected_env)]

pdp_boot_list <- lapply(seq_along(selected_env), function(i) {
  
  var <- selected_env[i]
  
  boot_res <- lapply(1:n_boot, function(j) {
    idx <- sample(1:nrow(train_data), replace = TRUE)
    train_boot <- train_data[idx, ]
    
    rf_boot <- randomForest(
      Shannon_Diversity ~ ., 
      data = train_boot,
      ntree = 500
    )
    
    pd <- partial(rf_boot, pred.var = var, train = train_boot)
    pd$iter <- j
    pd
  })
  
  pd_all <- bind_rows(boot_res)
  
  pd_summary <- pd_all %>%
    group_by(!!sym(var)) %>%
    summarise(
      y = mean(yhat),
      ymin = quantile(yhat, 0.025),
      ymax = quantile(yhat, 0.975),
      .groups = "drop"
    ) %>%
    arrange(!!sym(var)) %>%
    mutate(
      y_smooth    = loess(y ~ get(var), span = 0.6)$fitted,
      ymin_smooth = loess(ymin ~ get(var), span = 0.6)$fitted,
      ymax_smooth = loess(ymax ~ get(var), span = 0.6)$fitted
    )

  ggplot(pd_summary, aes_string(x = var)) +
    ggplot(pd_summary, aes_string(x = var)) +
  geom_ribbon(
    aes(ymin = ymin_smooth, ymax = ymax_smooth),
    fill = gator_orange,
    alpha = 0.2
  ) +
  geom_line(
    aes(y = y_smooth),
    color = gator_blue,
    size = 1.3
  ) +
  labs(
    x = var_labels[[var]],
    y = NULL   # remove individual y-axis labels for shared label
  ) +
  theme_bw(base_size = 11) +
  theme(
    panel.grid.major = element_line(color = "gray80", size = 0.3),  # faint major grids
    panel.grid.minor = element_line(color = "gray90", size = 0.2),  # even fainter minor grids
    plot.title = element_text(face = "bold", size = 12)
  ) +
  ggtitle(panel_labels[i])
})

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

gridExtra::grid.arrange(
  grobs = pdp_boot_list,
  ncol = 3   # better for 7 panels
)

png("partial_dependence_spatial_bootstrap.png",
    width = 1200, height = 900, res = 130)

gridExtra::grid.arrange(
  grid::textGrob("Shannon Diversity", rot = 90, gp = grid::gpar(fontsize = 14)),
  gridExtra::arrangeGrob(
    grobs = pdp_boot_list,
    ncol = 3
  ),
  ncol = 2,
  widths = c(0.05, 0.95)  # controls spacing of y-axis label
)

dev.off()

## png 
##   2

Summary Output

cat("============================================================\n")

## ============================================================

cat("  SPATIAL MODEL SUMMARY\n")

##   SPATIAL MODEL SUMMARY

cat("============================================================\n")

## ============================================================

cat(sprintf("  Moran's I (response)     : %.4f (p = %.4f)\n",
            moran_test$estimate["Moran I statistic"], moran_test$p.value))

##   Moran's I (response)     : -0.0008 (p = 0.0233)

cat(sprintf("  MEMs retained            : %d\n", length(sig_mems)))

##   MEMs retained            : 12

cat(sprintf("  MEM axes confirmed (Boruta): %d\n", length(selected_mems)))

##   MEM axes confirmed (Boruta): 5

cat(sprintf("  Environmental vars confirmed: %d\n", length(selected_env)))

##   Environmental vars confirmed: 6

cat(sprintf("  Total selected vars      : %d\n", length(selected_vars)))

##   Total selected vars      : 11

cat("------------------------------------------------------------\n")

## ------------------------------------------------------------

cat(sprintf("  Test RMSE : %.4f\n", rmse_val))

##   Test RMSE : 0.4735

cat(sprintf("  Test MAE  : %.4f\n", mae_val))

##   Test MAE  : 0.3702

cat(sprintf("  Test Bias : %.4f\n", bias_val))

##   Test Bias : -0.0506

cat(sprintf("  Relative Bias : %.2f%%\n", rel_bias))

##   Relative Bias : -2.12%

cat(sprintf("  Test R²   : %.4f\n", r2_val))

##   Test R²   : 0.1841

cat("------------------------------------------------------------\n")

## ------------------------------------------------------------

cat(sprintf("  Residual Moran's I       : %.4f (p = %.4f)\n",
            moran_resid$estimate["Moran I statistic"], moran_resid$p.value))

##   Residual Moran's I       : -0.0879 (p = 0.9554)

cat("============================================================\n")

## ============================================================

saveRDS(rf_final, "shannon_diversity_spatial_rf_model.rds")
cat("\nModel saved as 'shannon_diversity_spatial_rf_model.rds'\n")

## 
## Model saved as 'shannon_diversity_spatial_rf_model.rds'

Reworked Biodiversity

Alan Ivory

2026-04-01

Extract data from tif

Extract Soil Data

Number of quadrats sampled per plot

Filter All data to only include specified species (Per PLANTS database)

Shrub Cover Conversion

Herbacous Cover Conversion

Merging Herb cover with Shrub

Extract Cogongrass Cover

Species Matrix

Shannon Diversity

Merge Shannon Diversity with Cogongrass Cover and Environmental Data

Check Distribution of Response Variable

Moran’s Eigenvector Maps

Model Data Preparation

Correlation Check

Variable Selection- Boruta

Train/Test Split

Tune Random Forest via Cross-Validation

Evaluate on Test Set

Residual Spatial Autocorrelation

Variable Importance Plot

Partial Dependence Plots

Partial Dependence Plots

Summary Output