pacman::p_load(randomForest, Boruta, caret, tidyverse, vip, pdp,
sf, corrplot, gridExtra, spdep, adespatial, tibble,
readxl, terra, patchwork)
EnvironmentalOutputs <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Environmental_Outputs.xlsx")
social_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/03_Biodiversity/06_Processing/Social_Outputs.xlsx")
# Merge Environmental and Social Data
EnvironmentalOutputs <- EnvironmentalOutputs %>%
left_join(social_data, by = "Plot")
# Tree PCQ Data
tree_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
sheet = "Tree_PCQ")
# Veg Data
Veg_Cover <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
sheet = "Veg_Cover")
# Shrub Cover Data
shrub_data <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/Field_Data_FL_AL_MS.xlsx",
sheet = "Shrub_Cover")
# Site Data
CogonSites <- read_excel("C:/Users/DrewIvory/OneDrive - University of Florida/Desktop/School/PHD/01_Projects/05_SharedData/CogonSites_FL_AL_MS.xlsx")
agb_raster <- rast("I:/Cogongrass_Project/data/TreeMap2022_CONUS_DRYBIO_L.tif")
points <- vect(EnvironmentalOutputs,
geom = c("Longitude", "Latitude"),
crs = "EPSG:4326")
agb_vals <- extract(agb_raster, points)
## Warning: [extract] transforming vector data to the CRS of the raster
EnvironmentalOutputs$agb <- agb_vals[,2]
# Load the raster
r_out <- rast("I:/Cogongrass_Project/data/Soils/taxorder_CONUS_30m.tif")
# Reproject points to match raster CRS and extract soil values
points_proj <- project(points, crs(r_out))
soil_vals <- extract(r_out, points_proj)
# Add soil type to EnvironmentalOutputs
EnvironmentalOutputs$SoilType <- soil_vals$taxorder
table(EnvironmentalOutputs$SoilType)
##
## 1 4 9 10
## 31 31 1 143
# Convert numeric IDs to labeled factor
soil_lookup <- data.frame(
ID = 1:11,
SoilOrder = c("Alfisols","Andisols","Aridisols","Entisols","Gelisols",
"Histosols","Inceptisols","Mollisols","Spodosols","Ultisols","Vertisols")
)
soil_vals$SoilType <- soil_lookup$SoilOrder[soil_vals$taxorder]
EnvironmentalOutputs$SoilType <- as.factor(soil_vals$SoilType)
# Count the total number of quadrats per plot
quadrat_count <- Veg_Cover %>%
group_by(Plot) %>%
summarize(total_quadrats = n_distinct(Quadrat), .groups = "drop")
#Filter tree data to only include trees with "tree" in the growth column
tree_data <- dplyr::filter(tree_data, Growth == "Tree")
#Filter Veg Cover to exclude Shrubs and Trees
Veg_Cover <- dplyr::filter(Veg_Cover, Growth != "Shrub" & Growth != "Tree")
#Filter Shrub Cover to only include Shrubs and Trees
shrub_data <- dplyr::filter(shrub_data, Growth == "Shrub" | Growth == "Tree")
# Total length of Shrub cover at a site
shrub_cover <- shrub_data %>%
mutate(Cover = Line_End - Line_Start) %>%
group_by(Species_Name, Plot, InvStatus) %>%
summarise(Total_Cover = sum(Cover, na.rm = TRUE), .groups = "drop") %>%
mutate(Percent_Cover = Total_Cover / 3000 * 100)
# Combine Plot and Quadrat columns
Veg_Cover <- Veg_Cover %>%
mutate(Plot_Quadrat = paste(Plot, Quadrat, sep = '_'))
# Join with CogonSites to get site information
Veg_Cover <- Veg_Cover %>%
left_join(CogonSites, by = "Plot")
# Sum species cover across quadrats for each species at each plot
veg_cover_summed <- Veg_Cover %>%
group_by(Plot, Species_Name, InvStatus) %>%
summarize(total_cover = sum(Cover_Per, na.rm = TRUE), .groups = "drop")
# Calculate average herbaceous species cover
avg_species_cover <- veg_cover_summed %>%
left_join(quadrat_count, by = "Plot") %>%
mutate(avg_cover = total_cover / total_quadrats)
This species matrix includes herbaceous and shrub species
# Merge shrub cover with herbaceous average cover
combined_cover <- avg_species_cover %>%
full_join(
shrub_cover %>%
dplyr::select(Plot, Species_Name, Percent_Cover, InvStatus),
by = c("Plot", "Species_Name")
) %>%
mutate(
overlap_flag = ifelse(!is.na(avg_cover) & !is.na(Percent_Cover), TRUE, FALSE), # Flag overlaps
final_cover = case_when(
!is.na(avg_cover) & is.na(Percent_Cover) ~ avg_cover, # Use herbaceous cover if no shrub data
is.na(avg_cover) & !is.na(Percent_Cover) ~ Percent_Cover, # Use shrub cover if no herbaceous data
TRUE ~ NA_real_ # Leave as NA where overlaps exist
)
)
# Extract cogongrass cover
cogongrass_cover <- combined_cover %>%
filter(Species_Name == "Imperata_cylindrica") %>%
dplyr::select(Plot, final_cover) %>%
rename(Cogongrass_Cover = final_cover)
combined_cover <- combined_cover %>%
filter(Species_Name != "Imperata_cylindrica" , Species_Name != "Imperata_cylindrica_Live") # Remove cogongrass from species matrix
## Remove any non_native species
combined_cover <- combined_cover %>%
filter(InvStatus.x != "Non_Native") # Remove non-native species from species matrix
species_matrix <- combined_cover %>%
dplyr::select(Plot, Species_Name, final_cover) %>%
pivot_wider(
names_from = Species_Name,
values_from = final_cover,
values_fill = 0
)
# Calculate Shannon diversity index for each site
shannon_diversity <- species_matrix %>%
dplyr::select(-Plot) %>%
vegan::diversity(index = "shannon") %>%
as.data.frame() %>%
setNames("Shannon_Diversity") %>%
mutate(Plot = species_matrix$Plot)
# Merge Shannon diversity with cogongrass cover, tree canopy cover, and environmental outputs
model_data <- shannon_diversity %>%
left_join(cogongrass_cover, by = "Plot") %>%
left_join(EnvironmentalOutputs, by = "Plot") %>%
mutate(Cogongrass_Cover = ifelse(is.na(Cogongrass_Cover), 0, Cogongrass_Cover))
# Inspect to see if there are any columns with only one level
summary(model_data)
## Shannon_Diversity Plot Cogongrass_Cover Site
## Min. :0.000 Length:206 Min. : 0.00 Length:206
## 1st Qu.:2.065 Class :character 1st Qu.: 0.00 Class :character
## Median :2.407 Mode :character Median : 0.00 Mode :character
## Mean :2.333 Mean :17.08
## 3rd Qu.:2.720 3rd Qu.:29.82
## Max. :3.552 Max. :92.14
##
## BurnYear Camera Latitude Longitude
## Min. :2000 Length:206 Min. :28.48 Min. :-89.80
## 1st Qu.:2014 Class :character 1st Qu.:29.56 1st Qu.:-88.91
## Median :2022 Mode :character Median :30.77 Median :-87.14
## Mean :2018 Mean :30.38 Mean :-86.72
## 3rd Qu.:2022 3rd Qu.:30.98 3rd Qu.:-83.58
## Max. :2024 Max. :31.59 Max. :-81.94
## NA's :42
## Muname NLCD_LandCover Region Status
## Length:206 Length:206 Length:206 Length:206
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## SoilType Soil_Group TreeCanopy_NLCD aspect elevation
## Alfisols : 31 Min. : 18 Min. : 0.00 Min. : 0.0 Min. : 17.00
## Entisols : 31 1st Qu.:377 1st Qu.:51.50 1st Qu.: 90.0 1st Qu.: 38.00
## Spodosols: 1 Median :381 Median :75.50 Median :180.0 Median : 59.00
## Ultisols :143 Mean :330 Mean :66.22 Mean :171.7 Mean : 57.68
## 3rd Qu.:389 3rd Qu.:86.75 3rd Qu.:270.0 3rd Qu.: 75.00
## Max. :389 Max. :98.00 Max. :350.6 Max. :111.00
##
## ndvi pr slope sph
## Min. :0.2728 Min. :3.850 Min. : 0.000 Min. :0.01051
## 1st Qu.:0.4303 1st Qu.:4.345 1st Qu.: 2.149 1st Qu.:0.01074
## Median :0.4624 Median :4.592 Median : 2.984 Median :0.01091
## Mean :0.4503 Mean :4.566 Mean : 3.601 Mean :0.01145
## 3rd Qu.:0.4846 3rd Qu.:4.796 3rd Qu.: 4.688 3rd Qu.:0.01183
## Max. :0.5357 Max. :5.294 Max. :18.429 Max. :0.01325
##
## srad tmmn agb avg_pop_den_1km
## Min. :204.7 Min. :285.7 Min. : 6.90 Min. : 0.0000
## 1st Qu.:206.8 1st Qu.:286.6 1st Qu.: 21.21 1st Qu.: 0.5956
## Median :207.0 Median :287.2 Median : 35.82 Median : 2.1841
## Mean :210.6 Mean :287.4 Mean : 38.51 Mean : 8.3667
## 3rd Qu.:216.6 3rd Qu.:288.6 3rd Qu.: 50.89 3rd Qu.: 5.2578
## Max. :224.2 Max. :290.3 Max. :112.06 Max. :122.9795
## NA's :30
## road_density_1km
## Min. :0.000
## 1st Qu.:1.771
## Median :3.103
## Mean :3.037
## 3rd Qu.:3.810
## Max. :6.932
##
hist(model_data$Shannon_Diversity, breaks=30, main="Histogram of Shannon Diversity", xlab="Shannon Diversity")
summary(model_data$Shannon_Diversity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.065 2.407 2.333 2.720 3.552
# Extract coordinates
coords <- model_data[, c("Longitude", "Latitude")]
coords_matrix <- as.matrix(coords)
# Build a spatial neighbours list using a distance-based threshold.
# We use the minimum distance that ensures every plot has at least one neighbour.
nb <- dnearneigh(coords_matrix, d1 = 0, d2 = max(nbdists(
dnearneigh(coords_matrix, d1 = 0, d2 = 5), coords_matrix)[[1]]) * 1.5)
# If the above produces isolates, fall back to k-nearest neighbours (k=4)
if (any(card(nb) == 0)) {
cat(" Some plots had no neighbours — switching to k=4 nearest neighbours.\n")
nb <- knn2nb(knearneigh(coords_matrix, k = 4))
}
# Convert neighbours to spatial weights (binary, row-standardised)
listw <- nb2listw(nb, style = "W", zero.policy = TRUE)
# Compute Moran's I on raw response to confirm spatial autocorrelation exists
moran_test <- moran.test(model_data$Shannon_Diversity, listw, zero.policy = TRUE)
cat(sprintf(" Moran's I = %.4f, p = %.4f\n",
moran_test$estimate["Moran I statistic"],
moran_test$p.value))
## Moran's I = -0.0008, p = 0.0233
if (moran_test$p.value < 0.05) {
cat(" >> Significant spatial autocorrelation detected — MEMs are warranted.\n\n")
} else {
cat(" >> No significant spatial autocorrelation — MEMs may have limited benefit.\n\n")
}
## >> Significant spatial autocorrelation detected — MEMs are warranted.
# Build MEMs from the spatial weights matrix
mem_out <- mem(listw) # all MEMs with positive eigenvalues
mem_scores <- as.data.frame(mem_out)
# Select only MEMs significantly correlated with Shannon Diversity (p < 0.05)
# This avoids adding irrelevant spatial axes and overfitting
mem_pvals <- apply(mem_scores, 2, function(mem_col) {
cor.test(mem_col, model_data$Shannon_Diversity)$p.value
})
sig_mems <- names(mem_pvals[mem_pvals < 0.05])
if (length(sig_mems) == 0) {
cat(" No MEMs significantly correlated with Shannon Diversity (p < 0.05).\n")
cat(" Using top 3 MEMs by correlation strength instead.\n")
sig_mems <- names(sort(mem_pvals))[1:min(3, length(mem_pvals))]
}
cat(sprintf(" Total MEMs with positive eigenvalues : %d\n", ncol(mem_scores)))
## Total MEMs with positive eigenvalues : 205
cat(sprintf(" MEMs retained (p < 0.05) : %d\n", length(sig_mems)))
## MEMs retained (p < 0.05) : 12
cat(sprintf(" Retained MEMs: %s\n\n", paste(sig_mems, collapse = ", ")))
## Retained MEMs: MEM2, MEM5, MEM23, MEM65, MEM72, MEM102, MEM118, MEM147, MEM149, MEM160, MEM194, MEM203
mem_selected <- mem_scores[, sig_mems, drop = FALSE]
set.seed(97)
# Ensure the data is correctly formatted:
# convert categorical variables to numeric
model_data$NLCD_LandCover <- as.numeric(as.factor(model_data$NLCD_LandCover))
model_data$SoilType <- as.numeric(as.factor(model_data$SoilType))
model_data$Site <- as.numeric(as.factor(model_data$Site))
str(model_data)
## 'data.frame': 206 obs. of 26 variables:
## $ Shannon_Diversity: num 2.69 2.65 2.54 1.61 2.97 ...
## $ Plot : chr "BI200" "BI201" "BI202" "BI97" ...
## $ Cogongrass_Cover : num 22.1 10.7 39.6 21.1 2.5 ...
## $ Site : num 2 2 2 2 2 2 2 2 2 2 ...
## $ BurnYear : num 2022 2022 2022 2022 2022 ...
## $ Camera : chr "NA" "S-4K" "NA" "NA" ...
## $ Latitude : num 30.8 30.8 30.8 30.9 30.8 ...
## $ Longitude : num -86.9 -86.9 -86.8 -86.9 -86.9 ...
## $ Muname : chr "TroupLoamySand_0_to_5_percent_slopes" "TroupLoamySand_0_to_5_percent_slopes" "TroupLoamySand_0_to_5_percent_slopes" "FuquayLoamySand_0_to_5_percent_slopes" ...
## $ NLCD_LandCover : num 4 4 4 4 4 4 4 4 4 4 ...
## $ Region : chr "FA" "FA" "FA" "FA" ...
## $ Status : chr "Invaded" "Invaded" "Invaded" "Invaded" ...
## $ SoilType : num 4 4 4 4 4 4 4 4 4 4 ...
## $ Soil_Group : num 381 381 381 389 381 381 381 381 381 389 ...
## $ TreeCanopy_NLCD : num 66 94 70 59 83 63 66 71 82 54 ...
## $ aspect : num 0 246.8 180 246.8 13.1 ...
## $ elevation : num 72 70 71 48 58 71 65 55 61 43 ...
## $ ndvi : num 0.46 0.475 0.461 0.468 0.466 ...
## $ pr : num 4.76 4.76 4.81 4.8 4.82 ...
## $ slope : num 0.927 2.349 1.854 2.351 4.751 ...
## $ sph : num 0.0109 0.0109 0.0108 0.0107 0.0107 ...
## $ srad : num 206 206 207 207 207 ...
## $ tmmn : num 287 287 287 287 287 ...
## $ agb : num 45 45 27.5 19.2 18.5 ...
## $ avg_pop_den_1km : num 0.00479 0.00305 0.57717 2.00298 1.05855 ...
## $ road_density_1km : num 6.07 6.44 6.29 2.34 5.35 ...
env_predictors <- model_data[, c("Cogongrass_Cover", "elevation", "aspect",
"slope", "pr", "sph", "srad", "tmmn",
"TreeCanopy_NLCD", "NLCD_LandCover",
"SoilType", "ndvi", "avg_pop_den_1km",
"road_density_1km", "agb")]
# Combine environmental predictors + significant MEMs
all_predictors <- cbind(env_predictors, mem_selected)
model_df <- cbind(Shannon_Diversity = model_data$Shannon_Diversity,
all_predictors) %>%
na.omit()
cat(sprintf(" Environmental predictors : %d\n", ncol(env_predictors)))
## Environmental predictors : 15
cat(sprintf(" Spatial MEM predictors : %d\n", ncol(mem_selected)))
## Spatial MEM predictors : 12
cat(sprintf(" Total predictors : %d\n", ncol(all_predictors)))
## Total predictors : 27
cat(sprintf(" Observations (post-NA) : %d\n\n", nrow(model_df)))
## Observations (post-NA) : 176
cor_matrix <- cor(model_df, use = "complete.obs")
png("correlation_matrix.png", width = 900, height = 800, res = 120)
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.75,
addCoef.col = "black", number.cex = 0.55,
title = "Predictor Correlation Matrix", mar = c(0,0,1,0))
dev.off()
## png
## 2
cat("Correlation matrix saved.\n")
## Correlation matrix saved.
# Compute correlation matrix for predictors only (exclude response)
pred_cor <- cor(model_df[, -1], use = "complete.obs") # remove Shannon_Diversity
# Find pairs with |r| > 0.9
high_cor_pairs <- findCorrelation(pred_cor, cutoff = 0.9, names = TRUE, verbose = TRUE)
## Compare row 5 and column 8 with corr 0.91
## Means: 0.22 vs 0.09 so flagging column 5
## Compare row 8 and column 6 with corr 0.913
## Means: 0.191 vs 0.08 so flagging column 8
## All correlations <= 0.9
# Ensure 'pr' is always kept
if ("pr" %in% high_cor_pairs) {
# Get all variables highly correlated with 'pr'
pr_cor_vars <- names(which(abs(pred_cor["pr", ]) > 0.9))
pr_cor_vars <- setdiff(pr_cor_vars, "pr") # exclude pr itself
# Remove these variables instead of pr
high_cor_pairs <- setdiff(high_cor_pairs, "pr")
high_cor_pairs <- unique(c(high_cor_pairs, pr_cor_vars))
}
# Remove highly correlated predictors
if (length(high_cor_pairs) > 0) {
cat(sprintf(" Removing %d highly correlated predictors (|r| > 0.9) while keeping 'pr': %s\n",
length(high_cor_pairs),
paste(high_cor_pairs, collapse = ", ")))
model_df <- model_df[, !(colnames(model_df) %in% high_cor_pairs)]
} else {
cat(" No highly correlated predictors (|r| > 0.9) found — no removals necessary.\n")
}
## Removing 1 highly correlated predictors (|r| > 0.9) while keeping 'pr': tmmn
cat("========== STEP 3: Boruta Variable Selection ==========\n")
## ========== STEP 3: Boruta Variable Selection ==========
cat(" Running Boruta (this may take a minute)...\n")
## Running Boruta (this may take a minute)...
boruta_out <- Boruta(Shannon_Diversity ~ ., data = model_df,
doTrace = 1, maxRuns = 150, num.trees = 500)
## After 12 iterations, +1.3 secs:
## confirmed 5 attributes: avg_pop_den_1km, Cogongrass_Cover, MEM203, MEM23, pr;
## rejected 5 attributes: agb, aspect, MEM72, NLCD_LandCover, road_density_1km;
## still have 16 attributes left.
## After 16 iterations, +1.7 secs:
## confirmed 2 attributes: MEM118, srad;
## rejected 2 attributes: MEM102, slope;
## still have 12 attributes left.
## After 19 iterations, +2 secs:
## confirmed 1 attribute: elevation;
## rejected 1 attribute: ndvi;
## still have 10 attributes left.
## After 29 iterations, +2.9 secs:
## rejected 3 attributes: MEM160, MEM194, MEM65;
## still have 7 attributes left.
## After 32 iterations, +3.1 secs:
## confirmed 1 attribute: TreeCanopy_NLCD;
## still have 6 attributes left.
## After 37 iterations, +3.6 secs:
## confirmed 1 attribute: MEM5;
## still have 5 attributes left.
## After 64 iterations, +5.5 secs:
## confirmed 1 attribute: MEM2;
## still have 4 attributes left.
## After 115 iterations, +9.2 secs:
## rejected 1 attribute: SoilType;
## still have 3 attributes left.
## After 136 iterations, +11 secs:
## rejected 1 attribute: MEM147;
## still have 2 attributes left.
boruta_final <- TentativeRoughFix(boruta_out)
selected_vars <- getSelectedAttributes(boruta_final, withTentative = FALSE)
# Separate selected vars into environmental and spatial
selected_env <- selected_vars[!grepl("^MEM", selected_vars)]
selected_mems <- selected_vars[grepl("^MEM", selected_vars)]
cat(sprintf("\n Variables confirmed : %d\n", length(selected_vars)))
##
## Variables confirmed : 11
cat(sprintf(" Environmental confirmed : %d (%s)\n",
length(selected_env), paste(selected_env, collapse = ", ")))
## Environmental confirmed : 6 (Cogongrass_Cover, elevation, pr, srad, TreeCanopy_NLCD, avg_pop_den_1km)
cat(sprintf(" MEM predictors confirmed : %d (%s)\n\n",
length(selected_mems),
if (length(selected_mems) > 0) paste(selected_mems, collapse = ", ") else "none"))
## MEM predictors confirmed : 5 (MEM2, MEM5, MEM23, MEM118, MEM203)
# Boruta importance plot
png("boruta_spatial_importance.png", width = 1000, height = 600, res = 120)
par(mar = c(11, 4, 3, 1))
plot(boruta_final, las = 2, xlab = "",
main = "")
dev.off()
## png
## 2
model_df_sel <- model_df[, c("Shannon_Diversity", selected_vars)]
train_idx <- createDataPartition(model_df_sel$Shannon_Diversity, p = 0.8, list = FALSE)
train_data <- model_df_sel[ train_idx, ]
test_data <- model_df_sel[-train_idx, ]
cat("========== STEP 4: Tuning Random Forest (10-fold CV) ==========\n")
## ========== STEP 4: Tuning Random Forest (10-fold CV) ==========
ctrl <- trainControl(method = "cv", number = 10, verboseIter = FALSE)
tune_grid <- expand.grid(mtry = seq(2, max(2, floor(sqrt(length(selected_vars))) + 3), by = 1))
rf_tuned <- train(Shannon_Diversity ~ .,
data = train_data,
method = "rf",
trControl = ctrl,
tuneGrid = tune_grid,
ntree = 500,
importance = TRUE)
best_mtry <- rf_tuned$bestTune$mtry
cat(sprintf(" Best mtry: %d\n\n", best_mtry))
## Best mtry: 6
rf_final <- randomForest(Shannon_Diversity ~ .,
data = train_data,
ntree = 1000,
mtry = best_mtry,
importance = TRUE)
print(rf_final)
##
## Call:
## randomForest(formula = Shannon_Diversity ~ ., data = train_data, ntree = 1000, mtry = best_mtry, importance = TRUE)
## Type of random forest: regression
## Number of trees: 1000
## No. of variables tried at each split: 6
##
## Mean of squared residuals: 0.2167753
## % Var explained: 27.48
cat("========== STEP 5: Model Evaluation ==========\n")
## ========== STEP 5: Model Evaluation ==========
pred_test <- predict(rf_final, newdata = test_data)
bias_val <- mean(pred_test - test_data$Shannon_Diversity)
rel_bias <- bias_val / mean(test_data$Shannon_Diversity) * 100
rmse_val <- sqrt(mean((pred_test - test_data$Shannon_Diversity)^2))
mae_val <- mean(abs(pred_test - test_data$Shannon_Diversity))
ss_res <- sum((pred_test - test_data$Shannon_Diversity)^2)
ss_tot <- sum((test_data$Shannon_Diversity - mean(test_data$Shannon_Diversity))^2)
r2_val <- 1 - ss_res / ss_tot
cat(sprintf(" RMSE : %.4f\n", rmse_val))
## RMSE : 0.4735
cat(sprintf(" MAE : %.4f\n", mae_val))
## MAE : 0.3702
cat(sprintf(" Bias : %.4f\n", bias_val))
## Bias : -0.0506
cat(sprintf(" Relative Bias : %.2f%%\n", rel_bias))
## Relative Bias : -2.12%
cat(sprintf(" R² : %.4f\n\n", r2_val))
## R² : 0.1841
# Predicted vs observed plot
pred_obs_df <- data.frame(Observed = test_data$Shannon_Diversity,
Predicted = pred_test)
p_pred <- ggplot(pred_obs_df, aes(x = Observed, y = Predicted)) +
geom_point(alpha = 0.7, colour = "#2E8B57", size = 2.5) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey30") +
annotate("text", x = min(pred_obs_df$Observed) + 0.05,
y = max(pred_obs_df$Predicted) - 0.1,
label = sprintf("R² = %.3f\nRMSE = %.3f", r2_val, rmse_val),
hjust = 0, size = 4.5) +
labs(title = "",
x = "Observed", y = "Predicted") +
theme_bw(base_size = 13)
ggsave("predicted_vs_observed_spatial.png", p_pred, width = 6, height = 5, dpi = 150)
# Get residuals for ALL observations (training + test) using out-of-bag for train
oob_pred <- rf_final$predicted # OOB predictions (train set)
test_resid <- test_data$Shannon_Diversity - pred_test
# Full-dataset residuals for Moran test — predict all from final model
all_data_sel <- model_df_sel
all_preds <- predict(rf_final, newdata = all_data_sel)
all_resid <- all_data_sel$Shannon_Diversity - all_preds
# Match residuals back to spatial coordinates (using row indices surviving na.omit)
resid_df <- data.frame(
resid = all_resid,
Longitude = model_data$Longitude[as.integer(names(all_resid))],
Latitude = model_data$Latitude[as.integer(names(all_resid))]
)
resid_df <- resid_df[!is.na(resid_df$Longitude), ]
resid_coords <- as.matrix(resid_df[, c("Longitude", "Latitude")])
nb_resid <- knn2nb(knearneigh(resid_coords, k = 4))
## Warning in knn2nb(knearneigh(resid_coords, k = 4)): neighbour object has 12
## sub-graphs
listw_resid <- nb2listw(nb_resid, style = "W")
moran_resid <- moran.test(resid_df$resid, listw_resid)
cat(sprintf(" Residual Moran's I = %.4f, p = %.4f\n",
moran_resid$estimate["Moran I statistic"],
moran_resid$p.value))
## Residual Moran's I = -0.0879, p = 0.9554
if (moran_resid$p.value < 0.05) {
cat(" >> Residual spatial autocorrelation remains. Consider:\n")
cat(" - Lowering the MEM p-value threshold to retain more MEMs\n")
cat(" - Adding Site or Region as a predictor\n")
cat(" - Trying a Spatial Random Forest (SpatialML package)\n\n")
} else {
cat(" >> No significant residual autocorrelation — MEMs successfully\n")
cat(" absorbed spatial structure.\n\n")
}
## >> No significant residual autocorrelation — MEMs successfully
## absorbed spatial structure.
# Map of residuals
resid_sf <- st_as_sf(resid_df, coords = c("Longitude", "Latitude"), crs = 4326)
p_resid_map <- ggplot(resid_df, aes(x = Longitude, y = Latitude, colour = resid)) +
geom_point(size = 3, alpha = 0.85) +
scale_colour_gradient2(low = "#d73027", mid = "white", high = "#1a9850",
midpoint = 0, name = "Residual") +
labs(title = "",
subtitle = sprintf("Moran's I = %.3f, p = %.3f",
moran_resid$estimate["Moran I statistic"],
moran_resid$p.value)) +
theme_bw(base_size = 12)
ggsave("residual_map.png", p_resid_map, width = 7, height = 5, dpi = 150)
cat(" Residual map saved.\n")
## Residual map saved.
imp_df <- as.data.frame(importance(rf_final)) %>%
rownames_to_column("Variable") %>%
arrange(desc(`%IncMSE`)) %>%
mutate(Type = ifelse(grepl("^MEM", Variable), "Spatial (MEM)", "Environmental"))
p_imp <- ggplot(imp_df, aes(x = reorder(Variable, `%IncMSE`),
y = `%IncMSE`, fill = Type)) +
geom_col(alpha = 0.85) +
scale_fill_manual(values = c("Environmental" = "#2E8B57", "Spatial (MEM)" = "#4472C4")) +
coord_flip() +
labs(title = "",
subtitle = "",
x = NULL, y = "% Increase in MSE", fill = "Predictor Type") +
theme_bw(base_size = 13)
ggsave("variable_importance_spatial.png", p_imp, width = 8, height = 5, dpi = 150)
cat("Variable importance plot saved.\n")
## Variable importance plot saved.
top4_env <- imp_df %>%
filter(Type == "Environmental") %>%
slice_head(n = 4) %>%
pull(Variable)
pdp_plots <- lapply(top4_env, function(var) {
pd <- partial(rf_final, pred.var = var, train = train_data)
autoplot(pd) +
labs(title = var, x = var, y = "Shannon Diversity") +
theme_bw(base_size = 11)
})
png("partial_dependence_spatial_top4.png", width = 1100, height = 900, res = 130)
do.call(grid.arrange, c(pdp_plots, ncol = 2,
top = "Partial Dependence — Top 4 Environmental Predictors"))
dev.off()
## png
## 2
cat("Partial dependence plots saved.\n\n")
## Partial dependence plots saved.
gator_blue <- "#0021A5"
gator_orange <- "#0021A5"
selected_env <- c(
"pr", "srad", "TreeCanopy_NLCD", "Cogongrass_Cover", "elevation", "avg_pop_den_1km"
)
var_labels <- c(
Cogongrass_Cover = "Cogongrass Cover (%)",
pr = "Precipitation (mm)",
TreeCanopy_NLCD = "Tree Canopy Cover (%)",
srad = expression("Surface Radiation (W/m"^2*")"),
elevation = "Elevation (m)",
avg_pop_den_1km = expression("Population Density (pop/1 km"^2*")")
)
n_boot <- 100
panel_labels <- LETTERS[1:length(selected_env)]
pdp_boot_list <- lapply(seq_along(selected_env), function(i) {
var <- selected_env[i]
boot_res <- lapply(1:n_boot, function(j) {
idx <- sample(1:nrow(train_data), replace = TRUE)
train_boot <- train_data[idx, ]
rf_boot <- randomForest(
Shannon_Diversity ~ .,
data = train_boot,
ntree = 500
)
pd <- partial(rf_boot, pred.var = var, train = train_boot)
pd$iter <- j
pd
})
pd_all <- bind_rows(boot_res)
pd_summary <- pd_all %>%
group_by(!!sym(var)) %>%
summarise(
y = mean(yhat),
ymin = quantile(yhat, 0.025),
ymax = quantile(yhat, 0.975),
.groups = "drop"
) %>%
arrange(!!sym(var)) %>%
mutate(
y_smooth = loess(y ~ get(var), span = 0.6)$fitted,
ymin_smooth = loess(ymin ~ get(var), span = 0.6)$fitted,
ymax_smooth = loess(ymax ~ get(var), span = 0.6)$fitted
)
ggplot(pd_summary, aes_string(x = var)) +
ggplot(pd_summary, aes_string(x = var)) +
geom_ribbon(
aes(ymin = ymin_smooth, ymax = ymax_smooth),
fill = gator_orange,
alpha = 0.2
) +
geom_line(
aes(y = y_smooth),
color = gator_blue,
size = 1.3
) +
labs(
x = var_labels[[var]],
y = NULL # remove individual y-axis labels for shared label
) +
theme_bw(base_size = 11) +
theme(
panel.grid.major = element_line(color = "gray80", size = 0.3), # faint major grids
panel.grid.minor = element_line(color = "gray90", size = 0.2), # even fainter minor grids
plot.title = element_text(face = "bold", size = 12)
) +
ggtitle(panel_labels[i])
})
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gridExtra::grid.arrange(
grobs = pdp_boot_list,
ncol = 3 # better for 7 panels
)
png("partial_dependence_spatial_bootstrap.png",
width = 1200, height = 900, res = 130)
gridExtra::grid.arrange(
grid::textGrob("Shannon Diversity", rot = 90, gp = grid::gpar(fontsize = 14)),
gridExtra::arrangeGrob(
grobs = pdp_boot_list,
ncol = 3
),
ncol = 2,
widths = c(0.05, 0.95) # controls spacing of y-axis label
)
dev.off()
## png
## 2
cat("============================================================\n")
## ============================================================
cat(" SPATIAL MODEL SUMMARY\n")
## SPATIAL MODEL SUMMARY
cat("============================================================\n")
## ============================================================
cat(sprintf(" Moran's I (response) : %.4f (p = %.4f)\n",
moran_test$estimate["Moran I statistic"], moran_test$p.value))
## Moran's I (response) : -0.0008 (p = 0.0233)
cat(sprintf(" MEMs retained : %d\n", length(sig_mems)))
## MEMs retained : 12
cat(sprintf(" MEM axes confirmed (Boruta): %d\n", length(selected_mems)))
## MEM axes confirmed (Boruta): 5
cat(sprintf(" Environmental vars confirmed: %d\n", length(selected_env)))
## Environmental vars confirmed: 6
cat(sprintf(" Total selected vars : %d\n", length(selected_vars)))
## Total selected vars : 11
cat("------------------------------------------------------------\n")
## ------------------------------------------------------------
cat(sprintf(" Test RMSE : %.4f\n", rmse_val))
## Test RMSE : 0.4735
cat(sprintf(" Test MAE : %.4f\n", mae_val))
## Test MAE : 0.3702
cat(sprintf(" Test Bias : %.4f\n", bias_val))
## Test Bias : -0.0506
cat(sprintf(" Relative Bias : %.2f%%\n", rel_bias))
## Relative Bias : -2.12%
cat(sprintf(" Test R² : %.4f\n", r2_val))
## Test R² : 0.1841
cat("------------------------------------------------------------\n")
## ------------------------------------------------------------
cat(sprintf(" Residual Moran's I : %.4f (p = %.4f)\n",
moran_resid$estimate["Moran I statistic"], moran_resid$p.value))
## Residual Moran's I : -0.0879 (p = 0.9554)
cat("============================================================\n")
## ============================================================
saveRDS(rf_final, "shannon_diversity_spatial_rf_model.rds")
cat("\nModel saved as 'shannon_diversity_spatial_rf_model.rds'\n")
##
## Model saved as 'shannon_diversity_spatial_rf_model.rds'