soil.vege

SOIL

library(dplyr)
library(tidyverse)
library(FactoMineR)
library(factoextra)
library(ggplot2)
library(purrr) 
library(tidyr)
library(ggtern)
library(cluster)
library(caret)  # For RMSE
library(gridExtra)  # For arranging plots side by side
library(readr)
Soil_Data <- read_csv("soil.data.csv")
colnames(Soil_Data)
 [1] "SSN"                         "Job.No"                     
 [3] "Study"                       "Site"                       
 [5] "cluster"                     "plot"                       
 [7] "depth_std"                   "depth_top"                  
 [9] "depth_bottom"                "pH"                         
[11] "SOC (%)"                     "TN (%)"                     
[13] "EC  (uS/cm)"                 "m3.P (mg/kg)"               
[15] "m3.Al (mg/kg)"               "m3.B (mg/kg)"               
[17] "m3.Ca (mg/kg)"               "m3.Fe (mg/kg)"              
[19] "m3.K (mg/kg)"                "m3.Mg (mg/kg)"              
[21] "m3.Na (mg/kg)"               "CEC (cmolc/kg)"             
[23] "ExAc (cmolc/kg)"             "PSI"                        
[25] "Clay (%)"                    "Silt (%)"                   
[27] "Sand (%)"                    "Soil.Texture.Interpretation"
[29] "Interpretation"              "...30"                      
# Select and rename relevant columns
soil_data <- Soil_Data %>%
  dplyr::select(-c(1:4, 30, 29, 28)) %>%
  rename(
    EC = `EC  (uS/cm)`, 
    P = `m3.P (mg/kg)`, 
    Al = `m3.Al (mg/kg)`, 
    B = `m3.B (mg/kg)`,   
    Ca = `m3.Ca (mg/kg)`,  
    Fe = `m3.Fe (mg/kg)`,  
    K = `m3.K (mg/kg)`,   
    Mg = `m3.Mg (mg/kg)`,  
    Na = `m3.Na (mg/kg)`,  
    CEC = `CEC (cmolc/kg)`, 
    ExAc = `ExAc (cmolc/kg)`,  
    TN = `TN (%)`,  
    SOC = `SOC (%)`, 
    Clay = `Clay (%)`,  
    Sand = `Sand (%)`,   
    Silt = `Silt (%)`
  ) 

# Convert cluster to factor
soil_data$cluster <- as.factor(soil_data$cluster)  

# Replace only specific NA values with 0
soil_data <- soil_data %>%
  mutate(across(where(is.numeric), ~ replace_na(., 0)))  # Replace NAs with 0 for numeric columns

# Compute the mean for each plot within each cluster
soil_data <- soil_data %>%
  group_by(cluster, plot) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE))  

# View the result
colnames(soil_data)
 [1] "cluster"      "plot"         "depth_top"    "depth_bottom" "pH"          
 [6] "SOC"          "TN"           "EC"           "P"            "Al"          
[11] "B"            "Ca"           "Fe"           "K"            "Mg"          
[16] "Na"           "CEC"          "ExAc"         "PSI"          "Clay"        
[21] "Silt"         "Sand"        

PCA TEST 1

soil.data <- soil_data[,-c(1:4)]

# Scale the data
soil_data_scaled <- scale(soil.data)
##This step will return the proportion of variance explained by each principal component and additional information, such as eigenvalues and the percentage of variance each PC explains.
# Perform PCA
pc.results.soil <- PCA(soil_data_scaled, scale.unit = TRUE, graph = FALSE)

# Correlation circle - variable contribution to PCs
fviz_pca_var(pc.results.soil, col.var = "contrib", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE)

# Extract eigenvalues
eigenvalues <- pc.results.soil$eig[,1]  # First column contains eigenvalues
head(eigenvalues)
   comp 1    comp 2    comp 3    comp 4    comp 5    comp 6 
7.5175646 4.4735839 1.9603027 1.2654116 1.0108506 0.4796236 
# Extract correlation of variables with PCs
correlations <- pc.results.soil$var$coord
head(correlations)
          Dim.1      Dim.2       Dim.3       Dim.4       Dim.5
pH   0.89288765 -0.3198878  0.17780786 -0.10981891 -0.11333255
SOC  0.25929878  0.9409967  0.01981635 -0.01406242  0.08483381
TN   0.05406606  0.9729491 -0.02929538 -0.01689972  0.12645926
EC   0.59203055  0.7034661  0.10441728  0.22294499 -0.17805452
P    0.44298990  0.5502939  0.47522877  0.25119190 -0.07212982
Al  -0.65468650  0.2344754 -0.52590976  0.14447660  0.40048924
#✅  Interpetation: 

##The eigenvalues represent the amount of variance captured by each principal component (PC).
#The first five components have eigenvalues greater than 1, suggesting they retain meaningful information.
#Cumulative variance:
#PC1 explains 39.23% of the total variance.
#PC2 adds 25.07%, bringing the cumulative variance to 64.30%
#Variable Contributions to Principal Components (Correlation with PCs):

##PC1:These properties are strongly affected by fertilisers especially NPK
#Strong positive correlations: CEC (0.97), Ca (0.96), Mg (0.93), B (0.89), EC (0.77), pH (0.76).
#Strong negative correlation: Al (-0.49), Fe (-0.54), ExAc (-0.39).
#Suggests PC1 represents cation exchange capacity (CEC) and soil fertility.
#PC2:Indirect Effects of Fertilisers
#Strongest positive correlations: TN (0.88), SOC (0.81), Clay (-0.86), Sand (0.76).
#Indicates PC2 captures organic matter and soil texture gradients.

#K mean clustering 
# Determine the optimal number of clusters 
#Elbow method; Measures the total within-cluster sum of squares (WSS) as the number of clusters increases. It identifies the "elbow point," where adding more clusters no longer significantly reduces WSS.
# Determine the optimal number of clusters using the elbow method
fviz_nbclust(pc.results.soil$ind$coord, kmeans, method = "wss") +
  labs(title = "Optimal Clusters (Elbow Method)")

# Gap statistic for determining clusters
# Gap statistic; Compares the total within-cluster variation for different numbers of clusters with the expected variation under a null reference distribution.
set.seed(123)  # Ensures reproducibility
gap_stat <- clusGap(pc.results.soil$ind$coord, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat) +
  labs(title = "Optimal Clusters (Gap Statistic)")

# Perform K-means clustering on PCA scores
set.seed(123)
k.groups <- kmeans(pc.results.soil$ind$coord, centers = 4, nstart = 25)
#nstart = 25. This means that R will try 25 different random starting assignments and then select the best results corresponding to the one with the lowest within cluster variation.

# Visualise PCA results with k-means clusters}
fviz_pca_ind(pc.results.soil, col.ind = as.factor(k.groups$cluster), 
             label = "var",  # Label the variables in the biplot
             addEllipses = TRUE,  # Add ellipses to group points
             geom = "text") +  # Use text only, no scatter points
  geom_text(aes(label = as.factor(soil_data$cluster), color = as.factor(k.groups$cluster)), 
            size = 3, vjust = -0.5)  # Add cluster numbers with matching colors

SNGI

# Extract variance explained by PC1 and PC2
variance_explained <- pc.results.soil$eig[,2] / sum(pc.results.soil$eig[,2])  # Ensure it's properly normalised

# Compute SNGI using the first two principal components
soil_data$SNGI <- pc.results.soil$ind$coord[,1] * variance_explained[1] + 
                 pc.results.soil$ind$coord[,2] * variance_explained[2]

# Classify clusters into four categories based on SNGI quantiles
soil_data$Health_Status <- cut(soil_data$SNGI, 
                               breaks = quantile(soil_data$SNGI, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE), 
                               labels = c("Very Low Nutrients", "Low Nutrients", "Moderate Nutrients", "Excessive Nutrients"), 
                               include.lowest = TRUE)

# PCA Biplot with Health Classification
pca_plot <- fviz_pca_ind(pc.results.soil, 
                          col.ind = as.factor(soil_data$Health_Status), 
                          addEllipses = TRUE, 
                          repel = TRUE) +
  scale_color_manual(values = c("Very Low Nutrients" = "#D73027", "Low Nutrients" = "#FC8D59", 
                                "Moderate Nutrients" = "#91BFDB", "Excessive Nutrients" = "#1A9850")) +
  labs(title = "PCA Biplot with Soil Nutrient Gradient Index Classification", color = "Soil Nutrient Gradient Status")



# Convert Cluster to a factor if not already
soil_data$Cluster <- as.factor(soil_data$cluster)  

# Create heatmap of Soil Nutrient Gradient Index (SNGI) Across Clusters
heatmap_plot <- ggplot(soil_data, aes(x = Cluster, y = SNGI, fill = Health_Status)) +
  geom_col() +  # Use geom_col() instead of geom_bar() to avoid incorrect stat usage
  scale_fill_manual(values = c("Very Low Nutrients" = "#D73027", "Low Nutrients" = "#FC8D59", 
                               "Moderate Nutrients" = "#91BFDB", "Excessive Nutrients" = "#1A9850")) +
  labs(title = "Soil Nutrient Gradient Index by Cluster", x = "Cluster", y = "Soil Nutrient Gradient Index") +
  theme_minimal()


# Print plots
print(heatmap_plot)

# Compute Mean SNGI per Cluster and Classify Based on Quartiles
mean_sngi_cluster <- soil_data %>%
  group_by(Cluster) %>%
  summarise(Mean_SNGI = mean(SNGI, na.rm = TRUE)) %>%  # Handle missing values
  mutate(Health_Status = case_when(
    Mean_SNGI <= quantile(Mean_SNGI, 0.25, na.rm = TRUE) ~ "Very Low Nutrients",
    Mean_SNGI > quantile(Mean_SNGI, 0.25, na.rm = TRUE) & Mean_SNGI <= quantile(Mean_SNGI, 0.50, na.rm = TRUE) ~ "Low Nutrients",
    Mean_SNGI > quantile(Mean_SNGI, 0.50, na.rm = TRUE) & Mean_SNGI <= quantile(Mean_SNGI, 0.75, na.rm = TRUE) ~ "Moderate Nutrients",
    Mean_SNGI > quantile(Mean_SNGI, 0.75, na.rm = TRUE) ~ "Excessive Nutrients"
  )) %>%
  mutate(Health_Status = factor(Health_Status, levels = c("Very Low Nutrients", "Low Nutrients", "Moderate Nutrients", "Excessive Nutrients"))) %>%
  arrange(Health_Status, Mean_SNGI)  # Arrange by category & within category by SNGI

# Arrange Clusters in Ascending Order of Mean SNGI (Most Degraded to Most Healthy)
mean_sngi_cluster <- mean_sngi_cluster %>%
  arrange(Mean_SNGI)  # Orders from lowest (most degraded) to highest (healthiest)

# Plot Mean SNGI per Cluster (Ordered)
ggplot(mean_sngi_cluster, aes(x = reorder(as.factor(Cluster), Mean_SNGI), y = Mean_SNGI, fill = Health_Status)) +
  geom_col() +
  scale_fill_manual(values = c("Very Low Nutrients" = "#D73027", "Low Nutrients" = "#FC8D59", 
                               "Moderate Nutrients" = "#91BFDB", "Excessive Nutrients" = "#1A9850")) +
  labs(title = "Mean Soil Nutrient Gradient Index by Cluster", x = "Cluster", y = "Mean SNGI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for clarity

PCA TEST 2

# Remove first 5 columns
#soil.data <- soil_data[,-c(1:4)]
soil.data1 <- soil_data[, c("EC", "SOC", "TN", "Sand", "P",  "Clay")]

# Scale the data
soil_data_scaled1 <- scale(soil.data1)
##This step will return the proportion of variance explained by each principal component and additional information, such as eigenvalues and the percentage of variance each PC explains.
# Perform PCA
pc.results.soil1 <- PCA(soil_data_scaled1, scale.unit = TRUE, graph = FALSE)

# Correlation circle - variable contribution to PCs
fviz_pca_var(pc.results.soil1, col.var = "contrib", 
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), 
             repel = TRUE)

# Extract eigenvalues
eigenvalues1 <- pc.results.soil1$eig # First column contains eigenvalues
print((eigenvalues1))
       eigenvalue percentage of variance cumulative percentage of variance
comp 1 3.66332063             61.0553438                          61.05534
comp 2 1.59201137             26.5335228                          87.58887
comp 3 0.44535303              7.4225506                          95.01142
comp 4 0.22307955              3.7179926                          98.72941
comp 5 0.05762958              0.9604930                          99.68990
comp 6 0.01860584              0.3100973                         100.00000
# Extract correlation of variables with PCs
correlations1 <- pc.results.soil1$var$coord
head(correlations1)
          Dim.1        Dim.2       Dim.3       Dim.4       Dim.5
EC    0.7742268  0.524115486  0.03527139  0.34183792 -0.08784713
SOC   0.9390792  0.178278851  0.25202878 -0.10471379  0.04975402
TN    0.9429117 -0.004558563  0.28166351 -0.14962120  0.01987305
Sand  0.6218893 -0.729352360 -0.13488106  0.21157215  0.13538793
P     0.6601344  0.535310770 -0.50674887 -0.13672205  0.04581667
Clay -0.6858700  0.683372306  0.16207653  0.09705264  0.16313493
#✅  Interpetation: 


#PC1 (Dim.1) explains 64.91% of total variance, indicating it is the dominant factor influencing soil properties.
#PC2 (Dim.2) explains an additional 23.48%, bringing the cumulative variance to 88.39% → 

#PC1 (Dim.1) 

#Strong positive correlations: SOC (0.94), TN (0.94), EC (0.82), P (0.76), Sand (0.63)
#Strong negative correlation: Clay (-0.69)
#Suggests PC1 represents soil fertility, organic matter, and nutrient availability.
#PC2 (Dim.2) 

#Positive correlations: Clay (0.69), EC (0.45), P (0.39)
#Negative correlation: Sand (-0.73)
#Indicates a contrast between sandy soils and clay-rich soils, linked to organic matter retention.

#K mean clustering 
# Determine the optimal number of clusters 
#Elbow method; Measures the total within-cluster sum of squares (WSS) as the number of clusters increases. It identifies the "elbow point," where adding more clusters no longer significantly reduces WSS.
# Determine the optimal number of clusters using the elbow method
fviz_nbclust(pc.results.soil1$ind$coord, kmeans, method = "wss") +
  labs(title = "Optimal Clusters (Elbow Method)")

# Gap statistic for determining clusters
# Gap statistic; Compares the total within-cluster variation for different numbers of clusters with the expected variation under a null reference distribution.
set.seed(123)  # Ensures reproducibility
gap_stat <- clusGap(pc.results.soil1$ind$coord, FUN = kmeans, nstart = 25, K.max = 10, B = 50)
fviz_gap_stat(gap_stat) +
  labs(title = "Optimal Clusters (Gap Statistic)")

# Perform K-means clustering on PCA scores
set.seed(123)
k.groups1 <- kmeans(pc.results.soil1$ind$coord, centers = 3, nstart = 25)
#nstart = 25. This means that R will try 25 different random starting assignments and then select the best results corresponding to the one with the lowest within cluster variation.

# Visualise PCA results with k-means clusters}
fviz_pca_ind(pc.results.soil1, col.ind = as.factor(k.groups1$cluster), 
             label = "var",  # Label the variables in the biplot
             addEllipses = TRUE,  # Add ellipses to group points
             geom = "text") +  # Use text only, no scatter points
  geom_text(aes(label = as.factor(soil_data$cluster), color = as.factor(k.groups1$cluster)), 
            size = 3, vjust = -0.5)  

SHI

# Compute SHI using PC1 and PC2 coordinates
soil_data$SHI <-    (pc.results.soil$ind$coord[, 2] )

# Print first few rows of SHI
head(soil_data$SHI)
        1         2         3         4         5         6 
-3.587973 -3.801725 -2.515206 -3.009951 -3.951300 -3.072993 
# Classify clusters into four categories based on SHI quantiles
soil_data$Health_Status <- cut(soil_data$SHI, 
                               breaks = quantile(soil_data$SHI, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE, type = 7), 
                               labels = c("Degraded", "Near Degraded", "Near Healthy", "Healthy"), 
                               include.lowest = TRUE)

# PCA Biplot with Health Classification (Using PC2)
pca_plot <- fviz_pca_ind(pc.results.soil1, 
                          col.ind = as.factor(soil_data$Health_Status), 
                          addEllipses = TRUE, 
                          repel = TRUE) +
  scale_color_manual(values = c("Degraded" = "#D73027", "Near Degraded" = "#FC8D59", 
                                "Near Healthy" = "#91BFDB", "Healthy" = "#1A9850")) +
  labs(title = "PCA Biplot with Soil Health Classification (PC2)", color = "Soil Health Status")

# Convert Cluster to a factor
soil_data$Cluster <- factor(soil_data$Cluster, levels = unique(soil_data$Cluster))

# Heatmap of Soil Health Index (SHI) Across Clusters
heatmap_plot <- ggplot(soil_data, aes(x = as.factor(Cluster), y = SHI, fill = Health_Status)) +
  geom_col() +  
  scale_fill_manual(values = c("Degraded" = "#D73027", "Near Degraded" = "#FC8D59", 
                               "Near Healthy" = "#91BFDB", "Healthy" = "#1A9850")) +
  labs(title = "Soil Health Index by Cluster (PC2)", x = "Cluster", y = "Soil Health Index") +
  theme_minimal()

# Print plots
print(heatmap_plot)

# Compute Mean SHI per Cluster and Classify Based on Quartiles
mean_shi_cluster <- soil_data %>%
  group_by(Cluster) %>%
  summarise(Mean_SHI = mean(SHI, na.rm = TRUE))

# Compute quartiles once and use them in case_when
q <- quantile(mean_shi_cluster$Mean_SHI, probs = c(0.25, 0.5, 0.75), na.rm = TRUE)

mean_shi_cluster <- mean_shi_cluster %>%
  mutate(Health_Status = case_when(
    Mean_SHI <= q[1] ~ "Degraded",
    Mean_SHI > q[1] & Mean_SHI <= q[2] ~ "Near Degraded",
    Mean_SHI > q[2] & Mean_SHI <= q[3] ~ "Near Healthy",
    Mean_SHI > q[3] ~ "Healthy"
  )) %>%
  mutate(Health_Status = factor(Health_Status, levels = c("Degraded", "Near Degraded", "Near Healthy", "Healthy"))) %>%
  arrange(Health_Status, Mean_SHI)

# Arrange Clusters in Ascending Order of Mean SHI
mean_shi_cluster <- mean_shi_cluster %>%
  arrange(Mean_SHI)

# Plot Mean SHI per Cluster (Ordered)
ggplot(mean_shi_cluster, aes(x = reorder(as.factor(Cluster), Mean_SHI), y = Mean_SHI, fill = Health_Status)) +
  geom_col() +
  scale_fill_manual(values = c("Degraded" = "#D73027", "Near Degraded" = "#FC8D59", 
                               "Near Healthy" = "#91BFDB", "Healthy" = "#1A9850")) +
  labs(title = "", x = "Cluster", y = "Mean SHI") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

SOC and TN

# Step 1: Compute mean and standard deviation (SD) for SOC and TN per cluster
summary_data <- soil_data %>%
  group_by(cluster) %>%
  summarise(
    mean_SOC = mean(SOC, na.rm = TRUE),
    sd_SOC = sd(SOC, na.rm = TRUE),
    mean_TN = mean(TN, na.rm = TRUE),
    sd_TN = sd(TN, na.rm = TRUE)
  )

# Step 2: Create SOC plot
plot_SOC <- ggplot(summary_data, aes(x = reorder(cluster, -mean_SOC), y = mean_SOC)) +
  geom_point(size = 3, color = "blue") +  # Points for means
  geom_errorbar(aes(ymin = mean_SOC - sd_SOC, ymax = mean_SOC + sd_SOC), 
                width = 0.2, color = "blue") +  # Error bars (SD)
  labs(x = "Degradation gradient", y = "Mean SOC ± SD") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate cluster labels

# Step 3: Create TN plot
plot_TN <- ggplot(summary_data, aes(x = reorder(cluster, -mean_TN), y = mean_TN)) +
  geom_point(size = 3, color = "navy") +  # Points for means
  geom_errorbar(aes(ymin = mean_TN - sd_TN, ymax = mean_TN + sd_TN), 
                width = 0.2, color = "navy") +  # Error bars (SD)
  labs(x = "Degradation gradient", y = "Mean TN ± SD") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate cluster labels

# Step 4: Arrange the plots side by side
grid_plot <- grid.arrange(plot_SOC, plot_TN, ncol = 2)

# Step 5: Display the grid
print(grid_plot)
TableGrob (1 x 2) "arrange": 2 grobs
  z     cells    name           grob
1 1 (1-1,1-1) arrange gtable[layout]
2 2 (1-1,2-2) arrange gtable[layout]

Grasses

# Load required libraries
library(tidyverse)
library(vegan)
library(readr)
library(dplyr)
library(ggplot2)
library(ggpubr)
library(tidyr)
library(rstatix)
library(gridExtra)
library(dunn.test)
library(viridis)
library(grid)
library(lme4)       # For linear mixed models
library(lmerTest)   # For p-values
library(broom.mixed) # For extracting tidy summaries
library(dplyr)
library(Metrics)
library(patchwork)
library(mgcv)
library(broom)
library(knitr)
library(kableExtra)
library(formattable)
library(purrr) 
library(entropart)

# Read CSV file
grass.data <- read_csv("Data/query_ldsf_rangeland_Enarau.csv")

# Rename columns
grass.data <- grass.data %>%
  dplyr::rename(
    herbs = rl_nearestherb,
    pg = rl_nearest_grass_perennial_spp,
    dist.pg = rl_dist_grass_perennial,
    ag = rl_nearest_grass_annual_spp,
    dist.ag = rl_dist_grass_annual,
    wp = rl_nearest_woody_spp,
    f = rl_nearest_forb_spp,
    dist.wp = rl_dist_woody_plant,
    dist.f = rl_dist_forb,
    bare.ground = rl_bareground,
    leaf.litter = rl_leaflitter,
    dung.presence = rl_dung,
    rock.cover = rl_rock,
    point.canopy = rl_canopy,
    grass.tuft = rl_grasstuft,
    cluster = Cluster,
    plot = Plot
  )

# Replace NA values in numeric columns with 0
grass.data <- grass.data %>%
  mutate(across(where(is.numeric), ~ replace_na(., 0)))

# View the updated data frame
colnames(grass.data)
 [1] "PARENT_KEY"    "Site"          "cluster"       "plot"         
 [5] "Longitude"     "Latitude"      "VegStructure"  "Year"         
 [9] "Trees"         "Shrubs"        "KEY"           "bare.ground"  
[13] "point.canopy"  "grass.tuft"    "leaf.litter"   "rock.cover"   
[17] "dung.presence" "herbs"         "pg"            "dist.pg"      
[21] "ag"            "dist.ag"       "f"             "dist.f"       
[25] "wp"            "dist.wp"      

Coded names

library(stringr)
# Function to extract first letters from species names
coded <- function(species_name) {
species_name <- str_squish(str_replace_all(species_name, "\u00A0", " "))  # Clean spaces
parts <- unlist(str_split(species_name, " "))
if (length(parts) == 0) return(NA_character_)
paste0(str_sub(parts[1], 1, 1), ifelse(length(parts) > 1, str_sub(parts[2], 1, 1), ""))
}
# Columns that need to be processed
species_cols <- c("ag", "pg", "wp", "f")
# Apply function and clean codes
grass.data <- grass.data %>%
mutate(across(all_of(species_cols),
~ vapply(., coded, FUN.VALUE = character(1)),
.names = "{.col}_code")) %>%
mutate(across(ends_with("_code"), ~ ifelse(. == "N", NA, .)))
# Define invasive species and classify status
invasive_species <- c("Sc", "Si", "Ds", "Tm", "Rc", "Ln", "Lc", "Aa", "Dra", "Gb", "Ic",
"Is", "Ld", "Xo", "Ac", "Xs", "Es", "Bp", "Dre", "Sn", "Ct", "Ar",
"Cs", "Af", "In")
grass.data <- grass.data %>%
mutate(across(c("wp_code", "f_code"), ~ ifelse(. %in% invasive_species, "Invasive", "Native"), .names = "{.col}_status"))
# Add new columns to classify species as Invasive or Native
grass.data <- grass.data %>%
mutate(
wp.inv = ifelse(wp_code %in% invasive_species, wp_code, NA),
wp.ntv = ifelse(!wp_code %in% invasive_species & !is.na(wp_code), wp_code, NA),
f.inv  = ifelse(f_code %in% invasive_species, f_code, NA),
f.ntv  = ifelse(!f_code %in% invasive_species & !is.na(f_code), f_code, NA)
)
head(grass.data)
# A tibble: 6 × 36
  PARENT_KEY     Site  cluster  plot Longitude Latitude VegStructure  Year Trees
  <chr>          <chr>   <dbl> <dbl>     <dbl>    <dbl> <chr>        <dbl> <chr>
1 uuid:b716dfd2… Enar…       1     1      35.4    -1.00 grassland     2024 yes  
2 uuid:b716dfd2… Enar…       1     1      35.4    -1.00 grassland     2024 yes  
3 uuid:b716dfd2… Enar…       1     1      35.4    -1.00 grassland     2024 yes  
4 uuid:b716dfd2… Enar…       1     1      35.4    -1.00 grassland     2024 yes  
5 uuid:b716dfd2… Enar…       1     1      35.4    -1.00 grassland     2024 yes  
6 uuid:b716dfd2… Enar…       1     1      35.4    -1.00 grassland     2024 yes  
# ℹ 27 more variables: Shrubs <chr>, KEY <chr>, bare.ground <chr>,
#   point.canopy <chr>, grass.tuft <chr>, leaf.litter <chr>, rock.cover <chr>,
#   dung.presence <chr>, herbs <chr>, pg <chr>, dist.pg <dbl>, ag <chr>,
#   dist.ag <dbl>, f <chr>, dist.f <dbl>, wp <chr>, dist.wp <dbl>,
#   ag_code <chr>, pg_code <chr>, wp_code <chr>, f_code <chr>,
#   wp_code_status <chr>, f_code_status <chr>, wp.inv <chr>, wp.ntv <chr>,
#   f.inv <chr>, f.ntv <chr>

q

To assess plant species diversity across different clusters and plots, we computed Hill numbers ( q₀, q₁, and q₂) using species occurrence data in R Package entropart.

  • q₀ (Species richness): Counts the number of species present.

  • q₁ (Shannon diversity): Accounts for both species richness and evenness.

  • q₂ (Simpson diversity): Gives more weight to dominant species, reducing the influence of rare species.

    Species data were obtained from multiple functional groups, including:

    • Annual grasses, Perennial grasses, Woody plants (native), Forbs (native), Invasive species (forbs and woody plants)
# Function to compute Hill numbers 
hill.no <- function(species_matrix) {
  q0 <- apply(species_matrix, 1, function(x) Diversity(as.ProbaVector(x / sum(x)), q = 0)) # Species richness
  q1 <- apply(species_matrix, 1, function(x) Diversity(as.ProbaVector(x / sum(x)), q = 1)) # Shannon diversity
  q2 <- apply(species_matrix, 1, function(x) Diversity(as.ProbaVector(x / sum(x)), q = 2)) # Simpson diversity
 
  data.frame(
    cluster = as.numeric(sub("_.*", "", rownames(species_matrix))),  
    plot = as.numeric(sub(".*_", "", rownames(species_matrix))),  
    q0 = q0,
    q1 = q1,
    q2 = q2
  )
}

process_species_group <- function(data, species_column) {
  counts <- data %>%
    filter(!is.na(!!sym(species_column)) & !!sym(species_column) != "N") %>%
    group_by(cluster, plot, !!sym(species_column)) %>%
    summarise(count = n(), .groups = "drop")
  
  species_pivot <- counts %>%
    pivot_wider(names_from = !!sym(species_column), values_from = count, values_fill = list(count = 0)) %>%
    unite("cluster_plot", cluster, plot, sep = "_", remove = FALSE) %>%
    column_to_rownames(var = "cluster_plot")
  
  species_pivot[] <- lapply(species_pivot, as.numeric)
  hill.no(species_pivot)
}

# Compute diversity for different species groups
diversity.all <- bind_rows(
  process_species_group(grass.data, "ag_code") %>% mutate(Species_Type = "Annual Grasses"),
  process_species_group(grass.data, "pg_code") %>% mutate(Species_Type = "Perennial Grasses"),
  process_species_group(grass.data, "wp.ntv") %>% mutate(Species_Type = "Woody Plants"),
  process_species_group(grass.data, "f.ntv") %>% mutate(Species_Type = "Forbs"),
  process_species_group(grass.data, "f.inv") %>%
    bind_rows(process_species_group(grass.data, "wp.inv")) %>%
    mutate(Species_Type = "Invasives")
)

diversity_long <- diversity.all %>%
  pivot_longer(cols = q0:q2, names_to = "Hill_Number", values_to = "Diversity")

q visualisation

The visualisation below shows the mean q for each cluster; which is the alpha diversity.

# ----------------- Function to Order Clusters by Highest Mean for Each Category -----------------
order_clusters <- function(data, hill_number, species_type) {
  data %>%
    filter(Hill_Number == hill_number, Species_Type == species_type) %>%
    group_by(cluster) %>%
    summarize(Mean_Diversity = mean(Diversity, na.rm = TRUE)) %>%
    arrange(desc(Mean_Diversity)) %>%
    pull(cluster)  
}

# Generate independent cluster orders for each category and Hill number
cluster_order_ag_q0 <- order_clusters(diversity_long, "q0", "Annual Grasses")
cluster_order_ag_q1 <- order_clusters(diversity_long, "q1", "Annual Grasses")
cluster_order_ag_q2 <- order_clusters(diversity_long, "q2", "Annual Grasses")

cluster_order_pg_q0 <- order_clusters(diversity_long, "q0", "Perennial Grasses")
cluster_order_pg_q1 <- order_clusters(diversity_long, "q1", "Perennial Grasses")
cluster_order_pg_q2 <- order_clusters(diversity_long, "q2", "Perennial Grasses")

cluster_order_wp_q0 <- order_clusters(diversity_long, "q0", "Woody Plants")
cluster_order_wp_q1 <- order_clusters(diversity_long, "q1", "Woody Plants")
cluster_order_wp_q2 <- order_clusters(diversity_long, "q2", "Woody Plants")

cluster_order_f_q0 <- order_clusters(diversity_long, "q0", "Forbs")
cluster_order_f_q1 <- order_clusters(diversity_long, "q1", "Forbs")
cluster_order_f_q2 <- order_clusters(diversity_long, "q2", "Forbs")

cluster_order_inv_q0 <- order_clusters(diversity_long, "q0", "Invasives")
cluster_order_inv_q1 <- order_clusters(diversity_long, "q1", "Invasives")
cluster_order_inv_q2 <- order_clusters(diversity_long, "q2", "Invasives")

# ----------------- Function to Create Error Bar Plots for Any Category -----------------
plot_error <- function(data, hill_number, species_type, title_suffix, cluster_order) {
  data_filtered <- data %>%
    filter(Hill_Number == hill_number, Species_Type == species_type) %>%
    group_by(cluster) %>%
    summarise(Mean_Diversity = mean(Diversity, na.rm = TRUE),
              SD_Diversity = sd(Diversity, na.rm = TRUE)) %>%
    mutate(cluster = factor(cluster, levels = cluster_order))  # Apply ordering

  ggplot(data_filtered, aes(x = cluster, y = Mean_Diversity)) +
    geom_point(color = "blue", size = 3) +  # Mean diversity points
    geom_errorbar(aes(ymin = Mean_Diversity - SD_Diversity, ymax = Mean_Diversity + SD_Diversity), 
                  width = 0.2, color = "red") +  # Error bars
    labs(title = paste(species_type, title_suffix), x = "cluster", y = "Diversity") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

# ----------------- Generate Plots for Each Category -----------------
# Annual Grasses
plot_ag_q0 <- plot_error(diversity_long, "q0", "Annual Grasses", "(q0)", cluster_order_ag_q0)
plot_ag_q1 <- plot_error(diversity_long, "q1", "Annual Grasses", "(q1)", cluster_order_ag_q1)
plot_ag_q2 <- plot_error(diversity_long, "q2", "Annual Grasses", "(q2)", cluster_order_ag_q2)

# Perennial Grasses
plot_pg_q0 <- plot_error(diversity_long, "q0", "Perennial Grasses", "(q0)", cluster_order_pg_q0)
plot_pg_q1 <- plot_error(diversity_long, "q1", "Perennial Grasses", "(q1)", cluster_order_pg_q1)
plot_pg_q2 <- plot_error(diversity_long, "q2", "Perennial Grasses", "(q2)", cluster_order_pg_q2)

# Woody Plants Native
plot_wp_q0 <- plot_error(diversity_long, "q0", "Woody Plants", "(q0)", cluster_order_wp_q0)
plot_wp_q1 <- plot_error(diversity_long, "q1", "Woody Plants", "(q1)", cluster_order_wp_q1)
plot_wp_q2 <- plot_error(diversity_long, "q2", "Woody Plants", "(q2)", cluster_order_wp_q2)

# Forbs Native
plot_f_q0 <- plot_error(diversity_long, "q0", "Forbs", "(q0)", cluster_order_f_q0)
plot_f_q1 <- plot_error(diversity_long, "q1", "Forbs", "(q1)", cluster_order_f_q1)
plot_f_q2 <- plot_error(diversity_long, "q2", "Forbs", "(q2)", cluster_order_f_q2)

# Invasive Species
plot_inv_q0 <- plot_error(diversity_long, "q0", "Invasives", "(q0)", cluster_order_inv_q0)
plot_inv_q1 <- plot_error(diversity_long, "q1", "Invasives", "(q1)", cluster_order_inv_q1)
plot_inv_q2 <- plot_error(diversity_long, "q2", "Invasives", "(q2)", cluster_order_inv_q2)

# ----------------- Arrange Plots in Grids -----------------
# Annual Grasses
(plot_ag_q0 | plot_ag_q1 | plot_ag_q2)

# Perennial Grasses
(plot_pg_q0 | plot_pg_q1 | plot_pg_q2)

# Woody Plants Native
(plot_wp_q0 | plot_wp_q1 | plot_wp_q2)

# Forbs Native
(plot_f_q0 | plot_f_q1 | plot_f_q2)

# Invasive Species
(plot_inv_q0 | plot_inv_q1 | plot_inv_q2)

q vs pc

To assess the relationship between plant diversity calculated using Hill numbers (q0, q1, q2 and soil properties, we extracted principal component analysis (PCA) scores for PC1 and PC2 and merged the scores with plant diversity data for different plant functional groups. We applied Generalized Additive Mixed Models (GAMMs) to evaluate the effects of soil properties (PC1 and PC2) on plant diversity metrics. GAMMs were chosen due to their ability to model nonlinear relationships while accounting for random effects. The diversity indices (q0, q1, q2) for different species types were response variables, PC1 and PC2 were independent, and cluster was included as a random effect to account for spatial autocorrelation. The models were estimated using the restricted maximum likelihood (REML) method.

To assess model fit and assumptions, we examined the linearity of relationships between predictor variables and diversity indices using scatter plots with LOESS smoothers.

# Step 1: Prepare Data
# Extract PCA scores for PC1 and PC2
soil_pca_scores <- as.data.frame(pc.results.soil1$ind$coord)[, 1:2]
colnames(soil_pca_scores) <- c("PC1", "PC2")

# Convert Cluster and Plot to factors
soil_pca_scores <- soil_pca_scores %>%
  mutate(cluster = as.factor(soil_data$cluster), plot = as.factor(soil_data$plot))

soil_data <- soil_data %>%
  mutate(cluster = as.factor(cluster), plot = as.factor(plot))

# Reshape diversity data
diversity_wide <- diversity_long %>%
  pivot_wider(
    names_from = c(Species_Type, Hill_Number),
    values_from = Diversity,
    names_glue = "{Hill_Number}_{Species_Type}",
    values_fill = list(Diversity = NA)
  ) %>%
  mutate(cluster = as.factor(cluster), plot = as.factor(plot))

# Merge diversity metrics with soil PCA scores and soil data
diversity_pca <- diversity_wide %>%
  left_join(soil_pca_scores, by = c("cluster", "plot")) %>%
  left_join(soil_data, by = c("cluster", "plot"))

# Rename diversity columns
diversity_pca <- diversity_pca %>%
  rename(
    q0.ag = `q0_Annual Grasses`, q1.ag = `q1_Annual Grasses`, q2.ag = `q2_Annual Grasses`,
    q0.pg = `q0_Perennial Grasses`, q1.pg = `q1_Perennial Grasses`, q2.pg = `q2_Perennial Grasses`,
    q0.wp = `q0_Woody Plants`, q1.wp = `q1_Woody Plants`, q2.wp = `q2_Woody Plants`,
    q0.f  = `q0_Forbs`, q1.f  = `q1_Forbs`, q2.f  = `q2_Forbs`,
    q0.inv = `q0_Invasives`, q1.inv = `q1_Invasives`, q2.inv = `q2_Invasives`
  ) %>%
  mutate(
    across(where(is.list), ~ map_dbl(.x, ~ ifelse(length(.x) == 0, NA, as.numeric(.x)))),  # Convert lists & handle empty elements
    across(where(is.numeric), ~ replace_na(.x, 0))  # Replace NA with 0 only for numeric columns
  )

# Define response variables
variables <- c("q0.ag", "q1.ag", "q2.ag", "q0.pg", "q1.pg", "q2.pg", 
               "q0.wp", "q1.wp", "q2.wp", "q0.f", "q1.f", "q2.f", 
               "q0.inv", "q1.inv", "q2.inv")

# Test whether the data is linear
ggplot(diversity_pca, aes(x = PC1, y = q2.pg)) +
  geom_point() +
  geom_smooth(method = "loess", col = "red") +
  labs(title = "Checking Linearity")

# Step 1: Run GAMM for each variable
models_gamm <- lapply(variables, function(var) {
  gam(as.formula(paste(var, "~ s(PC1) + s(PC2) + s(cluster, bs='re')")), 
      data = diversity_pca, 
      method = "REML")
})

# Step 2: Summarise GAMM results
summaries_gamm <- lapply(models_gamm, summary)

# Step 3: Extract significant GAMM results
extract_significant_results <- function(summary_gamm, var_name) {
  if (!is.null(summary_gamm$s.table)) {
    summary_gamm$s.table %>%
      as.data.frame() %>%
      rename(EDF = edf, F_Value = F, P_Value = `p-value`) %>%
      filter(P_Value < 0.05) %>%
      mutate(Term = rownames(.), Variable = var_name)
  } else {
    data.frame()  # Return empty dataframe if no s.table
  }
}

significant_results_gamm <- bind_rows(lapply(seq_along(summaries_gamm), function(i) {
  extract_significant_results(summaries_gamm[[i]], variables[i])
}))

# Step 4: Display GAMM results in a simple kable table
if (nrow(significant_results_gamm) > 0) {
  gamm_table <- kable(significant_results_gamm, digits = 3, caption = "Significant Smooth Terms in GAMMs")
  print(gamm_table)
} else {
  message("No significant smooth terms found in GAMMs.")
}


Table: Significant Smooth Terms in GAMMs

|                |    EDF| Ref.df| F_Value| P_Value|Term       |Variable |
|:---------------|------:|------:|-------:|-------:|:----------|:--------|
|s(PC1)...1      |  1.000|  1.001|   7.247|   0.008|s(PC1)     |q0.ag    |
|s(cluster)...2  | 12.693| 15.000|   6.025|   0.000|s(cluster) |q0.ag    |
|s(PC1)...3      |  1.000|  1.000|   7.003|   0.009|s(PC1)     |q1.ag    |
|s(cluster)...4  | 11.915| 15.000|   4.166|   0.000|s(cluster) |q1.ag    |
|s(PC1)...5      |  1.000|  1.000|   6.757|   0.010|s(PC1)     |q2.ag    |
|s(PC2)...6      |  1.000|  1.001|   4.659|   0.033|s(PC2)     |q2.ag    |
|s(cluster)...7  | 11.229| 15.000|   3.183|   0.000|s(cluster) |q2.ag    |
|s(cluster)...8  |  9.766| 15.000|   1.670|   0.002|s(cluster) |q0.pg    |
|s(cluster)...9  |  6.519| 15.000|   0.767|   0.045|s(cluster) |q2.pg    |
|s(PC1)...10     |  1.000|  1.000|  18.060|   0.000|s(PC1)     |q0.wp    |
|s(PC2)...11     |  1.001|  1.001|  12.664|   0.001|s(PC2)     |q0.wp    |
|s(cluster)...12 |  7.104| 15.000|   0.866|   0.036|s(cluster) |q0.wp    |
|s(PC1)...13     |  1.000|  1.000|  20.176|   0.000|s(PC1)     |q1.wp    |
|s(PC2)...14     |  1.000|  1.000|  13.489|   0.000|s(PC2)     |q1.wp    |
|s(cluster)...15 |  6.988| 15.000|   0.838|   0.041|s(cluster) |q1.wp    |
|s(PC1)...16     |  1.000|  1.000|  20.376|   0.000|s(PC1)     |q2.wp    |
|s(PC2)...17     |  1.000|  1.001|  13.548|   0.000|s(PC2)     |q2.wp    |
|s(cluster)...18 |  7.173| 15.000|   0.884|   0.034|s(cluster) |q2.wp    |
|s(cluster)...19 | 11.040| 15.000|   2.919|   0.000|s(cluster) |q0.f     |
|s(cluster)...20 | 10.491| 15.000|   2.405|   0.000|s(cluster) |q1.f     |
|s(cluster)...21 |  9.752| 15.000|   1.866|   0.001|s(cluster) |q2.f     |
|s(PC2)...22     |  1.619|  2.013|   4.004|   0.021|s(PC2)     |q0.inv   |
|s(cluster)...23 |  8.681| 15.000|   1.382|   0.004|s(cluster) |q0.inv   |
|s(PC2)...24     |  1.210|  1.386|   3.643|   0.035|s(PC2)     |q1.inv   |
|s(cluster)...25 |  9.790| 15.000|   1.898|   0.000|s(cluster) |q1.inv   |
|s(PC2)...26     |  1.002|  1.004|   4.424|   0.037|s(PC2)     |q2.inv   |
|s(cluster)...27 | 10.207| 15.000|   2.152|   0.000|s(cluster) |q2.inv   |
####✅  Interpetation: 
#Annual grasses are strongly influenced by PC1 and cluster, suggesting both soil properties and spatial heterogeneity shape their distribution.
#Perennial grasses are primarily affected by cluster, with less pronounced soil effects.
#Forbs show strong spatial variability across clusters.
#Invasive species are significantly influenced by PC2, indicating that soil properties drive their establishment, but they also exhibit spatial variability.
#Woody plants are most influenced by PC1, reflecting the role of soil health in determining their presence and diversity.

# Step 6: Plot GAMM Results
plot_list <- lapply(seq_along(variables), function(i) {
  model <- models_gamm[[i]]

  # Use median PC2 for PC1 predictions and vice versa
  median_PC2 <- median(diversity_pca$PC2, na.rm = TRUE)
  median_PC1 <- median(diversity_pca$PC1, na.rm = TRUE)
  most_common_cluster <- diversity_pca$cluster[which.max(table(diversity_pca$cluster))]

  # Generate newdata with fixed cluster
  df1 <- data.frame(PC1 = seq(min(diversity_pca$PC1, na.rm = TRUE), 
                    max(diversity_pca$PC1, na.rm = TRUE), length.out = 100),
                    PC2 = median_PC2, cluster = most_common_cluster)

  df2 <- data.frame(PC2 = seq(min(diversity_pca$PC2, na.rm = TRUE), 
                    max(diversity_pca$PC2, na.rm = TRUE), length.out = 100),
                    PC1 = median_PC1, cluster = most_common_cluster)

  # Predict effects and standard errors
  pred1 <- predict(model, newdata = df1, type = "response", se.fit = TRUE)
  df1$Effect <- pred1$fit
  df1$SE <- pred1$se.fit

  pred2 <- predict(model, newdata = df2, type = "response", se.fit = TRUE)
  df2$Effect <- pred2$fit
  df2$SE <- pred2$se.fit

  # Plot smooth effect of PC1 (Blue)
  plot1 <- ggplot(df1, aes(x = PC1, y = Effect)) +
    geom_line(color = "#1f77b4") +  # Blue for PC1
    geom_ribbon(aes(ymin = Effect - SE, ymax = Effect + SE), fill = "#1f77b4", alpha = 0.3) +
    ggtitle(paste("", variables[i], "")) +
    theme_minimal()

  # Plot smooth effect of PC2 (Green)
  plot2 <- ggplot(df2, aes(x = PC2, y = Effect)) +
    geom_line(color = "#2ca02c") +  # Green for PC2
    geom_ribbon(aes(ymin = Effect - SE, ymax = Effect + SE), fill = "#2ca02c", alpha = 0.3) +
    ggtitle(paste("", variables[i], "")) +
    theme_minimal()

  return(plot1 + plot2)
})

# Display all plots
wrap_plots(plot_list)

#######✅  Interpetation: 
#PC1 (indicative of healthier soils) favours woody plants and perennial grasses while negatively affecting annual grasses, forbs, and invasive species.
#PC2  strongly influences invasive species, with lower PC2 values promoting their abundance.
#Invasive species and annual grasses thrive in more degraded conditions, while woody plants and perennial grasses benefit from healthier soils.

q vs pc weighted pc

We computed weighted principal component (PC) scores using the explained variance of the first two principal components (PC1 and PC2). These weighted scores provide a composite representation of soil health gradients. Generalized Additive Mixed Models (GAMMs) were applied where (PC) represents a smooth function of soil properties, and (Cluster, bs=‘re’) accounts for random variations among clusters.

# Compute weighted PC score
explained_variance <- pc.results.soil1$eig[1:2]
weights <- explained_variance / sum(explained_variance)
soil_pca_scores <- as.data.frame(pc.results.soil1$ind$coord)[, 1:2]
colnames(soil_pca_scores) <- c("PC1", "PC2")
soil_pca_scores <- soil_pca_scores %>% 
  mutate(PC = PC1 * weights[1] + PC2 * weights[2])

# Convert Cluster and Plot to factors 
soil_pca_scores <- soil_pca_scores %>%
  mutate(cluster = factor(soil_data$cluster), plot = factor(soil_data$plot))
soil_data <- soil_data %>%
  mutate(cluster = factor(cluster), plot = factor(plot))

# Merge datasets
diversity_pca <- diversity_wide %>%
  left_join(soil_pca_scores, by = c("cluster", "plot")) %>%
  left_join(soil_data, by = c("cluster", "plot"))

# Rename columns for clarity
diversity_pca <- diversity_pca %>%
  rename(
    q0.ag = `q0_Annual Grasses`, q1.ag = `q1_Annual Grasses`, q2.ag = `q2_Annual Grasses`,
    q0.pg = `q0_Perennial Grasses`, q1.pg = `q1_Perennial Grasses`, q2.pg = `q2_Perennial Grasses`,
    q0.wp = `q0_Woody Plants`, q1.wp = `q1_Woody Plants`, q2.wp = `q2_Woody Plants`,
    q0.f  = `q0_Forbs`, q1.f  = `q1_Forbs`, q2.f  = `q2_Forbs`,
    q0.inv = `q0_Invasives`, q1.inv = `q1_Invasives`, q2.inv = `q2_Invasives`
  ) %>%
  mutate(
    across(where(is.list), ~ map_dbl(.x, ~ ifelse(length(.x) == 0, NA, as.numeric(.x)))),  # Convert lists & handle empty elements
    across(where(is.numeric), ~ replace_na(.x, 0))  # Replace NA with 0 only for numeric columns
  )

# Define response variables
variables <- c("q0.ag", "q1.ag", "q2.ag", "q0.pg", "q1.pg", "q2.pg", 
               "q0.wp", "q1.wp", "q2.wp", "q0.f", "q1.f", "q2.f", 
               "q0.inv", "q1.inv", "q2.inv")

# Run GAMM for each variable
models_gamm <- lapply(variables, function(var) {
  gam(as.formula(paste(var, "~ s(PC) + s(cluster, bs='re')")), 
      data = diversity_pca, method = "REML")
})

# Summarise GAMM results
summaries_gamm <- lapply(models_gamm, summary)

# Extract significant GAMM results
extract_gamm_results <- function(summary_gamm, var_name) {
  if (!is.null(summary_gamm$s.table)) {
    summary_gamm$s.table %>%
      as.data.frame() %>%
      rename(EDF = edf, F_Value = F, P_Value = `p-value`) %>%
      mutate(
        Term = rownames(.),
        Variable = var_name,
        Significance = case_when(
          P_Value < 0.001 ~ "***",
          P_Value < 0.01 ~ "**",
          P_Value < 0.05 ~ "*",
          TRUE ~ ""
        )
      )
  } else {
    data.frame()  # Return empty dataframe if no s.table
  }
}

# Combine results for all variables
gamm_results <- bind_rows(lapply(seq_along(summaries_gamm), function(i) {
  extract_gamm_results(summaries_gamm[[i]], variables[i])
}))


#  Display GAMM results in a simple kable table
if (nrow(significant_results_gamm) > 0) {
  gamm_table <- kable(significant_results_gamm, digits = 3, caption = "Significant Smooth Terms in GAMMs")
  print(gamm_table)
} else {
  message("No significant smooth terms found in GAMMs.")
}


Table: Significant Smooth Terms in GAMMs

|                |    EDF| Ref.df| F_Value| P_Value|Term       |Variable |
|:---------------|------:|------:|-------:|-------:|:----------|:--------|
|s(PC1)...1      |  1.000|  1.001|   7.247|   0.008|s(PC1)     |q0.ag    |
|s(cluster)...2  | 12.693| 15.000|   6.025|   0.000|s(cluster) |q0.ag    |
|s(PC1)...3      |  1.000|  1.000|   7.003|   0.009|s(PC1)     |q1.ag    |
|s(cluster)...4  | 11.915| 15.000|   4.166|   0.000|s(cluster) |q1.ag    |
|s(PC1)...5      |  1.000|  1.000|   6.757|   0.010|s(PC1)     |q2.ag    |
|s(PC2)...6      |  1.000|  1.001|   4.659|   0.033|s(PC2)     |q2.ag    |
|s(cluster)...7  | 11.229| 15.000|   3.183|   0.000|s(cluster) |q2.ag    |
|s(cluster)...8  |  9.766| 15.000|   1.670|   0.002|s(cluster) |q0.pg    |
|s(cluster)...9  |  6.519| 15.000|   0.767|   0.045|s(cluster) |q2.pg    |
|s(PC1)...10     |  1.000|  1.000|  18.060|   0.000|s(PC1)     |q0.wp    |
|s(PC2)...11     |  1.001|  1.001|  12.664|   0.001|s(PC2)     |q0.wp    |
|s(cluster)...12 |  7.104| 15.000|   0.866|   0.036|s(cluster) |q0.wp    |
|s(PC1)...13     |  1.000|  1.000|  20.176|   0.000|s(PC1)     |q1.wp    |
|s(PC2)...14     |  1.000|  1.000|  13.489|   0.000|s(PC2)     |q1.wp    |
|s(cluster)...15 |  6.988| 15.000|   0.838|   0.041|s(cluster) |q1.wp    |
|s(PC1)...16     |  1.000|  1.000|  20.376|   0.000|s(PC1)     |q2.wp    |
|s(PC2)...17     |  1.000|  1.001|  13.548|   0.000|s(PC2)     |q2.wp    |
|s(cluster)...18 |  7.173| 15.000|   0.884|   0.034|s(cluster) |q2.wp    |
|s(cluster)...19 | 11.040| 15.000|   2.919|   0.000|s(cluster) |q0.f     |
|s(cluster)...20 | 10.491| 15.000|   2.405|   0.000|s(cluster) |q1.f     |
|s(cluster)...21 |  9.752| 15.000|   1.866|   0.001|s(cluster) |q2.f     |
|s(PC2)...22     |  1.619|  2.013|   4.004|   0.021|s(PC2)     |q0.inv   |
|s(cluster)...23 |  8.681| 15.000|   1.382|   0.004|s(cluster) |q0.inv   |
|s(PC2)...24     |  1.210|  1.386|   3.643|   0.035|s(PC2)     |q1.inv   |
|s(cluster)...25 |  9.790| 15.000|   1.898|   0.000|s(cluster) |q1.inv   |
|s(PC2)...26     |  1.002|  1.004|   4.424|   0.037|s(PC2)     |q2.inv   |
|s(cluster)...27 | 10.207| 15.000|   2.152|   0.000|s(cluster) |q2.inv   |
## ##✅  Interpetation: 
#GAMM Results Interpretation (Smooth Terms Table)
#The table presents results from a Generalized Additive Mixed Model (GAMM) assessing how soil properties (PC) and spatial clusters influence plant diversity indices (q0, q1, q2) across functional groups.
#Annual Grasses (q0.ag, q1.ag, q2.ag)
# PC significantly affects q0.ag, q1.ag, and q2.ag (P < 0.01), showing that annual grass diversity declines with increasing soil health (PC values as also seen below).
# Spatial clusters significantly influence all annual grass indices, meaning diversity varies across different locations.
#Perennial Grasses (q0.pg, q1.pg, q2.pg)
# PC has no significant effect on q0.pg, q1.pg, and q2.pg (P > 0.05), indicating that perennial grass diversity does not strongly correlate with soil PC values.
#Clusters significantly affect q0.pg (P = 0.002), suggesting location-dependent variation. #Interpretation: Perennial grasses might be more stable across soil conditions and not solely dependent on soil restoration levels.
#Woody Plants (q0.wp, q1.wp)
# PC significantly affects q0.wp and q1.wp (P < 0.01), meaning woody plant diversity increases with soil health as also seen below.
# Clusters significantly affect q0.wp and q1.wp, implying spatial variation.
#Interpretation: Woody plants are strong indicators of restored ecosystems and increase as soil conditions improve.
#Forbs 
#(PC)) do not significantly affect forb diversity (all p > 0.86).
#Spatial variation (s(cluster)) is significant (p < 0.001), meaning forb diversity is mostly driven by differences between clusters rather than soil health.
#Invasive 
#Soil properties (s(PC)) do not significantly affect invasive species diversity (all p > 0.8).
#Spatial variation (s(cluster)) is significant (p < 0.001), suggesting that invasive species are more influenced by landscape heterogeneity than soil conditions.
# Generate GAMM effect plots
plot_list <- lapply(seq_along(variables), function(i) {
  model <- models_gamm[[i]]
  
  median_PC <- median(diversity_pca$PC, na.rm = TRUE)
  most_common_cluster <- diversity_pca$cluster[which.max(table(diversity_pca$cluster))]
  
  df <- data.frame(
    PC = seq(min(diversity_pca$PC, na.rm = TRUE), 
             max(diversity_pca$PC, na.rm = TRUE), length.out = 100),
    cluster = most_common_cluster
  )
  
  pred <- predict(model, newdata = df, type = "response", se.fit = TRUE)
  df$Effect <- pred$fit
  df$SE <- pred$se.fit
  
  ggplot(df, aes(x = PC, y = Effect)) +
    geom_line(color = "blue") +
    geom_ribbon(aes(ymin = Effect - SE, ymax = Effect + SE), alpha = 0.2) +
    ggtitle(paste("", variables[i])) +
    theme_minimal()
})

# Display all plots in a grid
wrap_plots(plot_list, ncol = 3)

#######✅  Interpetation: 
##The provided plot displays the effect of soil properties (PC scores) on different plant diversity indices (q0, q1, q2) across plant functional groups using Generalized Additive Mixed Models (GAMM).

#The effect of soil properties (PC) on annual grasses is mostly negative, with a decline in diversity indices (q0, q1, q2) as PC increases.

#Perennial Grasses (q0.pg, q1.pg, q2.pg); trend is positive, showing that all three diversity indices increase with higher PC values

#Woody Plants (q0.wp, q1.wp, q2.wp) increase with increase in PC values

#Forbs (q0.f, q1.f, q2.f) showed a negative effect  where higher PC values correspond to lower diversity indices.

#Invasive Species (q0.inv, q1.inv, q2.inv) showed a decline in diversity indices with increasing soil health (PC values).

a and b computed from q1

We calculated Alpha diversity for each species type within clusters using the shannon diversity index (q1) values. We calculated Gamma diversity as the mean of the Shannon diversity index (q1) across all clusters for each species type (functional group). We computed Beta diversity as the ratio of gamma diversity to alpha diversity for each species type within clusters.

# Calculate Alpha Diversity (Mean and SD within Clusters)
alpha_diversity_q1 <- diversity.all %>%
  group_by(cluster, Species_Type) %>%
  summarise(
    Mean_Alpha = mean(q1, na.rm = TRUE),
    SD_Alpha = ifelse(n() > 1, sd(q1, na.rm = TRUE), NA),  # Compute SD only if more than one value
    .groups = "drop"
  )
# Calculate Alpha Diversity (Mean and SD within Clusters)
alpha_diversity_q1 <- diversity.all %>%
  group_by(cluster, Species_Type) %>%
  summarise(
    Mean_Alpha = mean(q1, na.rm = TRUE),
    SD_Alpha = ifelse(n() > 1, sd(q1, na.rm = TRUE), NA),  # Compute SD only if more than one value
    .groups = "drop"
  )

# Get unique species types
species_types <- unique(alpha_diversity_q1$Species_Type)

# Function to plot alpha diversity for a single species type
plot_alpha_diversity <- function(data, species_type) {
  # Filter data for the species type
  species_data <- data %>%
    filter(Species_Type == species_type) %>%
    mutate(cluster = factor(cluster, levels = cluster[order(-Mean_Alpha)]))  # Reorder clusters by Mean_Alpha
  
  # Plot
  ggplot(species_data, aes(x = cluster, y = Mean_Alpha)) +
    geom_point(color = "blue", size = 2) +
    geom_errorbar(aes(ymin = Mean_Alpha - SD_Alpha, ymax = Mean_Alpha + SD_Alpha), 
                  width = 0.2, color = "red") +
    labs(title = paste("α", species_type),
         x = "Cluster", y = "α") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

# Generate plots for each species type
alpha_plots <- map(species_types, ~ plot_alpha_diversity(alpha_diversity_q1, .x))

# Combine plots into a grid
alpha_grid <- wrap_plots(alpha_plots, ncol = 2)

# Display the grid
print(alpha_grid)

# Calculate Gamma Diversity (Mean and SD for the Entire Study Area)
gamma_diversity_q1 <- diversity.all %>%
  group_by(Species_Type) %>%
  summarise(
    Mean_Gamma = mean(q1, na.rm = TRUE),  # Mean diversity across all clusters
    SD_Gamma = ifelse(n() > 1, sd(q1, na.rm = TRUE), NA),  # Compute SD only if multiple observations
    .groups = "drop"
  )

# Gamma Diversity Bar Plot
gamma_plot <- gamma_diversity_q1 %>%
  filter(!is.na(Mean_Gamma)) %>%  # Ensure no NA values
  ggplot(aes(x = reorder(Species_Type, -Mean_Gamma), y = Mean_Gamma, fill = Species_Type)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = Mean_Gamma - SD_Gamma, ymax = Mean_Gamma + SD_Gamma), 
                width = 0.2, color = "black") +
  labs(title = " γ Across Species Types",
       x = "Species Type", y = " (γ)") +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(gamma_plot)

# Compute Beta Diversity (Gamma/Alpha for q1) with SD
beta_diversity_q1 <- alpha_diversity_q1 %>%
  left_join(gamma_diversity_q1, by = "Species_Type") %>%
  mutate(
    Beta_q1 = Mean_Gamma / Mean_Alpha,  # Turnover among clusters
    SD_Beta = Beta_q1 * sqrt((SD_Gamma / Mean_Gamma)^2 + (SD_Alpha / Mean_Alpha)^2)  # Propagation of SD
  ) %>%
  select(Species_Type, cluster, Beta_q1, SD_Beta)

# Get unique species types
species_types <- unique(beta_diversity_q1$Species_Type)

# Function to plot beta diversity for a single species type
plot_beta_diversity <- function(data, species_type) {
  # Filter data for the species type
  species_data <- data %>%
    filter(Species_Type == species_type) %>%
    mutate(cluster = factor(cluster, levels = cluster[order(-Beta_q1)]))  # Reorder clusters by Beta_q1
  
  # Plot
  ggplot(species_data, aes(x = cluster, y = Beta_q1)) +
    geom_point(color = "blue", size = 3) +
    geom_errorbar(aes(ymin = Beta_q1 - SD_Beta, ymax = Beta_q1 + SD_Beta), 
                  width = 0.2, color = "red") +
    labs(title = paste(" β ", species_type),
         x = "Cluster", y = "β") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

   
# Generate plots for each species type
beta_plots <- map(species_types, ~ plot_beta_diversity(beta_diversity_q1, .x))

# Combine plots into a grid
beta_grid <- wrap_plots(beta_plots, ncol = 2)

# Display the grid
print(beta_grid)

a and b vs pc

We used the Generalized Additive Models (GAM) to analyse the relationship between diversity metrics (α and β diversity) and soil properties (Mean_PC1 and Mean_PC2).

# Compute Mean PC1 & PC2 per cluster
soil_pca_cluster_means <- soil_pca_scores %>%
  group_by(cluster) %>%
  summarise(
    Mean_PC1 = mean(PC1, na.rm = TRUE), 
    Mean_PC2 = mean(PC2, na.rm = TRUE), 
    .groups = "drop"
  ) %>%
  mutate(cluster = as.character(cluster))

# Ensure cluster is a character variable across datasets
alpha_diversity_q1 <- alpha_diversity_q1 %>% mutate(cluster = as.character(cluster))
soil_pca_scores <- soil_pca_scores %>% mutate(cluster = as.character(cluster))

data_prepared <- alpha_diversity_q1 %>%
  pivot_wider(names_from = Species_Type, values_from = c(Mean_Alpha, SD_Alpha), names_glue = "{.value}_{Species_Type}")

# Compute Beta Diversity
beta_diversity_wide <- alpha_diversity_q1 %>%
  left_join(gamma_diversity_q1, by = "Species_Type") %>%
  mutate(
    Beta_q1 = Mean_Gamma / Mean_Alpha, 
    SD_Beta = Beta_q1 * sqrt((SD_Gamma / Mean_Gamma)^2 + (SD_Alpha / Mean_Alpha)^2)
  ) %>%
  select(Species_Type, cluster, Beta_q1, SD_Beta) %>%
  pivot_wider(names_from = Species_Type, values_from = c(Beta_q1, SD_Beta), names_glue = "{.value}_{Species_Type}") %>%
  mutate(cluster = as.character(cluster))

# Merge All Data
diversity_pc_data <- beta_diversity_wide %>%
  left_join(data_prepared, by = "cluster") %>%
  left_join(soil_pca_cluster_means, by = "cluster") %>%
  rename(
    β.ag = `Beta_q1_Annual Grasses`, β.f = `Beta_q1_Forbs`, β.inv = `Beta_q1_Invasives`,
    β.pg = `Beta_q1_Perennial Grasses`, β.wp = `Beta_q1_Woody Plants`,
    α.ag = `Mean_Alpha_Annual Grasses`, α.f = `Mean_Alpha_Forbs`, α.inv = `Mean_Alpha_Invasives`,
    α.pg = `Mean_Alpha_Perennial Grasses`, α.wp = `Mean_Alpha_Woody Plants`
  )

# Define response variables
response_vars <- c("\u03b1.ag", "\u03b1.f", "\u03b1.inv", "\u03b1.pg", "\u03b1.wp", 
                   "\u03b2.ag", "\u03b2.f", "\u03b2.inv", "\u03b2.pg", "\u03b2.wp")

# Fit GAMs and extract significant results
gamm_results <- map_dfr(response_vars, function(var) {
  formula <- as.formula(paste(var, "~ s(Mean_PC1) + s(Mean_PC2)"))
  gam_model <- gam(formula, data = diversity_pc_data, family = gaussian())
  gam_summary <- summary(gam_model)
  
  if (!is.null(gam_summary$s.table)) {
    smooth_terms <- gam_summary$s.table
    significant_terms <- smooth_terms[smooth_terms[, "p-value"] < 0.05, , drop = FALSE]
    
    if (nrow(significant_terms) > 0) {
      return(
        data.frame(
          EDF = significant_terms[, "edf"],
          Ref.df = significant_terms[, "Ref.df"],
          F_Value = significant_terms[, "F"],
          P_Value = significant_terms[, "p-value"],
          Term = rownames(significant_terms),
          Variable = var,
          Significance = case_when(
            significant_terms[, "p-value"] < 0.001 ~ "***",
            significant_terms[, "p-value"] < 0.01 ~ "**",
            significant_terms[, "p-value"] < 0.05 ~ "*",
            TRUE ~ ""
          )
        )
      )
    }
  }
  return(NULL)
})

# Display significant GAM results
if (nrow(gamm_results) > 0) {
  gamm_results %>%
    arrange(P_Value) %>%
    kable(caption = "Significant GAM Results") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
} else {
  message("No significant GAM terms were found.")
}
Significant GAM Results
EDF Ref.df F_Value P_Value Term Variable Significance
...1 1.240166 1.441357 24.291099 0.0001733 s(Mean_PC2) α.wp ***
...2 1.000000 1.000000 26.546525 0.0001900 s(Mean_PC2) β.wp ***
s(Mean_PC2)...3 1.000000 1.000000 15.357047 0.0035144 s(Mean_PC2) β.inv **
s(Mean_PC2)...4 1.000000 1.000000 14.157103 0.0044644 s(Mean_PC2) α.inv **
s(Mean_PC1)...5 4.947978 5.775896 4.785560 0.0173426 s(Mean_PC1) α.inv *
s(Mean_PC1)...6 4.723936 5.550964 3.846343 0.0337508 s(Mean_PC1) β.inv *
...7 5.918966 6.947164 3.681767 0.0451666 s(Mean_PC2) α.f *
...8 5.947397 6.951058 3.562662 0.0482488 s(Mean_PC2) β.f *
#######✅  Interpetation: 
#Significant results only: The table presents the results of a Generalized Additive Model (GAM), showing the effects of Mean_PC1 and Mean_PC2 (principal components representing soil properties) on different response variables (α and β diversity of woody plants (wp), invasive species (inv), and forbs (f)).
#Soil properties (PC2) strongly impact woody plants and invasive species, alpha and beta diversity. The strongest effects are seen in woody plant diversity (both α and β), suggesting that tree species distribution is closely linked to soil conditions.
#PC2 influences the alpha and beta diversities of forbs
# Both PC1 and PC2 (soil properties) contribute to shaping invasive species diversity, with stronger effects on β diversity, indicating that soil properties drive spatial variation in invasive species.

# Function to create scatterplots with GAM fits
plot_gam_effects <- function(response_var, data) {
  ggplot(data, aes_string(x = "Mean_PC1", y = response_var)) +
    geom_point(color = "blue", alpha = 0.6) +
    geom_smooth(method = "gam", formula = y ~ s(x), color = "red", se = TRUE) +
    labs(title = paste(response_var, ""), x = "Mean_PC1", y = response_var) +
    theme_minimal()
}

# Generate plots for all response variables
gam_plots <- lapply(response_vars, function(var) plot_gam_effects(var, diversity_pc_data))

# Combine plots in a grid
wrap_plots(gam_plots, ncol = 5)

# Define response variables
response_vars <- c("\u03b1.ag", "\u03b1.f", "\u03b1.inv", "\u03b1.pg", "\u03b1.wp", 
                   "\u03b2.ag", "\u03b2.f", "\u03b2.inv", "\u03b2.pg", "\u03b2.wp")

# Function to create scatterplots with GAM fits using PC2
plot_gam_effects_PC2 <- function(response_var, data) {
  ggplot(data, aes_string(x = "Mean_PC2", y = response_var)) +
    geom_point(color = "blue", alpha = 0.6) +
    geom_smooth(method = "gam", formula = y ~ s(x), color = "red", se = TRUE) +
    labs(title = paste(response_var, ""), x = "Mean_PC2", y = response_var) +
    theme_minimal()
}

# Generate plots for all response variables
gam_plots_PC2 <- lapply(response_vars, function(var) plot_gam_effects_PC2(var, diversity_pc_data))

# Combine plots in a grid
wrap_plots(gam_plots_PC2, ncol = 5)

#######✅  Interpetation: 
#α.wp (Woody Plants): Strong negative correlation, meaning woody plant diversity decreases as soil properties improve. β.wp (Woody Plants): Decreasing trend, indicating that sites become more similar in woody plant composition as PC2 increases.
#α.f (Forbs): A U-shaped curve, indicating that forb diversity is lowest at intermediate PC2 values and highest at both low and high PC2 values.β.f (Forbs): A hump-shaped pattern, suggesting higher beta diversity at intermediate PC2 values.

#α.ag vs PC1: Weak negative relationship (α diversity of annual grasses decreases slightly as PC1 increases) β.ag vs PC1: Slight positive trend, suggesting more variation in annual grass communities as PC1 increases.
#α.ag (Annual Grasses): Slight increasing trend, suggesting that as soil properties improve (higher PC2), annual grass diversity may increase.β.ag (Annual Grasses): Slight decreasing trend, indicating reduced dissimilarity among sites as PC2 increases.
#α.f vs PC1: Negative trend, suggesting forbs decrease with increasing PC1. β.f vs PC1: Weak positive trend, indicating that forb community composition shifts slightly as PC1 increases.
#α.inv vs PC1: Non-linear trend, showing a slight dip at negative PC1 and stabilising around 0. β.inv vs PC1: Very slight increasing trend, meaning invasive species turnover might increase with PC1.
#α.inv (Invasive Species): Slightly decreasing trend, suggesting invasive species diversity decreases as PC2 increases.β.inv (Invasive Species): Increasing trend, meaning that differences in invasive species composition increase as PC2 rises.
#α.pg vs PC1: Weak U-shaped trend, meaning perennial grass diversity might be lowest at intermediate PC1. β.pg vs PC1: Non-linear U-shaped pattern, suggesting turnover in perennial grasses is highest at extreme PC1 values.
#α.pg (Perennial Grasses): No strong trend, suggesting little relationship with PC2. β.pg (Perennial Grasses): No strong pattern.
#α.wp vs PC1: Slight increasing trend, indicating that woody plant richness may increase with higher PC1. β.wp vs PC1: Strong U-shaped trend, showing woody plant communities are more variable at extreme PC1 values.