Mean Nitrate Prediction_Updated

Explore Data

file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"

model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))

model_data_long <- model_data %>%
  ungroup() %>%
  select(where(is.numeric)) %>%
 select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")


ggplot(model_data_long, aes(y = value)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
  facet_wrap(~variable, scales = "free", ncol = 4) +
  labs(title = "Boxplots of All Numeric Variables",
       y = "Value", x = "") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

Add counties information to the dataset

counties <- counties(state = "KS", year = 2022, class = "sf")

wells_sf <- model_data %>%
  st_as_sf(coords = c("Lon.x", "Lat.x"), crs = 4326)
counties_transformed <- st_transform(counties, crs = 4326)

wells_with_county <- st_join(wells_sf, counties_transformed["NAME"])
model_data <- wells_with_county %>%
  st_drop_geometry() %>%
  rename(county = NAME)

wells_bbox <- st_bbox(wells_sf)

library(viridis)  
counties_with_wells <- st_filter(counties_transformed, wells_sf)

ggplot() +
  geom_sf(data = counties_with_wells, fill = "white", color = "black", size = 0.3) +
  geom_sf(data = wells_sf, aes(color = mean_nitrate), size = 2) +
  geom_sf_text(data = counties_with_wells, aes(label = NAME), size = 3, check_overlap = TRUE, color = "gray30") +
  coord_sf(xlim = c(wells_bbox$xmin, wells_bbox$xmax),
           ylim = c(wells_bbox$ymin, wells_bbox$ymax),
           expand = TRUE) +
  scale_color_viridis_c(option = "C", name = "Mean Nitrate") +
  labs(title = "Wells and Counties Containing Them") +
  theme_minimal()

Scale the data: standardization (scale()) to center all predictors around 0 with a standard deviation of 1.((x- mean)/sd)

numeric_predictors <- model_data %>%
  ungroup() %>% 
  select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
         Mean_WellDepth, avg_sand, avg_silt,avg_clay,
         wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond)
scaled_predictors <- scale(numeric_predictors)

model_data_scaled <- model_data %>%
  select(Well_ID, cluster, mean_nitrate, sd_nitrate,county) %>%  
  bind_cols(as_tibble(scaled_predictors))
model_data_scaled$county <- as.factor(model_data_scaled$county)


county_counts <- model_data_scaled %>%
  count(county)
county_counts

# A tibble: 5 × 2
  county        n
  <fct>     <int>
1 Harvey      109
2 McPherson    16
3 Reno        177
4 Rice          2
5 Sedgwick     58

boxplot(mean_nitrate ~ county, data = model_data_scaled,
        las = 2,                     
        col = "lightblue",            
        main = "Mean Nitrate by County",
        xlab = "County",
        ylab = "Mean Nitrate")

text(x = 1:nrow(county_counts),
     y = par("usr")[3] +20,  
     labels = county_counts$n,
     xpd = TRUE,               
     srt = 90,                 
     adj = 1,
     cex = 0.8,               
     col = "darkblue")

Best models so far

Best Models So Far (Sorted by R²)
Model	R²	RMSE
Random forest model	0.580	2.671
Gamma-distributed generalized linear model	0.503	2.880
Log-transformed linear mixed model	0.463	3.257
Linear mixed model using county	0.403	3.156
Generalized additive model (GAM)	0.378	3.278
Multiple linear regression	0.371	3.246

Updated Friday, 9th May

using counties as random effect

1. Linear mixed-effects model excluding cluster, using Counties as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.

library(lme4)
library(ggplot2)
library(Metrics)

model_data_scaled_1 <- model_data_scaled

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE  + Developed_Change +
                   Mean_WellDepth  + avg_sand + avg_silt +avg_clay +
                   wizard_sites_SD_data + Agri_Change+mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
                   (1 | county),  
                 data = train_data)
summary(lmm_mean)

Linear mixed model fit by REML ['lmerMod']
Formula: mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +  
    Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +  
    Agri_Change + mean_Chloride + mean_SpecCond + sd_Chloride +  
    sd_SpecCond + (1 | county)
   Data: train_data

REML criterion at convergence: 1543.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.1852 -0.5801 -0.1803  0.3240  4.3556 

Random effects:
 Groups   Name        Variance Std.Dev.
 county   (Intercept)  6.451   2.540   
 Residual             12.138   3.484   
Number of obs: 289, groups:  county, 5

Fixed effects:
                                Estimate Std. Error t value
(Intercept)                      2.32343    1.22519   1.896
wizard_sites_last_year_data_WTE  1.66929    0.39437   4.233
Developed_Change                -0.03578    0.28207  -0.127
Mean_WellDepth                  -0.29672    0.19747  -1.503
avg_sand                        -1.79085    1.29516  -1.383
avg_silt                        -1.55512    1.32527  -1.173
wizard_sites_SD_data             0.05082    0.30638   0.166
Agri_Change                     -0.14324    0.26996  -0.531
mean_Chloride                    3.63893    1.89308   1.922
mean_SpecCond                   -5.28321    1.88337  -2.805
sd_Chloride                     -0.53474    0.34182  -1.564
sd_SpecCond                      0.94366    0.37600   2.510

Correlation of Fixed Effects:
            (Intr) w_____ Dvlp_C Mn_WlD avg_sn avg_sl w__SD_ Agr_Ch mn_Chl
wzr_____WTE -0.019                                                        
Devlpd_Chng  0.000  0.129                                                 
Men_WllDpth  0.002  0.103 -0.019                                          
avg_sand    -0.048  0.032 -0.016 -0.106                                   
avg_silt    -0.055  0.068 -0.018 -0.114  0.982                            
wzrd_st_SD_ -0.005 -0.139  0.064 -0.173  0.064  0.026                     
Agri_Change  0.018  0.031  0.605 -0.054 -0.037 -0.030  0.044              
mean_Chlord -0.048  0.088 -0.017  0.056  0.123  0.137 -0.104  0.002       
mean_SpcCnd  0.048 -0.136  0.005 -0.077 -0.115 -0.134  0.123 -0.016 -0.986
sd_Chloride -0.010  0.223  0.040 -0.040  0.105  0.111  0.067  0.021  0.042
sd_SpecCond  0.021 -0.013  0.041  0.065 -0.120 -0.126 -0.026  0.035 -0.211
            mn_SpC sd_Chl
wzr_____WTE              
Devlpd_Chng              
Men_WllDpth              
avg_sand                 
avg_silt                 
wzrd_st_SD_              
Agri_Change              
mean_Chlord              
mean_SpcCnd              
sd_Chloride -0.083       
sd_SpecCond  0.134 -0.505
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

# The model predicts mean nitrate levels using well, soil, and land use features, while accounting for differences across counties.
# 
# The random effect for county shows some variation across locations (county variance = 6.45), but most variability is still within counties (residual variance = 12.14). #about one-third of the variation in mean nitrate levels is associated with differences across counties, even after accounting for the predictors in the model.
# 
# wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are the most influential predictors.
# 
# Some predictors like Developed_Change, Agri_Change, and wizard_sites_SD_data don’t seem to have strong effects (low t-values).





test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)

r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
  labs(
    title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  theme_minimal(base_size = 20) +
  scale_color_manual(values = c("blue", "yellow", "gray")) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
           hjust = 0, vjust = 1.2, size = 5, color = "black") +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

2. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance. Counties as a random effect

library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No wizard_sites_last_year_data_WTE", 
    "No Mean Well Depth",
    "No wizard_sites_SD_data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Well Depth and No WTE"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+Agri_Change+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | county),
    mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),  # No sand + silt
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | county),  # No chloride + speccond
    mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county)  # No WTE + depth
  )
)
model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lmer(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +  
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none") 

p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

3. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability. county as a random effect

model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
                  Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
                  wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + Agri_Change+mean_SpecCond +
                  (1 | county),
                data = train_data)
summary(lmm_log)

Linear mixed model fit by REML ['lmerMod']
Formula: log_mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +  
    avg_sand + avg_clay + avg_silt + Developed_Change + wizard_sites_SD_data +  
    mean_Chloride + sd_Chloride + sd_SpecCond + Agri_Change +  
    mean_SpecCond + (1 | county)
   Data: train_data

REML criterion at convergence: 670

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.6128 -0.6097 -0.1445  0.7098  2.4641 

Random effects:
 Groups   Name        Variance Std.Dev.
 county   (Intercept) 0.3397   0.5828  
 Residual             0.5146   0.7173  
Number of obs: 289, groups:  county, 5

Fixed effects:
                                 Estimate Std. Error t value
(Intercept)                      0.821242   0.278257   2.951
wizard_sites_last_year_data_WTE  0.354311   0.082096   4.316
Mean_WellDepth                  -0.088511   0.040659  -2.177
avg_sand                         0.008175   0.155050   0.053
avg_clay                         0.085740   0.146047   0.587
Developed_Change                -0.019301   0.058084  -0.332
wizard_sites_SD_data            -0.003729   0.063207  -0.059
mean_Chloride                    1.191097   0.390191   3.053
sd_Chloride                     -0.043007   0.070427  -0.611
sd_SpecCond                      0.152779   0.077422   1.973
Agri_Change                     -0.047574   0.055586  -0.856
mean_SpecCond                   -1.539662   0.388288  -3.965

Correlation of Fixed Effects:
            (Intr) w_____ Mn_WlD avg_sn avg_cl Dvlp_C w__SD_ mn_Chl sd_Chl
wzr_____WTE -0.026                                                        
Men_WllDpth  0.001  0.103                                                 
avg_sand     0.058 -0.126  0.117                                          
avg_clay     0.052 -0.070  0.114  0.947                                   
Devlpd_Chng -0.001  0.129 -0.019  0.021  0.018                            
wzrd_st_SD_ -0.005 -0.138 -0.173  0.043 -0.025  0.063                     
mean_Chlord -0.047  0.092  0.056 -0.152 -0.138 -0.017 -0.103              
sd_Chloride -0.011  0.226 -0.040 -0.112 -0.111  0.040  0.067  0.043       
sd_SpecCond  0.020 -0.014  0.065  0.125  0.126  0.041 -0.026 -0.211 -0.505
Agri_Change  0.016  0.031 -0.054  0.016  0.030  0.605  0.044  0.002  0.021
mean_SpcCnd  0.047 -0.140 -0.077  0.157  0.136  0.005  0.122 -0.986 -0.084
            sd_SpC Agr_Ch
wzr_____WTE              
Men_WllDpth              
avg_sand                 
avg_clay                 
Devlpd_Chng              
wzrd_st_SD_              
mean_Chlord              
sd_Chloride              
sd_SpecCond              
Agri_Change  0.035       
mean_SpcCnd  0.134 -0.016
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

 # Model Goal: Predict log-transformed nitrate using well/site/soil data while adjusting for differences across counties.
 # 
 # 
 # Strong Predictors: wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are statistically important (big t-values).
 # 
 # 
 # Weak/No Effect: Some variables like Developed_Change, Agri_Change, and wizard_sites_SD_data don't seem to help much here.
 # 
 # 
 # Random Effect (county): There is some variation across counties (random effect SD = 0.58), but most variation is still within counties (residual SD = 0.71).



test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log) 

r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
  geom_point(alpha = 0.7, size = 2, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  labs(
    title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

4. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(county, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.

model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate  <- log1p(test_data$mean_nitrate)
train_data$county <- as.factor(train_data$county)
test_data$county <- factor(test_data$county, levels = levels(train_data$county))

gam_model <- gam(log_mean_nitrate ~ 
                   s(wizard_sites_last_year_data_WTE) +
                   s(Mean_WellDepth) +
                   s(avg_sand) +
                   s(avg_clay) +
                   s(avg_silt) +
                   s(wizard_sites_SD_data) +
                   s(mean_Chloride) +
                   s(mean_SpecCond) +
                   s(sd_Chloride) +
                   s(sd_SpecCond) +
                   s(Developed_Change) +
                   s(Agri_Change)+
                   s(county, bs = "re"),  
                 data = train_data,
                 method = "REML") # Restricted Maximum Likelihood # estimate the variance components (like the random effects and residual error) in mixed models 
summary(gam_model)


Family: gaussian 
Link function: identity 

Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) + 
    s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) + 
    s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) + 
    s(Developed_Change) + s(Agri_Change) + s(county, bs = "re")

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.9562     0.1917   4.989 1.11e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                                         edf    Ref.df     F  p-value    
s(wizard_sites_last_year_data_WTE) 4.3919896 5.4011909 3.521  0.00628 ** 
s(Mean_WellDepth)                  3.3939422 4.0230540 9.324 6.00e-07 ***
s(avg_sand)                        0.0002365 0.0004617 0.021  0.99755    
s(avg_clay)                        1.0001008 1.0001972 0.375  0.54107    
s(avg_silt)                        1.0001908 1.0003714 0.004  0.95382    
s(wizard_sites_SD_data)            2.6584084 3.3435020 1.839  0.13507    
s(mean_Chloride)                   3.3641869 4.1595066 3.051  0.01809 *  
s(mean_SpecCond)                   1.0000177 1.0000260 3.541  0.06099 .  
s(sd_Chloride)                     1.0000654 1.0001238 0.285  0.59380    
s(sd_SpecCond)                     1.0000810 1.0001574 1.864  0.17338    
s(Developed_Change)                1.0000774 1.0001511 3.659  0.05685 .  
s(Agri_Change)                     3.7619394 4.6885505 2.206  0.05817 .  
s(county)                          2.7023225 4.0000000 5.660 5.27e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Rank: 113/114
R-sq.(adj) =  0.493   Deviance explained = 53.9%
-REML = 310.63  Scale est. = 0.39299   n = 289

# The model predicts log-transformed nitrate using smooth curves (splines) for each predictor and accounts for differences across counties as a random effect.
# 
# The model explains about 53.9% of the deviance (similar to R²) and has an adjusted R² of 0.493, indicating a decent fit.
# 
# wizard_sites_last_year_data_WTE, Mean_WellDepth, mean_Chloride, and county were statistically significant predictors (p < 0.05).
# 
# Other predictors like avg_sand, sd_Chloride, and wizard_sites_SD_data didn’t show strong effects (p > 0.1).
# 
# The smooth term for county was significant, confirming that including it as a random effect helps account for regional variability.












test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)

gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
           label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
           hjust = 0, vjust = 1, size = 5) +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

5. Multiple linear model

model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

mlr_model <- lm(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
                  Mean_WellDepth + avg_sand + avg_silt + avg_clay +
                  wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
                  sd_Chloride + sd_SpecCond+Agri_Change,
                data = train_data)
summary(mlr_model)


Call:
lm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + 
    Developed_Change + Mean_WellDepth + avg_sand + avg_silt + 
    avg_clay + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + 
    sd_Chloride + sd_SpecCond + Agri_Change, data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.9367 -2.1461 -0.8388  1.1720 15.9447 

Coefficients: (1 not defined because of singularities)
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                      2.94154    0.21476  13.697  < 2e-16 ***
wizard_sites_last_year_data_WTE  1.67481    0.24673   6.788  6.9e-11 ***
Developed_Change                 0.09089    0.29230   0.311 0.756083    
Mean_WellDepth                  -0.28547    0.20604  -1.386 0.167014    
avg_sand                        -1.76700    1.31633  -1.342 0.180575    
avg_silt                        -1.54386    1.32262  -1.167 0.244102    
avg_clay                              NA         NA      NA       NA    
wizard_sites_SD_data            -0.83285    0.24891  -3.346 0.000933 ***
mean_Chloride                    2.26540    1.91474   1.183 0.237768    
mean_SpecCond                   -3.81547    1.89326  -2.015 0.044840 *  
sd_Chloride                     -0.64473    0.34956  -1.844 0.066193 .  
sd_SpecCond                      0.95444    0.39236   2.433 0.015625 *  
Agri_Change                     -0.13986    0.28090  -0.498 0.618950    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.639 on 277 degrees of freedom
Multiple R-squared:  0.2737,    Adjusted R-squared:  0.2448 
F-statistic: 9.488 on 11 and 277 DF,  p-value: 1.696e-14

# 
# The model explains about 27% of the variance in mean nitrate values (R² = 0.274), with a residual error of around 3.64.
# 
# The variable wizard_sites_last_year_data_WTE has the strongest positive effect and is highly significant.
# 
# wizard_sites_SD_data and sd_SpecCond also show significant relationships; mean_SpecCond is weakly significant and negative.
# 
# The variable avg_clay was automatically removed due to perfect multicollinearity with other predictors.
# 
# Several predictors like Developed_Change, Mean_WellDepth, and Agri_Change were not statistically significant.
# 






test_data$mlr_predicted <- predict(mlr_model, newdata = test_data)

mlr_r2 <- round(cor(test_data$mean_nitrate, test_data$mlr_predicted)^2, 3)
mlr_rmse <- round(rmse(test_data$mean_nitrate, test_data$mlr_predicted), 3)



lim_range <- range(c(test_data$mean_nitrate, test_data$mlr_predicted), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = mlr_predicted)) +
  geom_point(alpha = 0.7, color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "MLR: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", mlr_r2, "\nRMSE = ", mlr_rmse),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

6. Comparison of Multiple linear model

model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No WTE",
    "No Mean Well Depth",
    "No SD Data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Depth and WTE"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond+Agri_Change,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_SpecCond + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + sd_Chloride,
    
    mean_nitrate ~ Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + mean_Chloride + mean_SpecCond +
      sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_clay + wizard_sites_SD_data + mean_Chloride +
      mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
      Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data,
    
    mean_nitrate ~ Developed_Change + avg_sand + avg_silt + avg_clay +
      wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
      sd_Chloride + sd_SpecCond
  )
)

model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lm(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "MLR Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +  
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")


p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "MLR Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

Previous work up until Thursday, 8th May

Note From Erin’s email

The point of the random effect is to account for variation that naturally arises from different groupings/hierarchies of the data. I know you initially tried using the cluster as the random effect, but the challenge there (I think) is that cluster really strongly defines the range of nitrate – so it swamps out the fixed effects and we get those weird concentration bands. I think that using agricultural change doesn’t work because 1) it’s a continuous variable and 2) the values are all basically the same for all the wells. If we were trying to predict individual measurements, then the random effect would be the site. But since we’re using the means, that doesn’t really apply. Part of me is wondering if you should just try a multiple linear regression (so no hierarchy) and see if the model does any better.

1. Observed vs. predicted mean nitrate concentrations using a linear mixed-effects model with Agri_Change as a random effect and cluster as a fixed effect.

model_data_scaled_1 <- model_data_scaled

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
                   Mean_WellDepth + avg_sand + avg_silt +avg_clay +
                   wizard_sites_SD_data + mean_Chloride +sd_Chloride+sd_SpecCond +mean_SpecCond + cluster +
                   (1 | Agri_Change),  
                 data = train_data)

test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  labs(
    title = "LMM (Cluster as Fixed Effect): Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  scale_color_manual(values = c("blue", "yellow", "gray")) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
           size = 5, color = "black") +
  theme_minimal(base_size = 20) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

2. Linear mixed-effects model excluding cluster, using Agri_Change as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.

library(lme4)
library(ggplot2)
library(Metrics)

model_data_scaled_1 <- model_data_scaled

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE  + Developed_Change +
                   Mean_WellDepth  + avg_sand + avg_silt +avg_clay +
                   wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
                   (1 | Agri_Change),  
                 data = train_data)

test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)

r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
  labs(
    title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  theme_minimal(base_size = 20) +
  scale_color_manual(values = c("blue", "yellow", "gray")) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
           hjust = 0, vjust = 1.2, size = 5, color = "black") +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

3. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance.

library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No wizard_sites_last_year_data_WTE", 
    "No Mean Well Depth",
    "No wizard_sites_SD_data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Well Depth and No WTE"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | Agri_Change),
    mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change),  # No sand + silt
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | Agri_Change),  # No chloride + speccond
    mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change)  # No WTE + depth
  )
)
model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lmer(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none") +
  ylim(0, 1)

p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

4. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability.

model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
                  Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
                  wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond +
                  (1 | Agri_Change),
                data = train_data)

test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log) 

r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
  geom_point(alpha = 0.7, size = 2, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  labs(
    title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

5. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(Agri_Change, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.

model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate  <- log1p(test_data$mean_nitrate)

gam_model <- gam(log_mean_nitrate ~ 
                   s(wizard_sites_last_year_data_WTE) +
                   s(Mean_WellDepth) +
                   s(avg_sand) +
                    s(avg_clay) +
                    s(avg_silt) +
                   s(wizard_sites_SD_data) +
                   s(mean_Chloride) +
                   s(mean_SpecCond) +
                   s(sd_Chloride) +
                   s(sd_SpecCond) +
                   s(Developed_Change) +
                   s(Agri_Change, bs = "re"),  
                 data = train_data,
                 method = "REML")
summary(gam_model)


Family: gaussian 
Link function: identity 

Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) + 
    s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) + 
    s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) + 
    s(Developed_Change) + s(Agri_Change, bs = "re")

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.90443    0.03699   24.45   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                                         edf   Ref.df     F  p-value    
s(wizard_sites_last_year_data_WTE) 5.558e+00 6.698698 8.256  < 2e-16 ***
s(Mean_WellDepth)                  3.282e+00 3.884149 8.621 3.21e-06 ***
s(avg_sand)                        1.192e-04 0.000207 0.001 0.999714    
s(avg_clay)                        3.811e+00 4.740497 1.932 0.106595    
s(avg_silt)                        1.000e+00 1.000183 3.610 0.058563 .  
s(wizard_sites_SD_data)            3.663e+00 4.572301 4.993 0.000418 ***
s(mean_Chloride)                   2.684e+00 3.354533 2.231 0.079040 .  
s(mean_SpecCond)                   1.000e+00 1.000012 2.634 0.105794    
s(sd_Chloride)                     1.000e+00 1.000222 0.450 0.503059    
s(sd_SpecCond)                     1.000e+00 1.000164 0.763 0.383093    
s(Developed_Change)                5.110e+00 5.793833 2.668 0.012745 *  
s(Agri_Change)                     4.157e-05 1.000000 0.000 0.878071    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Rank: 100/101
R-sq.(adj) =   0.49   Deviance explained = 53.9%
-REML = 316.75  Scale est. = 0.39551   n = 289

test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)

gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
           label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
           hjust = 0, vjust = 1, size = 5) +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

6. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy

library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)

model_data_scaled_1<- model_data_scaled
set.seed(123)  
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)

train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
                           Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
                           wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
                         data = train_data,
                         ntree = 500, importance = TRUE)
print(rf_model)


Call:
 randomForest(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +      Developed_Change + Mean_WellDepth + avg_sand + avg_clay +      avg_silt + sd_Chloride + sd_SpecCond + wizard_sites_SD_data +      mean_Chloride + mean_SpecCond + Agri_Change, data = train_data,      ntree = 500, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 4

          Mean of squared residuals: 9.29879
                    % Var explained: 46.79

importance(rf_model)

                                   %IncMSE IncNodePurity
wizard_sites_last_year_data_WTE 24.3555896     1049.6664
Developed_Change                 2.8712488      129.7873
Mean_WellDepth                  21.4816680     1009.5143
avg_sand                         3.9973885      159.4341
avg_clay                         5.5287895      174.4687
avg_silt                         4.2982408      197.9525
sd_Chloride                     10.9084549      372.6617
sd_SpecCond                      8.2713165      310.2396
wizard_sites_SD_data            10.7779309      334.6947
mean_Chloride                   13.5936136      514.9471
mean_SpecCond                    9.3119503      437.4617
Agri_Change                      0.2146065      117.4282

# 
# Top 3 most important predictors based on %IncMSE are:
# 
# wizard_sites_last_year_data_WTE (24.4%),
# 
# Mean_WellDepth (21.5%),
# 
# mean_Chloride (13.6%).
# 
# Agri_Change has very low importance (0.2%), suggesting it adds little predictive value.
# 
# IncNodePurity shows how much a variable helps in reducing overall variance — used to assess relative contribution.
# 
# Overall, WTE, depth, and water quality variables (like chloride, spec cond) are driving prediction strength in  RF model.
# 








test_data$rf_predicted <- predict(rf_model, newdata = test_data)

r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
  geom_point(alpha = 0.7, color = "steelblue", size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
           size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

7. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations.

model_data_scaled_1<- model_data_scaled




set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)

train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
                   Developed_Change + Mean_WellDepth + avg_sand +avg_clay+avg_silt+
                   wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond +                                                 mean_SpecCond+Agri_Change,
                 data = train_data,
                 family = Gamma(link = "log"))
summary(glm_model)


Call:
glm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + 
    Developed_Change + Mean_WellDepth + avg_sand + avg_clay + 
    avg_silt + wizard_sites_SD_data + mean_Chloride + sd_Chloride + 
    sd_SpecCond + mean_SpecCond + Agri_Change, family = Gamma(link = "log"), 
    data = train_data)

Coefficients: (1 not defined because of singularities)
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                      0.72814    0.08372   8.697 3.08e-16 ***
wizard_sites_last_year_data_WTE  0.54185    0.09619   5.633 4.34e-08 ***
Developed_Change                -0.03460    0.11395  -0.304 0.761621    
Mean_WellDepth                  -0.03000    0.08032  -0.373 0.709097    
avg_sand                         0.06754    0.28255   0.239 0.811248    
avg_clay                         0.30100    0.27567   1.092 0.275822    
avg_silt                              NA         NA      NA       NA    
wizard_sites_SD_data            -0.52311    0.09704  -5.391 1.50e-07 ***
mean_Chloride                    2.58317    0.74647   3.461 0.000624 ***
sd_Chloride                     -0.14638    0.13628  -1.074 0.283700    
sd_SpecCond                      0.35784    0.15296   2.339 0.020023 *  
mean_SpecCond                   -3.23494    0.73809  -4.383 1.66e-05 ***
Agri_Change                     -0.12455    0.10951  -1.137 0.256388    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Gamma family taken to be 2.01265)

    Null deviance: 748.73  on 288  degrees of freedom
Residual deviance: 558.80  on 277  degrees of freedom
AIC: 978.79

Number of Fisher Scoring iterations: 25

# This model predicts mean nitrate using a Gamma distribution with a log link, suitable for skewed, non-negative data.
# 
# The variables wizard_sites_last_year_data_WTE, wizard_sites_SD_data, mean_Chloride, mean_SpecCond, and sd_SpecCond are statistically significant (p < 0.05), meaning they likely influence nitrate concentrations.
# 
# avg_silt was automatically excluded due to collinearity (probably because of perfect correlation with sand or clay).


test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")

r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
       x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
           size = 5) +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)