Mean Nitrate Prediction_Updated_GreaterThan2N

Explore Data

file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"

model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data <- model_data %>%
  filter(mean_nitrate > 2)

model_data_long <- model_data %>%
  ungroup() %>%
  select(where(is.numeric)) %>%
 select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")


ggplot(model_data_long, aes(y = value)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
  facet_wrap(~variable, scales = "free", ncol = 4) +
  labs(title = "Boxplots of All Numeric Variables",
       y = "Value", x = "") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

Add counties information to the dataset

counties <- counties(state = "KS", year = 2022, class = "sf")

wells_sf <- model_data %>%
  st_as_sf(coords = c("Lon.x", "Lat.x"), crs = 4326)
counties_transformed <- st_transform(counties, crs = 4326)

wells_with_county <- st_join(wells_sf, counties_transformed["NAME"])
model_data <- wells_with_county %>%
  st_drop_geometry() %>%
  rename(county = NAME)

wells_bbox <- st_bbox(wells_sf)

library(viridis)  
counties_with_wells <- st_filter(counties_transformed, wells_sf)

ggplot() +
  geom_sf(data = counties_with_wells, fill = "white", color = "black", size = 0.3) +
  geom_sf(data = wells_sf, aes(color = mean_nitrate), size = 2) +
  geom_sf_text(data = counties_with_wells, aes(label = NAME), size = 3, check_overlap = TRUE, color = "gray30") +
  coord_sf(xlim = c(wells_bbox$xmin, wells_bbox$xmax),
           ylim = c(wells_bbox$ymin, wells_bbox$ymax),
           expand = TRUE) +
  scale_color_viridis_c(option = "C", name = "Mean Nitrate") +
  labs(title = "Wells and Counties Containing Them") +
  theme_minimal()

Scale the data: standardization (scale()) to center all predictors around 0 with a standard deviation of 1.((x- mean)/sd)

numeric_predictors <- model_data %>%
  ungroup() %>% 
  select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
         Mean_WellDepth, avg_sand, avg_silt,avg_clay,
         wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond)
scaled_predictors <- scale(numeric_predictors)

model_data_scaled <- model_data %>%
  select(Well_ID, cluster, mean_nitrate, sd_nitrate,county) %>%  
  bind_cols(as_tibble(scaled_predictors))
model_data_scaled$county <- as.factor(model_data_scaled$county)


county_counts <- model_data_scaled %>%
  count(county)
county_counts

# A tibble: 5 × 2
  county        n
  <fct>     <int>
1 Harvey       10
2 McPherson     9
3 Reno         90
4 Rice          1
5 Sedgwick     19

boxplot(mean_nitrate ~ county, data = model_data_scaled,
        las = 2,                     
        col = "lightblue",            
        main = "Mean Nitrate by County",
        xlab = "County",
        ylab = "Mean Nitrate")

text(x = 1:nrow(county_counts),
     y = par("usr")[3] +20,  
     labels = county_counts$n,
     xpd = TRUE,               
     srt = 90,                 
     adj = 1,
     cex = 0.8,               
     col = "darkblue")

Updated Wednesday, 14th May

using counties as random effect

1. Linear mixed-effects model excluding cluster, using Counties as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.

library(lme4)
library(ggplot2)
library(Metrics)

model_data_scaled_1 <- model_data_scaled

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE  + Developed_Change +
                   Mean_WellDepth  + avg_sand + avg_silt +avg_clay +
                   wizard_sites_SD_data + Agri_Change+mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
                   (1 | county),  
                 data = train_data)
summary(lmm_mean)

Linear mixed model fit by REML ['lmerMod']
Formula: mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +  
    Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +  
    Agri_Change + mean_Chloride + mean_SpecCond + sd_Chloride +  
    sd_SpecCond + (1 | county)
   Data: train_data

REML criterion at convergence: 549.4

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.7232 -0.6865 -0.0775  0.6445  3.3055 

Random effects:
 Groups   Name        Variance Std.Dev.
 county   (Intercept)  1.269   1.127   
 Residual             15.041   3.878   
Number of obs: 103, groups:  county, 5

Fixed effects:
                                Estimate Std. Error t value
(Intercept)                       6.9344     0.7927   8.748
wizard_sites_last_year_data_WTE   0.3728     0.5956   0.626
Developed_Change                 -0.1369     0.4417  -0.310
Mean_WellDepth                   -1.1517     0.5108  -2.255
avg_sand                         -3.9547     2.7346  -1.446
avg_silt                         -4.3075     2.7657  -1.557
wizard_sites_SD_data             -0.4779     0.4385  -1.090
Agri_Change                      -0.3798     0.4413  -0.860
mean_Chloride                     0.5244     3.9986   0.131
mean_SpecCond                    -1.0214     3.9950  -0.256
sd_Chloride                      -1.7886     1.0887  -1.643
sd_SpecCond                       1.2647     0.8859   1.428

Correlation of Fixed Effects:
            (Intr) w_____ Dvlp_C Mn_WlD avg_sn avg_sl w__SD_ Agr_Ch mn_Chl
wzr_____WTE  0.260                                                        
Devlpd_Chng -0.006  0.123                                                 
Men_WllDpth -0.033  0.005  0.074                                          
avg_sand    -0.071 -0.050 -0.031 -0.119                                   
avg_silt    -0.088  0.024 -0.039 -0.165  0.981                            
wzrd_st_SD_ -0.129 -0.171  0.145 -0.145 -0.012 -0.036                     
Agri_Change  0.010  0.006  0.556  0.063  0.054  0.060  0.109              
mean_Chlord -0.067  0.052 -0.159 -0.081  0.092  0.148 -0.224 -0.191       
mean_SpcCnd  0.050 -0.079  0.158  0.060 -0.060 -0.125  0.246  0.181 -0.980
sd_Chloride  0.073  0.172  0.045 -0.204 -0.001  0.020  0.115  0.092 -0.315
sd_SpecCond  0.010 -0.045  0.021  0.214 -0.111 -0.106 -0.102  0.012  0.200
            mn_SpC sd_Chl
wzr_____WTE              
Devlpd_Chng              
Men_WllDpth              
avg_sand                 
avg_silt                 
wzrd_st_SD_              
Agri_Change              
mean_Chlord              
mean_SpcCnd              
sd_Chloride  0.222       
sd_SpecCond -0.260 -0.571
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

stargazer(lmm_mean,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")


=============================================================
                                     Dependent variable:     
                                -----------------------------
                                        mean_nitrate         
-------------------------------------------------------------
wizard_sites_last_year_data_WTE             0.37             
                                           (0.60)            
                                                             
Developed_Change                            -0.14            
                                           (0.44)            
                                                             
Mean_WellDepth                             -1.15*            
                                           (0.51)            
                                                             
avg_sand                                    -3.95            
                                           (2.73)            
                                                             
avg_silt                                    -4.31            
                                           (2.77)            
                                                             
wizard_sites_SD_data                        -0.48            
                                           (0.44)            
                                                             
Agri_Change                                 -0.38            
                                           (0.44)            
                                                             
mean_Chloride                               0.52             
                                           (4.00)            
                                                             
mean_SpecCond                               -1.02            
                                           (3.99)            
                                                             
sd_Chloride                                 -1.79            
                                           (1.09)            
                                                             
sd_SpecCond                                 1.26             
                                           (0.89)            
                                                             
Constant                                   6.93***           
                                           (0.79)            
                                                             
-------------------------------------------------------------
Observations                                 103             
Log Likelihood                             -274.68           
Akaike Inf. Crit.                          577.36            
Bayesian Inf. Crit.                        614.25            
=============================================================
Note:                           *p<0.05; **p<0.01; ***p<0.001

# The model predicts mean nitrate levels using well, soil, and land use features, while accounting for differences across counties.
# 
# The random effect for county shows some variation across locations (county variance = 6.45), but most variability is still within counties (residual variance = 12.14). #about one-third of the variation in mean nitrate levels is associated with differences across counties, even after accounting for the predictors in the model.
# 
# wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are the most influential predictors.
# 
# Some predictors like Developed_Change, Agri_Change, and wizard_sites_SD_data don’t seem to have strong effects (low t-values).






test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)

r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
  labs(
    title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  theme_minimal(base_size = 20) +
  scale_color_manual(values = c("blue", "yellow", "gray")) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
           hjust = 0, vjust = 1.2, size = 5, color = "black") +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# residul plot
plot(lmm_mean)

# The ideal plot would have points randomly scattered around the horizontal line at 0, with no clear pattern.
# 
# this plot shows a clear curve and funnel shape

# qqplot
qqnorm(resid(lmm_mean))
qqline(resid(lmm_mean))

# 
# Since many points deviate from the line, especially at the ends (tails), this suggests non-normality in  residuals — particularly heavier tails (possible outliers) #very high or low values

2. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance. Counties as a random effect

library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No wizard_sites_last_year_data_WTE", 
    "No Mean Well Depth",
    "No wizard_sites_SD_data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Well Depth and No WTE"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+Agri_Change+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | county),
    mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),  # No sand + silt
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | county),  # No chloride + speccond
    mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county)  # No WTE + depth
  )
)
model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lmer(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +  
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none") 

p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

3. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability. county as a random effect

model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
                  Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
                  wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + Agri_Change+mean_SpecCond +
                  (1 | county),
                data = train_data)
summary(lmm_log)

Linear mixed model fit by REML ['lmerMod']
Formula: log_mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +  
    avg_sand + avg_clay + avg_silt + Developed_Change + wizard_sites_SD_data +  
    mean_Chloride + sd_Chloride + sd_SpecCond + Agri_Change +  
    mean_SpecCond + (1 | county)
   Data: train_data

REML criterion at convergence: 153.5

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.1953 -0.5865  0.0377  0.5898  2.1839 

Random effects:
 Groups   Name        Variance Std.Dev.
 county   (Intercept) 0.02498  0.1581  
 Residual             0.19018  0.4361  
Number of obs: 103, groups:  county, 5

Fixed effects:
                                Estimate Std. Error t value
(Intercept)                      1.91398    0.10256  18.663
wizard_sites_last_year_data_WTE  0.01518    0.07034   0.216
Mean_WellDepth                  -0.16123    0.05754  -2.802
avg_sand                         0.31599    0.18198   1.736
avg_clay                         0.29035    0.17140   1.694
Developed_Change                -0.01148    0.04972  -0.231
wizard_sites_SD_data            -0.07443    0.05023  -1.482
mean_Chloride                    0.21228    0.45337   0.468
sd_Chloride                     -0.24837    0.12278  -2.023
sd_SpecCond                      0.14287    0.09962   1.434
Agri_Change                     -0.03670    0.04964  -0.739
mean_SpecCond                   -0.25374    0.45222  -0.561

Correlation of Fixed Effects:
            (Intr) w_____ Mn_WlD avg_sn avg_cl Dvlp_C w__SD_ mn_Chl sd_Chl
wzr_____WTE  0.257                                                        
Men_WllDpth -0.040  0.003                                                 
avg_sand     0.118 -0.140  0.216                                          
avg_clay     0.099 -0.020  0.155  0.944                                   
Devlpd_Chng -0.002  0.114  0.072  0.053  0.043                            
wzrd_st_SD_ -0.130 -0.182 -0.140  0.077  0.036  0.137                     
mean_Chlord -0.075  0.059 -0.073 -0.249 -0.164 -0.161 -0.220              
sd_Chloride  0.078  0.179 -0.206 -0.043 -0.012  0.045  0.114 -0.316       
sd_SpecCond  0.007 -0.046  0.214  0.086  0.104  0.021 -0.101  0.199 -0.570
Agri_Change  0.015  0.009  0.062 -0.058 -0.056  0.556  0.106 -0.192  0.093
mean_SpcCnd  0.057 -0.086  0.054  0.239  0.139  0.161  0.242 -0.980  0.223
            sd_SpC Agr_Ch
wzr_____WTE              
Men_WllDpth              
avg_sand                 
avg_clay                 
Devlpd_Chng              
wzrd_st_SD_              
mean_Chlord              
sd_Chloride              
sd_SpecCond              
Agri_Change  0.012       
mean_SpcCnd -0.258  0.181
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient

stargazer(lmm_log,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")


=============================================================
                                     Dependent variable:     
                                -----------------------------
                                      log_mean_nitrate       
-------------------------------------------------------------
wizard_sites_last_year_data_WTE             0.02             
                                           (0.07)            
                                                             
Mean_WellDepth                             -0.16**           
                                           (0.06)            
                                                             
avg_sand                                    0.32             
                                           (0.18)            
                                                             
avg_clay                                    0.29             
                                           (0.17)            
                                                             
Developed_Change                            -0.01            
                                           (0.05)            
                                                             
wizard_sites_SD_data                        -0.07            
                                           (0.05)            
                                                             
mean_Chloride                               0.21             
                                           (0.45)            
                                                             
sd_Chloride                                -0.25*            
                                           (0.12)            
                                                             
sd_SpecCond                                 0.14             
                                           (0.10)            
                                                             
Agri_Change                                 -0.04            
                                           (0.05)            
                                                             
mean_SpecCond                               -0.25            
                                           (0.45)            
                                                             
Constant                                   1.91***           
                                           (0.10)            
                                                             
-------------------------------------------------------------
Observations                                 103             
Log Likelihood                             -76.74            
Akaike Inf. Crit.                          181.47            
Bayesian Inf. Crit.                        218.36            
=============================================================
Note:                           *p<0.05; **p<0.01; ***p<0.001

 # Model Goal: Predict log-transformed nitrate using well/site/soil data while adjusting for differences across counties.
 # 
 # 
 # Strong Predictors: wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are statistically important (big t-values).
 # 
 # 
 # Weak/No Effect: Some variables like Developed_Change, Agri_Change, and wizard_sites_SD_data don't seem to help much here.
 # 
 # 
 # Random Effect (county): There is some variation across counties (random effect SD = 0.58), but most variation is still within counties (residual SD = 0.71).



test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log) 

r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
  geom_point(alpha = 0.7, size = 2, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  labs(
    title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# residul plot
plot(lmm_log)

# 

# qqplot
qqnorm(resid(lmm_log))
qqline(resid(lmm_log))

4. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(county, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.

model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate  <- log1p(test_data$mean_nitrate)
train_data$county <- as.factor(train_data$county)
test_data$county <- factor(test_data$county, levels = levels(train_data$county))

gam_model <- gam(log_mean_nitrate ~ 
                   s(wizard_sites_last_year_data_WTE) +
                   s(Mean_WellDepth) +
                   s(avg_sand) +
                   s(avg_clay) +
                   s(avg_silt) +
                   s(wizard_sites_SD_data) +
                   s(mean_Chloride) +
                   s(mean_SpecCond) +
                   s(sd_Chloride) +
                   s(sd_SpecCond) +
                   s(Developed_Change) +
                   s(Agri_Change)+
                   s(county, bs = "re"),  
                 data = train_data,
                 method = "REML") # Restricted Maximum Likelihood # estimate the variance components (like the random effects and residual error) in mixed models 
summary(gam_model)


Family: gaussian 
Link function: identity 

Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) + 
    s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) + 
    s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) + 
    s(Developed_Change) + s(Agri_Change) + s(county, bs = "re")

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.92393    0.08863   21.71   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                                         edf    Ref.df     F p-value   
s(wizard_sites_last_year_data_WTE) 1.000e+00 1.000e+00 0.257 0.61324   
s(Mean_WellDepth)                  1.000e+00 1.000e+00 7.172 0.00882 **
s(avg_sand)                        1.841e-06 3.528e-06 0.013 0.99983   
s(avg_clay)                        1.000e+00 1.000e+00 3.589 0.06140 . 
s(avg_silt)                        1.000e+00 1.000e+00 4.137 0.04492 * 
s(wizard_sites_SD_data)            1.000e+00 1.000e+00 3.600 0.06100 . 
s(mean_Chloride)                   1.987e+00 2.410e+00 0.896 0.61437   
s(mean_SpecCond)                   1.000e+00 1.000e+00 0.004 0.95293   
s(sd_Chloride)                     1.000e+00 1.000e+00 0.972 0.32673   
s(sd_SpecCond)                     1.000e+00 1.000e+00 0.720 0.39839   
s(Developed_Change)                1.000e+00 1.000e+00 0.074 0.78599   
s(Agri_Change)                     1.000e+00 1.000e+00 0.661 0.41844   
s(county)                          1.273e+00 4.000e+00 0.560 0.15446   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Rank: 113/114
R-sq.(adj) =  0.332   Deviance explained = 41.9%
-REML = 76.693  Scale est. = 0.1854    n = 103

stargazer(gam_model,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")


==================================================================
                                          Dependent variable:     
                                     -----------------------------
                                           log_mean_nitrate       
------------------------------------------------------------------
s(wizard_sites_last_year_data_WTE).1                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).2                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).3                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).4                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).5                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).6                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).7                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).8                              
                                                                  
                                                                  
s(wizard_sites_last_year_data_WTE).9                              
                                                                  
                                                                  
s(Mean_WellDepth).1                                               
                                                                  
                                                                  
s(Mean_WellDepth).2                                               
                                                                  
                                                                  
s(Mean_WellDepth).3                                               
                                                                  
                                                                  
s(Mean_WellDepth).4                                               
                                                                  
                                                                  
s(Mean_WellDepth).5                                               
                                                                  
                                                                  
s(Mean_WellDepth).6                                               
                                                                  
                                                                  
s(Mean_WellDepth).7                                               
                                                                  
                                                                  
s(Mean_WellDepth).8                                               
                                                                  
                                                                  
s(Mean_WellDepth).9                                               
                                                                  
                                                                  
s(avg_sand).1                                                     
                                                                  
                                                                  
s(avg_sand).2                                                     
                                                                  
                                                                  
s(avg_sand).3                                                     
                                                                  
                                                                  
s(avg_sand).4                                                     
                                                                  
                                                                  
s(avg_sand).5                                                     
                                                                  
                                                                  
s(avg_sand).6                                                     
                                                                  
                                                                  
s(avg_sand).7                                                     
                                                                  
                                                                  
s(avg_sand).8                                                     
                                                                  
                                                                  
s(avg_sand).9                                                     
                                                                  
                                                                  
s(avg_clay).1                                                     
                                                                  
                                                                  
s(avg_clay).2                                                     
                                                                  
                                                                  
s(avg_clay).3                                                     
                                                                  
                                                                  
s(avg_clay).4                                                     
                                                                  
                                                                  
s(avg_clay).5                                                     
                                                                  
                                                                  
s(avg_clay).6                                                     
                                                                  
                                                                  
s(avg_clay).7                                                     
                                                                  
                                                                  
s(avg_clay).8                                                     
                                                                  
                                                                  
s(avg_clay).9                                                     
                                                                  
                                                                  
s(avg_silt).1                                                     
                                                                  
                                                                  
s(avg_silt).2                                                     
                                                                  
                                                                  
s(avg_silt).3                                                     
                                                                  
                                                                  
s(avg_silt).4                                                     
                                                                  
                                                                  
s(avg_silt).5                                                     
                                                                  
                                                                  
s(avg_silt).6                                                     
                                                                  
                                                                  
s(avg_silt).7                                                     
                                                                  
                                                                  
s(avg_silt).8                                                     
                                                                  
                                                                  
s(avg_silt).9                                                     
                                                                  
                                                                  
s(wizard_sites_SD_data).1                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).2                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).3                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).4                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).5                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).6                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).7                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).8                                         
                                                                  
                                                                  
s(wizard_sites_SD_data).9                                         
                                                                  
                                                                  
s(mean_Chloride).1                                                
                                                                  
                                                                  
s(mean_Chloride).2                                                
                                                                  
                                                                  
s(mean_Chloride).3                                                
                                                                  
                                                                  
s(mean_Chloride).4                                                
                                                                  
                                                                  
s(mean_Chloride).5                                                
                                                                  
                                                                  
s(mean_Chloride).6                                                
                                                                  
                                                                  
s(mean_Chloride).7                                                
                                                                  
                                                                  
s(mean_Chloride).8                                                
                                                                  
                                                                  
s(mean_Chloride).9                                                
                                                                  
                                                                  
s(mean_SpecCond).1                                                
                                                                  
                                                                  
s(mean_SpecCond).2                                                
                                                                  
                                                                  
s(mean_SpecCond).3                                                
                                                                  
                                                                  
s(mean_SpecCond).4                                                
                                                                  
                                                                  
s(mean_SpecCond).5                                                
                                                                  
                                                                  
s(mean_SpecCond).6                                                
                                                                  
                                                                  
s(mean_SpecCond).7                                                
                                                                  
                                                                  
s(mean_SpecCond).8                                                
                                                                  
                                                                  
s(mean_SpecCond).9                                                
                                                                  
                                                                  
s(sd_Chloride).1                                                  
                                                                  
                                                                  
s(sd_Chloride).2                                                  
                                                                  
                                                                  
s(sd_Chloride).3                                                  
                                                                  
                                                                  
s(sd_Chloride).4                                                  
                                                                  
                                                                  
s(sd_Chloride).5                                                  
                                                                  
                                                                  
s(sd_Chloride).6                                                  
                                                                  
                                                                  
s(sd_Chloride).7                                                  
                                                                  
                                                                  
s(sd_Chloride).8                                                  
                                                                  
                                                                  
s(sd_Chloride).9                                                  
                                                                  
                                                                  
s(sd_SpecCond).1                                                  
                                                                  
                                                                  
s(sd_SpecCond).2                                                  
                                                                  
                                                                  
s(sd_SpecCond).3                                                  
                                                                  
                                                                  
s(sd_SpecCond).4                                                  
                                                                  
                                                                  
s(sd_SpecCond).5                                                  
                                                                  
                                                                  
s(sd_SpecCond).6                                                  
                                                                  
                                                                  
s(sd_SpecCond).7                                                  
                                                                  
                                                                  
s(sd_SpecCond).8                                                  
                                                                  
                                                                  
s(sd_SpecCond).9                                                  
                                                                  
                                                                  
s(Developed_Change).1                                             
                                                                  
                                                                  
s(Developed_Change).2                                             
                                                                  
                                                                  
s(Developed_Change).3                                             
                                                                  
                                                                  
s(Developed_Change).4                                             
                                                                  
                                                                  
s(Developed_Change).5                                             
                                                                  
                                                                  
s(Developed_Change).6                                             
                                                                  
                                                                  
s(Developed_Change).7                                             
                                                                  
                                                                  
s(Developed_Change).8                                             
                                                                  
                                                                  
s(Developed_Change).9                                             
                                                                  
                                                                  
s(Agri_Change).1                                                  
                                                                  
                                                                  
s(Agri_Change).2                                                  
                                                                  
                                                                  
s(Agri_Change).3                                                  
                                                                  
                                                                  
s(Agri_Change).4                                                  
                                                                  
                                                                  
s(Agri_Change).5                                                  
                                                                  
                                                                  
s(Agri_Change).6                                                  
                                                                  
                                                                  
s(Agri_Change).7                                                  
                                                                  
                                                                  
s(Agri_Change).8                                                  
                                                                  
                                                                  
s(Agri_Change).9                                                  
                                                                  
                                                                  
s(county).1                                                       
                                                                  
                                                                  
s(county).2                                                       
                                                                  
                                                                  
s(county).3                                                       
                                                                  
                                                                  
s(county).4                                                       
                                                                  
                                                                  
s(county).5                                                       
                                                                  
                                                                  
Constant                                        1.92***           
                                                (0.09)            
                                                                  
------------------------------------------------------------------
Observations                                      103             
Adjusted R2                                      0.33             
Log Likelihood                                  -66.95            
UBRE                                             76.69            
==================================================================
Note:                                *p<0.05; **p<0.01; ***p<0.001

# The model predicts log-transformed nitrate using smooth curves (splines) for each predictor and accounts for differences across counties as a random effect.
# 
# The model explains about 53.9% of the deviance (similar to R²) and has an adjusted R² of 0.493, indicating a decent fit.
# 
# wizard_sites_last_year_data_WTE, Mean_WellDepth, mean_Chloride, and county were statistically significant predictors (p < 0.05).
# 
# Other predictors like avg_sand, sd_Chloride, and wizard_sites_SD_data didn’t show strong effects (p > 0.1).
# 
# The smooth term for county was significant, confirming that including it as a random effect helps account for regional variability.












test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)

gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
           label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
           hjust = 0, vjust = 1, size = 5) +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# QQ plot
qqnorm(resid(gam_model))
qqline(resid(gam_model))

5. Multiple linear model

model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

mlr_model <- lm(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
                  Mean_WellDepth + avg_sand + avg_silt + avg_clay +
                  wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
                  sd_Chloride + sd_SpecCond+Agri_Change,
                data = train_data)
summary(mlr_model)


Call:
lm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + 
    Developed_Change + Mean_WellDepth + avg_sand + avg_silt + 
    avg_clay + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + 
    sd_Chloride + sd_SpecCond + Agri_Change, data = train_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.7340 -2.8757 -0.4397  2.4261 12.9928 

Coefficients: (1 not defined because of singularities)
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                       7.4869     0.3913  19.132   <2e-16 ***
wizard_sites_last_year_data_WTE   0.7929     0.4694   1.689   0.0946 .  
Developed_Change                 -0.1241     0.4450  -0.279   0.7810    
Mean_WellDepth                   -1.1795     0.5137  -2.296   0.0240 *  
avg_sand                         -4.1976     2.6967  -1.557   0.1230    
avg_silt                         -4.5008     2.6932  -1.671   0.0981 .  
avg_clay                              NA         NA      NA       NA    
wizard_sites_SD_data             -0.6563     0.4143  -1.584   0.1166    
mean_Chloride                     0.5148     3.9384   0.131   0.8963    
mean_SpecCond                    -1.0721     3.9560  -0.271   0.7870    
sd_Chloride                      -1.6788     1.0889  -1.542   0.1266    
sd_SpecCond                       1.2564     0.8955   1.403   0.1640    
Agri_Change                      -0.3609     0.4458  -0.809   0.4204    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.922 on 91 degrees of freedom
Multiple R-squared:  0.3023,    Adjusted R-squared:  0.2179 
F-statistic: 3.584 on 11 and 91 DF,  p-value: 0.0003222

stargazer(mlr_model,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")


=============================================================
                                     Dependent variable:     
                                -----------------------------
                                        mean_nitrate         
-------------------------------------------------------------
wizard_sites_last_year_data_WTE             0.79             
                                           (0.47)            
                                                             
Developed_Change                            -0.12            
                                           (0.45)            
                                                             
Mean_WellDepth                             -1.18*            
                                           (0.51)            
                                                             
avg_sand                                    -4.20            
                                           (2.70)            
                                                             
avg_silt                                    -4.50            
                                           (2.69)            
                                                             
avg_clay                                                     
                                                             
                                                             
wizard_sites_SD_data                        -0.66            
                                           (0.41)            
                                                             
mean_Chloride                               0.51             
                                           (3.94)            
                                                             
mean_SpecCond                               -1.07            
                                           (3.96)            
                                                             
sd_Chloride                                 -1.68            
                                           (1.09)            
                                                             
sd_SpecCond                                 1.26             
                                           (0.90)            
                                                             
Agri_Change                                 -0.36            
                                           (0.45)            
                                                             
Constant                                   7.49***           
                                           (0.39)            
                                                             
-------------------------------------------------------------
Observations                                 103             
R2                                          0.30             
Adjusted R2                                 0.22             
Residual Std. Error                    3.92 (df = 91)        
F Statistic                         3.58*** (df = 11; 91)    
=============================================================
Note:                           *p<0.05; **p<0.01; ***p<0.001

# 
# The model explains about 27% of the variance in mean nitrate values (R² = 0.274), with a residual error of around 3.64.
# 
# The variable wizard_sites_last_year_data_WTE has the strongest positive effect and is highly significant.
# 
# wizard_sites_SD_data and sd_SpecCond also show significant relationships; mean_SpecCond is weakly significant and negative.
# 
# The variable avg_clay was automatically removed due to perfect multicollinearity with other predictors.
# 
# Several predictors like Developed_Change, Mean_WellDepth, and Agri_Change were not statistically significant.
# 






test_data$mlr_predicted <- predict(mlr_model, newdata = test_data)

mlr_r2 <- round(cor(test_data$mean_nitrate, test_data$mlr_predicted)^2, 3)
mlr_rmse <- round(rmse(test_data$mean_nitrate, test_data$mlr_predicted), 3)



lim_range <- range(c(test_data$mean_nitrate, test_data$mlr_predicted), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = mlr_predicted)) +
  geom_point(alpha = 0.7, color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "MLR: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", mlr_r2, "\nRMSE = ", mlr_rmse),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# Residual vs. fitted plot
plot(mlr_model)

# QQ plot of residuals
qqnorm(resid(mlr_model))
qqline(resid(mlr_model))

6. Comparison of Multiple linear model

model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No WTE",
    "No Mean Well Depth",
    "No SD Data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Depth and WTE"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond+Agri_Change,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_SpecCond + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + sd_Chloride,
    
    mean_nitrate ~ Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
      avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + avg_clay + mean_Chloride + mean_SpecCond +
      sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_clay + wizard_sites_SD_data + mean_Chloride +
      mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
      Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data,
    
    mean_nitrate ~ Developed_Change + avg_sand + avg_silt + avg_clay +
      wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
      sd_Chloride + sd_SpecCond
  )
)

model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lm(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "MLR Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +  
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")


p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "MLR Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

Previous work up until Thursday, 8th May

1. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy

library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)

model_data_scaled_1<- model_data_scaled
set.seed(123)  
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)

train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
                           Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
                           wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
                         data = train_data,
                         ntree = 500, importance = TRUE)
print(rf_model)


Call:
 randomForest(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +      Developed_Change + Mean_WellDepth + avg_sand + avg_clay +      avg_silt + sd_Chloride + sd_SpecCond + wizard_sites_SD_data +      mean_Chloride + mean_SpecCond + Agri_Change, data = train_data,      ntree = 500, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 4

          Mean of squared residuals: 15.52265
                    % Var explained: 20.3

importance(rf_model)

                                   %IncMSE IncNodePurity
wizard_sites_last_year_data_WTE  9.7002060     281.22874
Developed_Change                -1.6860849      55.71317
Mean_WellDepth                   6.3151719     328.93666
avg_sand                         2.3630821     112.51640
avg_clay                         4.2071863     105.78716
avg_silt                         4.9661743     136.90064
sd_Chloride                      6.6445761     169.99421
sd_SpecCond                      2.3799743     129.14534
wizard_sites_SD_data             4.4674341     140.82756
mean_Chloride                    7.7793412     200.19792
mean_SpecCond                    6.0886308     152.03909
Agri_Change                      0.8170956      59.85917

# 
# Top 3 most important predictors based on %IncMSE are:
# 
# wizard_sites_last_year_data_WTE (24.4%),
# 
# Mean_WellDepth (21.5%),
# 
# mean_Chloride (13.6%).
# 
# Agri_Change has very low importance (0.2%), suggesting it adds little predictive value.
# 
# IncNodePurity shows how much a variable helps in reducing overall variance — used to assess relative contribution.
# 
# Overall, WTE, depth, and water quality variables (like chloride, spec cond) are driving prediction strength in  RF model.
# 








test_data$rf_predicted <- predict(rf_model, newdata = test_data)

r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
  geom_point(alpha = 0.7, color = "steelblue", size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
           size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# Residuals
resid_rf <- test_data$mean_nitrate - predict(rf_model, newdata = test_data)

# QQ plot
qqnorm(resid_rf)
qqline(resid_rf)

# Training data Testing data MSE % variation explained MSE % variation explained #IOWA paper

train_preds <- predict(rf_model, newdata = train_data)
mse_train <- mean((train_data$mean_nitrate - train_preds)^2)
r2_train <- cor(train_data$mean_nitrate, train_preds)^2 * 100

test_preds <- predict(rf_model, newdata = test_data)
mse_test <- mean((test_data$mean_nitrate - test_preds)^2)
r2_test <- cor(test_data$mean_nitrate, test_preds)^2 * 100

rf_performance <- tibble(
  Dataset = c("Training", "Testing"),
  MSE = c(mse_train, mse_test),
  `% Variance Explained` = c(r2_train, r2_test)
)
rf_performance %>%
  gt() %>%
  tab_header(title = "Random Forest Model Performance") %>%
  gt::fmt_number(columns = c(MSE, `% Variance Explained`), decimals = 2)

Random Forest Model Performance
Dataset	MSE	% Variance Explained
Training	2.94	90.90
Testing	11.11	29.11

2. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations.

model_data_scaled_1<- model_data_scaled




set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)

train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
                   Developed_Change + Mean_WellDepth + avg_sand +avg_clay+avg_silt+
                   wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond +                                                 mean_SpecCond+Agri_Change,
                 data = train_data,
                 family = Gamma(link = "log"))
summary(glm_model)


Call:
glm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + 
    Developed_Change + Mean_WellDepth + avg_sand + avg_clay + 
    avg_silt + wizard_sites_SD_data + mean_Chloride + sd_Chloride + 
    sd_SpecCond + mean_SpecCond + Agri_Change, family = Gamma(link = "log"), 
    data = train_data)

Coefficients: (1 not defined because of singularities)
                                Estimate Std. Error t value Pr(>|t|)    
(Intercept)                      1.94504    0.04995  38.943  < 2e-16 ***
wizard_sites_last_year_data_WTE  0.10576    0.05992   1.765  0.08089 .  
Developed_Change                -0.02951    0.05680  -0.520  0.60460    
Mean_WellDepth                  -0.20937    0.06557  -3.193  0.00193 ** 
avg_sand                         0.42523    0.19344   2.198  0.03047 *  
avg_clay                         0.39317    0.18730   2.099  0.03857 *  
avg_silt                              NA         NA      NA       NA    
wizard_sites_SD_data            -0.12344    0.05288  -2.334  0.02177 *  
mean_Chloride                    0.25154    0.50266   0.500  0.61800    
sd_Chloride                     -0.27937    0.13898  -2.010  0.04737 *  
sd_SpecCond                      0.13134    0.11429   1.149  0.25349    
mean_SpecCond                   -0.29467    0.50492  -0.584  0.56094    
Agri_Change                     -0.06413    0.05690  -1.127  0.26265    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Gamma family taken to be 0.2505409)

    Null deviance: 37.899  on 102  degrees of freedom
Residual deviance: 23.393  on  91  degrees of freedom
AIC: 542.08

Number of Fisher Scoring iterations: 8

stargazer(glm_model,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")


=============================================================
                                     Dependent variable:     
                                -----------------------------
                                        mean_nitrate         
-------------------------------------------------------------
wizard_sites_last_year_data_WTE             0.11             
                                           (0.06)            
                                                             
Developed_Change                            -0.03            
                                           (0.06)            
                                                             
Mean_WellDepth                             -0.21**           
                                           (0.07)            
                                                             
avg_sand                                    0.43*            
                                           (0.19)            
                                                             
avg_clay                                    0.39*            
                                           (0.19)            
                                                             
avg_silt                                                     
                                                             
                                                             
wizard_sites_SD_data                       -0.12*            
                                           (0.05)            
                                                             
mean_Chloride                               0.25             
                                           (0.50)            
                                                             
sd_Chloride                                -0.28*            
                                           (0.14)            
                                                             
sd_SpecCond                                 0.13             
                                           (0.11)            
                                                             
mean_SpecCond                               -0.29            
                                           (0.50)            
                                                             
Agri_Change                                 -0.06            
                                           (0.06)            
                                                             
Constant                                   1.95***           
                                           (0.05)            
                                                             
-------------------------------------------------------------
Observations                                 103             
Log Likelihood                             -259.04           
Akaike Inf. Crit.                          542.08            
=============================================================
Note:                           *p<0.05; **p<0.01; ***p<0.001

# This model predicts mean nitrate using a Gamma distribution with a log link, suitable for skewed, non-negative data.
# 
# The variables wizard_sites_last_year_data_WTE, wizard_sites_SD_data, mean_Chloride, mean_SpecCond, and sd_SpecCond are statistically significant (p < 0.05), meaning they likely influence nitrate concentrations.
# 
# avg_silt was automatically excluded due to collinearity (probably because of perfect correlation with sand or clay).


test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")

r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
       x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
           size = 5) +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# QQ plot
qqnorm(resid(glm_model, type = "deviance"))
qqline(resid(glm_model, type = "deviance"))

Best models so far

Best Models So Far (With >2 mg N: 117 wells)
Model	R²	RMSE
Generalized additive model (GAM)	0.328	3.017
Random forest model	0.291	3.334
Gamma-distributed generalized linear model	0.266	3.540
Log-transformed linear mixed model	0.264	3.189
Linear mixed model using county	0.192	3.835
Multiple linear regression	0.188	3.851