Mean Nitrate Prediction_Updated

Explore Data

file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"

model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data <- model_data %>%
  filter(!is.na(WWC5AGEYears))

model_data_long <- model_data %>%
  ungroup() %>%
  select(where(is.numeric)) %>%
 select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")


ggplot(model_data_long, aes(y = value)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
  facet_wrap(~variable, scales = "free", ncol = 4) +
  labs(title = "Boxplots of All Numeric Variables",
       y = "Value", x = "") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

Add counties information to the dataset

counties <- counties(state = "KS", year = 2022, class = "sf")

wells_sf <- model_data %>%
  st_as_sf(coords = c("Lon.x", "Lat.x"), crs = 4326)
counties_transformed <- st_transform(counties, crs = 4326)

wells_with_county <- st_join(wells_sf, counties_transformed["NAME"])
model_data <- wells_with_county %>%
  st_drop_geometry() %>%
  rename(county = NAME)

wells_bbox <- st_bbox(wells_sf)

library(viridis)  
counties_with_wells <- st_filter(counties_transformed, wells_sf)

ggplot() +
  geom_sf(data = counties_with_wells, fill = "white", color = "black", size = 0.3) +
  geom_sf(data = wells_sf, aes(color = mean_nitrate), size = 2) +
  geom_sf_text(data = counties_with_wells, aes(label = NAME), size = 3, check_overlap = TRUE, color = "gray30") +
  coord_sf(xlim = c(wells_bbox$xmin, wells_bbox$xmax),
           ylim = c(wells_bbox$ymin, wells_bbox$ymax),
           expand = TRUE) +
  scale_color_viridis_c(option = "C", name = "Mean Nitrate") +
  labs(title = "Wells and Counties Containing Them") +
  theme_minimal()

Scale the data: standardization (scale()) to center all predictors around 0 with a standard deviation of 1.((x- mean)/sd)

numeric_predictors <- model_data %>%
  ungroup() %>% 
  select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
         Mean_WellDepth, avg_sand, avg_silt,avg_clay,
         wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond,WWC5AGEYears)
scaled_predictors <- scale(numeric_predictors)

model_data_scaled <- model_data %>%
  select(Well_ID, cluster, mean_nitrate, sd_nitrate,county) %>%  
  bind_cols(as_tibble(scaled_predictors))
model_data_scaled$county <- as.factor(model_data_scaled$county)


county_counts <- model_data_scaled %>%
  count(county)
county_counts
# A tibble: 5 × 2
  county        n
  <fct>     <int>
1 Harvey       91
2 McPherson    16
3 Reno         99
4 Rice          2
5 Sedgwick      9
boxplot(mean_nitrate ~ county, data = model_data_scaled,
        las = 2,                     
        col = "lightblue",            
        main = "Mean Nitrate by County",
        xlab = "County",
        ylab = "Mean Nitrate")

text(x = 1:nrow(county_counts),
     y = par("usr")[3] +20,  
     labels = county_counts$n,
     xpd = TRUE,               
     srt = 90,                 
     adj = 1,
     cex = 0.8,               
     col = "darkblue")

Updated Wednesday, 14th May

using counties as random effect

1. Linear mixed-effects model excluding cluster, using Counties as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.

library(lme4)
library(ggplot2)
library(Metrics)

model_data_scaled_1 <- model_data_scaled

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE  + Developed_Change +
                   Mean_WellDepth  + avg_sand + avg_silt +avg_clay +
                   wizard_sites_SD_data + Agri_Change+mean_Chloride + WWC5AGEYears+mean_SpecCond+sd_Chloride+sd_SpecCond +
                   (1 | county),  
                 data = train_data)
#summary(lmm_mean)

stargazer(lmm_mean,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")

=============================================================
                                     Dependent variable:     
                                -----------------------------
                                        mean_nitrate         
-------------------------------------------------------------
wizard_sites_last_year_data_WTE             1.56*            
                                           (0.61)            
                                                             
Developed_Change                            -0.04            
                                           (0.41)            
                                                             
Mean_WellDepth                              -0.10            
                                           (0.26)            
                                                             
avg_sand                                    -2.06            
                                           (1.76)            
                                                             
avg_silt                                    -1.75            
                                           (1.82)            
                                                             
wizard_sites_SD_data                        -0.39            
                                           (0.41)            
                                                             
Agri_Change                                 -0.01            
                                           (0.37)            
                                                             
mean_Chloride                              9.71***           
                                           (2.92)            
                                                             
WWC5AGEYears                                0.26             
                                           (0.30)            
                                                             
mean_SpecCond                             -11.48***          
                                           (2.94)            
                                                             
sd_Chloride                                 -0.03            
                                           (0.48)            
                                                             
sd_SpecCond                                 0.15             
                                           (0.49)            
                                                             
Constant                                    3.22*            
                                           (1.51)            
                                                             
-------------------------------------------------------------
Observations                                 173             
Log Likelihood                             -461.43           
Akaike Inf. Crit.                          952.86            
Bayesian Inf. Crit.                        1000.16           
=============================================================
Note:                           *p<0.05; **p<0.01; ***p<0.001
# The model predicts mean nitrate levels using well, soil, and land use features, while accounting for differences across counties.
# 
# The random effect for county shows some variation across locations (county variance = 6.45), but most variability is still within counties (residual variance = 12.14). #about one-third of the variation in mean nitrate levels is associated with differences across counties, even after accounting for the predictors in the model.
# 
# wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are the most influential predictors.
# 
# Some predictors like Developed_Change, Agri_Change, and wizard_sites_SD_data don’t seem to have strong effects (low t-values).






test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)

r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
  labs(
    title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  theme_minimal(base_size = 20) +
  scale_color_manual(values = c("blue", "yellow", "gray")) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
           hjust = 0, vjust = 1.2, size = 5, color = "black") +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# residul plot
plot(lmm_mean)

# The ideal plot would have points randomly scattered around the horizontal line at 0, with no clear pattern.
# 
# this plot shows a clear curve and funnel shape

# qqplot
qqnorm(resid(lmm_mean))
qqline(resid(lmm_mean))

# 
# Since many points deviate from the line, especially at the ends (tails), this suggests non-normality in  residuals — particularly heavier tails (possible outliers) #very high or low values 

2. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance. Counties as a random effect

library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No wizard_sites_last_year_data_WTE", 
    "No Mean Well Depth",
    "No wizard_sites_SD_data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Well Depth and No WTE",
    "No Age"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+Agri_Change+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+ Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+ Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+ Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | county),
    mean_nitrate ~ Developed_Change + Mean_WellDepth + WWC5AGEYears+ avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),
    mean_nitrate ~ wizard_sites_last_year_data_WTE+ WWC5AGEYears + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),  # No sand + silt
    mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | county),  # No chloride + speccond
    mean_nitrate ~ Developed_Change + avg_sand + WWC5AGEYears+ avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),  # No WTE + depth
    mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county) 
  )
)
model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lmer(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +  
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none") 

p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

3. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability. county as a random effect

model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
                  Mean_WellDepth + avg_sand + WWC5AGEYears+avg_clay+avg_silt+Developed_Change+
                  wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + Agri_Change+mean_SpecCond +
                  (1 | county),
                data = train_data)
summary(lmm_log)
Linear mixed model fit by REML ['lmerMod']
Formula: log_mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +  
    avg_sand + WWC5AGEYears + avg_clay + avg_silt + Developed_Change +  
    wizard_sites_SD_data + mean_Chloride + sd_Chloride + sd_SpecCond +  
    Agri_Change + mean_SpecCond + (1 | county)
   Data: train_data

REML criterion at convergence: 387.2

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.87909 -0.60853 -0.05188  0.56218  2.63632 

Random effects:
 Groups   Name        Variance Std.Dev.
 county   (Intercept) 0.6465   0.8040  
 Residual             0.4353   0.6598  
Number of obs: 173, groups:  county, 5

Fixed effects:
                                Estimate Std. Error t value
(Intercept)                      0.92265    0.38228   2.414
wizard_sites_last_year_data_WTE  0.31322    0.11725   2.671
Mean_WellDepth                  -0.04266    0.04821  -0.885
avg_sand                         0.09708    0.18956   0.512
WWC5AGEYears                     0.03136    0.05564   0.564
avg_clay                         0.15888    0.17043   0.932
Developed_Change                 0.03563    0.07610   0.468
wizard_sites_SD_data            -0.11675    0.07669  -1.522
mean_Chloride                    2.33159    0.54431   4.284
sd_Chloride                      0.07027    0.08887   0.791
sd_SpecCond                      0.01277    0.09145   0.140
Agri_Change                      0.01884    0.06888   0.274
mean_SpecCond                   -2.71228    0.54801  -4.949
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
stargazer(lmm_log,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")

=============================================================
                                     Dependent variable:     
                                -----------------------------
                                      log_mean_nitrate       
-------------------------------------------------------------
wizard_sites_last_year_data_WTE            0.31**            
                                           (0.12)            
                                                             
Mean_WellDepth                              -0.04            
                                           (0.05)            
                                                             
avg_sand                                    0.10             
                                           (0.19)            
                                                             
WWC5AGEYears                                0.03             
                                           (0.06)            
                                                             
avg_clay                                    0.16             
                                           (0.17)            
                                                             
Developed_Change                            0.04             
                                           (0.08)            
                                                             
wizard_sites_SD_data                        -0.12            
                                           (0.08)            
                                                             
mean_Chloride                              2.33***           
                                           (0.54)            
                                                             
sd_Chloride                                 0.07             
                                           (0.09)            
                                                             
sd_SpecCond                                 0.01             
                                           (0.09)            
                                                             
Agri_Change                                 0.02             
                                           (0.07)            
                                                             
mean_SpecCond                             -2.71***           
                                           (0.55)            
                                                             
Constant                                    0.92*            
                                           (0.38)            
                                                             
-------------------------------------------------------------
Observations                                 173             
Log Likelihood                             -193.61           
Akaike Inf. Crit.                          417.21            
Bayesian Inf. Crit.                        464.51            
=============================================================
Note:                           *p<0.05; **p<0.01; ***p<0.001
 # Model Goal: Predict log-transformed nitrate using well/site/soil data while adjusting for differences across counties.
 # 
 # 
 # Strong Predictors: wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are statistically important (big t-values).
 # 
 # 
 # Weak/No Effect: Some variables like Developed_Change, Agri_Change, and wizard_sites_SD_data don't seem to help much here.
 # 
 # 
 # Random Effect (county): There is some variation across counties (random effect SD = 0.58), but most variation is still within counties (residual SD = 0.71).



test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log) 

r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
  geom_point(alpha = 0.7, size = 2, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  labs(
    title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# residul plot
plot(lmm_log)

# 

# qqplot
qqnorm(resid(lmm_log))
qqline(resid(lmm_log))

4. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(county, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.

model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate  <- log1p(test_data$mean_nitrate)
train_data$county <- as.factor(train_data$county)
test_data$county <- factor(test_data$county, levels = levels(train_data$county))

gam_model <- gam(log_mean_nitrate ~ 
                   s(wizard_sites_last_year_data_WTE) +
                   s(Mean_WellDepth) +
                   s(avg_sand) +
                   s(avg_clay) +
                   s(avg_silt) +
                   s(wizard_sites_SD_data) +
                   s(mean_Chloride) +
                   s(mean_SpecCond) +
                   s(sd_Chloride) +
                   s(sd_SpecCond) +
                   s(Developed_Change) +
                   s(Agri_Change)+
                   s(WWC5AGEYears)+
                   s(county, bs = "re"),  
                 data = train_data,
                 method = "REML") # Restricted Maximum Likelihood # estimate the variance components (like the random effects and residual error) in mixed models 
summary(gam_model)

Family: gaussian 
Link function: identity 

Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) + 
    s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) + 
    s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) + 
    s(Developed_Change) + s(Agri_Change) + s(WWC5AGEYears) + 
    s(county, bs = "re")

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.01092    0.04018   25.16   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
                                         edf    Ref.df      F  p-value    
s(wizard_sites_last_year_data_WTE) 5.226e+00 6.290e+00 12.028  < 2e-16 ***
s(Mean_WellDepth)                  2.151e+00 2.380e+00  7.058 0.000703 ***
s(avg_sand)                        2.649e-05 4.151e-05  0.003 0.999739    
s(avg_clay)                        1.000e+00 1.000e+00  3.777 0.053903 .  
s(avg_silt)                        2.231e+00 2.735e+00  1.981 0.140752    
s(wizard_sites_SD_data)            2.913e+00 3.644e+00  2.057 0.132801    
s(mean_Chloride)                   1.963e+00 2.458e+00  5.887 0.001607 ** 
s(mean_SpecCond)                   1.000e+00 1.000e+00  9.456 0.002512 ** 
s(sd_Chloride)                     1.000e+00 1.000e+00  8.800 0.003524 ** 
s(sd_SpecCond)                     1.000e+00 1.000e+00  3.560 0.061176 .  
s(Developed_Change)                1.824e+00 2.046e+00  1.840 0.175338    
s(Agri_Change)                     1.000e+00 1.000e+00  0.018 0.894285    
s(WWC5AGEYears)                    5.481e+00 6.457e+00  2.716 0.012401 *  
s(county)                          2.682e-05 4.000e+00  0.000 0.318336    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Rank: 122/123
R-sq.(adj) =   0.66   Deviance explained = 71.3%
-REML = 171.07  Scale est. = 0.27926   n = 173
# 1st num regression coefficient , 2nd num standard error of the coefficient estimate, *** means p-value < 0.001, which indicates the effect is highly significant.
# 
# stargazer(gam_model,type="text",
#           digits=2,
#           star.cutoffs = c(.05,.01,.001),
#           digit.separator = "")

# The model predicts log-transformed nitrate using smooth curves (splines) for each predictor and accounts for differences across counties as a random effect.
# 
# The model explains about 53.9% of the deviance (similar to R²) and has an adjusted R² of 0.493, indicating a decent fit.
# 
# wizard_sites_last_year_data_WTE, Mean_WellDepth, mean_Chloride, and county were statistically significant predictors (p < 0.05).
# 
# Other predictors like avg_sand, sd_Chloride, and wizard_sites_SD_data didn’t show strong effects (p > 0.1).
# 
# The smooth term for county was significant, confirming that including it as a random effect helps account for regional variability.












test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)

gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
           label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
           hjust = 0, vjust = 1, size = 5) +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# QQ plot
qqnorm(resid(gam_model))
qqline(resid(gam_model))

5. Multiple linear model

model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

mlr_model <- lm(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
                  Mean_WellDepth + WWC5AGEYears+ avg_sand + avg_silt + avg_clay +
                  wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
                  sd_Chloride + sd_SpecCond+Agri_Change,
                data = train_data)
#summary(mlr_model)
# 1st num regression coefficient , 2nd num standard error of the coefficient estimate, *** means p-value < 0.001, which indicates the effect is highly significant.
stargazer(mlr_model,type="text",
          digits=2,
          star.cutoffs = c(.05,.01,.001),
          digit.separator = "")

=============================================================
                                     Dependent variable:     
                                -----------------------------
                                        mean_nitrate         
-------------------------------------------------------------
wizard_sites_last_year_data_WTE            2.19***           
                                           (0.39)            
                                                             
Developed_Change                            0.37             
                                           (0.38)            
                                                             
Mean_WellDepth                              -0.05            
                                           (0.27)            
                                                             
WWC5AGEYears                                0.43             
                                           (0.31)            
                                                             
avg_sand                                    -1.70            
                                           (1.79)            
                                                             
avg_silt                                    -1.30            
                                           (1.80)            
                                                             
avg_clay                                                     
                                                             
                                                             
wizard_sites_SD_data                      -1.26***           
                                           (0.34)            
                                                             
mean_Chloride                              8.74**            
                                           (2.96)            
                                                             
mean_SpecCond                             -10.51***          
                                           (2.97)            
                                                             
sd_Chloride                                 0.09             
                                           (0.50)            
                                                             
sd_SpecCond                                 0.14             
                                           (0.51)            
                                                             
Agri_Change                                 -0.05            
                                           (0.39)            
                                                             
Constant                                   3.44***           
                                           (0.29)            
                                                             
-------------------------------------------------------------
Observations                                 173             
R2                                          0.37             
Adjusted R2                                 0.33             
Residual Std. Error                    3.73 (df = 160)       
F Statistic                        7.96*** (df = 12; 160)    
=============================================================
Note:                           *p<0.05; **p<0.01; ***p<0.001
# 
# The model explains about 27% of the variance in mean nitrate values (R² = 0.274), with a residual error of around 3.64.
# 
# The variable wizard_sites_last_year_data_WTE has the strongest positive effect and is highly significant.
# 
# wizard_sites_SD_data and sd_SpecCond also show significant relationships; mean_SpecCond is weakly significant and negative.
# 
# The variable avg_clay was automatically removed due to perfect multicollinearity with other predictors.
# 
# Several predictors like Developed_Change, Mean_WellDepth, and Agri_Change were not statistically significant.
# 






test_data$mlr_predicted <- predict(mlr_model, newdata = test_data)

mlr_r2 <- round(cor(test_data$mean_nitrate, test_data$mlr_predicted)^2, 3)
mlr_rmse <- round(rmse(test_data$mean_nitrate, test_data$mlr_predicted), 3)



lim_range <- range(c(test_data$mean_nitrate, test_data$mlr_predicted), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = mlr_predicted)) +
  geom_point(alpha = 0.7, color = "darkgreen") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "MLR: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", mlr_r2, "\nRMSE = ", mlr_rmse),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# QQ plot of residuals
qqnorm(resid(mlr_model))
qqline(resid(mlr_model))

6. Comparison of Multiple linear model

model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No WTE",
    "No Mean Well Depth",
    "No SD Data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Depth and WTE",
    "No Age"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond+Agri_Change,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_clay + WWC5AGEYears+ wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_silt + avg_clay + WWC5AGEYears+ wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
      avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
      mean_SpecCond + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
      mean_Chloride + sd_Chloride,
    
    mean_nitrate ~ Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
      avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
      mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_sand + avg_silt + WWC5AGEYears+ avg_clay + mean_Chloride + mean_SpecCond +
      sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
      avg_clay + WWC5AGEYears+ wizard_sites_SD_data + mean_Chloride +
      mean_SpecCond + sd_Chloride + sd_SpecCond,
    
    mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change +
      Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data,
    
    mean_nitrate ~ Developed_Change + WWC5AGEYears+ avg_sand + avg_silt + avg_clay +
      wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
      sd_Chloride + sd_SpecCond,
     mean_nitrate ~ Developed_Change + avg_sand + avg_silt + avg_clay +
      wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
      sd_Chloride + sd_SpecCond
  )
)

model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lm(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "MLR Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +  
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")


p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "MLR Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

7. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy

library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)

model_data_scaled_1<- model_data_scaled
set.seed(123)  
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)

train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
                           Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
                           wizard_sites_SD_data + mean_Chloride + mean_SpecCond + WWC5AGEYears+ Agri_Change,
                         data = train_data,
                         ntree = 500, importance = TRUE)
print(rf_model)

Call:
 randomForest(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +      Developed_Change + Mean_WellDepth + avg_sand + avg_clay +      avg_silt + sd_Chloride + sd_SpecCond + wizard_sites_SD_data +      mean_Chloride + mean_SpecCond + WWC5AGEYears + Agri_Change,      data = train_data, ntree = 500, importance = TRUE) 
               Type of random forest: regression
                     Number of trees: 500
No. of variables tried at each split: 4

          Mean of squared residuals: 8.927819
                    % Var explained: 56.63
importance(rf_model)
                                  %IncMSE IncNodePurity
wizard_sites_last_year_data_WTE 22.369779     746.51622
Developed_Change                 5.669976     100.18938
Mean_WellDepth                  16.240461     554.84470
avg_sand                         6.576424     147.54888
avg_clay                         6.297357     136.97875
avg_silt                         7.336557     160.19089
sd_Chloride                      8.166566     156.77181
sd_SpecCond                      7.580271     200.40126
wizard_sites_SD_data            14.723644     332.97902
mean_Chloride                   11.442131     320.88192
mean_SpecCond                    8.708503     246.95999
WWC5AGEYears                     7.946512     234.03414
Agri_Change                      3.427808      93.65367
# 
# Top 3 most important predictors based on %IncMSE are:
# 
# wizard_sites_last_year_data_WTE (24.4%),
# 
# Mean_WellDepth (21.5%),
# 
# mean_Chloride (13.6%).
# 
# Agri_Change has very low importance (0.2%), suggesting it adds little predictive value.
# 
# IncNodePurity shows how much a variable helps in reducing overall variance — used to assess relative contribution.
# 
# Overall, WTE, depth, and water quality variables (like chloride, spec cond) are driving prediction strength in  RF model.
# 








test_data$rf_predicted <- predict(rf_model, newdata = test_data)

r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
  geom_point(alpha = 0.7, color = "steelblue", size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
           size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

# Residuals
resid_rf <- test_data$mean_nitrate - predict(rf_model, newdata = test_data)

# # QQ plot
# qqnorm(resid_rf)
# qqline(resid_rf)





# Training data Testing data MSE % variation explained MSE % variation explained #IOWA paper

train_preds <- predict(rf_model, newdata = train_data)
mse_train <- mean((train_data$mean_nitrate - train_preds)^2)
r2_train <- cor(train_data$mean_nitrate, train_preds)^2 * 100

test_preds <- predict(rf_model, newdata = test_data)
mse_test <- mean((test_data$mean_nitrate - test_preds)^2)
r2_test <- cor(test_data$mean_nitrate, test_preds)^2 * 100

rf_performance <- tibble(
  Dataset = c("Training", "Testing"),
  MSE = c(mse_train, mse_test),
  `% Variance Explained` = c(r2_train, r2_test)
)
rf_performance %>%
  gt() %>%
  tab_header(title = "Random Forest Model Performance") %>%
  gt::fmt_number(columns = c(MSE, `% Variance Explained`), decimals = 2)
Random Forest Model Performance
Dataset MSE % Variance Explained
Training 1.69 94.19
Testing 9.36 63.67

Best models so far

Best Models So Far (With Well Age in eq: 217 wells)
Model R2 RMSE
Linear mixed model using county 0.475 3.643
Log-transformed linear mixed model 0.507 3.820
Generalized additive model (GAM) 0.586 3.353
Multiple linear regression 0.453 3.759
Random forest model 0.637 3.059
Gamma-distributed generalized linear model 0.380 4.100
Best Model: Data Filtered by Age But Age Not Used in Model (217 wells)
Model R2 RMSE
Linear mixed model using county 0.465 3.669
Log-transformed linear mixed model 0.501 3.842
Generalized additive model (GAM) 0.544 3.607
Multiple linear regression 0.436 3.804
Random forest model 0.617 3.130
Gamma-distributed generalized linear model 0.372 4.042
Best Model without Age (362 wells)
Model R2 RMSE
Linear mixed model using county 0.403 3.156
Log-transformed linear mixed model 0.463 3.257
Generalized additive model (GAM) 0.378 3.278
Multiple linear regression 0.371 3.246
Random forest model 0.580 2.671
Gamma-distributed generalized linear model 0.503 2.880
Model Performance Comparison (With Age vs. Filtered vs. Without Age)
Model R2 (With Age) R2 (Filtered, Age Not Used) R2 (Without Age) RMSE (With Age) RMSE (Filtered, Age Not Used) RMSE (Without Age)
Linear mixed model using county 0.475 0.465 0.403 3.643 3.669 3.156
Log-transformed linear mixed model 0.507 0.501 0.463 3.820 3.842 3.257
Generalized additive model (GAM) 0.586 0.544 0.378 3.353 3.607 3.278
Multiple linear regression 0.453 0.436 0.371 3.759 3.804 3.246
Random forest model 0.637 0.617 0.580 3.059 3.130 2.671
Gamma-distributed generalized linear model 0.380 0.372 0.503 4.100 4.042 2.880