Mean Nitrate Prediction_Removing Outliers

Explore Data

file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"

model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))

model_data_long <- model_data %>%
  ungroup() %>%
  select(where(is.numeric)) %>%
 select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")


ggplot(model_data_long, aes(y = value)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
  facet_wrap(~variable, scales = "free", ncol = 4) +
  labs(title = "Boxplots of All Numeric Variables",
       y = "Value", x = "") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

Removing Outliers

model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data <- model_data %>%
  filter(mean_nitrate <= 17, Mean_WellDepth <= 1500)


model_data_long <- model_data %>%
  ungroup() %>%
  select(where(is.numeric)) %>%
  select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "value")


ggplot(model_data_long, aes(y = value)) +
  geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
  facet_wrap(~variable, scales = "free", ncol = 4) +
  labs(title = "Boxplots of All Numeric Variables\nif outliers are removed from nitrate, well depth",
       y = "Value", x = "") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

Scale the data:

numeric_predictors <- model_data %>%
  ungroup() %>% 
  select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
         Mean_WellDepth, avg_sand, avg_silt,avg_clay,
         wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond)
scaled_predictors <- scale(numeric_predictors)

model_data_scaled <- model_data %>%
  select(Well_ID, cluster, mean_nitrate, sd_nitrate) %>%  
  bind_cols(as_tibble(scaled_predictors))

1. Observed vs. predicted mean nitrate concentrations using a linear mixed-effects model with Agri_Change as a random effect and cluster as a fixed effect.

model_data_scaled_1 <- model_data_scaled

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
                   Mean_WellDepth + avg_sand + avg_silt +avg_clay +
                   wizard_sites_SD_data + mean_Chloride +sd_Chloride+sd_SpecCond +mean_SpecCond + cluster +
                   (1 | Agri_Change),  
                 data = train_data)

test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  labs(
    title = "LMM (Cluster as Fixed Effect): Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  scale_color_manual(values = c("blue", "yellow", "gray")) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
           size = 5, color = "black") +
  theme_minimal(base_size = 20) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

2. Linear mixed-effects model excluding cluster, using Agri_Change as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.

library(lme4)
library(ggplot2)
library(Metrics)

model_data_scaled_1 <- model_data_scaled

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE  + Developed_Change +
                   Mean_WellDepth  + avg_sand + avg_silt +avg_clay +
                   wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
                   (1 | Agri_Change),  
                 data = train_data)

test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)

r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
  geom_point(alpha = 0.7, size = 2) +
  geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
  labs(
    title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  theme_minimal(base_size = 20) +
  scale_color_manual(values = c("blue", "yellow", "gray")) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
           hjust = 0, vjust = 1.2, size = 5, color = "black") +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

3. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance.

library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
  model_name = c(
    "Full Model",
    "No Silt",
    "No Sand",
    "No Developed_Change",
    "No Chloride",
    "No SpecCond",
    "No wizard_sites_last_year_data_WTE", 
    "No Mean Well Depth",
    "No wizard_sites_SD_data",
    "No Silt and Sand",
    "No Chloride and SpecCond",
    "No Mean Well Depth and No WTE"
  ),
  formula = list(
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | Agri_Change),
    mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change),
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change),  # No sand + silt
    mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | Agri_Change),  # No chloride + speccond
    mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change)  # No WTE + depth
  )
)
model_results <- model_formulas %>%
  mutate(
    model_fit = map(formula, ~ lmer(.x, data = train_data)),
    predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
    r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
    rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
  )

plot_data <- model_results %>%
  select(model_name, r2, rmse) %>%
  pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")

p_r2 <- plot_data %>%
  filter(metric == "r2") %>%
  ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
  scale_fill_manual(values = c("r2" = "steelblue")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none") +
  ylim(0, 1)

p_rmse <- plot_data %>%
  filter(metric == "rmse") %>%
  ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
  geom_col(width = 0.7) +
  labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
  scale_fill_manual(values = c("rmse" = "tomato")) +
  theme_minimal(base_size = 16) +
  coord_flip() +
  theme(legend.position = "none")

p_r2 / p_rmse

4. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability.

model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
                  Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
                  wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond +
                  (1 | Agri_Change),
                data = train_data)

test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log) * 2  # apply scaling factor if intended

r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
  geom_point(alpha = 0.7, size = 2, color = "steelblue") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  labs(
    title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
           hjust = 0, vjust = 1, size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

5. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(Agri_Change, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.

model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)

set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate  <- log1p(test_data$mean_nitrate)

gam_model <- gam(log_mean_nitrate ~ 
                   s(wizard_sites_last_year_data_WTE) +
                   s(Mean_WellDepth) +
                   s(avg_sand) +
                    s(avg_clay) +
                    s(avg_silt) +
                   s(wizard_sites_SD_data) +
                   s(mean_Chloride) +
                   s(mean_SpecCond) +
                   s(sd_Chloride) +
                   s(sd_SpecCond) +
                   s(Developed_Change) +
                   s(Agri_Change, bs = "re"),  
                 data = train_data,
                 method = "REML")

test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)

gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
  geom_point(alpha = 0.7, color = "steelblue") +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
           label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
           hjust = 0, vjust = 1, size = 5) +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)

6. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy

library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)

set.seed(123)  
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)

train_data <- model_data_scaled_1[train_idx, ]
test_data  <- model_data_scaled_1[-train_idx, ]

rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
                           Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
                           wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
                         data = train_data,
                         ntree = 500, importance = TRUE)

test_data$rf_predicted <- predict(rf_model, newdata = test_data)

r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)

lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)

ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
  geom_point(alpha = 0.7, color = "steelblue", size = 2) +
  geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
  geom_smooth(method = "lm", se = FALSE, color = "black") +
  labs(
    title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
    x = "Observed Mean Nitrate",
    y = "Predicted Mean Nitrate"
  ) +
  annotate("text", x = min(lim_range), y = max(lim_range),
           hjust = 0, vjust = 1,
           label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
           size = 5, color = "black") +
  theme_minimal(base_size = 16) +
  coord_fixed() +
  xlim(lim_range) +
  ylim(lim_range)