Mean Nitrate Prediction
Scale the data:
numeric_predictors <- model_data %>%
ungroup() %>%
select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
Mean_WellDepth, avg_sand, avg_silt,
wizard_sites_SD_data, mean_Chloride, mean_SpecCond)
scaled_predictors <- scale(numeric_predictors)
model_data_scaled <- model_data %>%
select(Well_ID, cluster, mean_nitrate, sd_nitrate) %>%
bind_cols(as_tibble(scaled_predictors))1. Observed vs. predicted mean nitrate concentrations using a linear mixed-effects model with Agri_Change as a random effect and cluster as a fixed effect.
library(lme4)
library(ggplot2)
library(Metrics)
library(dplyr)
# Step 1: Prepare data and split
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
# Step 2: Fit model with cluster as fixed effect
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond + cluster +
(1 | Agri_Change),
data = train_data)
# Step 3: Predict on test set
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
# Step 4: Calculate performance metrics
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
# Step 5: Plot limits
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
# Step 6: Plot observed vs. predicted
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Cluster as Fixed Effect): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
size = 5, color = "black") +
theme_minimal(base_size = 20) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)2. Linear mixed-effects model excluding cluster, using Agri_Change as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.
library(lme4)
library(ggplot2)
library(Metrics)
# Step 1: Set up data and split
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
# Step 2: Fit LMM on training data
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
(1 | Agri_Change),
data = train_data)
# Step 3: Predict on test data
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
# Step 4: Evaluate performance
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
# Step 5: Plot limits
lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)
# Step 6: Plot
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
labs(
title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
theme_minimal(base_size = 20) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
hjust = 0, vjust = 1.2, size = 5, color = "black") +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)3. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance.
library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)
# Step 1: Split data once
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No wizard_sites_last_year_data_WTE",
"No Mean Well Depth",
"No wizard_sites_SD_data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Well Depth and No WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + (1 | Agri_Change),
mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change), # No sand + silt
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | Agri_Change), # No chloride + speccond
mean_nitrate ~ Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change) # No WTE + depth
)
)
# Step 2: Fit and evaluate each model using train → predict on test
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lmer(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
# Step 3: Prepare data for plotting
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
# Step 4: Plot R²
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none") +
ylim(0, 1)
# Step 5: Plot RMSE
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
# Step 6: Combine plots
p_r2 / p_rmse4. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability.
library(lme4)
library(ggplot2)
library(Metrics)
# Step 1: Create log-transformed response and split data
model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
# Step 2: Fit the LMM on training data
lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
(1 | Agri_Change),
data = train_data)
# Step 3: Predict on test set and back-transform
test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log) * 2 # apply scaling factor if intended
# Step 4: Evaluate model performance
r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
# Step 5: Define plot limits
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
# Step 6: Plot
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
geom_point(alpha = 0.7, size = 2, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)5. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(Agri_Change, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.
model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)
# Step 1: Split data
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
# Step 2: Log-transform target variable
train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate <- log1p(test_data$mean_nitrate)
# Step 3: Fit GAM model on training data
gam_model <- gam(log_mean_nitrate ~
s(wizard_sites_last_year_data_WTE) +
s(Mean_WellDepth) +
s(avg_sand) +
s(wizard_sites_SD_data) +
s(mean_Chloride) +
s(mean_SpecCond) +
s(Agri_Change, bs = "re"),
data = train_data,
method = "REML")
# Step 4: Predict on test set and back-transform
test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)
# Step 5: Evaluate performance
gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)
# Step 6: Plot
lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
hjust = 0, vjust = 1, size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)6. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy
library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)
# Step 1: Split the data
set.seed(123) # for reproducibility
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
# Step 2: Fit RF model on training data
rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
data = train_data,
ntree = 5000, importance = TRUE)
# Step 3: Predict on test set
test_data$rf_predicted <- predict(rf_model, newdata = test_data)
# Step 4: Compute performance metrics
r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)
# Step 5: Plot observed vs. predicted
lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
geom_point(alpha = 0.7, color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)7. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations. Predictions are directly on the original scale and the model emphasizes interpretability under distributional assumptions.
model_data_scaled_1<- model_data_scaled
library(ggplot2)
library(Metrics)
# Step 1: Split the data (same as before)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
# Step 2: Fit GLM with Gamma distribution on training data
glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond,
data = train_data,
family = Gamma(link = "log"))
# Step 3: Predict on test data (use type = "response" to get values on original scale)
test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")
# Step 4: Evaluate performance
r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)
# Step 5: Plot
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)