Mean Nitrate Prediction_Removing Outliers
Explore Data
file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"
model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data_long <- model_data %>%
ungroup() %>%
select(where(is.numeric)) %>%
select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
ggplot(model_data_long, aes(y = value)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(title = "Boxplots of All Numeric Variables",
y = "Value", x = "") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())Removing Outliers
model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data <- model_data %>%
filter(mean_nitrate <= 17, Mean_WellDepth <= 1500)
model_data_long <- model_data %>%
ungroup() %>%
select(where(is.numeric)) %>%
select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
ggplot(model_data_long, aes(y = value)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(title = "Boxplots of All Numeric Variables\nif outliers are removed from nitrate, well depth",
y = "Value", x = "") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())Scale the data:
numeric_predictors <- model_data %>%
ungroup() %>%
select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
Mean_WellDepth, avg_sand, avg_silt,avg_clay,
wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond)
scaled_predictors <- scale(numeric_predictors)
model_data_scaled <- model_data %>%
select(Well_ID, cluster, mean_nitrate, sd_nitrate) %>%
bind_cols(as_tibble(scaled_predictors))1. Observed vs. predicted mean nitrate concentrations using a linear mixed-effects model with Agri_Change as a random effect and cluster as a fixed effect.
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + mean_Chloride +sd_Chloride+sd_SpecCond +mean_SpecCond + cluster +
(1 | Agri_Change),
data = train_data)
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Cluster as Fixed Effect): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
size = 5, color = "black") +
theme_minimal(base_size = 20) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)2. Linear mixed-effects model excluding cluster, using Agri_Change as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.
library(lme4)
library(ggplot2)
library(Metrics)
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
(1 | Agri_Change),
data = train_data)
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
labs(
title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
theme_minimal(base_size = 20) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
hjust = 0, vjust = 1.2, size = 5, color = "black") +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)3. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance.
library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No wizard_sites_last_year_data_WTE",
"No Mean Well Depth",
"No wizard_sites_SD_data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Well Depth and No WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | Agri_Change),
mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change), # No sand + silt
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | Agri_Change), # No chloride + speccond
mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change) # No WTE + depth
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lmer(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none") +
ylim(0, 1)
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmse4. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability.
model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond +
(1 | Agri_Change),
data = train_data)
test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log) * 2 # apply scaling factor if intended
r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
geom_point(alpha = 0.7, size = 2, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)5. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(Agri_Change, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.
model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate <- log1p(test_data$mean_nitrate)
gam_model <- gam(log_mean_nitrate ~
s(wizard_sites_last_year_data_WTE) +
s(Mean_WellDepth) +
s(avg_sand) +
s(avg_clay) +
s(avg_silt) +
s(wizard_sites_SD_data) +
s(mean_Chloride) +
s(mean_SpecCond) +
s(sd_Chloride) +
s(sd_SpecCond) +
s(Developed_Change) +
s(Agri_Change, bs = "re"),
data = train_data,
method = "REML")
test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)
gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
hjust = 0, vjust = 1, size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)6. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy
library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
data = train_data,
ntree = 500, importance = TRUE)
test_data$rf_predicted <- predict(rf_model, newdata = test_data)
r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
geom_point(alpha = 0.7, color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)7. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations. Predictions are directly on the original scale and the model emphasizes interpretability under distributional assumptions.
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand +avg_clay+avg_silt+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond+Agri_Change,
data = train_data,
family = Gamma(link = "log"))
test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")
r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)