Mean Nitrate Prediction_Updated
Explore Data
file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"
model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data <- model_data %>%
filter(!is.na(WWC5AGEYears))
model_data_long <- model_data %>%
ungroup() %>%
select(where(is.numeric)) %>%
select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
ggplot(model_data_long, aes(y = value)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(title = "Boxplots of All Numeric Variables",
y = "Value", x = "") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())Add counties information to the dataset
counties <- counties(state = "KS", year = 2022, class = "sf")
wells_sf <- model_data %>%
st_as_sf(coords = c("Lon.x", "Lat.x"), crs = 4326)
counties_transformed <- st_transform(counties, crs = 4326)
wells_with_county <- st_join(wells_sf, counties_transformed["NAME"])
model_data <- wells_with_county %>%
st_drop_geometry() %>%
rename(county = NAME)
wells_bbox <- st_bbox(wells_sf)
library(viridis)
counties_with_wells <- st_filter(counties_transformed, wells_sf)
ggplot() +
geom_sf(data = counties_with_wells, fill = "white", color = "black", size = 0.3) +
geom_sf(data = wells_sf, aes(color = mean_nitrate), size = 2) +
geom_sf_text(data = counties_with_wells, aes(label = NAME), size = 3, check_overlap = TRUE, color = "gray30") +
coord_sf(xlim = c(wells_bbox$xmin, wells_bbox$xmax),
ylim = c(wells_bbox$ymin, wells_bbox$ymax),
expand = TRUE) +
scale_color_viridis_c(option = "C", name = "Mean Nitrate") +
labs(title = "Wells and Counties Containing Them") +
theme_minimal()Scale the data: standardization (scale()) to center all predictors around 0 with a standard deviation of 1.((x- mean)/sd)
numeric_predictors <- model_data %>%
ungroup() %>%
select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
Mean_WellDepth, avg_sand, avg_silt,avg_clay,
wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond,WWC5AGEYears)
scaled_predictors <- scale(numeric_predictors)
model_data_scaled <- model_data %>%
select(Well_ID, cluster, mean_nitrate, sd_nitrate,county) %>%
bind_cols(as_tibble(scaled_predictors))
model_data_scaled$county <- as.factor(model_data_scaled$county)
county_counts <- model_data_scaled %>%
count(county)
county_counts# A tibble: 5 × 2
county n
<fct> <int>
1 Harvey 91
2 McPherson 16
3 Reno 99
4 Rice 2
5 Sedgwick 9
boxplot(mean_nitrate ~ county, data = model_data_scaled,
las = 2,
col = "lightblue",
main = "Mean Nitrate by County",
xlab = "County",
ylab = "Mean Nitrate")
text(x = 1:nrow(county_counts),
y = par("usr")[3] +20,
labels = county_counts$n,
xpd = TRUE,
srt = 90,
adj = 1,
cex = 0.8,
col = "darkblue")Updated Wednesday, 14th May
using counties as random effect
1. Linear mixed-effects model excluding cluster, using Counties as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.
library(lme4)
library(ggplot2)
library(Metrics)
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + Agri_Change+mean_Chloride + WWC5AGEYears+mean_SpecCond+sd_Chloride+sd_SpecCond +
(1 | county),
data = train_data)
#summary(lmm_mean)
stargazer(lmm_mean,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 1.56*
(0.61)
Developed_Change -0.04
(0.41)
Mean_WellDepth -0.10
(0.26)
avg_sand -2.06
(1.76)
avg_silt -1.75
(1.82)
wizard_sites_SD_data -0.39
(0.41)
Agri_Change -0.01
(0.37)
mean_Chloride 9.71***
(2.92)
WWC5AGEYears 0.26
(0.30)
mean_SpecCond -11.48***
(2.94)
sd_Chloride -0.03
(0.48)
sd_SpecCond 0.15
(0.49)
Constant 3.22*
(1.51)
-------------------------------------------------------------
Observations 173
Log Likelihood -461.43
Akaike Inf. Crit. 952.86
Bayesian Inf. Crit. 1000.16
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# The model predicts mean nitrate levels using well, soil, and land use features, while accounting for differences across counties.
#
# The random effect for county shows some variation across locations (county variance = 6.45), but most variability is still within counties (residual variance = 12.14). #about one-third of the variation in mean nitrate levels is associated with differences across counties, even after accounting for the predictors in the model.
#
# wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are the most influential predictors.
#
# Some predictors like Developed_Change, Agri_Change, and wizard_sites_SD_data don’t seem to have strong effects (low t-values).
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
labs(
title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
theme_minimal(base_size = 20) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
hjust = 0, vjust = 1.2, size = 5, color = "black") +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# residul plot
plot(lmm_mean)# The ideal plot would have points randomly scattered around the horizontal line at 0, with no clear pattern.
#
# this plot shows a clear curve and funnel shape
# qqplot
qqnorm(resid(lmm_mean))
qqline(resid(lmm_mean))#
# Since many points deviate from the line, especially at the ends (tails), this suggests non-normality in residuals — particularly heavier tails (possible outliers) #very high or low values 2. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance. Counties as a random effect
library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No wizard_sites_last_year_data_WTE",
"No Mean Well Depth",
"No wizard_sites_SD_data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Well Depth and No WTE",
"No Age"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+Agri_Change+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+ Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+ Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + WWC5AGEYears+ Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | county),
mean_nitrate ~ Developed_Change + Mean_WellDepth + WWC5AGEYears+ avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE+ WWC5AGEYears + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county), # No sand + silt
mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | county), # No chloride + speccond
mean_nitrate ~ Developed_Change + avg_sand + WWC5AGEYears+ avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county), # No WTE + depth
mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county)
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lmer(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmse3. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability. county as a random effect
model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand + WWC5AGEYears+avg_clay+avg_silt+Developed_Change+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + Agri_Change+mean_SpecCond +
(1 | county),
data = train_data)
summary(lmm_log)Linear mixed model fit by REML ['lmerMod']
Formula: log_mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + WWC5AGEYears + avg_clay + avg_silt + Developed_Change +
wizard_sites_SD_data + mean_Chloride + sd_Chloride + sd_SpecCond +
Agri_Change + mean_SpecCond + (1 | county)
Data: train_data
REML criterion at convergence: 387.2
Scaled residuals:
Min 1Q Median 3Q Max
-2.87909 -0.60853 -0.05188 0.56218 2.63632
Random effects:
Groups Name Variance Std.Dev.
county (Intercept) 0.6465 0.8040
Residual 0.4353 0.6598
Number of obs: 173, groups: county, 5
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.92265 0.38228 2.414
wizard_sites_last_year_data_WTE 0.31322 0.11725 2.671
Mean_WellDepth -0.04266 0.04821 -0.885
avg_sand 0.09708 0.18956 0.512
WWC5AGEYears 0.03136 0.05564 0.564
avg_clay 0.15888 0.17043 0.932
Developed_Change 0.03563 0.07610 0.468
wizard_sites_SD_data -0.11675 0.07669 -1.522
mean_Chloride 2.33159 0.54431 4.284
sd_Chloride 0.07027 0.08887 0.791
sd_SpecCond 0.01277 0.09145 0.140
Agri_Change 0.01884 0.06888 0.274
mean_SpecCond -2.71228 0.54801 -4.949
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
stargazer(lmm_log,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
log_mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.31**
(0.12)
Mean_WellDepth -0.04
(0.05)
avg_sand 0.10
(0.19)
WWC5AGEYears 0.03
(0.06)
avg_clay 0.16
(0.17)
Developed_Change 0.04
(0.08)
wizard_sites_SD_data -0.12
(0.08)
mean_Chloride 2.33***
(0.54)
sd_Chloride 0.07
(0.09)
sd_SpecCond 0.01
(0.09)
Agri_Change 0.02
(0.07)
mean_SpecCond -2.71***
(0.55)
Constant 0.92*
(0.38)
-------------------------------------------------------------
Observations 173
Log Likelihood -193.61
Akaike Inf. Crit. 417.21
Bayesian Inf. Crit. 464.51
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# Model Goal: Predict log-transformed nitrate using well/site/soil data while adjusting for differences across counties.
#
#
# Strong Predictors: wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are statistically important (big t-values).
#
#
# Weak/No Effect: Some variables like Developed_Change, Agri_Change, and wizard_sites_SD_data don't seem to help much here.
#
#
# Random Effect (county): There is some variation across counties (random effect SD = 0.58), but most variation is still within counties (residual SD = 0.71).
test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log)
r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
geom_point(alpha = 0.7, size = 2, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# residul plot
plot(lmm_log)#
# qqplot
qqnorm(resid(lmm_log))
qqline(resid(lmm_log))4. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(county, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.
model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate <- log1p(test_data$mean_nitrate)
train_data$county <- as.factor(train_data$county)
test_data$county <- factor(test_data$county, levels = levels(train_data$county))
gam_model <- gam(log_mean_nitrate ~
s(wizard_sites_last_year_data_WTE) +
s(Mean_WellDepth) +
s(avg_sand) +
s(avg_clay) +
s(avg_silt) +
s(wizard_sites_SD_data) +
s(mean_Chloride) +
s(mean_SpecCond) +
s(sd_Chloride) +
s(sd_SpecCond) +
s(Developed_Change) +
s(Agri_Change)+
s(WWC5AGEYears)+
s(county, bs = "re"),
data = train_data,
method = "REML") # Restricted Maximum Likelihood # estimate the variance components (like the random effects and residual error) in mixed models
summary(gam_model)
Family: gaussian
Link function: identity
Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) +
s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) +
s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) +
s(Developed_Change) + s(Agri_Change) + s(WWC5AGEYears) +
s(county, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.01092 0.04018 25.16 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(wizard_sites_last_year_data_WTE) 5.226e+00 6.290e+00 12.028 < 2e-16 ***
s(Mean_WellDepth) 2.151e+00 2.380e+00 7.058 0.000703 ***
s(avg_sand) 2.649e-05 4.151e-05 0.003 0.999739
s(avg_clay) 1.000e+00 1.000e+00 3.777 0.053903 .
s(avg_silt) 2.231e+00 2.735e+00 1.981 0.140752
s(wizard_sites_SD_data) 2.913e+00 3.644e+00 2.057 0.132801
s(mean_Chloride) 1.963e+00 2.458e+00 5.887 0.001607 **
s(mean_SpecCond) 1.000e+00 1.000e+00 9.456 0.002512 **
s(sd_Chloride) 1.000e+00 1.000e+00 8.800 0.003524 **
s(sd_SpecCond) 1.000e+00 1.000e+00 3.560 0.061176 .
s(Developed_Change) 1.824e+00 2.046e+00 1.840 0.175338
s(Agri_Change) 1.000e+00 1.000e+00 0.018 0.894285
s(WWC5AGEYears) 5.481e+00 6.457e+00 2.716 0.012401 *
s(county) 2.682e-05 4.000e+00 0.000 0.318336
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Rank: 122/123
R-sq.(adj) = 0.66 Deviance explained = 71.3%
-REML = 171.07 Scale est. = 0.27926 n = 173
# 1st num regression coefficient , 2nd num standard error of the coefficient estimate, *** means p-value < 0.001, which indicates the effect is highly significant.
#
# stargazer(gam_model,type="text",
# digits=2,
# star.cutoffs = c(.05,.01,.001),
# digit.separator = "")
# The model predicts log-transformed nitrate using smooth curves (splines) for each predictor and accounts for differences across counties as a random effect.
#
# The model explains about 53.9% of the deviance (similar to R²) and has an adjusted R² of 0.493, indicating a decent fit.
#
# wizard_sites_last_year_data_WTE, Mean_WellDepth, mean_Chloride, and county were statistically significant predictors (p < 0.05).
#
# Other predictors like avg_sand, sd_Chloride, and wizard_sites_SD_data didn’t show strong effects (p > 0.1).
#
# The smooth term for county was significant, confirming that including it as a random effect helps account for regional variability.
test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)
gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
hjust = 0, vjust = 1, size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# QQ plot
qqnorm(resid(gam_model))
qqline(resid(gam_model))5. Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
mlr_model <- lm(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + WWC5AGEYears+ avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond+Agri_Change,
data = train_data)
#summary(mlr_model)
# 1st num regression coefficient , 2nd num standard error of the coefficient estimate, *** means p-value < 0.001, which indicates the effect is highly significant.
stargazer(mlr_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 2.19***
(0.39)
Developed_Change 0.37
(0.38)
Mean_WellDepth -0.05
(0.27)
WWC5AGEYears 0.43
(0.31)
avg_sand -1.70
(1.79)
avg_silt -1.30
(1.80)
avg_clay
wizard_sites_SD_data -1.26***
(0.34)
mean_Chloride 8.74**
(2.96)
mean_SpecCond -10.51***
(2.97)
sd_Chloride 0.09
(0.50)
sd_SpecCond 0.14
(0.51)
Agri_Change -0.05
(0.39)
Constant 3.44***
(0.29)
-------------------------------------------------------------
Observations 173
R2 0.37
Adjusted R2 0.33
Residual Std. Error 3.73 (df = 160)
F Statistic 7.96*** (df = 12; 160)
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
#
# The model explains about 27% of the variance in mean nitrate values (R² = 0.274), with a residual error of around 3.64.
#
# The variable wizard_sites_last_year_data_WTE has the strongest positive effect and is highly significant.
#
# wizard_sites_SD_data and sd_SpecCond also show significant relationships; mean_SpecCond is weakly significant and negative.
#
# The variable avg_clay was automatically removed due to perfect multicollinearity with other predictors.
#
# Several predictors like Developed_Change, Mean_WellDepth, and Agri_Change were not statistically significant.
#
test_data$mlr_predicted <- predict(mlr_model, newdata = test_data)
mlr_r2 <- round(cor(test_data$mean_nitrate, test_data$mlr_predicted)^2, 3)
mlr_rmse <- round(rmse(test_data$mean_nitrate, test_data$mlr_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$mlr_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = mlr_predicted)) +
geom_point(alpha = 0.7, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "MLR: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", mlr_r2, "\nRMSE = ", mlr_rmse),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# QQ plot of residuals
qqnorm(resid(mlr_model))
qqline(resid(mlr_model))6. Comparison of Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No WTE",
"No Mean Well Depth",
"No SD Data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Depth and WTE",
"No Age"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond+Agri_Change,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_clay + WWC5AGEYears+ wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_silt + avg_clay + WWC5AGEYears+ wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
mean_SpecCond + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
mean_Chloride + sd_Chloride,
mean_nitrate ~ Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
avg_sand + avg_silt + WWC5AGEYears+ avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + WWC5AGEYears+ avg_clay + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_clay + WWC5AGEYears+ wizard_sites_SD_data + mean_Chloride +
mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + WWC5AGEYears+ Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data,
mean_nitrate ~ Developed_Change + WWC5AGEYears+ avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond,
mean_nitrate ~ Developed_Change + avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lm(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "MLR Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "MLR Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmse7. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy
library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
wizard_sites_SD_data + mean_Chloride + mean_SpecCond + WWC5AGEYears+ Agri_Change,
data = train_data,
ntree = 500, importance = TRUE)
print(rf_model)
Call:
randomForest(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_clay + avg_silt + sd_Chloride + sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + WWC5AGEYears + Agri_Change, data = train_data, ntree = 500, importance = TRUE)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 4
Mean of squared residuals: 8.927819
% Var explained: 56.63
importance(rf_model) %IncMSE IncNodePurity
wizard_sites_last_year_data_WTE 22.369779 746.51622
Developed_Change 5.669976 100.18938
Mean_WellDepth 16.240461 554.84470
avg_sand 6.576424 147.54888
avg_clay 6.297357 136.97875
avg_silt 7.336557 160.19089
sd_Chloride 8.166566 156.77181
sd_SpecCond 7.580271 200.40126
wizard_sites_SD_data 14.723644 332.97902
mean_Chloride 11.442131 320.88192
mean_SpecCond 8.708503 246.95999
WWC5AGEYears 7.946512 234.03414
Agri_Change 3.427808 93.65367
#
# Top 3 most important predictors based on %IncMSE are:
#
# wizard_sites_last_year_data_WTE (24.4%),
#
# Mean_WellDepth (21.5%),
#
# mean_Chloride (13.6%).
#
# Agri_Change has very low importance (0.2%), suggesting it adds little predictive value.
#
# IncNodePurity shows how much a variable helps in reducing overall variance — used to assess relative contribution.
#
# Overall, WTE, depth, and water quality variables (like chloride, spec cond) are driving prediction strength in RF model.
#
test_data$rf_predicted <- predict(rf_model, newdata = test_data)
r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
geom_point(alpha = 0.7, color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# Residuals
resid_rf <- test_data$mean_nitrate - predict(rf_model, newdata = test_data)
# # QQ plot
# qqnorm(resid_rf)
# qqline(resid_rf)
# Training data Testing data MSE % variation explained MSE % variation explained #IOWA paper
train_preds <- predict(rf_model, newdata = train_data)
mse_train <- mean((train_data$mean_nitrate - train_preds)^2)
r2_train <- cor(train_data$mean_nitrate, train_preds)^2 * 100
test_preds <- predict(rf_model, newdata = test_data)
mse_test <- mean((test_data$mean_nitrate - test_preds)^2)
r2_test <- cor(test_data$mean_nitrate, test_preds)^2 * 100
rf_performance <- tibble(
Dataset = c("Training", "Testing"),
MSE = c(mse_train, mse_test),
`% Variance Explained` = c(r2_train, r2_test)
)
rf_performance %>%
gt() %>%
tab_header(title = "Random Forest Model Performance") %>%
gt::fmt_number(columns = c(MSE, `% Variance Explained`), decimals = 2)| Random Forest Model Performance | ||
|---|---|---|
| Dataset | MSE | % Variance Explained |
| Training | 1.69 | 94.19 |
| Testing | 9.36 | 63.67 |
8. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations.
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + WWC5AGEYears+ avg_sand +avg_clay+avg_silt+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond+Agri_Change,
data = train_data,
family = Gamma(link = "log"))
#summary(glm_model)
# 1st num regression coefficient , 2nd num standard error of the coefficient estimate, *** means p-value < 0.001, which indicates the effect is highly significant.
stargazer(glm_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.57***
(0.12)
Developed_Change 0.07
(0.12)
Mean_WellDepth -0.09
(0.09)
WWC5AGEYears 0.22*
(0.10)
avg_sand -0.11
(0.31)
avg_clay 0.06
(0.29)
avg_silt
wizard_sites_SD_data -0.82***
(0.11)
mean_Chloride 4.84***
(0.95)
sd_Chloride 0.08
(0.16)
sd_SpecCond 0.08
(0.16)
mean_SpecCond -5.41***
(0.95)
Agri_Change -0.10
(0.12)
Constant 0.73***
(0.09)
-------------------------------------------------------------
Observations 173
Log Likelihood -291.59
Akaike Inf. Crit. 609.18
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# This model predicts mean nitrate using a Gamma distribution with a log link, suitable for skewed, non-negative data.
#
# The variables wizard_sites_last_year_data_WTE, wizard_sites_SD_data, mean_Chloride, mean_SpecCond, and sd_SpecCond are statistically significant (p < 0.05), meaning they likely influence nitrate concentrations.
#
# avg_silt was automatically excluded due to collinearity (probably because of perfect correlation with sand or clay).
test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")
r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# QQ plot
qqnorm(resid(glm_model, type = "deviance"))
qqline(resid(glm_model, type = "deviance"))Best models so far
| Best Models So Far (With Well Age in eq: 217 wells) | ||
|---|---|---|
| Model | R2 | RMSE |
| Linear mixed model using county | 0.475 | 3.643 |
| Log-transformed linear mixed model | 0.507 | 3.820 |
| Generalized additive model (GAM) | 0.586 | 3.353 |
| Multiple linear regression | 0.453 | 3.759 |
| Random forest model | 0.637 | 3.059 |
| Gamma-distributed generalized linear model | 0.380 | 4.100 |
| Best Model: Data Filtered by Age But Age Not Used in Model (217 wells) | ||
|---|---|---|
| Model | R2 | RMSE |
| Linear mixed model using county | 0.465 | 3.669 |
| Log-transformed linear mixed model | 0.501 | 3.842 |
| Generalized additive model (GAM) | 0.544 | 3.607 |
| Multiple linear regression | 0.436 | 3.804 |
| Random forest model | 0.617 | 3.130 |
| Gamma-distributed generalized linear model | 0.372 | 4.042 |
| Best Model without Age (362 wells) | ||
|---|---|---|
| Model | R2 | RMSE |
| Linear mixed model using county | 0.403 | 3.156 |
| Log-transformed linear mixed model | 0.463 | 3.257 |
| Generalized additive model (GAM) | 0.378 | 3.278 |
| Multiple linear regression | 0.371 | 3.246 |
| Random forest model | 0.580 | 2.671 |
| Gamma-distributed generalized linear model | 0.503 | 2.880 |
| Model Performance Comparison (With Age vs. Filtered vs. Without Age) | ||||||
|---|---|---|---|---|---|---|
| Model | R2 (With Age) | R2 (Filtered, Age Not Used) | R2 (Without Age) | RMSE (With Age) | RMSE (Filtered, Age Not Used) | RMSE (Without Age) |
| Linear mixed model using county | 0.475 | 0.465 | 0.403 | 3.643 | 3.669 | 3.156 |
| Log-transformed linear mixed model | 0.507 | 0.501 | 0.463 | 3.820 | 3.842 | 3.257 |
| Generalized additive model (GAM) | 0.586 | 0.544 | 0.378 | 3.353 | 3.607 | 3.278 |
| Multiple linear regression | 0.453 | 0.436 | 0.371 | 3.759 | 3.804 | 3.246 |
| Random forest model | 0.637 | 0.617 | 0.580 | 3.059 | 3.130 | 2.671 |
| Gamma-distributed generalized linear model | 0.380 | 0.372 | 0.503 | 4.100 | 4.042 | 2.880 |