Mean Nitrate Prediction_Updated
Explore Data
file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"
model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data_long <- model_data %>%
ungroup() %>%
select(where(is.numeric)) %>%
select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
ggplot(model_data_long, aes(y = value)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(title = "Boxplots of All Numeric Variables",
y = "Value", x = "") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())Add counties information to the dataset
counties <- counties(state = "KS", year = 2022, class = "sf")
wells_sf <- model_data %>%
st_as_sf(coords = c("Lon.x", "Lat.x"), crs = 4326)
counties_transformed <- st_transform(counties, crs = 4326)
wells_with_county <- st_join(wells_sf, counties_transformed["NAME"])
model_data <- wells_with_county %>%
st_drop_geometry() %>%
rename(county = NAME)
wells_bbox <- st_bbox(wells_sf)
library(viridis)
counties_with_wells <- st_filter(counties_transformed, wells_sf)
ggplot() +
geom_sf(data = counties_with_wells, fill = "white", color = "black", size = 0.3) +
geom_sf(data = wells_sf, aes(color = mean_nitrate), size = 2) +
geom_sf_text(data = counties_with_wells, aes(label = NAME), size = 3, check_overlap = TRUE, color = "gray30") +
coord_sf(xlim = c(wells_bbox$xmin, wells_bbox$xmax),
ylim = c(wells_bbox$ymin, wells_bbox$ymax),
expand = TRUE) +
scale_color_viridis_c(option = "C", name = "Mean Nitrate") +
labs(title = "Wells and Counties Containing Them") +
theme_minimal()Scale the data: standardization (scale()) to center all predictors around 0 with a standard deviation of 1.((x- mean)/sd)
numeric_predictors <- model_data %>%
ungroup() %>%
select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
Mean_WellDepth, avg_sand, avg_silt,avg_clay,
wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond)
scaled_predictors <- scale(numeric_predictors)
model_data_scaled <- model_data %>%
select(Well_ID, cluster, mean_nitrate, sd_nitrate,county) %>%
bind_cols(as_tibble(scaled_predictors))
model_data_scaled$county <- as.factor(model_data_scaled$county)
county_counts <- model_data_scaled %>%
count(county)
county_counts# A tibble: 5 × 2
county n
<fct> <int>
1 Harvey 109
2 McPherson 16
3 Reno 177
4 Rice 2
5 Sedgwick 58
boxplot(mean_nitrate ~ county, data = model_data_scaled,
las = 2,
col = "lightblue",
main = "Mean Nitrate by County",
xlab = "County",
ylab = "Mean Nitrate")
text(x = 1:nrow(county_counts),
y = par("usr")[3] +20,
labels = county_counts$n,
xpd = TRUE,
srt = 90,
adj = 1,
cex = 0.8,
col = "darkblue")Best models so far
| Best Models So Far (Sorted by R²) | ||
|---|---|---|
| Model | R2 | RMSE |
| Random forest model | 0.580 | 2.671 |
| Gamma-distributed generalized linear model | 0.503 | 2.880 |
| Log-transformed linear mixed model | 0.463 | 3.257 |
| Linear mixed model using county | 0.403 | 3.156 |
| Generalized additive model (GAM) | 0.378 | 3.278 |
| Multiple linear regression | 0.371 | 3.246 |
Updated Friday, 9th May
using counties as random effect
1. Linear mixed-effects model excluding cluster, using Counties as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.
library(lme4)
library(ggplot2)
library(Metrics)
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + Agri_Change+mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
(1 | county),
data = train_data)
summary(lmm_mean)Linear mixed model fit by REML ['lmerMod']
Formula: mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
Agri_Change + mean_Chloride + mean_SpecCond + sd_Chloride +
sd_SpecCond + (1 | county)
Data: train_data
REML criterion at convergence: 1543.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.1852 -0.5801 -0.1803 0.3240 4.3556
Random effects:
Groups Name Variance Std.Dev.
county (Intercept) 6.451 2.540
Residual 12.138 3.484
Number of obs: 289, groups: county, 5
Fixed effects:
Estimate Std. Error t value
(Intercept) 2.32343 1.22519 1.896
wizard_sites_last_year_data_WTE 1.66929 0.39437 4.233
Developed_Change -0.03578 0.28207 -0.127
Mean_WellDepth -0.29672 0.19747 -1.503
avg_sand -1.79085 1.29516 -1.383
avg_silt -1.55512 1.32527 -1.173
wizard_sites_SD_data 0.05082 0.30638 0.166
Agri_Change -0.14324 0.26996 -0.531
mean_Chloride 3.63893 1.89308 1.922
mean_SpecCond -5.28321 1.88337 -2.805
sd_Chloride -0.53474 0.34182 -1.564
sd_SpecCond 0.94366 0.37600 2.510
Correlation of Fixed Effects:
(Intr) w_____ Dvlp_C Mn_WlD avg_sn avg_sl w__SD_ Agr_Ch mn_Chl
wzr_____WTE -0.019
Devlpd_Chng 0.000 0.129
Men_WllDpth 0.002 0.103 -0.019
avg_sand -0.048 0.032 -0.016 -0.106
avg_silt -0.055 0.068 -0.018 -0.114 0.982
wzrd_st_SD_ -0.005 -0.139 0.064 -0.173 0.064 0.026
Agri_Change 0.018 0.031 0.605 -0.054 -0.037 -0.030 0.044
mean_Chlord -0.048 0.088 -0.017 0.056 0.123 0.137 -0.104 0.002
mean_SpcCnd 0.048 -0.136 0.005 -0.077 -0.115 -0.134 0.123 -0.016 -0.986
sd_Chloride -0.010 0.223 0.040 -0.040 0.105 0.111 0.067 0.021 0.042
sd_SpecCond 0.021 -0.013 0.041 0.065 -0.120 -0.126 -0.026 0.035 -0.211
mn_SpC sd_Chl
wzr_____WTE
Devlpd_Chng
Men_WllDpth
avg_sand
avg_silt
wzrd_st_SD_
Agri_Change
mean_Chlord
mean_SpcCnd
sd_Chloride -0.083
sd_SpecCond 0.134 -0.505
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
# The model predicts mean nitrate levels using well, soil, and land use features, while accounting for differences across counties.
#
# The random effect for county shows some variation across locations (county variance = 6.45), but most variability is still within counties (residual variance = 12.14). #about one-third of the variation in mean nitrate levels is associated with differences across counties, even after accounting for the predictors in the model.
#
# wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are the most influential predictors.
#
# Some predictors like Developed_Change, Agri_Change, and wizard_sites_SD_data don’t seem to have strong effects (low t-values).
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
labs(
title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
theme_minimal(base_size = 20) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
hjust = 0, vjust = 1.2, size = 5, color = "black") +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)2. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance. Counties as a random effect
library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No wizard_sites_last_year_data_WTE",
"No Mean Well Depth",
"No wizard_sites_SD_data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Well Depth and No WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+Agri_Change+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | county),
mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county), # No sand + silt
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | county), # No chloride + speccond
mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county) # No WTE + depth
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lmer(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmse3. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability. county as a random effect
model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + Agri_Change+mean_SpecCond +
(1 | county),
data = train_data)
summary(lmm_log)Linear mixed model fit by REML ['lmerMod']
Formula: log_mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + avg_clay + avg_silt + Developed_Change + wizard_sites_SD_data +
mean_Chloride + sd_Chloride + sd_SpecCond + Agri_Change +
mean_SpecCond + (1 | county)
Data: train_data
REML criterion at convergence: 670
Scaled residuals:
Min 1Q Median 3Q Max
-2.6128 -0.6097 -0.1445 0.7098 2.4641
Random effects:
Groups Name Variance Std.Dev.
county (Intercept) 0.3397 0.5828
Residual 0.5146 0.7173
Number of obs: 289, groups: county, 5
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.821242 0.278257 2.951
wizard_sites_last_year_data_WTE 0.354311 0.082096 4.316
Mean_WellDepth -0.088511 0.040659 -2.177
avg_sand 0.008175 0.155050 0.053
avg_clay 0.085740 0.146047 0.587
Developed_Change -0.019301 0.058084 -0.332
wizard_sites_SD_data -0.003729 0.063207 -0.059
mean_Chloride 1.191097 0.390191 3.053
sd_Chloride -0.043007 0.070427 -0.611
sd_SpecCond 0.152779 0.077422 1.973
Agri_Change -0.047574 0.055586 -0.856
mean_SpecCond -1.539662 0.388288 -3.965
Correlation of Fixed Effects:
(Intr) w_____ Mn_WlD avg_sn avg_cl Dvlp_C w__SD_ mn_Chl sd_Chl
wzr_____WTE -0.026
Men_WllDpth 0.001 0.103
avg_sand 0.058 -0.126 0.117
avg_clay 0.052 -0.070 0.114 0.947
Devlpd_Chng -0.001 0.129 -0.019 0.021 0.018
wzrd_st_SD_ -0.005 -0.138 -0.173 0.043 -0.025 0.063
mean_Chlord -0.047 0.092 0.056 -0.152 -0.138 -0.017 -0.103
sd_Chloride -0.011 0.226 -0.040 -0.112 -0.111 0.040 0.067 0.043
sd_SpecCond 0.020 -0.014 0.065 0.125 0.126 0.041 -0.026 -0.211 -0.505
Agri_Change 0.016 0.031 -0.054 0.016 0.030 0.605 0.044 0.002 0.021
mean_SpcCnd 0.047 -0.140 -0.077 0.157 0.136 0.005 0.122 -0.986 -0.084
sd_SpC Agr_Ch
wzr_____WTE
Men_WllDpth
avg_sand
avg_clay
Devlpd_Chng
wzrd_st_SD_
mean_Chlord
sd_Chloride
sd_SpecCond
Agri_Change 0.035
mean_SpcCnd 0.134 -0.016
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
# Model Goal: Predict log-transformed nitrate using well/site/soil data while adjusting for differences across counties.
#
#
# Strong Predictors: wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are statistically important (big t-values).
#
#
# Weak/No Effect: Some variables like Developed_Change, Agri_Change, and wizard_sites_SD_data don't seem to help much here.
#
#
# Random Effect (county): There is some variation across counties (random effect SD = 0.58), but most variation is still within counties (residual SD = 0.71).
test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log)
r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
geom_point(alpha = 0.7, size = 2, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)4. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(county, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.
model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate <- log1p(test_data$mean_nitrate)
train_data$county <- as.factor(train_data$county)
test_data$county <- factor(test_data$county, levels = levels(train_data$county))
gam_model <- gam(log_mean_nitrate ~
s(wizard_sites_last_year_data_WTE) +
s(Mean_WellDepth) +
s(avg_sand) +
s(avg_clay) +
s(avg_silt) +
s(wizard_sites_SD_data) +
s(mean_Chloride) +
s(mean_SpecCond) +
s(sd_Chloride) +
s(sd_SpecCond) +
s(Developed_Change) +
s(Agri_Change)+
s(county, bs = "re"),
data = train_data,
method = "REML") # Restricted Maximum Likelihood # estimate the variance components (like the random effects and residual error) in mixed models
summary(gam_model)
Family: gaussian
Link function: identity
Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) +
s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) +
s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) +
s(Developed_Change) + s(Agri_Change) + s(county, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9562 0.1917 4.989 1.11e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(wizard_sites_last_year_data_WTE) 4.3919896 5.4011909 3.521 0.00628 **
s(Mean_WellDepth) 3.3939422 4.0230540 9.324 6.00e-07 ***
s(avg_sand) 0.0002365 0.0004617 0.021 0.99755
s(avg_clay) 1.0001008 1.0001972 0.375 0.54107
s(avg_silt) 1.0001908 1.0003714 0.004 0.95382
s(wizard_sites_SD_data) 2.6584084 3.3435020 1.839 0.13507
s(mean_Chloride) 3.3641869 4.1595066 3.051 0.01809 *
s(mean_SpecCond) 1.0000177 1.0000260 3.541 0.06099 .
s(sd_Chloride) 1.0000654 1.0001238 0.285 0.59380
s(sd_SpecCond) 1.0000810 1.0001574 1.864 0.17338
s(Developed_Change) 1.0000774 1.0001511 3.659 0.05685 .
s(Agri_Change) 3.7619394 4.6885505 2.206 0.05817 .
s(county) 2.7023225 4.0000000 5.660 5.27e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Rank: 113/114
R-sq.(adj) = 0.493 Deviance explained = 53.9%
-REML = 310.63 Scale est. = 0.39299 n = 289
# The model predicts log-transformed nitrate using smooth curves (splines) for each predictor and accounts for differences across counties as a random effect.
#
# The model explains about 53.9% of the deviance (similar to R²) and has an adjusted R² of 0.493, indicating a decent fit.
#
# wizard_sites_last_year_data_WTE, Mean_WellDepth, mean_Chloride, and county were statistically significant predictors (p < 0.05).
#
# Other predictors like avg_sand, sd_Chloride, and wizard_sites_SD_data didn’t show strong effects (p > 0.1).
#
# The smooth term for county was significant, confirming that including it as a random effect helps account for regional variability.
test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)
gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
hjust = 0, vjust = 1, size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)5. Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
mlr_model <- lm(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond+Agri_Change,
data = train_data)
summary(mlr_model)
Call:
lm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand + avg_silt +
avg_clay + wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond + Agri_Change, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-7.9367 -2.1461 -0.8388 1.1720 15.9447
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.94154 0.21476 13.697 < 2e-16 ***
wizard_sites_last_year_data_WTE 1.67481 0.24673 6.788 6.9e-11 ***
Developed_Change 0.09089 0.29230 0.311 0.756083
Mean_WellDepth -0.28547 0.20604 -1.386 0.167014
avg_sand -1.76700 1.31633 -1.342 0.180575
avg_silt -1.54386 1.32262 -1.167 0.244102
avg_clay NA NA NA NA
wizard_sites_SD_data -0.83285 0.24891 -3.346 0.000933 ***
mean_Chloride 2.26540 1.91474 1.183 0.237768
mean_SpecCond -3.81547 1.89326 -2.015 0.044840 *
sd_Chloride -0.64473 0.34956 -1.844 0.066193 .
sd_SpecCond 0.95444 0.39236 2.433 0.015625 *
Agri_Change -0.13986 0.28090 -0.498 0.618950
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.639 on 277 degrees of freedom
Multiple R-squared: 0.2737, Adjusted R-squared: 0.2448
F-statistic: 9.488 on 11 and 277 DF, p-value: 1.696e-14
#
# The model explains about 27% of the variance in mean nitrate values (R² = 0.274), with a residual error of around 3.64.
#
# The variable wizard_sites_last_year_data_WTE has the strongest positive effect and is highly significant.
#
# wizard_sites_SD_data and sd_SpecCond also show significant relationships; mean_SpecCond is weakly significant and negative.
#
# The variable avg_clay was automatically removed due to perfect multicollinearity with other predictors.
#
# Several predictors like Developed_Change, Mean_WellDepth, and Agri_Change were not statistically significant.
#
test_data$mlr_predicted <- predict(mlr_model, newdata = test_data)
mlr_r2 <- round(cor(test_data$mean_nitrate, test_data$mlr_predicted)^2, 3)
mlr_rmse <- round(rmse(test_data$mean_nitrate, test_data$mlr_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$mlr_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = mlr_predicted)) +
geom_point(alpha = 0.7, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "MLR: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", mlr_r2, "\nRMSE = ", mlr_rmse),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)6. Comparison of Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No WTE",
"No Mean Well Depth",
"No SD Data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Depth and WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond+Agri_Change,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_SpecCond + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + sd_Chloride,
mean_nitrate ~ Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_clay + wizard_sites_SD_data + mean_Chloride +
mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data,
mean_nitrate ~ Developed_Change + avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lm(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "MLR Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "MLR Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmsePrevious work up until Thursday, 8th May
Note From Erin’s email
The point of the random effect is to account for variation that naturally arises from different groupings/hierarchies of the data. I know you initially tried using the cluster as the random effect, but the challenge there (I think) is that cluster really strongly defines the range of nitrate – so it swamps out the fixed effects and we get those weird concentration bands. I think that using agricultural change doesn’t work because 1) it’s a continuous variable and 2) the values are all basically the same for all the wells. If we were trying to predict individual measurements, then the random effect would be the site. But since we’re using the means, that doesn’t really apply. Part of me is wondering if you should just try a multiple linear regression (so no hierarchy) and see if the model does any better.
1. Observed vs. predicted mean nitrate concentrations using a linear mixed-effects model with Agri_Change as a random effect and cluster as a fixed effect.
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + mean_Chloride +sd_Chloride+sd_SpecCond +mean_SpecCond + cluster +
(1 | Agri_Change),
data = train_data)
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Cluster as Fixed Effect): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
size = 5, color = "black") +
theme_minimal(base_size = 20) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)2. Linear mixed-effects model excluding cluster, using Agri_Change as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.
library(lme4)
library(ggplot2)
library(Metrics)
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
(1 | Agri_Change),
data = train_data)
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
labs(
title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
theme_minimal(base_size = 20) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
hjust = 0, vjust = 1.2, size = 5, color = "black") +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)3. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance.
library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No wizard_sites_last_year_data_WTE",
"No Mean Well Depth",
"No wizard_sites_SD_data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Well Depth and No WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | Agri_Change),
mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | Agri_Change), # No sand + silt
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | Agri_Change), # No chloride + speccond
mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | Agri_Change) # No WTE + depth
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lmer(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none") +
ylim(0, 1)
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmse4. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability.
model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond +
(1 | Agri_Change),
data = train_data)
test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log)
r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
geom_point(alpha = 0.7, size = 2, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)5. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(Agri_Change, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.
model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate <- log1p(test_data$mean_nitrate)
gam_model <- gam(log_mean_nitrate ~
s(wizard_sites_last_year_data_WTE) +
s(Mean_WellDepth) +
s(avg_sand) +
s(avg_clay) +
s(avg_silt) +
s(wizard_sites_SD_data) +
s(mean_Chloride) +
s(mean_SpecCond) +
s(sd_Chloride) +
s(sd_SpecCond) +
s(Developed_Change) +
s(Agri_Change, bs = "re"),
data = train_data,
method = "REML")
summary(gam_model)
Family: gaussian
Link function: identity
Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) +
s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) +
s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) +
s(Developed_Change) + s(Agri_Change, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.90443 0.03699 24.45 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(wizard_sites_last_year_data_WTE) 5.558e+00 6.698698 8.256 < 2e-16 ***
s(Mean_WellDepth) 3.282e+00 3.884149 8.621 3.21e-06 ***
s(avg_sand) 1.192e-04 0.000207 0.001 0.999714
s(avg_clay) 3.811e+00 4.740497 1.932 0.106595
s(avg_silt) 1.000e+00 1.000183 3.610 0.058563 .
s(wizard_sites_SD_data) 3.663e+00 4.572301 4.993 0.000418 ***
s(mean_Chloride) 2.684e+00 3.354533 2.231 0.079040 .
s(mean_SpecCond) 1.000e+00 1.000012 2.634 0.105794
s(sd_Chloride) 1.000e+00 1.000222 0.450 0.503059
s(sd_SpecCond) 1.000e+00 1.000164 0.763 0.383093
s(Developed_Change) 5.110e+00 5.793833 2.668 0.012745 *
s(Agri_Change) 4.157e-05 1.000000 0.000 0.878071
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Rank: 100/101
R-sq.(adj) = 0.49 Deviance explained = 53.9%
-REML = 316.75 Scale est. = 0.39551 n = 289
test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)
gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
hjust = 0, vjust = 1, size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)6. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy
library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
data = train_data,
ntree = 500, importance = TRUE)
print(rf_model)
Call:
randomForest(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_clay + avg_silt + sd_Chloride + sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change, data = train_data, ntree = 500, importance = TRUE)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 4
Mean of squared residuals: 9.29879
% Var explained: 46.79
importance(rf_model) %IncMSE IncNodePurity
wizard_sites_last_year_data_WTE 24.3555896 1049.6664
Developed_Change 2.8712488 129.7873
Mean_WellDepth 21.4816680 1009.5143
avg_sand 3.9973885 159.4341
avg_clay 5.5287895 174.4687
avg_silt 4.2982408 197.9525
sd_Chloride 10.9084549 372.6617
sd_SpecCond 8.2713165 310.2396
wizard_sites_SD_data 10.7779309 334.6947
mean_Chloride 13.5936136 514.9471
mean_SpecCond 9.3119503 437.4617
Agri_Change 0.2146065 117.4282
#
# Top 3 most important predictors based on %IncMSE are:
#
# wizard_sites_last_year_data_WTE (24.4%),
#
# Mean_WellDepth (21.5%),
#
# mean_Chloride (13.6%).
#
# Agri_Change has very low importance (0.2%), suggesting it adds little predictive value.
#
# IncNodePurity shows how much a variable helps in reducing overall variance — used to assess relative contribution.
#
# Overall, WTE, depth, and water quality variables (like chloride, spec cond) are driving prediction strength in RF model.
#
test_data$rf_predicted <- predict(rf_model, newdata = test_data)
r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
geom_point(alpha = 0.7, color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)7. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations.
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand +avg_clay+avg_silt+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond+Agri_Change,
data = train_data,
family = Gamma(link = "log"))
summary(glm_model)
Call:
glm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand + avg_clay +
avg_silt + wizard_sites_SD_data + mean_Chloride + sd_Chloride +
sd_SpecCond + mean_SpecCond + Agri_Change, family = Gamma(link = "log"),
data = train_data)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.72814 0.08372 8.697 3.08e-16 ***
wizard_sites_last_year_data_WTE 0.54185 0.09619 5.633 4.34e-08 ***
Developed_Change -0.03460 0.11395 -0.304 0.761621
Mean_WellDepth -0.03000 0.08032 -0.373 0.709097
avg_sand 0.06754 0.28255 0.239 0.811248
avg_clay 0.30100 0.27567 1.092 0.275822
avg_silt NA NA NA NA
wizard_sites_SD_data -0.52311 0.09704 -5.391 1.50e-07 ***
mean_Chloride 2.58317 0.74647 3.461 0.000624 ***
sd_Chloride -0.14638 0.13628 -1.074 0.283700
sd_SpecCond 0.35784 0.15296 2.339 0.020023 *
mean_SpecCond -3.23494 0.73809 -4.383 1.66e-05 ***
Agri_Change -0.12455 0.10951 -1.137 0.256388
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 2.01265)
Null deviance: 748.73 on 288 degrees of freedom
Residual deviance: 558.80 on 277 degrees of freedom
AIC: 978.79
Number of Fisher Scoring iterations: 25
# This model predicts mean nitrate using a Gamma distribution with a log link, suitable for skewed, non-negative data.
#
# The variables wizard_sites_last_year_data_WTE, wizard_sites_SD_data, mean_Chloride, mean_SpecCond, and sd_SpecCond are statistically significant (p < 0.05), meaning they likely influence nitrate concentrations.
#
# avg_silt was automatically excluded due to collinearity (probably because of perfect correlation with sand or clay).
test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")
r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)