Mean Nitrate Prediction_Updated_GreaterThan2N
Explore Data
file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"
model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data <- model_data %>%
filter(mean_nitrate > 2)
model_data_long <- model_data %>%
ungroup() %>%
select(where(is.numeric)) %>%
select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
ggplot(model_data_long, aes(y = value)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(title = "Boxplots of All Numeric Variables",
y = "Value", x = "") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())Add counties information to the dataset
counties <- counties(state = "KS", year = 2022, class = "sf")
wells_sf <- model_data %>%
st_as_sf(coords = c("Lon.x", "Lat.x"), crs = 4326)
counties_transformed <- st_transform(counties, crs = 4326)
wells_with_county <- st_join(wells_sf, counties_transformed["NAME"])
model_data <- wells_with_county %>%
st_drop_geometry() %>%
rename(county = NAME)
wells_bbox <- st_bbox(wells_sf)
library(viridis)
counties_with_wells <- st_filter(counties_transformed, wells_sf)
ggplot() +
geom_sf(data = counties_with_wells, fill = "white", color = "black", size = 0.3) +
geom_sf(data = wells_sf, aes(color = mean_nitrate), size = 2) +
geom_sf_text(data = counties_with_wells, aes(label = NAME), size = 3, check_overlap = TRUE, color = "gray30") +
coord_sf(xlim = c(wells_bbox$xmin, wells_bbox$xmax),
ylim = c(wells_bbox$ymin, wells_bbox$ymax),
expand = TRUE) +
scale_color_viridis_c(option = "C", name = "Mean Nitrate") +
labs(title = "Wells and Counties Containing Them") +
theme_minimal()Scale the data: standardization (scale()) to center all predictors around 0 with a standard deviation of 1.((x- mean)/sd)
numeric_predictors <- model_data %>%
ungroup() %>%
select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
Mean_WellDepth, avg_sand, avg_silt,avg_clay,
wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond)
scaled_predictors <- scale(numeric_predictors)
model_data_scaled <- model_data %>%
select(Well_ID, cluster, mean_nitrate, sd_nitrate,county) %>%
bind_cols(as_tibble(scaled_predictors))
model_data_scaled$county <- as.factor(model_data_scaled$county)
county_counts <- model_data_scaled %>%
count(county)
county_counts# A tibble: 5 × 2
county n
<fct> <int>
1 Harvey 10
2 McPherson 9
3 Reno 90
4 Rice 1
5 Sedgwick 19
boxplot(mean_nitrate ~ county, data = model_data_scaled,
las = 2,
col = "lightblue",
main = "Mean Nitrate by County",
xlab = "County",
ylab = "Mean Nitrate")
text(x = 1:nrow(county_counts),
y = par("usr")[3] +20,
labels = county_counts$n,
xpd = TRUE,
srt = 90,
adj = 1,
cex = 0.8,
col = "darkblue")Updated Wednesday, 14th May
using counties as random effect
1. Linear mixed-effects model excluding cluster, using Counties as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.
library(lme4)
library(ggplot2)
library(Metrics)
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + Agri_Change+mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
(1 | county),
data = train_data)
summary(lmm_mean)Linear mixed model fit by REML ['lmerMod']
Formula: mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
Agri_Change + mean_Chloride + mean_SpecCond + sd_Chloride +
sd_SpecCond + (1 | county)
Data: train_data
REML criterion at convergence: 549.4
Scaled residuals:
Min 1Q Median 3Q Max
-1.7232 -0.6865 -0.0775 0.6445 3.3055
Random effects:
Groups Name Variance Std.Dev.
county (Intercept) 1.269 1.127
Residual 15.041 3.878
Number of obs: 103, groups: county, 5
Fixed effects:
Estimate Std. Error t value
(Intercept) 6.9344 0.7927 8.748
wizard_sites_last_year_data_WTE 0.3728 0.5956 0.626
Developed_Change -0.1369 0.4417 -0.310
Mean_WellDepth -1.1517 0.5108 -2.255
avg_sand -3.9547 2.7346 -1.446
avg_silt -4.3075 2.7657 -1.557
wizard_sites_SD_data -0.4779 0.4385 -1.090
Agri_Change -0.3798 0.4413 -0.860
mean_Chloride 0.5244 3.9986 0.131
mean_SpecCond -1.0214 3.9950 -0.256
sd_Chloride -1.7886 1.0887 -1.643
sd_SpecCond 1.2647 0.8859 1.428
Correlation of Fixed Effects:
(Intr) w_____ Dvlp_C Mn_WlD avg_sn avg_sl w__SD_ Agr_Ch mn_Chl
wzr_____WTE 0.260
Devlpd_Chng -0.006 0.123
Men_WllDpth -0.033 0.005 0.074
avg_sand -0.071 -0.050 -0.031 -0.119
avg_silt -0.088 0.024 -0.039 -0.165 0.981
wzrd_st_SD_ -0.129 -0.171 0.145 -0.145 -0.012 -0.036
Agri_Change 0.010 0.006 0.556 0.063 0.054 0.060 0.109
mean_Chlord -0.067 0.052 -0.159 -0.081 0.092 0.148 -0.224 -0.191
mean_SpcCnd 0.050 -0.079 0.158 0.060 -0.060 -0.125 0.246 0.181 -0.980
sd_Chloride 0.073 0.172 0.045 -0.204 -0.001 0.020 0.115 0.092 -0.315
sd_SpecCond 0.010 -0.045 0.021 0.214 -0.111 -0.106 -0.102 0.012 0.200
mn_SpC sd_Chl
wzr_____WTE
Devlpd_Chng
Men_WllDpth
avg_sand
avg_silt
wzrd_st_SD_
Agri_Change
mean_Chlord
mean_SpcCnd
sd_Chloride 0.222
sd_SpecCond -0.260 -0.571
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
stargazer(lmm_mean,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.37
(0.60)
Developed_Change -0.14
(0.44)
Mean_WellDepth -1.15*
(0.51)
avg_sand -3.95
(2.73)
avg_silt -4.31
(2.77)
wizard_sites_SD_data -0.48
(0.44)
Agri_Change -0.38
(0.44)
mean_Chloride 0.52
(4.00)
mean_SpecCond -1.02
(3.99)
sd_Chloride -1.79
(1.09)
sd_SpecCond 1.26
(0.89)
Constant 6.93***
(0.79)
-------------------------------------------------------------
Observations 103
Log Likelihood -274.68
Akaike Inf. Crit. 577.36
Bayesian Inf. Crit. 614.25
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# The model predicts mean nitrate levels using well, soil, and land use features, while accounting for differences across counties.
#
# The random effect for county shows some variation across locations (county variance = 6.45), but most variability is still within counties (residual variance = 12.14). #about one-third of the variation in mean nitrate levels is associated with differences across counties, even after accounting for the predictors in the model.
#
# wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are the most influential predictors.
#
# Some predictors like Developed_Change, Agri_Change, and wizard_sites_SD_data don’t seem to have strong effects (low t-values).
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
labs(
title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
theme_minimal(base_size = 20) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
hjust = 0, vjust = 1.2, size = 5, color = "black") +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# residul plot
plot(lmm_mean)# The ideal plot would have points randomly scattered around the horizontal line at 0, with no clear pattern.
#
# this plot shows a clear curve and funnel shape
# qqplot
qqnorm(resid(lmm_mean))
qqline(resid(lmm_mean))#
# Since many points deviate from the line, especially at the ends (tails), this suggests non-normality in residuals — particularly heavier tails (possible outliers) #very high or low values 2. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance. Counties as a random effect
library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No wizard_sites_last_year_data_WTE",
"No Mean Well Depth",
"No wizard_sites_SD_data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Well Depth and No WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+Agri_Change+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | county),
mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county), # No sand + silt
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | county), # No chloride + speccond
mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county) # No WTE + depth
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lmer(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmse3. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability. county as a random effect
model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + Agri_Change+mean_SpecCond +
(1 | county),
data = train_data)
summary(lmm_log)Linear mixed model fit by REML ['lmerMod']
Formula: log_mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + avg_clay + avg_silt + Developed_Change + wizard_sites_SD_data +
mean_Chloride + sd_Chloride + sd_SpecCond + Agri_Change +
mean_SpecCond + (1 | county)
Data: train_data
REML criterion at convergence: 153.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.1953 -0.5865 0.0377 0.5898 2.1839
Random effects:
Groups Name Variance Std.Dev.
county (Intercept) 0.02498 0.1581
Residual 0.19018 0.4361
Number of obs: 103, groups: county, 5
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.91398 0.10256 18.663
wizard_sites_last_year_data_WTE 0.01518 0.07034 0.216
Mean_WellDepth -0.16123 0.05754 -2.802
avg_sand 0.31599 0.18198 1.736
avg_clay 0.29035 0.17140 1.694
Developed_Change -0.01148 0.04972 -0.231
wizard_sites_SD_data -0.07443 0.05023 -1.482
mean_Chloride 0.21228 0.45337 0.468
sd_Chloride -0.24837 0.12278 -2.023
sd_SpecCond 0.14287 0.09962 1.434
Agri_Change -0.03670 0.04964 -0.739
mean_SpecCond -0.25374 0.45222 -0.561
Correlation of Fixed Effects:
(Intr) w_____ Mn_WlD avg_sn avg_cl Dvlp_C w__SD_ mn_Chl sd_Chl
wzr_____WTE 0.257
Men_WllDpth -0.040 0.003
avg_sand 0.118 -0.140 0.216
avg_clay 0.099 -0.020 0.155 0.944
Devlpd_Chng -0.002 0.114 0.072 0.053 0.043
wzrd_st_SD_ -0.130 -0.182 -0.140 0.077 0.036 0.137
mean_Chlord -0.075 0.059 -0.073 -0.249 -0.164 -0.161 -0.220
sd_Chloride 0.078 0.179 -0.206 -0.043 -0.012 0.045 0.114 -0.316
sd_SpecCond 0.007 -0.046 0.214 0.086 0.104 0.021 -0.101 0.199 -0.570
Agri_Change 0.015 0.009 0.062 -0.058 -0.056 0.556 0.106 -0.192 0.093
mean_SpcCnd 0.057 -0.086 0.054 0.239 0.139 0.161 0.242 -0.980 0.223
sd_SpC Agr_Ch
wzr_____WTE
Men_WllDpth
avg_sand
avg_clay
Devlpd_Chng
wzrd_st_SD_
mean_Chlord
sd_Chloride
sd_SpecCond
Agri_Change 0.012
mean_SpcCnd -0.258 0.181
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
stargazer(lmm_log,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
log_mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.02
(0.07)
Mean_WellDepth -0.16**
(0.06)
avg_sand 0.32
(0.18)
avg_clay 0.29
(0.17)
Developed_Change -0.01
(0.05)
wizard_sites_SD_data -0.07
(0.05)
mean_Chloride 0.21
(0.45)
sd_Chloride -0.25*
(0.12)
sd_SpecCond 0.14
(0.10)
Agri_Change -0.04
(0.05)
mean_SpecCond -0.25
(0.45)
Constant 1.91***
(0.10)
-------------------------------------------------------------
Observations 103
Log Likelihood -76.74
Akaike Inf. Crit. 181.47
Bayesian Inf. Crit. 218.36
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# Model Goal: Predict log-transformed nitrate using well/site/soil data while adjusting for differences across counties.
#
#
# Strong Predictors: wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are statistically important (big t-values).
#
#
# Weak/No Effect: Some variables like Developed_Change, Agri_Change, and wizard_sites_SD_data don't seem to help much here.
#
#
# Random Effect (county): There is some variation across counties (random effect SD = 0.58), but most variation is still within counties (residual SD = 0.71).
test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log)
r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
geom_point(alpha = 0.7, size = 2, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# residul plot
plot(lmm_log)#
# qqplot
qqnorm(resid(lmm_log))
qqline(resid(lmm_log))4. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(county, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.
model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate <- log1p(test_data$mean_nitrate)
train_data$county <- as.factor(train_data$county)
test_data$county <- factor(test_data$county, levels = levels(train_data$county))
gam_model <- gam(log_mean_nitrate ~
s(wizard_sites_last_year_data_WTE) +
s(Mean_WellDepth) +
s(avg_sand) +
s(avg_clay) +
s(avg_silt) +
s(wizard_sites_SD_data) +
s(mean_Chloride) +
s(mean_SpecCond) +
s(sd_Chloride) +
s(sd_SpecCond) +
s(Developed_Change) +
s(Agri_Change)+
s(county, bs = "re"),
data = train_data,
method = "REML") # Restricted Maximum Likelihood # estimate the variance components (like the random effects and residual error) in mixed models
summary(gam_model)
Family: gaussian
Link function: identity
Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) +
s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) +
s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) +
s(Developed_Change) + s(Agri_Change) + s(county, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.92393 0.08863 21.71 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(wizard_sites_last_year_data_WTE) 1.000e+00 1.000e+00 0.257 0.61324
s(Mean_WellDepth) 1.000e+00 1.000e+00 7.172 0.00882 **
s(avg_sand) 1.841e-06 3.528e-06 0.013 0.99983
s(avg_clay) 1.000e+00 1.000e+00 3.589 0.06140 .
s(avg_silt) 1.000e+00 1.000e+00 4.137 0.04492 *
s(wizard_sites_SD_data) 1.000e+00 1.000e+00 3.600 0.06100 .
s(mean_Chloride) 1.987e+00 2.410e+00 0.896 0.61437
s(mean_SpecCond) 1.000e+00 1.000e+00 0.004 0.95293
s(sd_Chloride) 1.000e+00 1.000e+00 0.972 0.32673
s(sd_SpecCond) 1.000e+00 1.000e+00 0.720 0.39839
s(Developed_Change) 1.000e+00 1.000e+00 0.074 0.78599
s(Agri_Change) 1.000e+00 1.000e+00 0.661 0.41844
s(county) 1.273e+00 4.000e+00 0.560 0.15446
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Rank: 113/114
R-sq.(adj) = 0.332 Deviance explained = 41.9%
-REML = 76.693 Scale est. = 0.1854 n = 103
stargazer(gam_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
==================================================================
Dependent variable:
-----------------------------
log_mean_nitrate
------------------------------------------------------------------
s(wizard_sites_last_year_data_WTE).1
s(wizard_sites_last_year_data_WTE).2
s(wizard_sites_last_year_data_WTE).3
s(wizard_sites_last_year_data_WTE).4
s(wizard_sites_last_year_data_WTE).5
s(wizard_sites_last_year_data_WTE).6
s(wizard_sites_last_year_data_WTE).7
s(wizard_sites_last_year_data_WTE).8
s(wizard_sites_last_year_data_WTE).9
s(Mean_WellDepth).1
s(Mean_WellDepth).2
s(Mean_WellDepth).3
s(Mean_WellDepth).4
s(Mean_WellDepth).5
s(Mean_WellDepth).6
s(Mean_WellDepth).7
s(Mean_WellDepth).8
s(Mean_WellDepth).9
s(avg_sand).1
s(avg_sand).2
s(avg_sand).3
s(avg_sand).4
s(avg_sand).5
s(avg_sand).6
s(avg_sand).7
s(avg_sand).8
s(avg_sand).9
s(avg_clay).1
s(avg_clay).2
s(avg_clay).3
s(avg_clay).4
s(avg_clay).5
s(avg_clay).6
s(avg_clay).7
s(avg_clay).8
s(avg_clay).9
s(avg_silt).1
s(avg_silt).2
s(avg_silt).3
s(avg_silt).4
s(avg_silt).5
s(avg_silt).6
s(avg_silt).7
s(avg_silt).8
s(avg_silt).9
s(wizard_sites_SD_data).1
s(wizard_sites_SD_data).2
s(wizard_sites_SD_data).3
s(wizard_sites_SD_data).4
s(wizard_sites_SD_data).5
s(wizard_sites_SD_data).6
s(wizard_sites_SD_data).7
s(wizard_sites_SD_data).8
s(wizard_sites_SD_data).9
s(mean_Chloride).1
s(mean_Chloride).2
s(mean_Chloride).3
s(mean_Chloride).4
s(mean_Chloride).5
s(mean_Chloride).6
s(mean_Chloride).7
s(mean_Chloride).8
s(mean_Chloride).9
s(mean_SpecCond).1
s(mean_SpecCond).2
s(mean_SpecCond).3
s(mean_SpecCond).4
s(mean_SpecCond).5
s(mean_SpecCond).6
s(mean_SpecCond).7
s(mean_SpecCond).8
s(mean_SpecCond).9
s(sd_Chloride).1
s(sd_Chloride).2
s(sd_Chloride).3
s(sd_Chloride).4
s(sd_Chloride).5
s(sd_Chloride).6
s(sd_Chloride).7
s(sd_Chloride).8
s(sd_Chloride).9
s(sd_SpecCond).1
s(sd_SpecCond).2
s(sd_SpecCond).3
s(sd_SpecCond).4
s(sd_SpecCond).5
s(sd_SpecCond).6
s(sd_SpecCond).7
s(sd_SpecCond).8
s(sd_SpecCond).9
s(Developed_Change).1
s(Developed_Change).2
s(Developed_Change).3
s(Developed_Change).4
s(Developed_Change).5
s(Developed_Change).6
s(Developed_Change).7
s(Developed_Change).8
s(Developed_Change).9
s(Agri_Change).1
s(Agri_Change).2
s(Agri_Change).3
s(Agri_Change).4
s(Agri_Change).5
s(Agri_Change).6
s(Agri_Change).7
s(Agri_Change).8
s(Agri_Change).9
s(county).1
s(county).2
s(county).3
s(county).4
s(county).5
Constant 1.92***
(0.09)
------------------------------------------------------------------
Observations 103
Adjusted R2 0.33
Log Likelihood -66.95
UBRE 76.69
==================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# The model predicts log-transformed nitrate using smooth curves (splines) for each predictor and accounts for differences across counties as a random effect.
#
# The model explains about 53.9% of the deviance (similar to R²) and has an adjusted R² of 0.493, indicating a decent fit.
#
# wizard_sites_last_year_data_WTE, Mean_WellDepth, mean_Chloride, and county were statistically significant predictors (p < 0.05).
#
# Other predictors like avg_sand, sd_Chloride, and wizard_sites_SD_data didn’t show strong effects (p > 0.1).
#
# The smooth term for county was significant, confirming that including it as a random effect helps account for regional variability.
test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)
gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
hjust = 0, vjust = 1, size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# QQ plot
qqnorm(resid(gam_model))
qqline(resid(gam_model))5. Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
mlr_model <- lm(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond+Agri_Change,
data = train_data)
summary(mlr_model)
Call:
lm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand + avg_silt +
avg_clay + wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond + Agri_Change, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-6.7340 -2.8757 -0.4397 2.4261 12.9928
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.4869 0.3913 19.132 <2e-16 ***
wizard_sites_last_year_data_WTE 0.7929 0.4694 1.689 0.0946 .
Developed_Change -0.1241 0.4450 -0.279 0.7810
Mean_WellDepth -1.1795 0.5137 -2.296 0.0240 *
avg_sand -4.1976 2.6967 -1.557 0.1230
avg_silt -4.5008 2.6932 -1.671 0.0981 .
avg_clay NA NA NA NA
wizard_sites_SD_data -0.6563 0.4143 -1.584 0.1166
mean_Chloride 0.5148 3.9384 0.131 0.8963
mean_SpecCond -1.0721 3.9560 -0.271 0.7870
sd_Chloride -1.6788 1.0889 -1.542 0.1266
sd_SpecCond 1.2564 0.8955 1.403 0.1640
Agri_Change -0.3609 0.4458 -0.809 0.4204
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.922 on 91 degrees of freedom
Multiple R-squared: 0.3023, Adjusted R-squared: 0.2179
F-statistic: 3.584 on 11 and 91 DF, p-value: 0.0003222
stargazer(mlr_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.79
(0.47)
Developed_Change -0.12
(0.45)
Mean_WellDepth -1.18*
(0.51)
avg_sand -4.20
(2.70)
avg_silt -4.50
(2.69)
avg_clay
wizard_sites_SD_data -0.66
(0.41)
mean_Chloride 0.51
(3.94)
mean_SpecCond -1.07
(3.96)
sd_Chloride -1.68
(1.09)
sd_SpecCond 1.26
(0.90)
Agri_Change -0.36
(0.45)
Constant 7.49***
(0.39)
-------------------------------------------------------------
Observations 103
R2 0.30
Adjusted R2 0.22
Residual Std. Error 3.92 (df = 91)
F Statistic 3.58*** (df = 11; 91)
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
#
# The model explains about 27% of the variance in mean nitrate values (R² = 0.274), with a residual error of around 3.64.
#
# The variable wizard_sites_last_year_data_WTE has the strongest positive effect and is highly significant.
#
# wizard_sites_SD_data and sd_SpecCond also show significant relationships; mean_SpecCond is weakly significant and negative.
#
# The variable avg_clay was automatically removed due to perfect multicollinearity with other predictors.
#
# Several predictors like Developed_Change, Mean_WellDepth, and Agri_Change were not statistically significant.
#
test_data$mlr_predicted <- predict(mlr_model, newdata = test_data)
mlr_r2 <- round(cor(test_data$mean_nitrate, test_data$mlr_predicted)^2, 3)
mlr_rmse <- round(rmse(test_data$mean_nitrate, test_data$mlr_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$mlr_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = mlr_predicted)) +
geom_point(alpha = 0.7, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "MLR: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", mlr_r2, "\nRMSE = ", mlr_rmse),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# Residual vs. fitted plot
plot(mlr_model)# QQ plot of residuals
qqnorm(resid(mlr_model))
qqline(resid(mlr_model))6. Comparison of Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No WTE",
"No Mean Well Depth",
"No SD Data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Depth and WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond+Agri_Change,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_SpecCond + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + sd_Chloride,
mean_nitrate ~ Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_clay + wizard_sites_SD_data + mean_Chloride +
mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data,
mean_nitrate ~ Developed_Change + avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lm(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "MLR Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "MLR Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmsePrevious work up until Thursday, 8th May
1. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy
library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
data = train_data,
ntree = 500, importance = TRUE)
print(rf_model)
Call:
randomForest(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_clay + avg_silt + sd_Chloride + sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change, data = train_data, ntree = 500, importance = TRUE)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 4
Mean of squared residuals: 15.52265
% Var explained: 20.3
importance(rf_model) %IncMSE IncNodePurity
wizard_sites_last_year_data_WTE 9.7002060 281.22874
Developed_Change -1.6860849 55.71317
Mean_WellDepth 6.3151719 328.93666
avg_sand 2.3630821 112.51640
avg_clay 4.2071863 105.78716
avg_silt 4.9661743 136.90064
sd_Chloride 6.6445761 169.99421
sd_SpecCond 2.3799743 129.14534
wizard_sites_SD_data 4.4674341 140.82756
mean_Chloride 7.7793412 200.19792
mean_SpecCond 6.0886308 152.03909
Agri_Change 0.8170956 59.85917
#
# Top 3 most important predictors based on %IncMSE are:
#
# wizard_sites_last_year_data_WTE (24.4%),
#
# Mean_WellDepth (21.5%),
#
# mean_Chloride (13.6%).
#
# Agri_Change has very low importance (0.2%), suggesting it adds little predictive value.
#
# IncNodePurity shows how much a variable helps in reducing overall variance — used to assess relative contribution.
#
# Overall, WTE, depth, and water quality variables (like chloride, spec cond) are driving prediction strength in RF model.
#
test_data$rf_predicted <- predict(rf_model, newdata = test_data)
r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
geom_point(alpha = 0.7, color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# Residuals
resid_rf <- test_data$mean_nitrate - predict(rf_model, newdata = test_data)
# QQ plot
qqnorm(resid_rf)
qqline(resid_rf)# Training data Testing data MSE % variation explained MSE % variation explained #IOWA paper
train_preds <- predict(rf_model, newdata = train_data)
mse_train <- mean((train_data$mean_nitrate - train_preds)^2)
r2_train <- cor(train_data$mean_nitrate, train_preds)^2 * 100
test_preds <- predict(rf_model, newdata = test_data)
mse_test <- mean((test_data$mean_nitrate - test_preds)^2)
r2_test <- cor(test_data$mean_nitrate, test_preds)^2 * 100
rf_performance <- tibble(
Dataset = c("Training", "Testing"),
MSE = c(mse_train, mse_test),
`% Variance Explained` = c(r2_train, r2_test)
)
rf_performance %>%
gt() %>%
tab_header(title = "Random Forest Model Performance") %>%
gt::fmt_number(columns = c(MSE, `% Variance Explained`), decimals = 2)| Random Forest Model Performance | ||
|---|---|---|
| Dataset | MSE | % Variance Explained |
| Training | 2.94 | 90.90 |
| Testing | 11.11 | 29.11 |
2. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations.
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand +avg_clay+avg_silt+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond+Agri_Change,
data = train_data,
family = Gamma(link = "log"))
summary(glm_model)
Call:
glm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand + avg_clay +
avg_silt + wizard_sites_SD_data + mean_Chloride + sd_Chloride +
sd_SpecCond + mean_SpecCond + Agri_Change, family = Gamma(link = "log"),
data = train_data)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.94504 0.04995 38.943 < 2e-16 ***
wizard_sites_last_year_data_WTE 0.10576 0.05992 1.765 0.08089 .
Developed_Change -0.02951 0.05680 -0.520 0.60460
Mean_WellDepth -0.20937 0.06557 -3.193 0.00193 **
avg_sand 0.42523 0.19344 2.198 0.03047 *
avg_clay 0.39317 0.18730 2.099 0.03857 *
avg_silt NA NA NA NA
wizard_sites_SD_data -0.12344 0.05288 -2.334 0.02177 *
mean_Chloride 0.25154 0.50266 0.500 0.61800
sd_Chloride -0.27937 0.13898 -2.010 0.04737 *
sd_SpecCond 0.13134 0.11429 1.149 0.25349
mean_SpecCond -0.29467 0.50492 -0.584 0.56094
Agri_Change -0.06413 0.05690 -1.127 0.26265
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 0.2505409)
Null deviance: 37.899 on 102 degrees of freedom
Residual deviance: 23.393 on 91 degrees of freedom
AIC: 542.08
Number of Fisher Scoring iterations: 8
stargazer(glm_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.11
(0.06)
Developed_Change -0.03
(0.06)
Mean_WellDepth -0.21**
(0.07)
avg_sand 0.43*
(0.19)
avg_clay 0.39*
(0.19)
avg_silt
wizard_sites_SD_data -0.12*
(0.05)
mean_Chloride 0.25
(0.50)
sd_Chloride -0.28*
(0.14)
sd_SpecCond 0.13
(0.11)
mean_SpecCond -0.29
(0.50)
Agri_Change -0.06
(0.06)
Constant 1.95***
(0.05)
-------------------------------------------------------------
Observations 103
Log Likelihood -259.04
Akaike Inf. Crit. 542.08
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# This model predicts mean nitrate using a Gamma distribution with a log link, suitable for skewed, non-negative data.
#
# The variables wizard_sites_last_year_data_WTE, wizard_sites_SD_data, mean_Chloride, mean_SpecCond, and sd_SpecCond are statistically significant (p < 0.05), meaning they likely influence nitrate concentrations.
#
# avg_silt was automatically excluded due to collinearity (probably because of perfect correlation with sand or clay).
test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")
r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# QQ plot
qqnorm(resid(glm_model, type = "deviance"))
qqline(resid(glm_model, type = "deviance"))Best models so far
| Best Models So Far (With >2 mg N: 117 wells) | ||
|---|---|---|
| Model | R2 | RMSE |
| Generalized additive model (GAM) | 0.328 | 3.017 |
| Random forest model | 0.291 | 3.334 |
| Gamma-distributed generalized linear model | 0.266 | 3.540 |
| Log-transformed linear mixed model | 0.264 | 3.189 |
| Linear mixed model using county | 0.192 | 3.835 |
| Multiple linear regression | 0.188 | 3.851 |