Mean Nitrate Prediction_Updated
Explore Data
file_Path_Variable_O<- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Output"
file_Path_Variable_I <- "C:/Users/a905h226/OneDrive - University of Kansas/Desktop/KGS project GW/Step By Step Code/Input"
model_data<- readRDS(file.path(file_Path_Variable_O, "LinearMixedModelData_PredictNitrate.rds"))
model_data_long <- model_data %>%
ungroup() %>%
select(where(is.numeric)) %>%
select(-Lat.x,-Lon.x,-Trend_Nitrate_Encoded) %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
ggplot(model_data_long, aes(y = value)) +
geom_boxplot(fill = "lightblue", color = "black", outlier.shape = 1) +
facet_wrap(~variable, scales = "free", ncol = 4) +
labs(title = "Boxplots of All Numeric Variables",
y = "Value", x = "") +
theme_minimal() +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())Add counties information to the dataset
counties <- counties(state = "KS", year = 2022, class = "sf")
wells_sf <- model_data %>%
st_as_sf(coords = c("Lon.x", "Lat.x"), crs = 4326)
counties_transformed <- st_transform(counties, crs = 4326)
wells_with_county <- st_join(wells_sf, counties_transformed["NAME"])
model_data <- wells_with_county %>%
st_drop_geometry() %>%
rename(county = NAME)
wells_bbox <- st_bbox(wells_sf)
library(viridis)
counties_with_wells <- st_filter(counties_transformed, wells_sf)
ggplot() +
geom_sf(data = counties_with_wells, fill = "white", color = "black", size = 0.3) +
geom_sf(data = wells_sf, aes(color = mean_nitrate), size = 2) +
geom_sf_text(data = counties_with_wells, aes(label = NAME), size = 3, check_overlap = TRUE, color = "gray30") +
coord_sf(xlim = c(wells_bbox$xmin, wells_bbox$xmax),
ylim = c(wells_bbox$ymin, wells_bbox$ymax),
expand = TRUE) +
scale_color_viridis_c(option = "C", name = "Mean Nitrate") +
labs(title = "Wells and Counties Containing Them") +
theme_minimal()Scale the data: standardization (scale()) to center all predictors around 0 with a standard deviation of 1.((x- mean)/sd)
numeric_predictors <- model_data %>%
ungroup() %>%
select(wizard_sites_last_year_data_WTE, Agri_Change, Developed_Change,
Mean_WellDepth, avg_sand, avg_silt,avg_clay,
wizard_sites_SD_data, mean_Chloride, mean_SpecCond,sd_Chloride,sd_SpecCond)
scaled_predictors <- scale(numeric_predictors)
model_data_scaled <- model_data %>%
select(Well_ID, cluster, mean_nitrate, sd_nitrate,county) %>%
bind_cols(as_tibble(scaled_predictors))
model_data_scaled$county <- as.factor(model_data_scaled$county)
county_counts <- model_data_scaled %>%
count(county)
county_counts# A tibble: 5 × 2
county n
<fct> <int>
1 Harvey 109
2 McPherson 16
3 Reno 177
4 Rice 2
5 Sedgwick 58
boxplot(mean_nitrate ~ county, data = model_data_scaled,
las = 2,
col = "lightblue",
main = "Mean Nitrate by County",
xlab = "County",
ylab = "Mean Nitrate")
text(x = 1:nrow(county_counts),
y = par("usr")[3] +20,
labels = county_counts$n,
xpd = TRUE,
srt = 90,
adj = 1,
cex = 0.8,
col = "darkblue")Best models so far
| Best Models So Far (Sorted by R²) | ||
|---|---|---|
| Model | R2 | RMSE |
| Random forest model | 0.580 | 2.671 |
| Gamma-distributed generalized linear model | 0.503 | 2.880 |
| Log-transformed linear mixed model | 0.463 | 3.257 |
| Linear mixed model using county | 0.403 | 3.156 |
| Generalized additive model (GAM) | 0.378 | 3.278 |
| Multiple linear regression | 0.371 | 3.246 |
Updated Friday, 9th May
using counties as random effect
1. Linear mixed-effects model excluding cluster, using Counties as a random intercept. This model captures broader group-level variability without overfitting to sub-group structures.
library(lme4)
library(ggplot2)
library(Metrics)
model_data_scaled_1 <- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_mean <- lmer(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt +avg_clay +
wizard_sites_SD_data + Agri_Change+mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond +
(1 | county),
data = train_data)
summary(lmm_mean)Linear mixed model fit by REML ['lmerMod']
Formula: mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
Agri_Change + mean_Chloride + mean_SpecCond + sd_Chloride +
sd_SpecCond + (1 | county)
Data: train_data
REML criterion at convergence: 1543.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.1852 -0.5801 -0.1803 0.3240 4.3556
Random effects:
Groups Name Variance Std.Dev.
county (Intercept) 6.451 2.540
Residual 12.138 3.484
Number of obs: 289, groups: county, 5
Fixed effects:
Estimate Std. Error t value
(Intercept) 2.32343 1.22519 1.896
wizard_sites_last_year_data_WTE 1.66929 0.39437 4.233
Developed_Change -0.03578 0.28207 -0.127
Mean_WellDepth -0.29672 0.19747 -1.503
avg_sand -1.79085 1.29516 -1.383
avg_silt -1.55512 1.32527 -1.173
wizard_sites_SD_data 0.05082 0.30638 0.166
Agri_Change -0.14324 0.26996 -0.531
mean_Chloride 3.63893 1.89308 1.922
mean_SpecCond -5.28321 1.88337 -2.805
sd_Chloride -0.53474 0.34182 -1.564
sd_SpecCond 0.94366 0.37600 2.510
Correlation of Fixed Effects:
(Intr) w_____ Dvlp_C Mn_WlD avg_sn avg_sl w__SD_ Agr_Ch mn_Chl
wzr_____WTE -0.019
Devlpd_Chng 0.000 0.129
Men_WllDpth 0.002 0.103 -0.019
avg_sand -0.048 0.032 -0.016 -0.106
avg_silt -0.055 0.068 -0.018 -0.114 0.982
wzrd_st_SD_ -0.005 -0.139 0.064 -0.173 0.064 0.026
Agri_Change 0.018 0.031 0.605 -0.054 -0.037 -0.030 0.044
mean_Chlord -0.048 0.088 -0.017 0.056 0.123 0.137 -0.104 0.002
mean_SpcCnd 0.048 -0.136 0.005 -0.077 -0.115 -0.134 0.123 -0.016 -0.986
sd_Chloride -0.010 0.223 0.040 -0.040 0.105 0.111 0.067 0.021 0.042
sd_SpecCond 0.021 -0.013 0.041 0.065 -0.120 -0.126 -0.026 0.035 -0.211
mn_SpC sd_Chl
wzr_____WTE
Devlpd_Chng
Men_WllDpth
avg_sand
avg_silt
wzrd_st_SD_
Agri_Change
mean_Chlord
mean_SpcCnd
sd_Chloride -0.083
sd_SpecCond 0.134 -0.505
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
stargazer(lmm_mean,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 1.67***
(0.39)
Developed_Change -0.04
(0.28)
Mean_WellDepth -0.30
(0.20)
avg_sand -1.79
(1.30)
avg_silt -1.56
(1.33)
wizard_sites_SD_data 0.05
(0.31)
Agri_Change -0.14
(0.27)
mean_Chloride 3.64
(1.89)
mean_SpecCond -5.28**
(1.88)
sd_Chloride -0.53
(0.34)
sd_SpecCond 0.94*
(0.38)
Constant 2.32
(1.23)
-------------------------------------------------------------
Observations 289
Log Likelihood -771.77
Akaike Inf. Crit. 1571.55
Bayesian Inf. Crit. 1622.88
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# The model predicts mean nitrate levels using well, soil, and land use features, while accounting for differences across counties.
#
# The random effect for county shows some variation across locations (county variance = 6.45), but most variability is still within counties (residual variance = 12.14). #about one-third of the variation in mean nitrate levels is associated with differences across counties, even after accounting for the predictors in the model.
#
# wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are the most influential predictors.
#
# Some predictors like Developed_Change, Agri_Change, and wizard_sites_SD_data don’t seem to have strong effects (low t-values).
test_data$predicted_mean <- predict(lmm_mean, newdata = test_data, allow.new.levels = TRUE)
r2 <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_val <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$predicted_mean, test_data$mean_nitrate), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean, color = cluster)) +
geom_point(alpha = 0.7, size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "black", linetype = "solid", size = 1) +
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dotted", size = 1) +
labs(
title = "LMM: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
theme_minimal(base_size = 20) +
scale_color_manual(values = c("blue", "yellow", "gray")) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2, "\nRMSE = ", rmse_val),
hjust = 0, vjust = 1.2, size = 5, color = "black") +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# residul plot
plot(lmm_mean)# The ideal plot would have points randomly scattered around the horizontal line at 0, with no clear pattern.
#
# this plot shows a clear curve and funnel shape
# qqplot
qqnorm(resid(lmm_mean))
qqline(resid(lmm_mean))#
# Since many points deviate from the line, especially at the ends (tails), this suggests non-normality in residuals — particularly heavier tails (possible outliers) #very high or low values 2. Comparison of R² and RMSE across different mixed models omitting individual or combinations of predictors. This sensitivity analysis highlights which variables most influence model performance. Counties as a random effect
library(lme4)
library(Metrics)
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(patchwork)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No wizard_sites_last_year_data_WTE",
"No Mean Well Depth",
"No wizard_sites_SD_data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Well Depth and No WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt +avg_clay+ wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+Agri_Change+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond+sd_Chloride+sd_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data +sd_Chloride+sd_SpecCond+ mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + (1 | county),
mean_nitrate ~ Developed_Change + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + avg_sand + avg_silt + wizard_sites_SD_data+sd_Chloride+sd_SpecCond + mean_Chloride + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_silt + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county),
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond + (1 | county), # No sand + silt
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change+sd_Chloride+sd_SpecCond + Mean_WellDepth + avg_sand + avg_silt + wizard_sites_SD_data + (1 | county), # No chloride + speccond
mean_nitrate ~ Developed_Change + avg_sand + avg_silt+sd_Chloride+sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + (1 | county) # No WTE + depth
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lmer(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data, allow.new.levels = TRUE)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmse3. using a log-transformed linear mixed-effects model. The model was fit using log1p(mean_nitrate) as the response to account for skewness and stabilize variance. Predictions were back-transformed using expm1() to the original scale for interpretability. county as a random effect
model_data_scaled_1 <- model_data_scaled
model_data_scaled_1$log_mean_nitrate <- log1p(model_data_scaled_1$mean_nitrate)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
lmm_log <- lmer(log_mean_nitrate ~ wizard_sites_last_year_data_WTE +
Mean_WellDepth + avg_sand +avg_clay+avg_silt+Developed_Change+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + Agri_Change+mean_SpecCond +
(1 | county),
data = train_data)
summary(lmm_log)Linear mixed model fit by REML ['lmerMod']
Formula: log_mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + avg_clay + avg_silt + Developed_Change + wizard_sites_SD_data +
mean_Chloride + sd_Chloride + sd_SpecCond + Agri_Change +
mean_SpecCond + (1 | county)
Data: train_data
REML criterion at convergence: 670
Scaled residuals:
Min 1Q Median 3Q Max
-2.6128 -0.6097 -0.1445 0.7098 2.4641
Random effects:
Groups Name Variance Std.Dev.
county (Intercept) 0.3397 0.5828
Residual 0.5146 0.7173
Number of obs: 289, groups: county, 5
Fixed effects:
Estimate Std. Error t value
(Intercept) 0.821242 0.278257 2.951
wizard_sites_last_year_data_WTE 0.354311 0.082096 4.316
Mean_WellDepth -0.088511 0.040659 -2.177
avg_sand 0.008175 0.155050 0.053
avg_clay 0.085740 0.146047 0.587
Developed_Change -0.019301 0.058084 -0.332
wizard_sites_SD_data -0.003729 0.063207 -0.059
mean_Chloride 1.191097 0.390191 3.053
sd_Chloride -0.043007 0.070427 -0.611
sd_SpecCond 0.152779 0.077422 1.973
Agri_Change -0.047574 0.055586 -0.856
mean_SpecCond -1.539662 0.388288 -3.965
Correlation of Fixed Effects:
(Intr) w_____ Mn_WlD avg_sn avg_cl Dvlp_C w__SD_ mn_Chl sd_Chl
wzr_____WTE -0.026
Men_WllDpth 0.001 0.103
avg_sand 0.058 -0.126 0.117
avg_clay 0.052 -0.070 0.114 0.947
Devlpd_Chng -0.001 0.129 -0.019 0.021 0.018
wzrd_st_SD_ -0.005 -0.138 -0.173 0.043 -0.025 0.063
mean_Chlord -0.047 0.092 0.056 -0.152 -0.138 -0.017 -0.103
sd_Chloride -0.011 0.226 -0.040 -0.112 -0.111 0.040 0.067 0.043
sd_SpecCond 0.020 -0.014 0.065 0.125 0.126 0.041 -0.026 -0.211 -0.505
Agri_Change 0.016 0.031 -0.054 0.016 0.030 0.605 0.044 0.002 0.021
mean_SpcCnd 0.047 -0.140 -0.077 0.157 0.136 0.005 0.122 -0.986 -0.084
sd_SpC Agr_Ch
wzr_____WTE
Men_WllDpth
avg_sand
avg_clay
Devlpd_Chng
wzrd_st_SD_
mean_Chlord
sd_Chloride
sd_SpecCond
Agri_Change 0.035
mean_SpcCnd 0.134 -0.016
fit warnings:
fixed-effect model matrix is rank deficient so dropping 1 column / coefficient
stargazer(lmm_log,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
log_mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.35***
(0.08)
Mean_WellDepth -0.09*
(0.04)
avg_sand 0.01
(0.16)
avg_clay 0.09
(0.15)
Developed_Change -0.02
(0.06)
wizard_sites_SD_data -0.004
(0.06)
mean_Chloride 1.19**
(0.39)
sd_Chloride -0.04
(0.07)
sd_SpecCond 0.15*
(0.08)
Agri_Change -0.05
(0.06)
mean_SpecCond -1.54***
(0.39)
Constant 0.82**
(0.28)
-------------------------------------------------------------
Observations 289
Log Likelihood -334.99
Akaike Inf. Crit. 697.98
Bayesian Inf. Crit. 749.31
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# Model Goal: Predict log-transformed nitrate using well/site/soil data while adjusting for differences across counties.
#
#
# Strong Predictors: wizard_sites_last_year_data_WTE, mean_Chloride, and mean_SpecCond are statistically important (big t-values).
#
#
# Weak/No Effect: Some variables like Developed_Change, Agri_Change, and wizard_sites_SD_data don't seem to help much here.
#
#
# Random Effect (county): There is some variation across counties (random effect SD = 0.58), but most variation is still within counties (residual SD = 0.71).
test_data$predicted_log <- predict(lmm_log, newdata = test_data, allow.new.levels = TRUE)
test_data$predicted_mean <- expm1(test_data$predicted_log)
r2_lmm <- round(cor(test_data$mean_nitrate, test_data$predicted_mean)^2, 3)
rmse_lmm <- round(rmse(test_data$mean_nitrate, test_data$predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_mean)) +
geom_point(alpha = 0.7, size = 2, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
labs(
title = "LMM (Log-Transformed): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", r2_lmm, "\nRMSE = ", rmse_lmm),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# residul plot
plot(lmm_log)#
# qqplot
qqnorm(resid(lmm_log))
qqline(resid(lmm_log))4. Generalized Additive Model (GAM) with smooth spline terms for continuous predictors and a random effect (s(county, bs = “re”)). This flexible model captures nonlinear relationships while accommodating group-level variance.
model_data_scaled_1<- model_data_scaled
library(mgcv)
library(ggplot2)
library(Metrics)
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
train_data$log_mean_nitrate <- log1p(train_data$mean_nitrate)
test_data$log_mean_nitrate <- log1p(test_data$mean_nitrate)
train_data$county <- as.factor(train_data$county)
test_data$county <- factor(test_data$county, levels = levels(train_data$county))
gam_model <- gam(log_mean_nitrate ~
s(wizard_sites_last_year_data_WTE) +
s(Mean_WellDepth) +
s(avg_sand) +
s(avg_clay) +
s(avg_silt) +
s(wizard_sites_SD_data) +
s(mean_Chloride) +
s(mean_SpecCond) +
s(sd_Chloride) +
s(sd_SpecCond) +
s(Developed_Change) +
s(Agri_Change)+
s(county, bs = "re"),
data = train_data,
method = "REML") # Restricted Maximum Likelihood # estimate the variance components (like the random effects and residual error) in mixed models
summary(gam_model)
Family: gaussian
Link function: identity
Formula:
log_mean_nitrate ~ s(wizard_sites_last_year_data_WTE) + s(Mean_WellDepth) +
s(avg_sand) + s(avg_clay) + s(avg_silt) + s(wizard_sites_SD_data) +
s(mean_Chloride) + s(mean_SpecCond) + s(sd_Chloride) + s(sd_SpecCond) +
s(Developed_Change) + s(Agri_Change) + s(county, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9562 0.1917 4.989 1.11e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(wizard_sites_last_year_data_WTE) 4.3919896 5.4011909 3.521 0.00628 **
s(Mean_WellDepth) 3.3939422 4.0230540 9.324 6.00e-07 ***
s(avg_sand) 0.0002365 0.0004617 0.021 0.99755
s(avg_clay) 1.0001008 1.0001972 0.375 0.54107
s(avg_silt) 1.0001908 1.0003714 0.004 0.95382
s(wizard_sites_SD_data) 2.6584084 3.3435020 1.839 0.13507
s(mean_Chloride) 3.3641869 4.1595066 3.051 0.01809 *
s(mean_SpecCond) 1.0000177 1.0000260 3.541 0.06099 .
s(sd_Chloride) 1.0000654 1.0001238 0.285 0.59380
s(sd_SpecCond) 1.0000810 1.0001574 1.864 0.17338
s(Developed_Change) 1.0000774 1.0001511 3.659 0.05685 .
s(Agri_Change) 3.7619394 4.6885505 2.206 0.05817 .
s(county) 2.7023225 4.0000000 5.660 5.27e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Rank: 113/114
R-sq.(adj) = 0.493 Deviance explained = 53.9%
-REML = 310.63 Scale est. = 0.39299 n = 289
stargazer(gam_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
==================================================================
Dependent variable:
-----------------------------
log_mean_nitrate
------------------------------------------------------------------
s(wizard_sites_last_year_data_WTE).1
s(wizard_sites_last_year_data_WTE).2
s(wizard_sites_last_year_data_WTE).3
s(wizard_sites_last_year_data_WTE).4
s(wizard_sites_last_year_data_WTE).5
s(wizard_sites_last_year_data_WTE).6
s(wizard_sites_last_year_data_WTE).7
s(wizard_sites_last_year_data_WTE).8
s(wizard_sites_last_year_data_WTE).9
s(Mean_WellDepth).1
s(Mean_WellDepth).2
s(Mean_WellDepth).3
s(Mean_WellDepth).4
s(Mean_WellDepth).5
s(Mean_WellDepth).6
s(Mean_WellDepth).7
s(Mean_WellDepth).8
s(Mean_WellDepth).9
s(avg_sand).1
s(avg_sand).2
s(avg_sand).3
s(avg_sand).4
s(avg_sand).5
s(avg_sand).6
s(avg_sand).7
s(avg_sand).8
s(avg_sand).9
s(avg_clay).1
s(avg_clay).2
s(avg_clay).3
s(avg_clay).4
s(avg_clay).5
s(avg_clay).6
s(avg_clay).7
s(avg_clay).8
s(avg_clay).9
s(avg_silt).1
s(avg_silt).2
s(avg_silt).3
s(avg_silt).4
s(avg_silt).5
s(avg_silt).6
s(avg_silt).7
s(avg_silt).8
s(avg_silt).9
s(wizard_sites_SD_data).1
s(wizard_sites_SD_data).2
s(wizard_sites_SD_data).3
s(wizard_sites_SD_data).4
s(wizard_sites_SD_data).5
s(wizard_sites_SD_data).6
s(wizard_sites_SD_data).7
s(wizard_sites_SD_data).8
s(wizard_sites_SD_data).9
s(mean_Chloride).1
s(mean_Chloride).2
s(mean_Chloride).3
s(mean_Chloride).4
s(mean_Chloride).5
s(mean_Chloride).6
s(mean_Chloride).7
s(mean_Chloride).8
s(mean_Chloride).9
s(mean_SpecCond).1
s(mean_SpecCond).2
s(mean_SpecCond).3
s(mean_SpecCond).4
s(mean_SpecCond).5
s(mean_SpecCond).6
s(mean_SpecCond).7
s(mean_SpecCond).8
s(mean_SpecCond).9
s(sd_Chloride).1
s(sd_Chloride).2
s(sd_Chloride).3
s(sd_Chloride).4
s(sd_Chloride).5
s(sd_Chloride).6
s(sd_Chloride).7
s(sd_Chloride).8
s(sd_Chloride).9
s(sd_SpecCond).1
s(sd_SpecCond).2
s(sd_SpecCond).3
s(sd_SpecCond).4
s(sd_SpecCond).5
s(sd_SpecCond).6
s(sd_SpecCond).7
s(sd_SpecCond).8
s(sd_SpecCond).9
s(Developed_Change).1
s(Developed_Change).2
s(Developed_Change).3
s(Developed_Change).4
s(Developed_Change).5
s(Developed_Change).6
s(Developed_Change).7
s(Developed_Change).8
s(Developed_Change).9
s(Agri_Change).1
s(Agri_Change).2
s(Agri_Change).3
s(Agri_Change).4
s(Agri_Change).5
s(Agri_Change).6
s(Agri_Change).7
s(Agri_Change).8
s(Agri_Change).9
s(county).1
s(county).2
s(county).3
s(county).4
s(county).5
Constant 0.96***
(0.19)
------------------------------------------------------------------
Observations 289
Adjusted R2 0.49
Log Likelihood -289.06
UBRE 310.63
==================================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# The model predicts log-transformed nitrate using smooth curves (splines) for each predictor and accounts for differences across counties as a random effect.
#
# The model explains about 53.9% of the deviance (similar to R²) and has an adjusted R² of 0.493, indicating a decent fit.
#
# wizard_sites_last_year_data_WTE, Mean_WellDepth, mean_Chloride, and county were statistically significant predictors (p < 0.05).
#
# Other predictors like avg_sand, sd_Chloride, and wizard_sites_SD_data didn’t show strong effects (p > 0.1).
#
# The smooth term for county was significant, confirming that including it as a random effect helps account for regional variability.
test_data$gam_predicted_log <- predict(gam_model, newdata = test_data)
test_data$gam_predicted_mean <- expm1(test_data$gam_predicted_log)
gam_r2 <- round(cor(test_data$mean_nitrate, test_data$gam_predicted_mean)^2, 3)
gam_rmse <- round(rmse(test_data$mean_nitrate, test_data$gam_predicted_mean), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$gam_predicted_mean), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = gam_predicted_mean)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "GAM (Test Set): Observed vs. Predicted Mean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = 2, y = max(test_data$mean_nitrate, na.rm = TRUE),
label = paste0("R² = ", gam_r2, "\nRMSE = ", gam_rmse),
hjust = 0, vjust = 1, size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# QQ plot
qqnorm(resid(gam_model))
qqline(resid(gam_model))5. Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
mlr_model <- lm(mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond+Agri_Change,
data = train_data)
summary(mlr_model)
Call:
lm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand + avg_silt +
avg_clay + wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond + Agri_Change, data = train_data)
Residuals:
Min 1Q Median 3Q Max
-7.9367 -2.1461 -0.8388 1.1720 15.9447
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.94154 0.21476 13.697 < 2e-16 ***
wizard_sites_last_year_data_WTE 1.67481 0.24673 6.788 6.9e-11 ***
Developed_Change 0.09089 0.29230 0.311 0.756083
Mean_WellDepth -0.28547 0.20604 -1.386 0.167014
avg_sand -1.76700 1.31633 -1.342 0.180575
avg_silt -1.54386 1.32262 -1.167 0.244102
avg_clay NA NA NA NA
wizard_sites_SD_data -0.83285 0.24891 -3.346 0.000933 ***
mean_Chloride 2.26540 1.91474 1.183 0.237768
mean_SpecCond -3.81547 1.89326 -2.015 0.044840 *
sd_Chloride -0.64473 0.34956 -1.844 0.066193 .
sd_SpecCond 0.95444 0.39236 2.433 0.015625 *
Agri_Change -0.13986 0.28090 -0.498 0.618950
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 3.639 on 277 degrees of freedom
Multiple R-squared: 0.2737, Adjusted R-squared: 0.2448
F-statistic: 9.488 on 11 and 277 DF, p-value: 1.696e-14
stargazer(mlr_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 1.67***
(0.25)
Developed_Change 0.09
(0.29)
Mean_WellDepth -0.29
(0.21)
avg_sand -1.77
(1.32)
avg_silt -1.54
(1.32)
avg_clay
wizard_sites_SD_data -0.83***
(0.25)
mean_Chloride 2.27
(1.91)
mean_SpecCond -3.82*
(1.89)
sd_Chloride -0.64
(0.35)
sd_SpecCond 0.95*
(0.39)
Agri_Change -0.14
(0.28)
Constant 2.94***
(0.21)
-------------------------------------------------------------
Observations 289
R2 0.27
Adjusted R2 0.24
Residual Std. Error 3.64 (df = 277)
F Statistic 9.49*** (df = 11; 277)
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
#
# The model explains about 27% of the variance in mean nitrate values (R² = 0.274), with a residual error of around 3.64.
#
# The variable wizard_sites_last_year_data_WTE has the strongest positive effect and is highly significant.
#
# wizard_sites_SD_data and sd_SpecCond also show significant relationships; mean_SpecCond is weakly significant and negative.
#
# The variable avg_clay was automatically removed due to perfect multicollinearity with other predictors.
#
# Several predictors like Developed_Change, Mean_WellDepth, and Agri_Change were not statistically significant.
#
test_data$mlr_predicted <- predict(mlr_model, newdata = test_data)
mlr_r2 <- round(cor(test_data$mean_nitrate, test_data$mlr_predicted)^2, 3)
mlr_rmse <- round(rmse(test_data$mean_nitrate, test_data$mlr_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$mlr_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = mlr_predicted)) +
geom_point(alpha = 0.7, color = "darkgreen") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "MLR: Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
label = paste0("R² = ", mlr_r2, "\nRMSE = ", mlr_rmse),
hjust = 0, vjust = 1, size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# Residual vs. fitted plot
plot(mlr_model)# QQ plot of residuals
qqnorm(resid(mlr_model))
qqline(resid(mlr_model))6. Comparison of Multiple linear model
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
model_formulas <- tibble::tibble(
model_name = c(
"Full Model",
"No Silt",
"No Sand",
"No Developed_Change",
"No Chloride",
"No SpecCond",
"No WTE",
"No Mean Well Depth",
"No SD Data",
"No Silt and Sand",
"No Chloride and SpecCond",
"No Mean Depth and WTE"
),
formula = list(
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond+Agri_Change,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_SpecCond + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + sd_Chloride,
mean_nitrate ~ Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
avg_sand + avg_silt + avg_clay + wizard_sites_SD_data +
mean_Chloride + mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_sand + avg_silt + avg_clay + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth +
avg_clay + wizard_sites_SD_data + mean_Chloride +
mean_SpecCond + sd_Chloride + sd_SpecCond,
mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change +
Mean_WellDepth + avg_sand + avg_silt + avg_clay + wizard_sites_SD_data,
mean_nitrate ~ Developed_Change + avg_sand + avg_silt + avg_clay +
wizard_sites_SD_data + mean_Chloride + mean_SpecCond +
sd_Chloride + sd_SpecCond
)
)
model_results <- model_formulas %>%
mutate(
model_fit = map(formula, ~ lm(.x, data = train_data)),
predicted = map(model_fit, ~ predict(.x, newdata = test_data)),
r2 = map_dbl(predicted, ~ cor(.x, test_data$mean_nitrate)^2),
rmse = map_dbl(predicted, ~ rmse(.x, test_data$mean_nitrate))
)
plot_data <- model_results %>%
select(model_name, r2, rmse) %>%
pivot_longer(cols = c(r2, rmse), names_to = "metric", values_to = "value")
p_r2 <- plot_data %>%
filter(metric == "r2") %>%
ggplot(aes(x = reorder(model_name, value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = expression(R^2), title = "MLR Model R² (Test Set)") +
scale_fill_manual(values = c("r2" = "steelblue")) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.1)) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_rmse <- plot_data %>%
filter(metric == "rmse") %>%
ggplot(aes(x = reorder(model_name, -value), y = value, fill = metric)) +
geom_col(width = 0.7) +
labs(x = "Model Variant", y = "RMSE", title = "MLR Model RMSE (Test Set)") +
scale_fill_manual(values = c("rmse" = "tomato")) +
theme_minimal(base_size = 16) +
coord_flip() +
theme(legend.position = "none")
p_r2 / p_rmsePrevious work up until Thursday, 8th May
1. Random Forest model using 500 trees to predict mean nitrate concentrations. This ensemble method captures complex nonlinearities and interactions with high predictive accuracy
library(randomForest)
library(ggplot2)
library(Metrics)
library(dplyr)
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
rf_model <- randomForest(mean_nitrate ~ wizard_sites_last_year_data_WTE +Developed_Change+
Mean_WellDepth + avg_sand +avg_clay+avg_silt+sd_Chloride+sd_SpecCond+
wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change,
data = train_data,
ntree = 500, importance = TRUE)
print(rf_model)
Call:
randomForest(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE + Developed_Change + Mean_WellDepth + avg_sand + avg_clay + avg_silt + sd_Chloride + sd_SpecCond + wizard_sites_SD_data + mean_Chloride + mean_SpecCond + Agri_Change, data = train_data, ntree = 500, importance = TRUE)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 4
Mean of squared residuals: 9.29879
% Var explained: 46.79
importance(rf_model) %IncMSE IncNodePurity
wizard_sites_last_year_data_WTE 24.3555896 1049.6664
Developed_Change 2.8712488 129.7873
Mean_WellDepth 21.4816680 1009.5143
avg_sand 3.9973885 159.4341
avg_clay 5.5287895 174.4687
avg_silt 4.2982408 197.9525
sd_Chloride 10.9084549 372.6617
sd_SpecCond 8.2713165 310.2396
wizard_sites_SD_data 10.7779309 334.6947
mean_Chloride 13.5936136 514.9471
mean_SpecCond 9.3119503 437.4617
Agri_Change 0.2146065 117.4282
#
# Top 3 most important predictors based on %IncMSE are:
#
# wizard_sites_last_year_data_WTE (24.4%),
#
# Mean_WellDepth (21.5%),
#
# mean_Chloride (13.6%).
#
# Agri_Change has very low importance (0.2%), suggesting it adds little predictive value.
#
# IncNodePurity shows how much a variable helps in reducing overall variance — used to assess relative contribution.
#
# Overall, WTE, depth, and water quality variables (like chloride, spec cond) are driving prediction strength in RF model.
#
test_data$rf_predicted <- predict(rf_model, newdata = test_data)
r2_rf <- round(cor(test_data$mean_nitrate, test_data$rf_predicted)^2, 3)
rmse_rf <- round(rmse(test_data$mean_nitrate, test_data$rf_predicted), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$rf_predicted), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = rf_predicted)) +
geom_point(alpha = 0.7, color = "steelblue", size = 2) +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(
title = "Random Forest (Test Set): Observed vs. Predicted \nMean Nitrate",
x = "Observed Mean Nitrate",
y = "Predicted Mean Nitrate"
) +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_rf, "\nRMSE = ", rmse_rf),
size = 5, color = "black") +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# Residuals
resid_rf <- test_data$mean_nitrate - predict(rf_model, newdata = test_data)
# QQ plot
qqnorm(resid_rf)
qqline(resid_rf)# Training data Testing data MSE % variation explained MSE % variation explained #IOWA paper
train_preds <- predict(rf_model, newdata = train_data)
mse_train <- mean((train_data$mean_nitrate - train_preds)^2)
r2_train <- cor(train_data$mean_nitrate, train_preds)^2 * 100
test_preds <- predict(rf_model, newdata = test_data)
mse_test <- mean((test_data$mean_nitrate - test_preds)^2)
r2_test <- cor(test_data$mean_nitrate, test_preds)^2 * 100
rf_performance <- tibble(
Dataset = c("Training", "Testing"),
MSE = c(mse_train, mse_test),
`% Variance Explained` = c(r2_train, r2_test)
)
rf_performance %>%
gt() %>%
tab_header(title = "Random Forest Model Performance") %>%
fmt_number(columns = c(MSE, `% Variance Explained`), decimals = 2)| Random Forest Model Performance | ||
|---|---|---|
| Dataset | MSE | % Variance Explained |
| Training | 1.81 | 92.42 |
| Testing | 7.13 | 58.01 |
2. Gamma-distributed Generalized Linear Model (GLM) with a log link to model skewed nitrate concentrations.
model_data_scaled_1<- model_data_scaled
set.seed(123)
n <- nrow(model_data_scaled_1)
train_idx <- sample(seq_len(n), size = 0.8 * n)
train_data <- model_data_scaled_1[train_idx, ]
test_data <- model_data_scaled_1[-train_idx, ]
glm_model <- glm(mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand +avg_clay+avg_silt+
wizard_sites_SD_data + mean_Chloride+sd_Chloride+sd_SpecCond + mean_SpecCond+Agri_Change,
data = train_data,
family = Gamma(link = "log"))
summary(glm_model)
Call:
glm(formula = mean_nitrate ~ wizard_sites_last_year_data_WTE +
Developed_Change + Mean_WellDepth + avg_sand + avg_clay +
avg_silt + wizard_sites_SD_data + mean_Chloride + sd_Chloride +
sd_SpecCond + mean_SpecCond + Agri_Change, family = Gamma(link = "log"),
data = train_data)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.72814 0.08372 8.697 3.08e-16 ***
wizard_sites_last_year_data_WTE 0.54185 0.09619 5.633 4.34e-08 ***
Developed_Change -0.03460 0.11395 -0.304 0.761621
Mean_WellDepth -0.03000 0.08032 -0.373 0.709097
avg_sand 0.06754 0.28255 0.239 0.811248
avg_clay 0.30100 0.27567 1.092 0.275822
avg_silt NA NA NA NA
wizard_sites_SD_data -0.52311 0.09704 -5.391 1.50e-07 ***
mean_Chloride 2.58317 0.74647 3.461 0.000624 ***
sd_Chloride -0.14638 0.13628 -1.074 0.283700
sd_SpecCond 0.35784 0.15296 2.339 0.020023 *
mean_SpecCond -3.23494 0.73809 -4.383 1.66e-05 ***
Agri_Change -0.12455 0.10951 -1.137 0.256388
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for Gamma family taken to be 2.01265)
Null deviance: 748.73 on 288 degrees of freedom
Residual deviance: 558.80 on 277 degrees of freedom
AIC: 978.79
Number of Fisher Scoring iterations: 25
stargazer(glm_model,type="text",
digits=2,
star.cutoffs = c(.05,.01,.001),
digit.separator = "")
=============================================================
Dependent variable:
-----------------------------
mean_nitrate
-------------------------------------------------------------
wizard_sites_last_year_data_WTE 0.54***
(0.10)
Developed_Change -0.03
(0.11)
Mean_WellDepth -0.03
(0.08)
avg_sand 0.07
(0.28)
avg_clay 0.30
(0.28)
avg_silt
wizard_sites_SD_data -0.52***
(0.10)
mean_Chloride 2.58***
(0.75)
sd_Chloride -0.15
(0.14)
sd_SpecCond 0.36*
(0.15)
mean_SpecCond -3.23***
(0.74)
Agri_Change -0.12
(0.11)
Constant 0.73***
(0.08)
-------------------------------------------------------------
Observations 289
Log Likelihood -477.39
Akaike Inf. Crit. 978.79
=============================================================
Note: *p<0.05; **p<0.01; ***p<0.001
# This model predicts mean nitrate using a Gamma distribution with a log link, suitable for skewed, non-negative data.
#
# The variables wizard_sites_last_year_data_WTE, wizard_sites_SD_data, mean_Chloride, mean_SpecCond, and sd_SpecCond are statistically significant (p < 0.05), meaning they likely influence nitrate concentrations.
#
# avg_silt was automatically excluded due to collinearity (probably because of perfect correlation with sand or clay).
test_data$predicted_glm <- predict(glm_model, newdata = test_data, type = "response")
r2_glm <- round(cor(test_data$mean_nitrate, test_data$predicted_glm)^2, 3)
rmse_glm <- round(rmse(test_data$mean_nitrate, test_data$predicted_glm), 3)
lim_range <- range(c(test_data$mean_nitrate, test_data$predicted_glm), na.rm = TRUE)
ggplot(test_data, aes(x = mean_nitrate, y = predicted_glm)) +
geom_point(alpha = 0.7, color = "steelblue") +
geom_abline(slope = 1, intercept = 0, linetype = "dotted", color = "red") +
geom_smooth(method = "lm", se = FALSE, color = "black") +
labs(title = "GLM (Gamma): Observed vs. Predicted Mean Nitrate (Test Set)",
x = "Observed Mean Nitrate", y = "Predicted Mean Nitrate") +
annotate("text", x = min(lim_range), y = max(lim_range),
hjust = 0, vjust = 1,
label = paste0("R² = ", r2_glm, "\nRMSE = ", rmse_glm),
size = 5) +
theme_minimal(base_size = 16) +
coord_fixed() +
xlim(lim_range) +
ylim(lim_range)# QQ plot
qqnorm(resid(glm_model, type = "deviance"))
qqline(resid(glm_model, type = "deviance"))