library(tidyverse)
library(gstat)
library(terra)
library(tidyterra)
library(ggnewscale)
library(tmap)
library(sf)
library(PNWColors)
library(fields)
library(spatstat)
library(automap)
library(knitr)
library(ncf)
# function to turn sf to raster
sf_2_rast <-function(sfObject,variableIndex = 1){
# coerce sf to a data.frame
dfObject <- data.frame(st_coordinates(sfObject),
z=as.data.frame(sfObject)[,variableIndex])
# coerce data.frame to SpatRaster
rastObject <- rast(dfObject,crs=crs(sfObject))
names(rastObject) <- names(sfObject)[variableIndex]
return(rastObject)
}
ozoneGrids_rast <- readRDS("data/ozoneGrids_rast.rds")
ozoneGrids_sf <- readRDS("data/ozoneGrids_sf.rds")
ozonePoints_sf <- readRDS("data/ozonePoints_sf.rds")
caBnd_sf <- readRDS("data/caBnd_sf.rds")
caBnd_sf
## Simple feature collection with 1 feature and 1 field
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -371692.4 ymin: -604021.9 xmax: 539854.3 ymax: 450479.1
## Projected CRS: NAD83 / California Albers
## ID geom
## 1 california POLYGON ((-194533.8 7189.73...
ozoneCoords <- as.data.frame(st_coordinates(ozonePoints_sf))
ozonePoints_df <- data.frame(ozone = ozonePoints_sf$ozone,
ozoneCoords)
gridCoords <- as.data.frame(st_coordinates(ozoneGrids_sf))
pal1 <- pnw_palette("Lake", n = nrow(ozonePoints_sf))
pal2 <- pnw_palette("Lake", n = nrow(ozoneGrids_sf))
ggplot() +
geom_sf(data = caBnd_sf) +
geom_spatraster(data = ozoneGrids_rast, aes(fill = hill)) +
scale_fill_gradientn(colors = gray.colors(100,
start = 0.1,
end = 0.9),
guide = "none",
na.value = "transparent") +
new_scale_fill() +
geom_spatraster(data = ozoneGrids_rast, aes(fill = elev)) +
scale_fill_hypso_c(name = "Elevation (m)",
palette = "colombia_hypso",
alpha = 0.75) +
geom_sf(data = ozonePoints_sf,
aes(col = ozone, size = ozone, alpha = 0.7)) +
scale_color_gradientn(colors = (pal1),
name = "Ozone (ppb)",
na.value = "transparent") +
labs(title = "Ozone Concentrations in California",
x = "Longitude", y = "Latitude") +
guides(size = "none", alpha = "none") +
theme_minimal()
Figure 1.1a: Average daily ozone measurements across 345 stations in California, USA. Color in base map denotes elevation. Points represent measurement stations; size and color indicate ozone concentration in parts per billion.
tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(ozonePoints_sf) +
tm_dots(col = "ozone", palette = pal1,
id = "name")
Figure 1.1b: Interactive map of average daily ozone measurements across 345 stations in California, USA. Color of point indicates ozone concentration in parts per billion.
Inverse distance weighting (IDW) is a deterministic interpolation method that estimates a variable \(\hat Z\) at location \(s_0\) by the weighting formula: \[\hat Z(s_0) = \frac{\sum_{i=1}^n w(s_i)Z(s_i)} {\sum_{i=1}^n w(s_i)} \] where \(w(s_i)\) is the weights of measurements at \(i\) locations: \[ w(s_i) = ||s_i - s_0||^{-p}\]
We can optimize the parameter \(p\) by finding the value with the highest \(R^2\) and lowest RMSE for a training set of the data.
# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_sf)
# values of p to test
powers2try <- seq(0.5,6,by=0.5)
nPowers <- length(powers2try)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=nPowers,nrow=m)
rmse <- matrix(0,ncol=nPowers,nrow=m)
set.seed(7)
for(i in 1:m){
# select rows for testData subset
rows4test <- sample(x = 1:n,size = n*0.1)
# create testData subset
testData <- ozonePoints_sf[rows4test,]
# create trainData subset
trainData <- ozonePoints_sf[-rows4test,]
for(j in 1:nPowers){
# IDW model with trainData subset
idw_model <- gstat(formula=ozone~1,
locations = trainData,
set=list(idp = powers2try[j]))
# test IDW model compared to observed values in testData
obs <- testData$ozone
# pull predictions from IDW model
preds <- predict(idw_model,newdata = testData, debug.level = 0) |>
pull(var1.pred)
# pull R^2 and RMSE
rsq[i,j] <- cor(obs, preds)^2
rmse[i,j] <- sqrt(mean((preds - obs)^2))
}
}
# get mean rsq and rmse for each power
skillDF <- data.frame(power = powers2try,
rsq = colMeans(rsq),
rmse = colMeans(rmse)) |>
pivot_longer(cols = -power)
ggplot(data = skillDF, aes(x = power, y = value)) +
geom_point() +
geom_line() +
facet_wrap(~name, scales = "free_y") +
labs(x="IDW Power",
title="10% Data Withheld, 50 Iterations")
Figure 1.2a: Mean RMSE and \(R^2\) for cross-validation IDW tests for
values of p from 0.5 to 6 at intervals of 0.5. For each value of p, 50
iterations were run with a random 10% of the data withheld for
validation.
Root mean square error (RMSE) decreases dramatically from p = 0.5 to p = 1.5, then begins to increase after p > 2.5. \(R^2\) does not reach a peak until p approaches 3, then decreases again. Hence, I selected an optimal value of p = 2.5 for the final IDW interpolation.
idw_ozone <- gstat(formula=ozone~1,
locations = ozonePoints_sf,
set=list(idp = 3))
idw_ozone_sf <- predict(idw_ozone, ozoneGrids_sf, debug.level = 0)
# convert to raster
idw_ozone_rast <- sf_2_rast(idw_ozone_sf)
idw_ozone_rast
## class : SpatRaster
## dimensions : 253, 217, 1 (nrow, ncol, nlyr)
## resolution : 4131.043, 4131.043 (x, y)
## extent : -363987.7, 532448.7, -597814.2, 447339.8 (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310)
## source(s) : memory
## name : var1.pred
## min value : 4.366395
## max value : 84.565024
ggplot() +
geom_raster(data = idw_ozone_rast,
mapping = aes(x = x, y = y,
fill = var1.pred), alpha = 0.8) +
scale_fill_gradientn(colors = (pal1),
name = "Ozone (ppb)",
na.value = "transparent") +
labs(title = "IDW-Modeled Ozone Concentrations",
subtitle = "IDW: p = 2.5",
x = "Easting (m)", y = "Northing (m)") +
coord_equal() +
theme_minimal()
ggplot() +
geom_raster(data = idw_ozone_rast,
mapping = aes(x = x, y = y,
fill = var1.pred), alpha = 0.8) +
geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
shape = 21, alpha = 0.8) +
scale_fill_gradientn(colors = (pal1),
name = "Ozone (ppb)",
na.value = "transparent") +
labs(title = "IDW-Modeled Ozone Concentrations",
subtitle = "IDW: p = 2.5",
x = "Longitude", y = "Latitude") +
scale_size(guide = "none") +
coord_sf() +
theme_minimal()
Figure 1.2b: Modeled ozone concentrations across California, USA. Inverse distance weighting (IDW) interpolation using measurements from 345 stations (see points on second figure). A value of p = 2.5 for the weights matrix was selected based on optimized \(R^2\) and RMSE values (\(R^2\) = 0.44, RMSE = 8.12).
There is some smoothness in the IDW surface, particularly where ozone concentrations are low and not particularly variable. Where measurements spike, the surrounding surface is slightly pock-marked, suggesting the IDW model could be improved upon (though the nature of ozone production might mean those craters are, in fact, significant). However, IDW is a deterministic method of interpolation, so there is no probability distribution associated with these values and no margin of confidence.
Thin-plate splines is an alternative to IDW that is still deterministic, but uses repeated splines to approximate values at unknown locations.
ozone_TPSmodel <- Tps(x = ozonePoints_df[,2:3], Y = ozonePoints_df[,1])
ozone_TPSmodel
## Call:
## Tps(x = ozonePoints_df[, 2:3], Y = ozonePoints_df[, 1])
##
## Number of Observations: 345
## Number of parameters in the null space 3
## Parameters for fixed spatial drift 3
## Model degrees of freedom: 90.5
## Residual degrees of freedom: 254.5
## GCV estimate for tau: 6.218
## MLE for tau: 5.922
## MLE for sigma: 261100
## lambda 0.00013
## User supplied sigma NA
## User supplied tau^2 NA
## Summary of estimates:
## lambda trA GCV tauHat -lnLike Prof converge
## GCV 0.0001342780 90.52659 52.41506 6.217842 1198.639 3
## GCV.model NA NA NA NA NA NA
## GCV.one 0.0001342780 90.52659 52.41506 6.217842 NA 3
## RMSE NA NA NA NA NA NA
## pure error NA NA NA NA NA NA
## REML 0.0008693386 45.70955 54.26053 6.860865 1188.519 7
# Predict TPS model over ozoneGrid
ozone_TPSpreds <- c(predict(object=ozone_TPSmodel, x = gridCoords))
# store in a data.frame with the x,y coordinates
ozone_TPS <- data.frame(x = gridCoords[,1],
y = gridCoords[,2],
ozone = ozone_TPSpreds)
# convert to Raster with SpatStat
ozone_TPSrast <- rast(ozone_TPS,
crs = crs(ozonePoints_sf))
ozone_TPSrast
## class : SpatRaster
## dimensions : 253, 217, 1 (nrow, ncol, nlyr)
## resolution : 4131.043, 4131.043 (x, y)
## extent : -363987.7, 532448.7, -597814.2, 447339.8 (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310)
## source(s) : memory
## name : ozone
## min value : 13.00884
## max value : 63.56690
ggplot() +
geom_spatraster(data = ozone_TPSrast, aes(fill = ozone), alpha = 0.9) +
scale_fill_gradientn(colors = pal1,
name = "Ozone (ppb)",
na.value = "transparent") +
labs(title = "TPS-Modeled Ozone Concentrations",
x = "Longitude", y = "Latitude") +
theme_minimal()
ggplot() +
geom_spatraster(data = ozone_TPSrast, aes(fill = ozone), alpha = 0.9) +
scale_fill_gradientn(colors = pal1,
name = "Ozone (ppb)",
na.value = "transparent") +
labs(title = "TPS-Modeled Ozone Concentrations",
x = "Longitude", y = "Latitude") +
theme_minimal() +
geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
shape = 21, alpha = 0.8)
Figure 1.3a: Ozone concentrations modeled with Thin-Plate Splines (TPS). Cross-validation mean \(R^2\) = 0.52, RMSE = 7.18. Points (in second figure) show empirical measurements from 345 stations.
The TPS model does not predict the observed values at known locations as accurately as the IDW model did (\(R^2\) = 0.751, RMSE = 5.347) but does produce a lower average RMSE (7.18) and higher average \(R^2\) (0.52) in cross-validation tests than IDW. The TPS model is a smoother surface than the IDW, spreading out the craters around more extreme points. This may or may not be a more accurate prediction, depending on the nature of ozone production in those locations. However, the second figure highlights how the TPS model underpredicts high ozone concentrations (see scale differences).
Kriging is a probabilistic method of interpolation: it uses a variogram model based on experimental semivariance values to predict values of \(\hat Z\) at unknown locations.
# using autofitVariogram() to produce experimental and fitted variogram
## tests Gaussian, Spherical, and Exponential fits and produces the best
ozone_KrigeVar <- autofitVariogram(ozonePoints_sf$ozone~1,
input_data = ozonePoints_sf,
model = c("Gau", "Sph", "Exp"))
plot(ozone_KrigeVar)
Figure 1.4a: Variogram of ozone concentrations.
The best-fit variogram uses a spherical model with nugget = 24, sill = 102, and range = 100611.
# Cross Validation of Krige variogram
# observational variogram
ozone_var <- variogram(ozone~1, ozonePoints_sf)
# Variogram model
ozone_Krigemod <- vgm(psill = 103, model = "Sph", range = 100611, nugget = 24)
# fit variogram model
ozone_fit <- fit.variogram(object = ozone_var, model = ozone_Krigemod)
# observational variogram
ozone_var <- variogram(ozone~1, ozonePoints_sf)
# Variogram model
ozone_Krigemod <- vgm(psill = 103, model = "Sph", range = 100611, nugget = 24)
# fit variogram model
ozone_fit <- fit.variogram(object = ozone_var, model = ozone_Krigemod)
ozone_Gstat <- gstat(formula = ozone~1, locations = ozonePoints_sf,
model = ozone_fit)
ozoneKrige_sf <- predict(ozone_Gstat, newdata = ozoneGrids_sf)
## [using ordinary kriging]
ozoneKrige_sf
## Simple feature collection with 22589 features and 2 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -361922.2 ymin: -595748.7 xmax: 530383.2 ymax: 445274.2
## Projected CRS: NAD83 / California Albers
## First 10 features:
## var1.pred var1.var geometry
## 1 23.27353 67.72911 POINT (-341267 445274.2)
## 2 22.87074 64.78668 POINT (-337135.9 445274.2)
## 3 22.61495 63.01437 POINT (-333004.9 445274.2)
## 4 22.55167 62.87221 POINT (-328873.9 445274.2)
## 5 22.69510 64.39531 POINT (-324742.8 445274.2)
## 6 23.01619 67.16777 POINT (-320611.8 445274.2)
## 7 23.46274 70.65231 POINT (-316480.7 445274.2)
## 8 23.96500 74.42233 POINT (-312349.7 445274.2)
## 9 24.48401 78.19729 POINT (-308218.6 445274.2)
## 10 24.99937 81.80175 POINT (-304087.6 445274.2)
ozoneKrige_rast <- sf_2_rast(ozoneKrige_sf)
ozoneKrige_rast
## class : SpatRaster
## dimensions : 253, 217, 1 (nrow, ncol, nlyr)
## resolution : 4131.043, 4131.043 (x, y)
## extent : -363987.7, 532448.7, -597814.2, 447339.8 (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310)
## source(s) : memory
## name : var1.pred
## min value : 13.81176
## max value : 67.96435
ggplot() +
geom_spatraster(data = ozoneKrige_rast, aes(fill = var1.pred),
alpha = 0.9) +
labs(title = "Kriging-Modeled Ozone Concentrations",
x = "Longitude", y = "Latitude") +
scale_fill_gradientn(colors = (pal1),
name = "Ozone (ppb)",
na.value = "transparent") +
theme_minimal()
ggplot() +
geom_spatraster(data = ozoneKrige_rast, aes(fill = var1.pred),
alpha = 0.9) +
labs(title = "Kriging-Modeled Ozone Concentrations",
x = "Longitude", y = "Latitude") +
scale_fill_gradientn(colors = (pal1),
name = "Ozone (ppb)",
na.value = "transparent") +
theme_minimal() +
geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
shape = 21, alpha = 0.8)
Figure 1.4b: Ozone Concentrations modeled with ordinary kriging. K-fold cross-validation mean \(R^2\) = 0.52, average RMSE = 7.20, with k = 10. Second figure shows empirical measurements from 345 stations.
The kriging surface blends the better features of both the TPS model and the IDW model. Changes across smaller distances that were craters in the IDW model are smoother but still present after kriging, while not being overly smooth like the TPS model. Looking back at the original map (which includes elevation), there appears to be some coincidence of high ozone levels and higher elevation.
The fit of the kriged surface and of the TPS model are strikingly similar after cross validation, though kriging slightly more accurately predicts values at known locations (\(R^2\) = 0.82 vs 0.75 in TPS model).
Regression kriging uses a variogram model to inform a linear model of a predictor variable and our variable of interest, incorporation spatial structure and improving upon the base model.
Variable <- names(ozonePoints_sf)
Units <- c("ppb", "m", "radians", rep("unitless", 5))
Description <- c("Average Daily Ozone 1980-1999",
"Elevation",
"Slope",
"Solar radiation",
"Night lights density",
"Traffic density",
"Tree density",
"Hillshade")
Source <- c("California Air Resources Board",
"USGS National Elevation Dataset",
rep("Derived from Elevation", 2),
"NASA",
"California Office of Env Health",
"NASA",
"Derived from Elevation")
ozone_vars <- data.frame(Variable = Variable[1:8],
Units = Units, Description = Description,
Source = Source)
kable(ozone_vars, "simple")
| Variable | Units | Description | Source |
|---|---|---|---|
| ozone | ppb | Average Daily Ozone 1980-1999 | California Air Resources Board |
| elev | m | Elevation | USGS National Elevation Dataset |
| slope | radians | Slope | Derived from Elevation |
| rad | unitless | Solar radiation | Derived from Elevation |
| lights | unitless | Night lights density | NASA |
| traffic | unitless | Traffic density | California Office of Env Health |
| tree | unitless | Tree density | NASA |
| hill | unitless | Hillshade | Derived from Elevation |
Of the predictor variables available, three stand out as most likely to influence ozone concentration: traffic, elevation, and solar radiation. Traffic and urban areas produce more \(NO_x\) pollutants that react with sunlight to produce ozone (\(O_3\)). Elevation increases exposure to solar radiation, and is in fact how the radiation metric was derived. I set a base model with elevation, radiation, and traffic density as predictors, then assessed their influence with ANOVA before proceeding to Regression Kriging.
# linear model (OLS) with no spatial consideration (yet)
ozone_lm <- lm(ozone ~ elev + rad + traffic, data = ozonePoints_sf)
summary(ozone_lm)
##
## Call:
## lm(formula = ozone ~ elev + rad + traffic, data = ozonePoints_sf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.167 -5.025 -0.479 4.090 44.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.3695930 1.3268034 22.136 < 2e-16 ***
## elev 0.0086736 0.0009388 9.239 < 2e-16 ***
## rad 0.2815676 1.4886421 0.189 0.85009
## traffic -6.3211861 2.1489999 -2.941 0.00349 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.103 on 341 degrees of freedom
## Multiple R-squared: 0.2715, Adjusted R-squared: 0.265
## F-statistic: 42.35 on 3 and 341 DF, p-value: < 2.2e-16
anova(ozone_lm)
## Analysis of Variance Table
##
## Response: ozone
## Df Sum Sq Mean Sq F value Pr(>F)
## elev 1 9810.8 9810.8 118.3928 < 2e-16 ***
## rad 1 0.7 0.7 0.0090 0.92456
## traffic 1 717.0 717.0 8.6522 0.00349 **
## Residuals 341 28257.5 82.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kable(anova(ozone_lm))
| Df | Sum Sq | Mean Sq | F value | Pr(>F) | |
|---|---|---|---|---|---|
| elev | 1 | 9.810804e+03 | 9810.8040379 | 118.3927593 | 0.0000000 |
| rad | 1 | 7.441989e-01 | 0.7441989 | 0.0089807 | 0.9245560 |
| traffic | 1 | 7.169747e+02 | 716.9746843 | 8.6521564 | 0.0034901 |
| Residuals | 341 | 2.825751e+04 | 82.8665883 | NA | NA |
Elevation is a good predictor of ozone concentration (\(F_{1,341}\) = 118.4, p < 0.001), but solar radiation is, in fact, a poor predictor (\(F_{1,341}\) = 0.009, p = 0.92). Traffic density, however, is still a signficant predictor, even with an alpha corrected for multiple comparisons (\(F_{1,341}\) = 8.65, p < 0.0035).
I reduced the model to include only signficant predictors: elevation and traffic density.
# linear model (OLS) with no spatial consideration (yet)
ozone_lm <- lm(ozone ~ elev + traffic, data = ozonePoints_sf)
summary(ozone_lm)
##
## Call:
## lm(formula = ozone ~ elev + traffic, data = ozonePoints_sf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.151 -5.094 -0.481 4.149 43.887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.5401303 0.9720104 30.391 < 2e-16 ***
## elev 0.0086557 0.0009327 9.280 < 2e-16 ***
## traffic -6.2820144 2.1359800 -2.941 0.00349 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.09 on 342 degrees of freedom
## Multiple R-squared: 0.2714, Adjusted R-squared: 0.2671
## F-statistic: 63.69 on 2 and 342 DF, p-value: < 2.2e-16
anova(ozone_lm)
## Analysis of Variance Table
##
## Response: ozone
## Df Sum Sq Mean Sq F value Pr(>F)
## elev 1 9810.8 9810.8 118.7275 < 2.2e-16 ***
## traffic 1 714.8 714.8 8.6497 0.003494 **
## Residuals 342 28260.5 82.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Per the variogram in section 1.4 (reproduced below), there is spatial autocorrelation in ozone concentrations. Furthermore, there is spatial autocorrelation in the residuals of the regression model, so this model could be improved by including spatial structure with kriging.
ozone_KrigeVar <- autofitVariogram(ozonePoints_sf$ozone~1,
input_data = ozonePoints_sf,
model = c("Gau", "Sph", "Exp"))
plot(ozone_KrigeVar)
# observational variogram
ozone_var <- variogram(ozone~1, ozonePoints_sf)
# Set up gstat object with regression model
ozone_KRGstat <- gstat(id = "ozone_KRmodel",
formula = ozone~elev + traffic,
data = ozonePoints_sf)
# variogram of model residuals
ozone_KRvar <- variogram(ozone_KRGstat)
plot(ozone_KRvar, pch=20, cex=1.5, col="black",
ylab=expression("Semivariance ("*gamma*")"),
xlab="Distance (m)", main = "Model Residuals")
# fit variogram model
ozone_KRfit_Sph <- fit.variogram(ozone_KRvar,
vgm(psill = 70,
model = "Sph",
range = 100000,
nugget = 40))
ozone_KRfit_Sph
## model psill range
## 1 Nug 31.34335 0.00
## 2 Sph 38.58030 84162.72
plot(ozone_KRvar, ozone_KRfit_Sph,
pch=20, cex=1.5, col="black",
ylab=expression("Semivariance ("*gamma*")"),
xlab="Distance (m)", main = "Model Residuals")
# fit variogram model
ozone_KRfit_Gau <- fit.variogram(ozone_KRvar,
vgm(psill = 70,
model = "Gau",
range = 100000,
nugget = 40))
## Warning in fit.variogram(ozone_KRvar, vgm(psill = 70, model = "Gau", range =
## 1e+05, : No convergence after 200 iterations: try different initial values?
ozone_KRfit_Gau
## model psill range
## 1 Nug 36.45778 0.00
## 2 Gau 33.78781 36529.32
plot(ozone_KRvar, ozone_KRfit_Gau,
pch=20, cex=1.5, col="black",
ylab=expression("Semivariance ("*gamma*")"),
xlab="Distance (m)", main = "Model Residuals")
# fit variogram model
ozone_KRfit_Exp <- fit.variogram(ozone_KRvar,
vgm(psill = 70,
model = "Exp",
range = 100000,
nugget = 40))
ozone_KRfit_Exp
## model psill range
## 1 Nug 22.10499 0.00
## 2 Exp 49.08628 30608.03
plot(ozone_KRvar, ozone_KRfit_Exp,
pch=20, cex=1.5, col="black",
ylab=expression("Semivariance ("*gamma*")"),
xlab="Distance (m)", main = "Model Residuals")
The best variogram model is the spherical model with nugget = 31.34, sill = 38.58, and range 84163, though all three are decent fits early on with a good deal of noise at greater distances.
# Update the gstat object with the variogram
ozone_KRGstat_up <- gstat(ozone_KRGstat, id = "ozone_KRmodel",
model = ozone_KRfit_Sph)
# predict across grid
ozone_KR_sf <- predict(ozone_KRGstat_up, newdata = ozoneGrids_sf)
## [using universal kriging]
ozone_KR_sf
## Simple feature collection with 22589 features and 2 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -361922.2 ymin: -595748.7 xmax: 530383.2 ymax: 445274.2
## Projected CRS: NAD83 / California Albers
## First 10 features:
## ozone_KRmodel.pred ozone_KRmodel.var geometry
## 1 25.07461 56.80418 POINT (-341267 445274.2)
## 2 25.32704 55.24772 POINT (-337135.9 445274.2)
## 3 25.67620 54.32472 POINT (-333004.9 445274.2)
## 4 25.44801 54.24509 POINT (-328873.9 445274.2)
## 5 25.68887 55.03646 POINT (-324742.8 445274.2)
## 6 27.66337 56.59941 POINT (-320611.8 445274.2)
## 7 28.06587 58.34317 POINT (-316480.7 445274.2)
## 8 28.01785 60.13423 POINT (-312349.7 445274.2)
## 9 28.88968 61.97526 POINT (-308218.6 445274.2)
## 10 30.82628 63.86217 POINT (-304087.6 445274.2)
ozone_KR_rast <- sf_2_rast(ozone_KR_sf)
ozone_KR_rast
## class : SpatRaster
## dimensions : 253, 217, 1 (nrow, ncol, nlyr)
## resolution : 4131.043, 4131.043 (x, y)
## extent : -363987.7, 532448.7, -597814.2, 447339.8 (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310)
## source(s) : memory
## name : ozone_KRmodel.pred
## min value : 10.80522
## max value : 64.24551
ggplot() +
geom_spatraster(data = ozone_KR_rast, aes(fill = ozone_KRmodel.pred),
alpha = 0.9) +
labs(title = "Regression Kriging-Modeled Ozone Concentrations",
x = "Longitude", y = "Latitude") +
scale_fill_gradientn(colors = (pal1),
name = "Ozone (ppb)",
na.value = "transparent") +
theme_minimal()
ggplot() +
geom_spatraster(data = ozone_KR_rast, aes(fill = ozone_KRmodel.pred),
alpha = 0.9) +
labs(title = "Regression Kriging-Modeled Ozone Concentrations",
x = "Longitude", y = "Latitude") +
scale_fill_gradientn(colors = (pal1),
name = "Ozone (ppb)",
na.value = "transparent") +
theme_minimal() +
geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
shape = 21, alpha = 0.8)
Figure 1.5a: Ozone concentrations modeled with regression kriging with \(Ozone \sim Elevation + Traffic\).
The prediction surface after regression kriging with elevation and traffic density as predictor variables has far greater detail in its contours than the previous interpolation methods. High values are not diminished but are interspersed with veins of moderate values, but without the crater-like spots in IDW or the oversmoothing of TPS.
obs <- ozonePoints_df[,1]
preds <- extract(idw_ozone_rast, ozonePoints_df[,2:3]) |>
pull(var1.pred)
IDW_rsq_known <- cor(obs,preds)^2
IDW_rsq_known
## [1] 0.9497181
IDW_rmse_known <- sqrt(mean((preds - obs)^2))
IDW_rmse_known
## [1] 2.379907
ggplot() +
geom_abline(slope=1,intercept = 0) +
geom_point(aes(x = obs, y = preds)) +
coord_fixed(ratio=1, xlim = range(preds, obs),
ylim = range(preds, obs)) +
labs(x="Observed Values",
y="Predicted Values",
title="Ozone (ppm) IDW Fit")
Figure 1.6a: Observed vs predicted values using IDW interpolation with p = 2.5. The final model fits the observed measurements well, with a high \(R^2\) of 0.92 and low RMSE of 3.44. Multiple cross-validation iterations of the IDW model were performed when optimizing p at 2.5.
obs <- ozonePoints_df[,1]
preds <- extract(ozone_TPSrast, ozonePoints_df[,2:3]) |>
pull(ozone)
TPS_rsq_known <- cor(obs,preds)^2
TPS_rsq_known
## [1] 0.751105
TPS_rmse_known <- sqrt(mean((preds - obs)^2))
TPS_rmse_known
## [1] 5.347147
ggplot() +
geom_abline(slope=1,intercept = 0) +
geom_point(aes(x = obs, y = preds)) +
coord_fixed(ratio=1, xlim = range(preds, obs),
ylim = range(preds, obs)) +
labs(x="Observed Values",
y="Predicted Values",
title="Ozone (ppm) TPS Fit")
Figure 1.6b: Observed vs predicted values from TPS iterpolation model. \(R^2\) = 0.751, RMSE = 5.347.
obs <- ozonePoints_df[,1]
preds <- extract(ozoneKrige_rast, ozonePoints_df[,2:3]) |>
pull(var1.pred)
Kriging_rsq_known <- cor(obs,preds)^2
Kriging_rsq_known
## [1] 0.8295548
Kriging_rmse_known <- sqrt(mean((preds - obs)^2))
Kriging_rmse_known
## [1] 4.59071
ggplot() +
geom_abline(slope=1,intercept = 0) +
geom_point(aes(x = obs, y = preds)) +
coord_fixed(ratio=1, xlim = range(preds, obs),
ylim = range(preds, obs)) +
labs(x="Observed Values",
y="Predicted Values",
title="Ozone (ppm) Kriging Fit")
Figure 1.6c: Observed vs Predicted values using ordinary Kriging model. Predicted values are closer fit to observed measurements than the TPS model (\(R^2\) = 0.83, RMSE = 4.59).
obs <- ozonePoints_df[,1]
preds <- extract(ozone_KR_rast, ozonePoints_df[,2:3]) |>
pull(ozone_KRmodel.pred)
RK_rsq_known <- cor(obs,preds)^2
RK_rsq_known
## [1] 0.7876989
RK_rmse_known <- sqrt(mean((preds - obs)^2))
RK_rmse_known
## [1] 5.045092
ggplot() +
geom_abline(slope=1,intercept = 0) +
geom_point(aes(x = obs, y = preds)) +
coord_fixed(ratio=1, xlim = range(preds, obs),
ylim = range(preds, obs)) +
labs(x="Observed Values",
y="Predicted Values",
title="Ozone (ppm) Kriging Fit")
Figure 1.6d: Observed vs. Predicted values at known points. \(R^2\) = 0.76, RMSE = 5.40.
# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_sf)
# setup R^2 and RMSE vectors for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)
set.seed(7)
for(i in 1:m){
# select rows for testData subset
rows4test <- sample(x = 1:n,size = n*0.1)
# create testData subset
testData <- ozonePoints_sf[rows4test,]
# create trainData subset
trainData <- ozonePoints_sf[-rows4test,]
# IDW model with trainData subset
idw_model <- gstat(formula=ozone~1,
locations = trainData,
set=list(idp = 2.5))
# test IDW model compared to observed values in testData
obs <- testData$ozone
# pull predictions from IDW model
preds <- predict(idw_model, newdata = testData, debug.level = 0) |>
pull(var1.pred)
# pull R^2 and RMSE
rsq[i] <- cor(obs, preds)^2
rmse[i] <- sqrt(mean((preds - obs)^2))
}
IDW_rsq <- mean(rsq)
IDW_rmse <- mean(rmse)
IDW_rsq
## [1] 0.4419662
IDW_rmse
## [1] 8.121792
# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_df)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)
set.seed(42)
for(i in 1:m){
# select rows for testData subset
rows4test <- sample(x = 1:n,size = n*0.1)
# create testData subset
testData <- ozonePoints_df[rows4test,]
# create trainData subset
trainData <- ozonePoints_df[-rows4test,]
# TPS model with trainData subset
tps_model <- Tps(x = trainData[,2:3], Y = trainData[,1])
# Predict TPS train model over ozoneGrid
tps_preds <- c(predict(object=tps_model, x = gridCoords))
# store in a data.frame with the x,y coordinates
tps_df <- data.frame(x = gridCoords[,1],
y = gridCoords[,2],
ozone = tps_preds)
# convert to Raster with SpatStat
tps_rast <- rast(tps_df,
crs = crs(ozonePoints_sf))
# pull observed from testData
obs <- testData$ozone
# pull predictions from TPS model
preds <- extract(tps_rast, testData[,2:3]) |>
pull(ozone)
# pull R^2 and RMSE
rsq[i] <- cor(obs, preds)^2
rmse[i] <- sqrt(mean((preds - obs)^2))
}
TPS_rsq <- mean(rsq)
TPS_rmse <- mean(rmse)
TPS_rsq
## [1] 0.5193196
TPS_rmse
## [1] 7.181533
# k-fold cross validation
set.seed(42)
n <- 50
ozone_kfold_sf <- krige.cv(formula = ozone ~ 1,
locations = ozonePoints_sf,
model = ozone_fit,
nfold = n,
verbose = FALSE)
ozone_kfold <- data.frame(observed = ozone_kfold_sf$observed,
var1.pred = ozone_kfold_sf$var1.pred,
fold = ozone_kfold_sf$fold)
rsq <- 1:n
rmse <- 1:n
fold <- 1:n
mod_fit <- data.frame(rsq, rmse, fold)
for (i in 1:n) {
ozone_kfold_sub <- subset(ozone_kfold, fold == i)
mod_fit$rsq[i] <- cor(ozone_kfold_sub$observed, ozone_kfold_sub$var1.pred)^2
mod_fit$rmse[i] <- sqrt(mean((ozone_kfold_sub$observed - ozone_kfold_sub$var1.pred)^2))
}
Kriging_rsq <- mean(mod_fit$rsq)
Kriging_rmse <- mean(mod_fit$rmse)
Kriging_rsq
## [1] 0.5697979
Kriging_rmse
## [1] 6.70588
# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_df)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)
set.seed(42)
for (i in 1:m) {
# select rows for testData subset
rows4test <- sample(x = 1:n, size = n*0.1)
# create testData subset
testData <- ozonePoints_sf[rows4test,]
# create trainData subset
trainData <- ozonePoints_sf[-rows4test,]
# Set up gstat object with regression model
ozone_RKGstat <- gstat(id = "ozone",
formula = ozone~elev + traffic,
data = trainData)
# variogram of model residuals
ozone_RKvar <- variogram(ozone_RKGstat)
# fit variogram model
ozone_RKfit_Sph <- fit.variogram(ozone_RKvar,
vgm(psill = 70,
model = "Sph",
range = 100000,
nugget = 40))
# Update the gstat object with the variogram
ozone_RKGstat_up <- gstat(ozone_RKGstat, id = "ozone",
model = ozone_RKfit_Sph)
# predict across grid
ozone_RK_sf <- predict(ozone_RKGstat_up, newdata = testData)
ozone_RK_sf
# pull observed from testData
obs <- testData$ozone
# pull predictions from RK model
preds <- ozone_RK_sf$ozone.pred
preds
# pull R^2 and RMSE
rsq[i] <- cor(obs,preds)^2
rmse[i] <- sqrt(mean((preds - obs)^2))
}
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
RK_rsq <- mean(rsq)
RK_rmse <- mean(rmse)
RK_rsq
## [1] 0.5362585
RK_rmse
## [1] 6.895541
Method <- c("IDW", "TPS", "Kriging", "RK")
Rsq_cv <- c(IDW_rsq, TPS_rsq, Kriging_rsq, RK_rsq)
rmse_cv <- c(IDW_rmse, TPS_rmse, Kriging_rmse, RK_rmse)
Rsq_known <- c(IDW_rsq_known, TPS_rsq_known, Kriging_rsq_known, RK_rsq_known)
rmse_known <- c(IDW_rmse_known, TPS_rmse_known, Kriging_rmse_known, RK_rmse_known)
skill_df <- data.frame(Method = Method,
Rsq_cv = Rsq_cv,
rmse_cv = rmse_cv,
Rsq_known = Rsq_known,
rmse_known = rmse_known)
kable(skill_df)
| Method | Rsq_cv | rmse_cv | Rsq_known | rmse_known |
|---|---|---|---|---|
| IDW | 0.4419662 | 8.121792 | 0.9497181 | 2.379907 |
| TPS | 0.5193196 | 7.181533 | 0.7511050 | 5.347147 |
| Kriging | 0.5697979 | 6.705880 | 0.8295548 | 4.590711 |
| RK | 0.5362585 | 6.895541 | 0.7876989 | 5.045092 |
Table 1.6: \(R^2\) and RMSE for the four models: IDW, TPS, Kriging, and Regression Kriging. The “cv” values show the cross-validated \(R^2\) and RMSE goodness of fit tests. The “known” values are calculated for the final model across all 345 known locations to assess how well the model predicts locations with empirical measurements.
\(R^2\) - or proportion of variance explained by the model - is least with IDW and greatest (0.57) with Kriging. However, RK has the lowest RMSE (6.90), but a lower \(R^2\) (0.54). Best model is ultimately subjective, and some more knowledge on how surface ozone concentrations are influenced by elevation would, I think, be critical in determining whether the ordinary Kriging or Regression Kriging model is more appropriate. Their skill is similar, but the RK surface is visibly influenced by elevation.
These data contain species composition measurements for 533 100m^2 plots across the Raif section of the Volga-Kama Nature Reserve in Tatarstan, Russia. Here, I explored how distinguishable these plots are based on temperature and moisture.
boreal <- read.csv("data/boreal.csv")
# head(boreal)
pal2 <- pnw_palette("Lake", n = nrow(boreal))
pal3 <- pnw_palette("Sunset2", n = nrow(boreal))
pal4 <- pnw_palette("Shuksan2", n = nrow(boreal))
pal4 <- rev(pal4)
\[BI = \sqrt{\frac{1000 (S+1)}{n}} \] where \(S\) is the number of boreal species and \(n\) is the total number of species found in each plot.
boreal$BI <- sqrt((1000*(boreal$nBor + 1)) / boreal$nTot)
# head(boreal)
boreal_sf <- st_as_sf(boreal, coords = c("easting", "northing"), crs = 32639)
ggplot() +
geom_sf(data = boreal_sf, aes(col = BI), alpha = 0.8) +
guides(size = "none") +
scale_color_gradientn(colors = pal2) +
theme_minimal()
On a cursory glance, boreal index does appear to be spatially autocorrelated, with high values occurring closer to other high values, and vice versa.
ggplot() +
geom_sf(data = boreal_sf, aes(col = tmp), alpha = 0.8) +
guides(size = "none") +
scale_color_gradientn(colors = pal3) +
theme_minimal()
ggplot() +
geom_sf(data = boreal_sf, aes(col = wet), alpha = 0.8) +
guides(size = "none") +
scale_color_gradientn(colors = pal4) +
theme_minimal()
\[ y = \beta_0 + \beta_1x_1 + \beta_2x_2 \] where \(y = BI\), \(x_1 = Temp\), and \(x_2 = Wet\) such that
\[ BI \sim Temp + Wet \]
# fit model assuming no autocorrelation in residuals with gls()
glsNaive <- gls(BI ~ tmp + wet, boreal)
summary(glsNaive)
## Generalized least squares fit by REML
## Model: BI ~ tmp + wet
## Data: boreal
## AIC BIC logLik
## 2830.709 2847.801 -1411.355
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -212.81280 55.10855 -3.861702 1e-04
## tmp 0.77901 0.18560 4.197282 0e+00
## wet 152.22303 10.92708 13.930810 0e+00
##
## Correlation:
## (Intr) tmp
## tmp -1.000
## wet 0.302 -0.296
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.91263968 -0.68370547 -0.08842685 0.55090391 5.23496551
##
## Residual standard error: 3.437209
## Degrees of freedom: 533 total; 530 residual
# test iterations m
m <- 10
# number of points n
n <- nrow(boreal)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)
set.seed(42)
for (i in 1:m) {
# select rows for testData subset
rows4test <- sample(x = 1:n, size = n*0.1)
# create testData subset
testData <- boreal[rows4test,]
# create trainData subset
trainData <- boreal[-rows4test,]
glsNaive_t <- gls(BI ~ tmp + wet, trainData)
preds <- predict(glsNaive_t, newdata = testData)
# pull observed from testData
obs <- testData$BI
# pull R^2 and RMSE
rsq[i] <- cor(obs,preds)^2
rmse[i] <- sqrt(mean((preds - obs)^2))
}
naive_rsq <- mean(rsq)
naive_rmse <- mean(rmse)
naive_rsq
## [1] 0.3321315
naive_rmse
## [1] 3.73864
The initial naive model (using Ordinary Least Squares) produces a slope for both temperature and wetness that are signficantly different from 0 (Temperature: t = 4.20, p < 0.001; Wetness: t = 13.9, p < 0.001). Cross validation shows however that the model is not a great fit (average \(R^2\) = 0.344, average RMSE = 3.43).
Naive Model: \[y = -212.813 + 0.779x_1 + 152.223x_2 \], where \(x_1\) = Temp and \(x_2\) = Wet.
boreal$glsNaiveResids <- residuals(glsNaive, type = "normalized")
boreal_sf <- st_as_sf(boreal, coords = c("easting", "northing"), crs = 32639)
# boreal_sf
ggplot() +
geom_sf(data = boreal_sf,
aes(col = glsNaiveResids, size = glsNaiveResids),
alpha = 0.8) +
guides(size = "none") +
scale_color_gradientn(colors = pal2) +
theme_minimal()
Residuals of the naive model also appear to have some spatial structure.
# test for spatial autocorrelation using a correlogram
residsI <- spline.correlog(x = boreal$easting, y = boreal$northing,
z = boreal$glsNaiveResids,
resamp = 50, quiet = TRUE)
# save coordinate points as matrix for dist()
points <- cbind(boreal$easting, boreal$northing)
# plot Moran's I for residuals
plot(residsI, xlim = c(0, max(dist(points))/3))
Moran’s I of the residuals shows that there is spatial autocorrelation in the residuals of the naive model, particularly at distances below ~500m.
# using autofitVariogram() to produce experimental and fitted variogram
## tests Gaussian, Spherical, and Exponential fits and produces the best
boreal_fit <- autofitVariogram(glsNaiveResids~1, input_data = boreal_sf,
model = c("Gau", "Sph", "Exp"))
plot(boreal_fit)
The best variogram for the residuals is an exponential model with nugget = 0.41, sill = 1, and range = 105. This will be used as a correlation structure to find the variance-covariance matrix for the updated model.
csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")
glsUpdated <- update(glsNaive, correlation = csSpatial)
summary(glsUpdated)
## Generalized least squares fit by REML
## Model: BI ~ tmp + wet
## Data: boreal
## AIC BIC logLik
## 2733.72 2759.357 -1360.86
##
## Correlation Structure: Exponential spatial correlation
## Formula: ~easting + northing
## Parameter estimate(s):
## range nugget
## 460.3758216 0.5045189
##
## Coefficients:
## Value Std.Error t-value p-value
## (Intercept) -77.23520 77.15219 -1.001076 0.3172
## tmp 0.31123 0.26012 1.196514 0.2320
## wet 75.75531 13.51721 5.604361 0.0000
##
## Correlation:
## (Intr) tmp
## tmp -1.000
## wet 0.066 -0.060
##
## Standardized residuals:
## Min Q1 Med Q3 Max
## -2.3203431 -0.5317626 0.1216847 0.7397759 5.1417469
##
## Residual standard error: 3.654787
## Degrees of freedom: 533 total; 530 residual
# test iterations m
m <- 10
# number of points n
n <- nrow(boreal)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)
csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")
set.seed(42)
for (i in 1:m) {
# select rows for testData subset
rows4test <- sample(x = 1:n, size = n*0.1)
# create testData subset
testData <- boreal[rows4test,]
# create trainData subset
trainData <- boreal[-rows4test,]
glsNaive_t <- gls(BI ~ tmp + wet, trainData)
glsUpdated_t <- update(glsNaive_t, correlation = csSpatial)
preds <- predict(glsUpdated_t, newdata = testData)
# pull observed from testData
obs <- testData$BI
# pull R^2 and RMSE
rsq[i] <- cor(obs,preds)^2
rmse[i] <- sqrt(mean((preds - obs)^2))
}
updated_rsq <- mean(rsq)
updated_rmse <- mean(rmse)
updated_rsq
## [1] 0.3331307
updated_rmse
## [1] 3.999107
When using Generalized Least Squares (GLS) to account for spatial structure in Boreal Index, the predictor variables have different significance than in the naive model. Primarily, temperature no longer has a significant slope and is not a good predictor of Boreal Index when spatial autocorrelation is considered (t = 1.20, p = 0.232). Wetness remains a good predictor (t = 5.60, p < 0.001). After cross-validation, the model explains roughly the same amount of variation in BI: mean \(R^2\) = 0.333, average RMSE = 4.0.
Better Model: \[y = -77.24 + 0.311 x_1 + 1.197 x_2 \] where \(x_1\) = Temp and \(x_2\) = Wet.
Improving the model with spatial structure through Generalized Least Squares regression draws questions about the importance of temperature as an effective predictor variable for Boreal Index across these plots. In the naive model, temperature is a significant predictor, but it does not remain significant when the model is updated with spatial structure.
Given that our dependent variable, Boreal Index, is a measure of how many boreal-exclusive trees are present vs their more southerly counterparts, it makes sense ecologically that BI in this forest would be more dependent on moisture levels than temperature. Temperature is a more variable environmental factor than wetness typically, whereas moisture levels can be retained for longer periods of time. Moisture would therefore be more indicative of long-term conditions that might limit growth of certain types of trees. Since the Raif region consists of a rare cross-over of boreal and southern species, presence/absence of certain species within each plot is most likely due to those factors that affect habitat suitability the most. I would anticipate that moisture (especially in a colder climate like Russia) will impact habitat suitability more than temperature, which can fluctuate a great deal daily and seasonally.
Overall, however, the updated model explains only a tiny bit more of variation in BI than the naive model and has a higher RMSE after cross-validation. We can also see from plotting the observed BI values to the predicted that the updated model may actually be doing a worse job of predicting our known values.
preds <- predict(glsNaive, newdata = boreal)
obs <- boreal$BI
# pull R^2 and RMSE
naive_rsq <- cor(obs,preds)^2
naive_rmse <- sqrt(mean((preds - obs)^2))
ggplot() +
geom_abline(slope=1,intercept = 0) +
geom_point(aes(x = obs, y = preds)) +
coord_fixed(ratio=1, xlim = range(preds, obs),
ylim = range(preds, obs)) +
labs(x="Observed Values",
y="Predicted Values",
title="BI Naive Model Fit")
preds <- predict(glsUpdated, newdata = boreal)
obs <- boreal$BI
# pull R^2 and RMSE
upd_rsq <- cor(obs,preds)^2
upd_rmse <- sqrt(mean((preds - obs)^2))
ggplot() +
geom_abline(slope=1,intercept = 0) +
geom_point(aes(x = obs, y = preds)) +
coord_fixed(ratio=1, xlim = range(preds, obs),
ylim = range(preds, obs)) +
labs(x="Observed Values",
y="Predicted Values",
title="BI Updated Model Fit")
This all led me to try one last, “best” model excluding temperature entirely.
glswet_Naive <- gls(BI ~ wet, boreal)
boreal$glswet_NaiveResids <- residuals(glswet_Naive, type = "normalized")
boreal_sf <- st_as_sf(boreal, coords = c("easting", "northing"), crs = 32639)
boreal_wetfit <- autofitVariogram(glswet_NaiveResids~1,
input_data = boreal_sf,
model = c("Gau", "Sph", "Exp"))
plot(boreal_wetfit)
csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")
glswet_Updated <- update(glswet_Naive, correlation = csSpatial)
preds <- predict(glswet_Updated, newdata = boreal)
obs <- boreal$BI
ggplot() +
geom_abline(slope=1,intercept = 0) +
geom_point(aes(x = obs, y = preds)) +
coord_fixed(ratio=1, xlim = range(preds, obs),
ylim = range(preds, obs)) +
labs(x="Observed Values",
y="Predicted Values",
title="BI Updated Model (Wet Only) Fit")
# test iterations m
m <- 10
# number of points n
n <- nrow(boreal)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)
csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")
set.seed(42)
for (i in 1:m) {
# select rows for testData subset
rows4test <- sample(x = 1:n, size = n*0.1)
# create testData subset
testData <- boreal[rows4test,]
# create trainData subset
trainData <- boreal[-rows4test,]
glswet_Naive_t <- gls(BI ~ wet, boreal)
glswet_Updated_t <- update(glswet_Naive_t, correlation = csSpatial)
preds <- predict(glswet_Updated_t, newdata = testData)
# pull observed from testData
obs <- testData$BI
# pull R^2 and RMSE
rsq[i] <- cor(obs,preds)^2
rmse[i] <- sqrt(mean((preds - obs)^2))
}
wet_rsq <- mean(rsq)
wet_rmse <- mean(rmse)
wet_rsq
## [1] 0.3230485
wet_rmse
## [1] 4.05143
As it turns out, temperature is still contributing to the model by explaining a small amount of variance in BI across plots. The GLS updated model with only wetness as a predictor variable has a lower cross-validated \(R^2\) (0.32) and higher RMSE (4.05) than the previous model.
In this case, including spatial structure is not actually contributing immensely to the model or altering interpretation. While it is an important consideration when autocorrelation in a variable is present, sometimes the simpler model (i.e., the naive linear regression model \(BI \sim Temp + Wet\)) provides sufficient predictions to make inference.