library(tidyverse)
library(gstat)
library(terra)
library(tidyterra)
library(ggnewscale)
library(tmap)
library(sf)
library(PNWColors)
library(fields)
library(spatstat)
library(automap)
library(knitr)
library(ncf)
# function to turn sf to raster
sf_2_rast <-function(sfObject,variableIndex = 1){
  # coerce sf to a data.frame
  dfObject <- data.frame(st_coordinates(sfObject),
                         z=as.data.frame(sfObject)[,variableIndex])
  # coerce data.frame to SpatRaster
  rastObject <- rast(dfObject,crs=crs(sfObject))
  
  names(rastObject) <- names(sfObject)[variableIndex]
  
  return(rastObject)
}

1. Geostatistics

ozoneGrids_rast <- readRDS("data/ozoneGrids_rast.rds")
ozoneGrids_sf <- readRDS("data/ozoneGrids_sf.rds")
ozonePoints_sf <- readRDS("data/ozonePoints_sf.rds")
caBnd_sf <- readRDS("data/caBnd_sf.rds")
caBnd_sf
## Simple feature collection with 1 feature and 1 field
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -371692.4 ymin: -604021.9 xmax: 539854.3 ymax: 450479.1
## Projected CRS: NAD83 / California Albers
##           ID                           geom
## 1 california POLYGON ((-194533.8 7189.73...
ozoneCoords <- as.data.frame(st_coordinates(ozonePoints_sf))
ozonePoints_df <- data.frame(ozone = ozonePoints_sf$ozone,
                             ozoneCoords)

gridCoords <- as.data.frame(st_coordinates(ozoneGrids_sf))
pal1 <- pnw_palette("Lake", n = nrow(ozonePoints_sf))
pal2 <- pnw_palette("Lake", n = nrow(ozoneGrids_sf))

1.1 Map

ggplot() +
  geom_sf(data = caBnd_sf) +
  geom_spatraster(data = ozoneGrids_rast, aes(fill = hill)) +
  scale_fill_gradientn(colors = gray.colors(100,
                                            start = 0.1,
                                            end = 0.9), 
                       guide = "none",
                       na.value = "transparent") +
  
  new_scale_fill() +
  geom_spatraster(data = ozoneGrids_rast, aes(fill = elev)) +
  scale_fill_hypso_c(name = "Elevation (m)", 
                     palette = "colombia_hypso",
                     alpha = 0.75) +
  
  geom_sf(data = ozonePoints_sf, 
          aes(col = ozone, size = ozone, alpha = 0.7)) +
  scale_color_gradientn(colors = (pal1),
                        name = "Ozone (ppb)",
                        na.value = "transparent") +
  labs(title = "Ozone Concentrations in California",
       x = "Longitude", y = "Latitude") +
  guides(size = "none", alpha = "none") +
  theme_minimal()

Figure 1.1a: Average daily ozone measurements across 345 stations in California, USA. Color in base map denotes elevation. Points represent measurement stations; size and color indicate ozone concentration in parts per billion.

tmap_mode("view")
## tmap mode set to interactive viewing
tm_shape(ozonePoints_sf) +
  tm_dots(col = "ozone", palette = pal1,
          id = "name") 

Figure 1.1b: Interactive map of average daily ozone measurements across 345 stations in California, USA. Color of point indicates ozone concentration in parts per billion.

1.2 Deterministic: IDW

Inverse distance weighting (IDW) is a deterministic interpolation method that estimates a variable \(\hat Z\) at location \(s_0\) by the weighting formula: \[\hat Z(s_0) = \frac{\sum_{i=1}^n w(s_i)Z(s_i)} {\sum_{i=1}^n w(s_i)} \] where \(w(s_i)\) is the weights of measurements at \(i\) locations: \[ w(s_i) = ||s_i - s_0||^{-p}\]

We can optimize the parameter \(p\) by finding the value with the highest \(R^2\) and lowest RMSE for a training set of the data.

# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_sf)
# values of p to test
powers2try <- seq(0.5,6,by=0.5)
nPowers <- length(powers2try)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=nPowers,nrow=m)
rmse <- matrix(0,ncol=nPowers,nrow=m)

set.seed(7)
for(i in 1:m){
  # select rows for testData subset
  rows4test <- sample(x = 1:n,size = n*0.1)
  # create testData subset
  testData <- ozonePoints_sf[rows4test,]
  # create trainData subset
  trainData <- ozonePoints_sf[-rows4test,]
  
  for(j in 1:nPowers){
    # IDW model with trainData subset
    idw_model <- gstat(formula=ozone~1, 
                       locations = trainData,
                       set=list(idp = powers2try[j]))
    # test IDW model compared to observed values in testData
    obs <- testData$ozone
    # pull predictions from IDW model
    preds <- predict(idw_model,newdata = testData, debug.level = 0) |>
      pull(var1.pred)
    # pull R^2 and RMSE
    rsq[i,j] <- cor(obs, preds)^2
    rmse[i,j] <- sqrt(mean((preds - obs)^2))
  }
}


# get mean rsq and rmse for each power
skillDF <- data.frame(power = powers2try,
                      rsq = colMeans(rsq),
                      rmse = colMeans(rmse)) |>
  pivot_longer(cols = -power)
ggplot(data = skillDF, aes(x = power, y = value)) + 
  geom_point() +
  geom_line() +
  facet_wrap(~name, scales = "free_y") +
  labs(x="IDW Power",
       title="10% Data Withheld, 50 Iterations")

Figure 1.2a: Mean RMSE and \(R^2\) for cross-validation IDW tests for values of p from 0.5 to 6 at intervals of 0.5. For each value of p, 50 iterations were run with a random 10% of the data withheld for validation.

Root mean square error (RMSE) decreases dramatically from p = 0.5 to p = 1.5, then begins to increase after p > 2.5. \(R^2\) does not reach a peak until p approaches 3, then decreases again. Hence, I selected an optimal value of p = 2.5 for the final IDW interpolation.

idw_ozone <- gstat(formula=ozone~1, 
                      locations = ozonePoints_sf,
                      set=list(idp = 3))
idw_ozone_sf <- predict(idw_ozone, ozoneGrids_sf, debug.level = 0)

# convert to raster
idw_ozone_rast <- sf_2_rast(idw_ozone_sf)
idw_ozone_rast
## class       : SpatRaster 
## dimensions  : 253, 217, 1  (nrow, ncol, nlyr)
## resolution  : 4131.043, 4131.043  (x, y)
## extent      : -363987.7, 532448.7, -597814.2, 447339.8  (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310) 
## source(s)   : memory
## name        : var1.pred 
## min value   :  4.366395 
## max value   : 84.565024

IDW: p = 2.5

ggplot() +
  geom_raster(data = idw_ozone_rast,
              mapping = aes(x = x, y = y, 
                            fill = var1.pred), alpha = 0.8) +
  scale_fill_gradientn(colors = (pal1),
                        name = "Ozone (ppb)",
                        na.value = "transparent") +
  labs(title = "IDW-Modeled Ozone Concentrations",
       subtitle = "IDW: p = 2.5",
       x = "Easting (m)", y = "Northing (m)") +
  coord_equal() +
  theme_minimal()

IDW: p = 2.5 + Real Points

ggplot() +
  geom_raster(data = idw_ozone_rast,
              mapping = aes(x = x, y = y, 
                            fill = var1.pred), alpha = 0.8) +
  geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
          shape = 21, alpha = 0.8) +
  scale_fill_gradientn(colors = (pal1),
                        name = "Ozone (ppb)",
                        na.value = "transparent") +
  labs(title = "IDW-Modeled Ozone Concentrations",
       subtitle = "IDW: p = 2.5",
       x = "Longitude", y = "Latitude") +
  scale_size(guide = "none") +
  coord_sf() +
  theme_minimal()

Figure 1.2b: Modeled ozone concentrations across California, USA. Inverse distance weighting (IDW) interpolation using measurements from 345 stations (see points on second figure). A value of p = 2.5 for the weights matrix was selected based on optimized \(R^2\) and RMSE values (\(R^2\) = 0.44, RMSE = 8.12).

There is some smoothness in the IDW surface, particularly where ozone concentrations are low and not particularly variable. Where measurements spike, the surrounding surface is slightly pock-marked, suggesting the IDW model could be improved upon (though the nature of ozone production might mean those craters are, in fact, significant). However, IDW is a deterministic method of interpolation, so there is no probability distribution associated with these values and no margin of confidence.

1.3 Deterministic: TPS

Thin-plate splines is an alternative to IDW that is still deterministic, but uses repeated splines to approximate values at unknown locations.

ozone_TPSmodel <- Tps(x = ozonePoints_df[,2:3], Y = ozonePoints_df[,1])
ozone_TPSmodel
## Call:
## Tps(x = ozonePoints_df[, 2:3], Y = ozonePoints_df[, 1])
##                                                
##  Number of Observations:                345    
##  Number of parameters in the null space 3      
##  Parameters for fixed spatial drift     3      
##  Model degrees of freedom:              90.5   
##  Residual degrees of freedom:           254.5  
##  GCV estimate for tau:                  6.218  
##  MLE for tau:                           5.922  
##  MLE for sigma:                         261100 
##  lambda                                 0.00013
##  User supplied sigma                    NA     
##  User supplied tau^2                    NA     
## Summary of estimates: 
##                  lambda      trA      GCV   tauHat -lnLike Prof converge
## GCV        0.0001342780 90.52659 52.41506 6.217842     1198.639        3
## GCV.model            NA       NA       NA       NA           NA       NA
## GCV.one    0.0001342780 90.52659 52.41506 6.217842           NA        3
## RMSE                 NA       NA       NA       NA           NA       NA
## pure error           NA       NA       NA       NA           NA       NA
## REML       0.0008693386 45.70955 54.26053 6.860865     1188.519        7
# Predict TPS model over ozoneGrid
ozone_TPSpreds <- c(predict(object=ozone_TPSmodel, x = gridCoords))
# store in a data.frame with the x,y coordinates
ozone_TPS <- data.frame(x = gridCoords[,1],
                         y = gridCoords[,2],
                         ozone = ozone_TPSpreds)
# convert to Raster with SpatStat
ozone_TPSrast <- rast(ozone_TPS,
                      crs = crs(ozonePoints_sf))
ozone_TPSrast
## class       : SpatRaster 
## dimensions  : 253, 217, 1  (nrow, ncol, nlyr)
## resolution  : 4131.043, 4131.043  (x, y)
## extent      : -363987.7, 532448.7, -597814.2, 447339.8  (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310) 
## source(s)   : memory
## name        :    ozone 
## min value   : 13.00884 
## max value   : 63.56690

TPS

ggplot() +
  geom_spatraster(data = ozone_TPSrast, aes(fill = ozone), alpha = 0.9) +
  scale_fill_gradientn(colors = pal1,
                       name = "Ozone (ppb)",
                       na.value = "transparent") +
  labs(title = "TPS-Modeled Ozone Concentrations",
       x = "Longitude", y = "Latitude") +
  theme_minimal()

TPS + Real Points

ggplot() +
  geom_spatraster(data = ozone_TPSrast, aes(fill = ozone), alpha = 0.9) +
  scale_fill_gradientn(colors = pal1,
                       name = "Ozone (ppb)",
                       na.value = "transparent") +
  labs(title = "TPS-Modeled Ozone Concentrations",
       x = "Longitude", y = "Latitude") +
  theme_minimal() +
  geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
          shape = 21, alpha = 0.8) 

Figure 1.3a: Ozone concentrations modeled with Thin-Plate Splines (TPS). Cross-validation mean \(R^2\) = 0.52, RMSE = 7.18. Points (in second figure) show empirical measurements from 345 stations.

The TPS model does not predict the observed values at known locations as accurately as the IDW model did (\(R^2\) = 0.751, RMSE = 5.347) but does produce a lower average RMSE (7.18) and higher average \(R^2\) (0.52) in cross-validation tests than IDW. The TPS model is a smoother surface than the IDW, spreading out the craters around more extreme points. This may or may not be a more accurate prediction, depending on the nature of ozone production in those locations. However, the second figure highlights how the TPS model underpredicts high ozone concentrations (see scale differences).

1.4 Probabilistic: Kriging

Kriging is a probabilistic method of interpolation: it uses a variogram model based on experimental semivariance values to predict values of \(\hat Z\) at unknown locations.

# using autofitVariogram() to produce experimental and fitted variogram
## tests Gaussian, Spherical, and Exponential fits and produces the best
ozone_KrigeVar <- autofitVariogram(ozonePoints_sf$ozone~1,
                                input_data = ozonePoints_sf, 
                                model = c("Gau", "Sph", "Exp"))
plot(ozone_KrigeVar)

Figure 1.4a: Variogram of ozone concentrations.

The best-fit variogram uses a spherical model with nugget = 24, sill = 102, and range = 100611.

# Cross Validation of Krige variogram
# observational variogram
ozone_var <- variogram(ozone~1, ozonePoints_sf)
# Variogram model
ozone_Krigemod <- vgm(psill = 103, model = "Sph", range = 100611, nugget = 24)
# fit variogram model
ozone_fit <- fit.variogram(object = ozone_var, model = ozone_Krigemod)
# observational variogram
ozone_var <- variogram(ozone~1, ozonePoints_sf)
# Variogram model
ozone_Krigemod <- vgm(psill = 103, model = "Sph", range = 100611, nugget = 24)
# fit variogram model
ozone_fit <- fit.variogram(object = ozone_var, model = ozone_Krigemod)

ozone_Gstat <- gstat(formula = ozone~1, locations = ozonePoints_sf, 
                   model = ozone_fit)
ozoneKrige_sf <- predict(ozone_Gstat, newdata = ozoneGrids_sf)
## [using ordinary kriging]
ozoneKrige_sf
## Simple feature collection with 22589 features and 2 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -361922.2 ymin: -595748.7 xmax: 530383.2 ymax: 445274.2
## Projected CRS: NAD83 / California Albers
## First 10 features:
##    var1.pred var1.var                   geometry
## 1   23.27353 67.72911   POINT (-341267 445274.2)
## 2   22.87074 64.78668 POINT (-337135.9 445274.2)
## 3   22.61495 63.01437 POINT (-333004.9 445274.2)
## 4   22.55167 62.87221 POINT (-328873.9 445274.2)
## 5   22.69510 64.39531 POINT (-324742.8 445274.2)
## 6   23.01619 67.16777 POINT (-320611.8 445274.2)
## 7   23.46274 70.65231 POINT (-316480.7 445274.2)
## 8   23.96500 74.42233 POINT (-312349.7 445274.2)
## 9   24.48401 78.19729 POINT (-308218.6 445274.2)
## 10  24.99937 81.80175 POINT (-304087.6 445274.2)
ozoneKrige_rast <- sf_2_rast(ozoneKrige_sf)
ozoneKrige_rast
## class       : SpatRaster 
## dimensions  : 253, 217, 1  (nrow, ncol, nlyr)
## resolution  : 4131.043, 4131.043  (x, y)
## extent      : -363987.7, 532448.7, -597814.2, 447339.8  (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310) 
## source(s)   : memory
## name        : var1.pred 
## min value   :  13.81176 
## max value   :  67.96435

Kriging

ggplot() +
  geom_spatraster(data = ozoneKrige_rast, aes(fill = var1.pred), 
                    alpha = 0.9) +
  labs(title = "Kriging-Modeled Ozone Concentrations",
       x = "Longitude", y = "Latitude") +
  scale_fill_gradientn(colors = (pal1),
                       name = "Ozone (ppb)",
                       na.value = "transparent") +
  theme_minimal()

Kriging + Real Points

ggplot() +
  geom_spatraster(data = ozoneKrige_rast, aes(fill = var1.pred), 
                    alpha = 0.9) +
  labs(title = "Kriging-Modeled Ozone Concentrations",
       x = "Longitude", y = "Latitude") +
  scale_fill_gradientn(colors = (pal1),
                       name = "Ozone (ppb)",
                       na.value = "transparent") +
  theme_minimal() +
  geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
          shape = 21, alpha = 0.8) 

Figure 1.4b: Ozone Concentrations modeled with ordinary kriging. K-fold cross-validation mean \(R^2\) = 0.52, average RMSE = 7.20, with k = 10. Second figure shows empirical measurements from 345 stations.

The kriging surface blends the better features of both the TPS model and the IDW model. Changes across smaller distances that were craters in the IDW model are smoother but still present after kriging, while not being overly smooth like the TPS model. Looking back at the original map (which includes elevation), there appears to be some coincidence of high ozone levels and higher elevation.

The fit of the kriged surface and of the TPS model are strikingly similar after cross validation, though kriging slightly more accurately predicts values at known locations (\(R^2\) = 0.82 vs 0.75 in TPS model).

1.5 Probabilistic: Regression Kriging

Regression kriging uses a variogram model to inform a linear model of a predictor variable and our variable of interest, incorporation spatial structure and improving upon the base model.

Variable <- names(ozonePoints_sf)
Units <- c("ppb", "m", "radians", rep("unitless", 5))
Description <- c("Average Daily Ozone 1980-1999",
                 "Elevation",
                 "Slope",
                 "Solar radiation",
                 "Night lights density",
                 "Traffic density",
                 "Tree density",
                 "Hillshade")
Source <- c("California Air Resources Board",
            "USGS National Elevation Dataset",
            rep("Derived from Elevation", 2),
            "NASA",
            "California Office of Env Health",
            "NASA",
            "Derived from Elevation")


ozone_vars <- data.frame(Variable = Variable[1:8],
                         Units = Units, Description = Description,
                         Source = Source)
kable(ozone_vars, "simple")
Variable Units Description Source
ozone ppb Average Daily Ozone 1980-1999 California Air Resources Board
elev m Elevation USGS National Elevation Dataset
slope radians Slope Derived from Elevation
rad unitless Solar radiation Derived from Elevation
lights unitless Night lights density NASA
traffic unitless Traffic density California Office of Env Health
tree unitless Tree density NASA
hill unitless Hillshade Derived from Elevation

Of the predictor variables available, three stand out as most likely to influence ozone concentration: traffic, elevation, and solar radiation. Traffic and urban areas produce more \(NO_x\) pollutants that react with sunlight to produce ozone (\(O_3\)). Elevation increases exposure to solar radiation, and is in fact how the radiation metric was derived. I set a base model with elevation, radiation, and traffic density as predictors, then assessed their influence with ANOVA before proceeding to Regression Kriging.

# linear model (OLS) with no spatial consideration (yet)
ozone_lm <- lm(ozone ~ elev + rad + traffic, data = ozonePoints_sf)
summary(ozone_lm)
## 
## Call:
## lm(formula = ozone ~ elev + rad + traffic, data = ozonePoints_sf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.167  -5.025  -0.479   4.090  44.017 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.3695930  1.3268034  22.136  < 2e-16 ***
## elev         0.0086736  0.0009388   9.239  < 2e-16 ***
## rad          0.2815676  1.4886421   0.189  0.85009    
## traffic     -6.3211861  2.1489999  -2.941  0.00349 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.103 on 341 degrees of freedom
## Multiple R-squared:  0.2715, Adjusted R-squared:  0.265 
## F-statistic: 42.35 on 3 and 341 DF,  p-value: < 2.2e-16
anova(ozone_lm)
## Analysis of Variance Table
## 
## Response: ozone
##            Df  Sum Sq Mean Sq  F value  Pr(>F)    
## elev        1  9810.8  9810.8 118.3928 < 2e-16 ***
## rad         1     0.7     0.7   0.0090 0.92456    
## traffic     1   717.0   717.0   8.6522 0.00349 ** 
## Residuals 341 28257.5    82.9                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kable(anova(ozone_lm))
Df Sum Sq Mean Sq F value Pr(>F)
elev 1 9.810804e+03 9810.8040379 118.3927593 0.0000000
rad 1 7.441989e-01 0.7441989 0.0089807 0.9245560
traffic 1 7.169747e+02 716.9746843 8.6521564 0.0034901
Residuals 341 2.825751e+04 82.8665883 NA NA

Elevation is a good predictor of ozone concentration (\(F_{1,341}\) = 118.4, p < 0.001), but solar radiation is, in fact, a poor predictor (\(F_{1,341}\) = 0.009, p = 0.92). Traffic density, however, is still a signficant predictor, even with an alpha corrected for multiple comparisons (\(F_{1,341}\) = 8.65, p < 0.0035).

I reduced the model to include only signficant predictors: elevation and traffic density.

# linear model (OLS) with no spatial consideration (yet)
ozone_lm <- lm(ozone ~ elev + traffic, data = ozonePoints_sf)
summary(ozone_lm)
## 
## Call:
## lm(formula = ozone ~ elev + traffic, data = ozonePoints_sf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.151  -5.094  -0.481   4.149  43.887 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.5401303  0.9720104  30.391  < 2e-16 ***
## elev         0.0086557  0.0009327   9.280  < 2e-16 ***
## traffic     -6.2820144  2.1359800  -2.941  0.00349 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.09 on 342 degrees of freedom
## Multiple R-squared:  0.2714, Adjusted R-squared:  0.2671 
## F-statistic: 63.69 on 2 and 342 DF,  p-value: < 2.2e-16
anova(ozone_lm)
## Analysis of Variance Table
## 
## Response: ozone
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## elev        1  9810.8  9810.8 118.7275 < 2.2e-16 ***
## traffic     1   714.8   714.8   8.6497  0.003494 ** 
## Residuals 342 28260.5    82.6                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Per the variogram in section 1.4 (reproduced below), there is spatial autocorrelation in ozone concentrations. Furthermore, there is spatial autocorrelation in the residuals of the regression model, so this model could be improved by including spatial structure with kriging.

ozone_KrigeVar <- autofitVariogram(ozonePoints_sf$ozone~1,
                                input_data = ozonePoints_sf, 
                                model = c("Gau", "Sph", "Exp"))
plot(ozone_KrigeVar)

# observational variogram
ozone_var <- variogram(ozone~1, ozonePoints_sf)

# Set up gstat object with regression model
ozone_KRGstat <- gstat(id = "ozone_KRmodel",
                       formula = ozone~elev + traffic,
                       data = ozonePoints_sf)

# variogram of model residuals
ozone_KRvar <- variogram(ozone_KRGstat)
plot(ozone_KRvar, pch=20, cex=1.5, col="black",
     ylab=expression("Semivariance ("*gamma*")"),
     xlab="Distance (m)", main = "Model Residuals")

Variogram Models

Spherical

# fit variogram model
ozone_KRfit_Sph <- fit.variogram(ozone_KRvar, 
                                 vgm(psill = 70, 
                                     model = "Sph", 
                                     range = 100000, 
                                     nugget = 40))
ozone_KRfit_Sph
##   model    psill    range
## 1   Nug 31.34335     0.00
## 2   Sph 38.58030 84162.72
plot(ozone_KRvar, ozone_KRfit_Sph,
     pch=20, cex=1.5, col="black",
     ylab=expression("Semivariance ("*gamma*")"),
     xlab="Distance (m)", main = "Model Residuals")

Gaussian

# fit variogram model
ozone_KRfit_Gau <- fit.variogram(ozone_KRvar, 
                                 vgm(psill = 70, 
                                     model = "Gau", 
                                     range = 100000, 
                                     nugget = 40))
## Warning in fit.variogram(ozone_KRvar, vgm(psill = 70, model = "Gau", range =
## 1e+05, : No convergence after 200 iterations: try different initial values?
ozone_KRfit_Gau
##   model    psill    range
## 1   Nug 36.45778     0.00
## 2   Gau 33.78781 36529.32
plot(ozone_KRvar, ozone_KRfit_Gau,
     pch=20, cex=1.5, col="black",
     ylab=expression("Semivariance ("*gamma*")"),
     xlab="Distance (m)", main = "Model Residuals")

Exponential

# fit variogram model
ozone_KRfit_Exp <- fit.variogram(ozone_KRvar, 
                                 vgm(psill = 70, 
                                     model = "Exp", 
                                     range = 100000, 
                                     nugget = 40))
ozone_KRfit_Exp
##   model    psill    range
## 1   Nug 22.10499     0.00
## 2   Exp 49.08628 30608.03
plot(ozone_KRvar, ozone_KRfit_Exp,
     pch=20, cex=1.5, col="black",
     ylab=expression("Semivariance ("*gamma*")"),
     xlab="Distance (m)", main = "Model Residuals")

The best variogram model is the spherical model with nugget = 31.34, sill = 38.58, and range 84163, though all three are decent fits early on with a good deal of noise at greater distances.

# Update the gstat object with the variogram
ozone_KRGstat_up <- gstat(ozone_KRGstat, id = "ozone_KRmodel",
                          model = ozone_KRfit_Sph)
# predict across grid
ozone_KR_sf <- predict(ozone_KRGstat_up, newdata = ozoneGrids_sf)
## [using universal kriging]
ozone_KR_sf
## Simple feature collection with 22589 features and 2 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -361922.2 ymin: -595748.7 xmax: 530383.2 ymax: 445274.2
## Projected CRS: NAD83 / California Albers
## First 10 features:
##    ozone_KRmodel.pred ozone_KRmodel.var                   geometry
## 1            25.07461          56.80418   POINT (-341267 445274.2)
## 2            25.32704          55.24772 POINT (-337135.9 445274.2)
## 3            25.67620          54.32472 POINT (-333004.9 445274.2)
## 4            25.44801          54.24509 POINT (-328873.9 445274.2)
## 5            25.68887          55.03646 POINT (-324742.8 445274.2)
## 6            27.66337          56.59941 POINT (-320611.8 445274.2)
## 7            28.06587          58.34317 POINT (-316480.7 445274.2)
## 8            28.01785          60.13423 POINT (-312349.7 445274.2)
## 9            28.88968          61.97526 POINT (-308218.6 445274.2)
## 10           30.82628          63.86217 POINT (-304087.6 445274.2)
ozone_KR_rast <- sf_2_rast(ozone_KR_sf)
ozone_KR_rast
## class       : SpatRaster 
## dimensions  : 253, 217, 1  (nrow, ncol, nlyr)
## resolution  : 4131.043, 4131.043  (x, y)
## extent      : -363987.7, 532448.7, -597814.2, 447339.8  (xmin, xmax, ymin, ymax)
## coord. ref. : NAD83 / California Albers (EPSG:3310) 
## source(s)   : memory
## name        : ozone_KRmodel.pred 
## min value   :           10.80522 
## max value   :           64.24551

RK Plot

Regression Kriging

ggplot() +
  geom_spatraster(data = ozone_KR_rast, aes(fill = ozone_KRmodel.pred), 
                    alpha = 0.9) +
  labs(title = "Regression Kriging-Modeled Ozone Concentrations",
       x = "Longitude", y = "Latitude") +
  scale_fill_gradientn(colors = (pal1),
                       name = "Ozone (ppb)",
                       na.value = "transparent") +
  theme_minimal()

Regression Kriging + Real Points

ggplot() +
  geom_spatraster(data = ozone_KR_rast, aes(fill = ozone_KRmodel.pred), 
                    alpha = 0.9) +
  labs(title = "Regression Kriging-Modeled Ozone Concentrations",
       x = "Longitude", y = "Latitude") +
  scale_fill_gradientn(colors = (pal1),
                       name = "Ozone (ppb)",
                       na.value = "transparent") +
  theme_minimal() +
  geom_sf(data = ozonePoints_sf, aes(fill = ozone), color = "white",
          shape = 21, alpha = 0.8) 

Figure 1.5a: Ozone concentrations modeled with regression kriging with \(Ozone \sim Elevation + Traffic\).

The prediction surface after regression kriging with elevation and traffic density as predictor variables has far greater detail in its contours than the previous interpolation methods. High values are not diminished but are interspersed with veins of moderate values, but without the crater-like spots in IDW or the oversmoothing of TPS.

1.6 Assess Skill

Model Skill on Known Points

IDW

obs <- ozonePoints_df[,1]
preds <- extract(idw_ozone_rast, ozonePoints_df[,2:3]) |>
      pull(var1.pred)

IDW_rsq_known <- cor(obs,preds)^2
IDW_rsq_known
## [1] 0.9497181
IDW_rmse_known <- sqrt(mean((preds - obs)^2))
IDW_rmse_known
## [1] 2.379907
ggplot() +
  geom_abline(slope=1,intercept = 0) +
  geom_point(aes(x = obs, y = preds)) + 
  coord_fixed(ratio=1, xlim = range(preds, obs),
              ylim = range(preds, obs)) +
  labs(x="Observed Values",
       y="Predicted Values",
       title="Ozone (ppm) IDW Fit")

Figure 1.6a: Observed vs predicted values using IDW interpolation with p = 2.5. The final model fits the observed measurements well, with a high \(R^2\) of 0.92 and low RMSE of 3.44. Multiple cross-validation iterations of the IDW model were performed when optimizing p at 2.5.

TPS

obs <- ozonePoints_df[,1]
preds <- extract(ozone_TPSrast, ozonePoints_df[,2:3]) |>
      pull(ozone)
TPS_rsq_known <- cor(obs,preds)^2
TPS_rsq_known
## [1] 0.751105
TPS_rmse_known <- sqrt(mean((preds - obs)^2))
TPS_rmse_known
## [1] 5.347147
ggplot() +
  geom_abline(slope=1,intercept = 0) +
  geom_point(aes(x = obs, y = preds)) + 
  coord_fixed(ratio=1, xlim = range(preds, obs),
              ylim = range(preds, obs)) +
  labs(x="Observed Values",
       y="Predicted Values",
       title="Ozone (ppm) TPS Fit")

Figure 1.6b: Observed vs predicted values from TPS iterpolation model. \(R^2\) = 0.751, RMSE = 5.347.

Kriging

obs <- ozonePoints_df[,1]
preds <- extract(ozoneKrige_rast, ozonePoints_df[,2:3]) |>
      pull(var1.pred)

Kriging_rsq_known <- cor(obs,preds)^2
Kriging_rsq_known
## [1] 0.8295548
Kriging_rmse_known <- sqrt(mean((preds - obs)^2))
Kriging_rmse_known
## [1] 4.59071
ggplot() +
  geom_abline(slope=1,intercept = 0) +
  geom_point(aes(x = obs, y = preds)) + 
  coord_fixed(ratio=1, xlim = range(preds, obs),
              ylim = range(preds, obs)) +
  labs(x="Observed Values",
       y="Predicted Values",
       title="Ozone (ppm) Kriging Fit")

Figure 1.6c: Observed vs Predicted values using ordinary Kriging model. Predicted values are closer fit to observed measurements than the TPS model (\(R^2\) = 0.83, RMSE = 4.59).

Regression Kriging

obs <- ozonePoints_df[,1]
preds <- extract(ozone_KR_rast, ozonePoints_df[,2:3]) |>
      pull(ozone_KRmodel.pred)

RK_rsq_known <- cor(obs,preds)^2
RK_rsq_known
## [1] 0.7876989
RK_rmse_known <- sqrt(mean((preds - obs)^2))
RK_rmse_known
## [1] 5.045092
ggplot() +
  geom_abline(slope=1,intercept = 0) +
  geom_point(aes(x = obs, y = preds)) + 
  coord_fixed(ratio=1, xlim = range(preds, obs),
              ylim = range(preds, obs)) +
  labs(x="Observed Values",
       y="Predicted Values",
       title="Ozone (ppm) Kriging Fit")

Figure 1.6d: Observed vs. Predicted values at known points. \(R^2\) = 0.76, RMSE = 5.40.

Cross Validation

IDW

# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_sf)
# setup R^2 and RMSE vectors for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)

set.seed(7)
for(i in 1:m){
  # select rows for testData subset
  rows4test <- sample(x = 1:n,size = n*0.1)
  # create testData subset
  testData <- ozonePoints_sf[rows4test,]
  # create trainData subset
  trainData <- ozonePoints_sf[-rows4test,]
  
    # IDW model with trainData subset
    idw_model <- gstat(formula=ozone~1, 
                       locations = trainData,
                       set=list(idp = 2.5))
    # test IDW model compared to observed values in testData
    obs <- testData$ozone
    # pull predictions from IDW model
    preds <- predict(idw_model, newdata = testData, debug.level = 0) |>
      pull(var1.pred)
    # pull R^2 and RMSE
    rsq[i] <- cor(obs, preds)^2
    rmse[i] <- sqrt(mean((preds - obs)^2))
  
}

IDW_rsq <- mean(rsq)
IDW_rmse <- mean(rmse)

IDW_rsq
## [1] 0.4419662
IDW_rmse
## [1] 8.121792

TPS

# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_df)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)

set.seed(42)
for(i in 1:m){
  # select rows for testData subset
  rows4test <- sample(x = 1:n,size = n*0.1)
  # create testData subset
  testData <- ozonePoints_df[rows4test,]
  # create trainData subset
  trainData <- ozonePoints_df[-rows4test,]
  
  # TPS model with trainData subset
  tps_model <- Tps(x = trainData[,2:3], Y = trainData[,1])
  
  # Predict TPS train model over ozoneGrid
  tps_preds <- c(predict(object=tps_model, x = gridCoords))
  
  # store in a data.frame with the x,y coordinates
  tps_df <- data.frame(x = gridCoords[,1],
                         y = gridCoords[,2],
                         ozone = tps_preds)
  # convert to Raster with SpatStat
  tps_rast <- rast(tps_df,
                      crs = crs(ozonePoints_sf))
  
  # pull observed from testData
  obs <- testData$ozone
  # pull predictions from TPS model
  preds <- extract(tps_rast, testData[,2:3]) |>
      pull(ozone)

    # pull R^2 and RMSE
    rsq[i] <- cor(obs, preds)^2
    rmse[i] <- sqrt(mean((preds - obs)^2))
  
}

TPS_rsq <- mean(rsq)
TPS_rmse <- mean(rmse)

TPS_rsq
## [1] 0.5193196
TPS_rmse
## [1] 7.181533

Kriging

# k-fold cross validation
set.seed(42)
n <- 50
ozone_kfold_sf <- krige.cv(formula = ozone ~ 1,
                               locations = ozonePoints_sf,
                               model = ozone_fit,
                               nfold = n,
                               verbose = FALSE)

ozone_kfold <- data.frame(observed = ozone_kfold_sf$observed,
                          var1.pred = ozone_kfold_sf$var1.pred,
                          fold = ozone_kfold_sf$fold)

rsq <- 1:n
rmse <- 1:n
fold <- 1:n

mod_fit <- data.frame(rsq, rmse, fold)

for (i in 1:n) {
  ozone_kfold_sub <- subset(ozone_kfold, fold == i)
  
  mod_fit$rsq[i] <- cor(ozone_kfold_sub$observed, ozone_kfold_sub$var1.pred)^2
  
  mod_fit$rmse[i] <- sqrt(mean((ozone_kfold_sub$observed - ozone_kfold_sub$var1.pred)^2))
}

Kriging_rsq <- mean(mod_fit$rsq)
Kriging_rmse <- mean(mod_fit$rmse)

Kriging_rsq
## [1] 0.5697979
Kriging_rmse
## [1] 6.70588

Regression Kriging

# test iterations m
m <- 50
# number of points n
n <- nrow(ozonePoints_df)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)

set.seed(42)

for (i in 1:m) {
   # select rows for testData subset
  rows4test <- sample(x = 1:n, size = n*0.1)
  # create testData subset
  testData <- ozonePoints_sf[rows4test,]
  # create trainData subset
  trainData <- ozonePoints_sf[-rows4test,]

  # Set up gstat object with regression model
  ozone_RKGstat <- gstat(id = "ozone",
                       formula = ozone~elev + traffic,
                       data = trainData)

  # variogram of model residuals
  ozone_RKvar <- variogram(ozone_RKGstat)
  
  # fit variogram model
  ozone_RKfit_Sph <- fit.variogram(ozone_RKvar, 
                                 vgm(psill = 70, 
                                     model = "Sph", 
                                     range = 100000, 
                                     nugget = 40))
  
  # Update the gstat object with the variogram
  ozone_RKGstat_up <- gstat(ozone_RKGstat, id = "ozone",
                          model = ozone_RKfit_Sph)
  # predict across grid
  ozone_RK_sf <- predict(ozone_RKGstat_up, newdata = testData)
  ozone_RK_sf

  # pull observed from testData
  obs <- testData$ozone
  # pull predictions from RK model
  preds <- ozone_RK_sf$ozone.pred
preds
    # pull R^2 and RMSE
    rsq[i] <- cor(obs,preds)^2
    rmse[i] <- sqrt(mean((preds - obs)^2))
}
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
## [using universal kriging]
RK_rsq <- mean(rsq)
RK_rmse <- mean(rmse)

RK_rsq
## [1] 0.5362585
RK_rmse
## [1] 6.895541

Skill Assessment

Method <- c("IDW", "TPS", "Kriging", "RK")
Rsq_cv <- c(IDW_rsq, TPS_rsq, Kriging_rsq, RK_rsq)
rmse_cv <- c(IDW_rmse, TPS_rmse, Kriging_rmse, RK_rmse)
Rsq_known <- c(IDW_rsq_known, TPS_rsq_known, Kriging_rsq_known, RK_rsq_known)
rmse_known <- c(IDW_rmse_known, TPS_rmse_known, Kriging_rmse_known, RK_rmse_known)
skill_df <- data.frame(Method = Method,
                       Rsq_cv = Rsq_cv,
                       rmse_cv = rmse_cv,
                       Rsq_known = Rsq_known,
                       rmse_known = rmse_known)

kable(skill_df)
Method Rsq_cv rmse_cv Rsq_known rmse_known
IDW 0.4419662 8.121792 0.9497181 2.379907
TPS 0.5193196 7.181533 0.7511050 5.347147
Kriging 0.5697979 6.705880 0.8295548 4.590711
RK 0.5362585 6.895541 0.7876989 5.045092

Table 1.6: \(R^2\) and RMSE for the four models: IDW, TPS, Kriging, and Regression Kriging. The “cv” values show the cross-validated \(R^2\) and RMSE goodness of fit tests. The “known” values are calculated for the final model across all 345 known locations to assess how well the model predicts locations with empirical measurements.

\(R^2\) - or proportion of variance explained by the model - is least with IDW and greatest (0.57) with Kriging. However, RK has the lowest RMSE (6.90), but a lower \(R^2\) (0.54). Best model is ultimately subjective, and some more knowledge on how surface ozone concentrations are influenced by elevation would, I think, be critical in determining whether the ordinary Kriging or Regression Kriging model is more appropriate. Their skill is similar, but the RK surface is visibly influenced by elevation.

2. Regression

These data contain species composition measurements for 533 100m^2 plots across the Raif section of the Volga-Kama Nature Reserve in Tatarstan, Russia. Here, I explored how distinguishable these plots are based on temperature and moisture.

boreal <- read.csv("data/boreal.csv")
# head(boreal)
pal2 <- pnw_palette("Lake", n = nrow(boreal))
pal3 <- pnw_palette("Sunset2", n = nrow(boreal))
pal4 <- pnw_palette("Shuksan2", n = nrow(boreal))
pal4 <- rev(pal4)

Boreal Index

\[BI = \sqrt{\frac{1000 (S+1)}{n}} \] where \(S\) is the number of boreal species and \(n\) is the total number of species found in each plot.

boreal$BI <- sqrt((1000*(boreal$nBor + 1)) / boreal$nTot)
# head(boreal)
boreal_sf <- st_as_sf(boreal, coords = c("easting", "northing"), crs = 32639)

Variable Plots

Boreal Index

ggplot() +
  geom_sf(data = boreal_sf, aes(col = BI), alpha = 0.8) +
  guides(size = "none") +
  scale_color_gradientn(colors = pal2) +
  theme_minimal()

On a cursory glance, boreal index does appear to be spatially autocorrelated, with high values occurring closer to other high values, and vice versa.

Temperature

ggplot() +
  geom_sf(data = boreal_sf, aes(col = tmp), alpha = 0.8) +
  guides(size = "none") +
  scale_color_gradientn(colors = pal3) +
  theme_minimal()

Wetness

ggplot() +
  geom_sf(data = boreal_sf, aes(col = wet), alpha = 0.8) +
  guides(size = "none") +
  scale_color_gradientn(colors = pal4) +
  theme_minimal()

2.1 Naive Model

\[ y = \beta_0 + \beta_1x_1 + \beta_2x_2 \] where \(y = BI\), \(x_1 = Temp\), and \(x_2 = Wet\) such that

\[ BI \sim Temp + Wet \]

# fit model assuming no autocorrelation in residuals with gls()
glsNaive <- gls(BI ~ tmp + wet, boreal)
summary(glsNaive)
## Generalized least squares fit by REML
##   Model: BI ~ tmp + wet 
##   Data: boreal 
##        AIC      BIC    logLik
##   2830.709 2847.801 -1411.355
## 
## Coefficients:
##                  Value Std.Error   t-value p-value
## (Intercept) -212.81280  55.10855 -3.861702   1e-04
## tmp            0.77901   0.18560  4.197282   0e+00
## wet          152.22303  10.92708 13.930810   0e+00
## 
##  Correlation: 
##     (Intr) tmp   
## tmp -1.000       
## wet  0.302 -0.296
## 
## Standardized residuals:
##         Min          Q1         Med          Q3         Max 
## -2.91263968 -0.68370547 -0.08842685  0.55090391  5.23496551 
## 
## Residual standard error: 3.437209 
## Degrees of freedom: 533 total; 530 residual
# test iterations m
m <- 10
# number of points n
n <- nrow(boreal)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)

set.seed(42)

for (i in 1:m) {
   # select rows for testData subset
  rows4test <- sample(x = 1:n, size = n*0.1)
  # create testData subset
  testData <- boreal[rows4test,]
  # create trainData subset
  trainData <- boreal[-rows4test,]
  
  glsNaive_t <- gls(BI ~ tmp + wet, trainData)
  
  preds <- predict(glsNaive_t, newdata = testData)
  
  # pull observed from testData
  obs <- testData$BI

    # pull R^2 and RMSE
    rsq[i] <- cor(obs,preds)^2
    rmse[i] <- sqrt(mean((preds - obs)^2))
}

naive_rsq <- mean(rsq)
naive_rmse <- mean(rmse)

naive_rsq
## [1] 0.3321315
naive_rmse
## [1] 3.73864

The initial naive model (using Ordinary Least Squares) produces a slope for both temperature and wetness that are signficantly different from 0 (Temperature: t = 4.20, p < 0.001; Wetness: t = 13.9, p < 0.001). Cross validation shows however that the model is not a great fit (average \(R^2\) = 0.344, average RMSE = 3.43).

Naive Model: \[y = -212.813 + 0.779x_1 + 152.223x_2 \], where \(x_1\) = Temp and \(x_2\) = Wet.

boreal$glsNaiveResids <- residuals(glsNaive, type = "normalized")
boreal_sf <- st_as_sf(boreal, coords = c("easting", "northing"), crs = 32639)
# boreal_sf
ggplot() +
  geom_sf(data = boreal_sf, 
          aes(col = glsNaiveResids, size = glsNaiveResids),
          alpha = 0.8) +
  guides(size = "none") +
  scale_color_gradientn(colors = pal2) +
  theme_minimal()

Residuals of the naive model also appear to have some spatial structure.

# test for spatial autocorrelation using a correlogram
residsI <- spline.correlog(x = boreal$easting, y = boreal$northing,
                           z = boreal$glsNaiveResids, 
                           resamp = 50, quiet = TRUE)

# save coordinate points as matrix for dist()
points <- cbind(boreal$easting, boreal$northing)

# plot Moran's I for residuals
plot(residsI, xlim = c(0, max(dist(points))/3))

Moran’s I of the residuals shows that there is spatial autocorrelation in the residuals of the naive model, particularly at distances below ~500m.

2.2 Better Model

# using autofitVariogram() to produce experimental and fitted variogram
## tests Gaussian, Spherical, and Exponential fits and produces the best
boreal_fit <- autofitVariogram(glsNaiveResids~1, input_data = boreal_sf, 
                      model = c("Gau", "Sph", "Exp"))
plot(boreal_fit)

The best variogram for the residuals is an exponential model with nugget = 0.41, sill = 1, and range = 105. This will be used as a correlation structure to find the variance-covariance matrix for the updated model.

csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")
glsUpdated <- update(glsNaive, correlation = csSpatial)
summary(glsUpdated)
## Generalized least squares fit by REML
##   Model: BI ~ tmp + wet 
##   Data: boreal 
##       AIC      BIC   logLik
##   2733.72 2759.357 -1360.86
## 
## Correlation Structure: Exponential spatial correlation
##  Formula: ~easting + northing 
##  Parameter estimate(s):
##       range      nugget 
## 460.3758216   0.5045189 
## 
## Coefficients:
##                 Value Std.Error   t-value p-value
## (Intercept) -77.23520  77.15219 -1.001076  0.3172
## tmp           0.31123   0.26012  1.196514  0.2320
## wet          75.75531  13.51721  5.604361  0.0000
## 
##  Correlation: 
##     (Intr) tmp   
## tmp -1.000       
## wet  0.066 -0.060
## 
## Standardized residuals:
##        Min         Q1        Med         Q3        Max 
## -2.3203431 -0.5317626  0.1216847  0.7397759  5.1417469 
## 
## Residual standard error: 3.654787 
## Degrees of freedom: 533 total; 530 residual
# test iterations m
m <- 10
# number of points n
n <- nrow(boreal)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)

csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")

set.seed(42)

for (i in 1:m) {
   # select rows for testData subset
  rows4test <- sample(x = 1:n, size = n*0.1)
  # create testData subset
  testData <- boreal[rows4test,]
  # create trainData subset
  trainData <- boreal[-rows4test,]
  
  glsNaive_t <- gls(BI ~ tmp + wet, trainData)
  
  glsUpdated_t <- update(glsNaive_t, correlation = csSpatial)
  
  preds <- predict(glsUpdated_t, newdata = testData)
  
  # pull observed from testData
  obs <- testData$BI

    # pull R^2 and RMSE
    rsq[i] <- cor(obs,preds)^2
    rmse[i] <- sqrt(mean((preds - obs)^2))
}

updated_rsq <- mean(rsq)
updated_rmse <- mean(rmse)

updated_rsq
## [1] 0.3331307
updated_rmse
## [1] 3.999107

When using Generalized Least Squares (GLS) to account for spatial structure in Boreal Index, the predictor variables have different significance than in the naive model. Primarily, temperature no longer has a significant slope and is not a good predictor of Boreal Index when spatial autocorrelation is considered (t = 1.20, p = 0.232). Wetness remains a good predictor (t = 5.60, p < 0.001). After cross-validation, the model explains roughly the same amount of variation in BI: mean \(R^2\) = 0.333, average RMSE = 4.0.

Better Model: \[y = -77.24 + 0.311 x_1 + 1.197 x_2 \] where \(x_1\) = Temp and \(x_2\) = Wet.

2.3 Synthesis.

Improving the model with spatial structure through Generalized Least Squares regression draws questions about the importance of temperature as an effective predictor variable for Boreal Index across these plots. In the naive model, temperature is a significant predictor, but it does not remain significant when the model is updated with spatial structure.

Given that our dependent variable, Boreal Index, is a measure of how many boreal-exclusive trees are present vs their more southerly counterparts, it makes sense ecologically that BI in this forest would be more dependent on moisture levels than temperature. Temperature is a more variable environmental factor than wetness typically, whereas moisture levels can be retained for longer periods of time. Moisture would therefore be more indicative of long-term conditions that might limit growth of certain types of trees. Since the Raif region consists of a rare cross-over of boreal and southern species, presence/absence of certain species within each plot is most likely due to those factors that affect habitat suitability the most. I would anticipate that moisture (especially in a colder climate like Russia) will impact habitat suitability more than temperature, which can fluctuate a great deal daily and seasonally.

Overall, however, the updated model explains only a tiny bit more of variation in BI than the naive model and has a higher RMSE after cross-validation. We can also see from plotting the observed BI values to the predicted that the updated model may actually be doing a worse job of predicting our known values.

preds <- predict(glsNaive, newdata = boreal)
  obs <- boreal$BI

    # pull R^2 and RMSE
    naive_rsq <- cor(obs,preds)^2
    naive_rmse <- sqrt(mean((preds - obs)^2))

    
ggplot() +
  geom_abline(slope=1,intercept = 0) +
  geom_point(aes(x = obs, y = preds)) + 
  coord_fixed(ratio=1, xlim = range(preds, obs),
              ylim = range(preds, obs)) +
  labs(x="Observed Values",
       y="Predicted Values",
       title="BI Naive Model Fit")

preds <- predict(glsUpdated, newdata = boreal)
  obs <- boreal$BI

    # pull R^2 and RMSE
    upd_rsq <- cor(obs,preds)^2
    upd_rmse <- sqrt(mean((preds - obs)^2))

    
    
ggplot() +
  geom_abline(slope=1,intercept = 0) +
  geom_point(aes(x = obs, y = preds)) + 
  coord_fixed(ratio=1, xlim = range(preds, obs),
              ylim = range(preds, obs)) +
  labs(x="Observed Values",
       y="Predicted Values",
       title="BI Updated Model Fit")

This all led me to try one last, “best” model excluding temperature entirely.

glswet_Naive <- gls(BI ~ wet, boreal)

boreal$glswet_NaiveResids <- residuals(glswet_Naive, type = "normalized")
boreal_sf <- st_as_sf(boreal, coords = c("easting", "northing"), crs = 32639)
boreal_wetfit <- autofitVariogram(glswet_NaiveResids~1, 
                                  input_data = boreal_sf, 
                                  model = c("Gau", "Sph", "Exp"))
plot(boreal_wetfit)

csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")

glswet_Updated <- update(glswet_Naive, correlation = csSpatial)
  
  preds <- predict(glswet_Updated, newdata = boreal)
  
  obs <- boreal$BI
  
  ggplot() +
  geom_abline(slope=1,intercept = 0) +
  geom_point(aes(x = obs, y = preds)) + 
  coord_fixed(ratio=1, xlim = range(preds, obs),
              ylim = range(preds, obs)) +
  labs(x="Observed Values",
       y="Predicted Values",
       title="BI Updated Model (Wet Only) Fit")

# test iterations m
m <- 10
# number of points n
n <- nrow(boreal)
# setup R^2 and RMSE matrices for outputs
rsq <- matrix(0,ncol=1,nrow=m)
rmse <- matrix(0,ncol=1,nrow=m)

csSpatial <- corSpatial(form=~easting+northing, nugget = TRUE, type = "exponential")

set.seed(42)

for (i in 1:m) {
   # select rows for testData subset
  rows4test <- sample(x = 1:n, size = n*0.1)
  # create testData subset
  testData <- boreal[rows4test,]
  # create trainData subset
  trainData <- boreal[-rows4test,]
  
  glswet_Naive_t <- gls(BI ~ wet, boreal)
  
  glswet_Updated_t <- update(glswet_Naive_t, correlation = csSpatial)
  
  preds <- predict(glswet_Updated_t, newdata = testData)
  
  # pull observed from testData
  obs <- testData$BI

    # pull R^2 and RMSE
    rsq[i] <- cor(obs,preds)^2
    rmse[i] <- sqrt(mean((preds - obs)^2))
}

wet_rsq <- mean(rsq)
wet_rmse <- mean(rmse)

wet_rsq
## [1] 0.3230485
wet_rmse
## [1] 4.05143

As it turns out, temperature is still contributing to the model by explaining a small amount of variance in BI across plots. The GLS updated model with only wetness as a predictor variable has a lower cross-validated \(R^2\) (0.32) and higher RMSE (4.05) than the previous model.

In this case, including spatial structure is not actually contributing immensely to the model or altering interpretation. While it is an important consideration when autocorrelation in a variable is present, sometimes the simpler model (i.e., the naive linear regression model \(BI \sim Temp + Wet\)) provides sufficient predictions to make inference.