Modeling

Load packages

Code

# for path management
library(here)
# for raster data
library(raster)
# for data manipulation
library(tidyverse)
# for spatial operations
library(sf)
# for basemaps
library(rnaturalearth)
# for assessing SAC
library(spdep)
# for bayesian linear model
library(brms)
# for spatial glmm
library(glmmfields)
# for model selection (projective predictive inference)
library(projpred)
# for spatial CV
library(spatialsample)
# for easy exploratory plots
library(bayesplot)
# for easy working with posteriors
library(tidybayes)
# for modeling workflow
library(tidymodels)
# for stitching together model output plots
library(patchwork)



source(here("R", "modeling_fns.R"))

Read in and process raw data

Read raw data

Code

# reading in the raw pairwise distances, just in case I need that info later
# I already created per-cell summaries, which I'm reading in below
raw_pi <- read_csv(here("output", "spreadsheets", "cell_medium_3_10_pi.csv"))

# only keeping relevant columns
raw_df <- read_csv(here("output", "spreadsheets", "cell_medium_3_10_sumstats.csv")) %>% 
  select(cell, gdm = sqrt_pi, gde = hill_1, num_ind, num_otu, num_order)

Filter out bad cells

These cells contain wonky environmental data that impacts modeling. The wonky environmental data is likely due to their coastal location, resulting in faulty projection of the environmental models.

HOLDING OFF ON THIS FOR NOW

Code

filter_df <- raw_df

Add in spatial and environmental metadata

This includes converting the data frames to sf objects to make spatial operations and plotting easier, mapping cells to their continents, and extracting relevant environmental data.

Code

# setting the crs for maps and environmental data
# behrmann projection
crs_behr <- "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"

# for assigning cells to continents. Islands are missed at coarser resolutions
world_large <- rnaturalearth::ne_countries(scale = "large", returnclass = "sf") %>%
  select(continent, name_long) %>% 
  st_transform(crs = crs_behr) 

# for mapping. this will be smaller so plotting is faster
world_small <- rnaturalearth::ne_countries(scale = "small", returnclass = "sf") %>%
  select(continent, name_long) %>% 
  st_transform(crs = crs_behr) 

# all predictors
predictors_all <- c("current_medium_bio_13", 
                    "current_medium_bio_14", 
                    "current_medium_bio_15", 
                    "current_medium_bio_2", 
                    "current_medium_bio_5", 
                    "ghh_medium_std_dev", 
                    "human_medium_gHM", 
                    "GlobalExtreme_tsTrendExt", 
                    "GlobalExtreme_tsVarExt", 
                    "GlobalExtreme_prTrendExt", 
                    "GlobalExtreme_prVarExt")

# Read in predictor rasters
rast_list <- list.files(here("data", "climate_agg"),
                        pattern = "medium",
                        full.names = TRUE) 


rast_list_pred <- rast_list[str_detect(rast_list,  paste(predictors_all, collapse = "|"))]


rasters_full <- raster::stack(rast_list_pred)

crs(rasters_full) <- crs_behr

# convert to data frame 
predictors_sf <- rasters_full %>%
  rasterToPolygons() %>%
  st_as_sf() 

predictors_sf <- predictors_sf %>% 
  mutate(cell = cellFromXY(rasters_full, st_coordinates(st_centroid(predictors_sf)))) %>% 
  rename_with(~str_remove_all(.x, "current_medium_"), contains("current_medium")) %>% rename_with(~str_remove_all(.x, "medium_"), contains("_medium_"))

# read in stability 
stability_sf <- read_sf(here("data", "climate_poly", "new_stability.geojson"), crs = crs_behr) %>% 
  rename(precip_trend = GlobalExtreme_prTrendExt,
         precip_var = GlobalExtreme_prVarExt,
         temp_trend = GlobalExtreme_tsTrendExt,
         temp_var = GlobalExtreme_tsVarExt)

Code

# add predictors and continents 
full_sf <- left_join(predictors_sf, filter_df, by = "cell") %>% 
  st_join(stability_sf, left = TRUE, largest = TRUE) %>% 
  st_join(world_large, largest = TRUE) 
  
ggplot() +
  geom_sf(data = world_small) +
  geom_sf(data = full_sf, aes(geometry = geometry, fill = gde, color = gde)) +
  scale_fill_viridis_c(na.value = "transparent") +
  scale_color_viridis_c(na.value = "transparent") +
  theme_bw()

Code

write_sf(full_sf, here("output", "spreadsheets", "full_sf.geojson"))

Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
that

Create data sets with minimum OTU thresholds

10, 25, 50, 100, 150, 200

Code

min_otu_vals <- c(10, 25, 50, 100, 150, 200)

splits_sf <- map(min_otu_vals, ~filter(full_sf, num_otu >= .x, 
                                       # there's a single missing value
                                       !is.na(ghh_std_dev)))

# there is a single cell in South America that doesn't map to the continent across all data sets
splits_sf <- map(splits_sf, ~mutate(.x, continent = replace_na(continent, "South America")))

# name the data frames
names(splits_sf) <- paste0("min_otu_", min_otu_vals)

Split into testing and training data

Stratify the split according to continent to ensure somewhat even spatial representation of the test vs training data. I’m doing a 75% train to 25% test split. The test data will not be touched until the end- this is not a train-validate split.

Code

set.seed(9999)
initial_split_sf <- map(splits_sf, ~initial_split(data = .x, prop = 3/4, strata = continent))

train_sf <- map(initial_split_sf, training)

test_sf <- map(initial_split_sf, testing)

Model selection

I’m considering two main approaches to control for spatial autocorrelation in spatial modeling. One is to use spatial cross-validation to train the model and select parameters that predict spatially independent withheld data well. Another is to model spatial autocorrelation in the data explicitly. When incorporating spatiality into the model through a special covariance matrix (e.g. spatial weights matrix for SAR or Gaussian random fields), the spatial relationships among the response and predictors are assumed to be maintained across the area of prediction, which is highly unlikely, except in an interpolation scenario. In cases where spatial weights are used, predicting beyond the data the model is fit to is not possible. So, for prediction into unsampled areas, models that don’t incorporate space explicitly, but are selected through spatially independent data are preferred.

I’m therefore using a machine learning approach for mapping, where I’m performing regularized linear regression and random forest regression coupled with spatial cross-validation for feature selection.

In case there is spatial autocorrelation in the residuals, and a goal of the study is to understand the predictors, I’m using a regularized Bayesian regression approach, where I am including a spatial covariance matrix (SAR, spatial autoregressive) and regulating parameter selection with a regularized horseshoe prior for the model coefficients.

Spatial CV

5-fold CV using random spatial blocks and k-means clustering. I’m exploring both methods to see if the train-test splits overlap in predictor space, i.e. make sure that segregating in space doesn’t lead to extrapolation of the environmental variables when tuning the model.

Code

set.seed(4562)
block_folds <- map(train_sf, ~spatial_block_cv(.x, v = 5))

set.seed(7658)
cluster_folds <- map(train_sf, ~spatial_clustering_cv(.x, v = 5)) 
# I have to modify cluster_folds later to convert them to tibbles, so I'm keeping this to make spatial plots of error easier
cluster_folds_sf <- cluster_folds

Code

autoplot(block_folds[[1]])

Code

autoplot(cluster_folds[[1]])

Environmental space

To ensure that the environmental space of each fold falls within the environmental space of the training data, I’m performing a PCA of the data, then coloring observations according to each train-validate split.

Since both methods demonstrate overlap in environmental space between train-validate splits, I’m going to move forward with a k-means clustering method of spatial cross-validation since the folds are more geographically discrete than blocking and can be discussed in terms of geopolitical boundaries. Both methods have slightly uneven distributions in the number of observations assigned to test folds (e.g. split 2 of the min_otu_150 data has 9 test localities, while split 5 has 41), but most are similar enough.

Code

train_pca_tib <- map(train_sf, ~as_tibble(.x) %>% select(-c(geometry, gdm, gde, cell, num_ind, num_otu, num_order, continent, name_long)) %>% remove_missing())

# I can't do this in a loop because of a weird memory error
pca_10_block <- run_predictor_pca(train_data = train_pca_tib$min_otu_10, folds = block_folds$min_otu_10)

pca_10_cluster <- run_predictor_pca(train_data = train_pca_tib$min_otu_10, folds = cluster_folds$min_otu_10)

pca_25_block <- run_predictor_pca(train_data = train_pca_tib$min_otu_25, folds = block_folds$min_otu_25)

pca_25_cluster <- run_predictor_pca(train_data = train_pca_tib$min_otu_25, folds = cluster_folds$min_otu_25)

pca_50_block <- run_predictor_pca(train_data = train_pca_tib$min_otu_50, folds = block_folds$min_otu_50)

pca_50_cluster <- run_predictor_pca(train_data = train_pca_tib$min_otu_50, folds = cluster_folds$min_otu_50)

pca_100_block <- run_predictor_pca(train_data = train_pca_tib$min_otu_100, folds = block_folds$min_otu_100)

pca_100_cluster <- run_predictor_pca(train_data = train_pca_tib$min_otu_100, folds = cluster_folds$min_otu_100)

pca_150_block <- run_predictor_pca(train_data = train_pca_tib$min_otu_150, folds = block_folds$min_otu_150)

pca_150_cluster <- run_predictor_pca(train_data = train_pca_tib$min_otu_150, folds = cluster_folds$min_otu_150)

pca_200_block <- run_predictor_pca(train_data = train_pca_tib$min_otu_200, folds = block_folds$min_otu_200)

pca_200_cluster <- run_predictor_pca(train_data = train_pca_tib$min_otu_200, folds = cluster_folds$min_otu_200)

ggplot(data = pca_150_block, aes(x = PC1, y = PC2, color = cv_split)) +
  stat_ellipse() +
  geom_point() +
  theme_bw() +
  facet_wrap(~split_number) +
  labs(title = "Block CV")

Code

ggplot(data = pca_150_cluster, aes(x = PC1, y = PC2, color = cv_split)) +
  stat_ellipse() +
  geom_point() +
  theme_bw() +
  facet_wrap(~split_number) +
  labs(title = "Cluster CV")

Define the models

I’m defining the model specifications in parsnip. The two models are a regularized linear model and a random forest regression. The tuning parameters are below:

Linear model:

penalty: amount of regularization
mixture: proportion of lasso penalty, where 1 is a pure lasso model and 0 is a ridge regression

Random forest (ranger r package):

mtry: # of randomly selected predictors
trees: # of trees. I’m not going to tune this- a sufficiently high number like 1000 is fine
min_n: minimal node size

Code

linear_reg_spec <- linear_reg(penalty = tune(), mixture = tune()) %>% 
   set_engine("glmnet")

rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("regression")

Creating tidmodels recipes and workflows to make model selection easier.

Variables we’re using:

Current climate- bio_2 (temp seasonality), bio_5 (MTWM), bio_13 (PWM), bio_14 (PDM), bio_15 (precip seasonality)
Climate stability- temp_trend, temp_var, precip_var, precip_trend
Habitat heterogeneity- ghh_std_dev
Human modification- human_gHM

In addition, we’re considering the response of GDE and GDM.

Code

# specifying the responses (GDE and GDM) and the predictor variables
raw_vars <- workflow_variables(
  outcomes = gde,
  predictors = c(bio_2,
                 bio_5,
                 bio_13,
                 bio_14,
                 bio_15,
                 temp_trend,
                 temp_var,
                 precip_trend,
                 precip_var,
                 ghh_std_dev,
                 human_gHM)
)

# I need to normalize the variables for linear regression, but leave them unchanged for random forest
normalized_vars <- raw_vars %>% 
  step_normalize(all_predictors())

# specify the workflows for each model-preprocessing list
wf_linreg <- workflow_set(
  preproc = list(normalized = normalized_vars),
  models = list(linear_regression = linear_reg_spec)
)

wf_rf <- workflow_set(
  preproc = list(raw = raw_vars),
  models = list(rf = rf_spec)
)

full_wf <- bind_rows(
  wf_linreg, 
  wf_rf
)

Now I’m specifying the range of values to tune over for the four models (two for each response). I’m using a grid search and letting tidymodels choose reasonable values.

Code

# get the coefficients for linear models
get_lm_coefs <- function(x) {
  x %>% 
    # get the lm model object
    extract_fit_engine() %>% 
    # transform its format
    tidy()
}

grid_ctrl <- control_grid(
  save_pred = TRUE,
  parallel_over = "resamples",
  save_workflow = TRUE,
  extract = get_lm_coefs
)

Now time to tune the models!

Code

# for some reason, a tidymodels package doesn't like sf objects instead of tibbles for the fold data
for (i in 1:length(cluster_folds)) {
  cluster_folds[[i]]$splits <- ssf_to_sf(cluster_folds[[i]]$splits)
}

# I'm also timing the tuning
results_time_10 <-
  system.time(
    grid_results_10 <- full_wf %>%
      workflow_map(
        seed = 1841,
        resamples = cluster_folds$min_otu_10,
        grid = 25,
        control = grid_ctrl, 
        verbose = TRUE
      )
  )

results_time_10

nrow(collect_metrics(grid_results_10, summarize = FALSE))

Let’s check out the results! Looks like both of the methods performed similarly. I think linear regression is easier to interpret, so we can move forward there.

Code

autoplot(
   grid_results_10,
   rank_metric = "rmse",  # <- how to order models
   metric = "rmse",       # <- which metric to visualize
   select_best = TRUE     # <- one point per workflow
) 

autoplot(grid_results_10, id = "normalized_linear_regression", metric = "rmse")

Now let’s select the best linear regression model!

Code

best_results <- 
   grid_results_10 %>% 
   extract_workflow_set_result("normalized_linear_regression") %>% 
   select_best(metric = "rmse")

best_results

linear_best_wf <- 
   grid_results_10 %>% 
   extract_workflow("normalized_linear_regression") %>% 
   finalize_workflow(best_results) 

linear_fit_resamples <- fit_resamples(object = linear_best_wf, 
                                      resamples = cluster_folds$min_otu_10,
                                      control = control_resamples(save_pred = TRUE))

linear_fit_pred <- collect_predictions(linear_fit_resamples)

linear_fit_metrics <- collect_metrics(linear_fit_resamples, summarize = FALSE) %>% 
  filter(.metric == "rmse")

train_sf_10 <- cluster_folds_sf$min_otu_10$splits[[1]]$data %>% 
  # to make combining with the predicted data easier
  mutate(.row = row_number())

full_fit <- linear_fit_pred %>% 
  left_join(linear_fit_metrics, by = "id") %>% 
  right_join(train_sf_10, by = ".row") %>% 
  select(-ends_with(".y")) %>% 
  rename_with(~str_remove(.x, ".x"), ends_with(".x")) %>% 
  st_sf() %>% 
  mutate(.resid = gde - .pred)


ggplot() + 
  geom_sf(data = full_fit, aes(color = .estimate, fill = .estimate)) +
  scale_fill_viridis_c() +
  scale_color_viridis_c()

ggplot(data = full_fit, aes(x = .pred, y = .resid)) +
  geom_point() +
  geom_smooth() 

qqnorm(full_fit$.resid)
qqline(full_fit$.resid)

Code

# create a neighborhood matrix (queen = TRUE means all neighbors, including diagonals, will be included)
sp_nb <- poly2nb(full_fit, queen = TRUE)

# create a weights matrix. style = "W" means that the weights will be scaled from 0-1. This way we can compare across areas with different numbers of areas, which is true for this data set.
# also, ignoring cases with no neighbors. Errors would get thrown otherwise, and since we have islands with no neighbors, this would be a problem
sp_w <- nb2listw(sp_nb, style = "W", zero.policy = TRUE)

moran.mc(full_fit$.resid, sp_w, nsim = 10000, zero.policy = TRUE)

SAR with projection predictive inference and horseshoe priors

Code

train_100 <- train_sf$min_otu_100 

# create a neighborhood matrix (queen = TRUE means all neighbors, including diagonals, will be included)
sp_nb <- poly2nb(train_100, queen = TRUE)

# create a weights matrix. style = "W" means that the weights will be scaled from 0-1. This way we can compare across areas with different numbers of areas, which is true for this data set.
# also, ignoring cases with no neighbors. Errors would get thrown otherwise, and since we have islands with no neighbors, this would be a problem
sp_w <- nb2listw(sp_nb, style = "W", zero.policy = TRUE)

# just checking, but the response has a fair amount of SAC, which predicts whether there will be residual SAC
moran.mc(train_100$gde, sp_w, nsim = 10000, zero.policy = TRUE)

Fit the full bayesian SAR model with a horseshoe prior

Code

options(mc.cores = parallel::detectCores())
full_fit_100 <- brm(
  gde ~ 
    bio_2 + 
    bio_5 +
    bio_13 +
    bio_14 +
    bio_15 +
    ghh_std_dev +
    human_gHM +
    temp_trend + 
    temp_var + 
    precip_trend + 
    precip_var + 
    gp(lon_scaled, 
       lat_scaled, 
       cov = "exp_quad", 
       by = continent),
  prior = 
    set_prior("normal(0,0.1)", class = "b"), 
  data = train_scaled$min_otu_100,
  family = gaussian(),
  chains = 4,
  seed = 19896,
  iter = 4000,
  control = list(adapt_delta = 0.999),
  cores = 4
)

Code

resid <- residuals(full_fit_100)[,1]
pred <- predict(full_fit_100)[,1]

resid_100 <- train_sf$min_otu_100 %>% 
  mutate(.resid = resid,
         .pred = pred)


ggplot(resid_100, aes(x = .pred, y = .resid)) +
  geom_point() + 
  geom_smooth()

moran.mc(resid_100$.resid, sp_w, nsim = 10000, zero.policy = TRUE)

GLMMFields

Projection predictive variable selection

Need to scale the data before modeling (mean 0, sd 1). When predicting to test data, I will apply the centering and scaling factors of the training data to the test data to prevent data leakage.

Code

train_scaled <- suppressWarnings(map(train_sf, scale_data))

Fit full models (all 11 predictors) as the reference models for projpred selection.

Code

options(mc.cores = parallel::detectCores())
set.seed(1977)
full_mod_gde <- map(train_scaled, ~run_stan(response = "gde", df = .x))

Code

write_rds(full_mod_gde, here("output", "models", "full_mod_gde.rds"))

Do projection predictive selection. After visually inspecting the performance statistics plots (ELPD and RMSE; example plot below), I determined that the model size suggested by projpred::suggest_size() was the most reasonable model size for all OTU filtering schemes.

Code

projpred_gde <- map(full_mod_gde, run_projpred)

plot(projpred_gde$min_otu_50, stats = c("elpd", "rmse"), deltas = TRUE)

Code

write_rds(projpred_gde, here("output", "models", "projpred_gde.rds"))

Run the models.

Code

options(mc.cores = parallel::detectCores())

set.seed(895)
glmm_gde <- map2(train_scaled, projpred_gde, ~run_glmmfields(response = "gde", df = .x, pp = .y))

Code

write_rds(glmm_gde, here("output", "models", "glmm_gde.rds"))

Compare models

Comparing the top models selected from the two model selection approaches. I’m going to compare the variables that are selected, the levels of residual spatial autocorrelation (SAC), and the spatial distribution of error in the training set.

Residual SAC

It looks like the GLMM properly accounts for SAC in all data sets!

Code

# predictions
glmm_pred_gde <- map(glmm_gde, predict)

# map to sf data frames for spatial weights matrices and doing moran.mc easily
glmm_pred_sf_gde <- map2(train_sf, glmm_pred_gde, ~mutate(.x, .pred = pull(.y, 1), .resid = gde - .pred))

# create a neighborhood matrix (queen = TRUE means all neighbors, including diagonals, will be included) and spatial weights matrix
glmm_w_gde <- map(train_sf, ~poly2nb(.x, queen = TRUE)) %>% 
  map(~nb2listw(.x, style = "W", zero.policy = TRUE))

glmm_moran_gde <- map2(glmm_pred_sf_gde, glmm_w_gde, ~moran.mc(.x$.resid, .y, nsim = 10000, zero.policy = TRUE))

# check moran p-values
map(glmm_moran_gde, ~.x[["p.value"]]) %>% 
  bind_rows() %>% 
  knitr::kable(caption = "Moran's I p-values")

Moran’s I p-values
min_otu_10	min_otu_25	min_otu_50	min_otu_100	min_otu_150	min_otu_200
0.2170783	0.1277872	0.1851815	0.710329	0.4210579	0.5553445

Posterior plots

Generating a bunch of diagnostic plots to compare how the models perform on training data.

Get posteriors

Code

posts_gde <- map(glmm_gde, ~glmmfields::posterior_predict(.x, iter = 1000))
resp_gde <- map(train_sf, ~pull(.x, gde))

Posterior-predictive check

The posteriors for all are similar, but the distribution of GDE gets narrower as the minimum OTU filter increases.

Code

plot_ppc_dens <- function(resp, post) {
  
  ppc_dens_overlay(resp, post[1:50,]) +
    lims(x = c(0, 1))
}

ppc_gde <-
  map2(
    resp_gde,
    posts_gde,
    ~ plot_ppc_dens(.x, .y)
  )

ppc_gde[["min_otu_10"]] + labs(title = "min_otu_10") +
  ppc_gde[["min_otu_25"]] + labs(title = "min_otu_25") +
  ppc_gde[["min_otu_50"]] + labs(title = "min_otu_50") +
  ppc_gde[["min_otu_100"]] + labs(title = "min_otu_100") +
  ppc_gde[["min_otu_150"]] + labs(title = "min_otu_150") +
  ppc_gde[["min_otu_200"]] + labs(title = "min_otu_200") +
  patchwork::plot_layout(ncol = 2)

Coefficient posteriors

BIO5 (max temp of the warmest month) shows up in all models, followed by BIO13 (prec of wettest month, 4/6), temp_var (4/6), temp_trend (4/6), BIO2 (diurnal temp range, 3/6)

Code

# only include post-warmup draws
beta_posts_gde <- map(glmm_gde, get_beta_posts) 

beta_plots_gde <- map(beta_posts_gde, ~mcmc_intervals(.x, 
           prob = 0.70,
           prob_outer = 0.95) +
  geom_vline(xintercept = 0) +
  ggtitle("70% and 95% CI intervals"))  

beta_plots_gde[["min_otu_10"]] + labs(title = "min_otu_10") +
  beta_plots_gde[["min_otu_25"]] + labs(title = "min_otu_25") +
  beta_plots_gde[["min_otu_50"]] + labs(title = "min_otu_50") +
  beta_plots_gde[["min_otu_100"]] + labs(title = "min_otu_100") +
  beta_plots_gde[["min_otu_150"]] + labs(title = "min_otu_150") +
  beta_plots_gde[["min_otu_200"]] + labs(title = "min_otu_200") +
  patchwork::plot_layout(ncol = 2)

Test predictions

Predicting the model to withheld test data.

Code

test_scaled <- suppressWarnings(map2(train_sf, test_sf, ~scale_test(.x, .y)))

test_pred_gde <- map2(glmm_gde, test_scaled, ~predict(.x, newdata = .y))

test_sf_gde <-
  map2(
    test_sf,
    test_pred_gde,
    ~ mutate(
      .x,
      .pred = .y$estimate,
      .pred_ci = .y$conf_high - .y$conf_low,
      .resid = gde - .y$estimate
    )
  )

Code

obs_vs_pred_gde <- map(test_sf_gde, ~plot_obs_vs_pred(.x, resp = "gde"))

obs_vs_pred_gde[["min_otu_10"]] + labs(title = "min_otu_10") 
  obs_vs_pred_gde[["min_otu_25"]] + labs(title = "min_otu_25") 
  obs_vs_pred_gde[["min_otu_50"]] + labs(title = "min_otu_50") 
  obs_vs_pred_gde[["min_otu_100"]] + labs(title = "min_otu_100") 
  obs_vs_pred_gde[["min_otu_150"]] + labs(title = "min_otu_150") 
  obs_vs_pred_gde[["min_otu_200"]] + labs(title = "min_otu_200")

Test maps

Code

test_pred_maps_gde <- map(test_sf_gde, ~map_test(.x, var = ".pred"))

Code

test_pred_maps_gde[["min_otu_10"]] + labs(title = "min_otu_10")

Code

test_pred_maps_gde[["min_otu_25"]] + labs(title = "min_otu_25")

Code

test_pred_maps_gde[["min_otu_50"]] + labs(title = "min_otu_50")

Code

test_pred_maps_gde[["min_otu_100"]] + labs(title = "min_otu_100")

Code

test_pred_maps_gde[["min_otu_150"]] + labs(title = "min_otu_150")

Code

test_pred_maps_gde[["min_otu_200"]] + labs(title = "min_otu_200")

Test residual maps

Maps of residual error across test data.

Code

test_resid_maps_gde <- map(test_sf_gde, ~map_test(.x, var = ".resid"))

Code

test_resid_maps_gde[["min_otu_10"]] + labs(title = "min_otu_10")

Code

test_resid_maps_gde[["min_otu_25"]] + labs(title = "min_otu_25")

Code

test_resid_maps_gde[["min_otu_50"]] + labs(title = "min_otu_50")

Code

test_resid_maps_gde[["min_otu_100"]] + labs(title = "min_otu_100")

Code

test_resid_maps_gde[["min_otu_150"]] + labs(title = "min_otu_150")

Code

test_resid_maps_gde[["min_otu_200"]] + labs(title = "min_otu_200")

Test 95% CI maps

Maps of the range in 95% CI of predicted values.

Code

test_ci_maps_gde <- map(test_sf_gde, ~map_test(.x, var = ".pred_ci"))

Code

test_ci_maps_gde[["min_otu_10"]] + labs(title = "min_otu_10")

Code

test_ci_maps_gde[["min_otu_25"]] + labs(title = "min_otu_25")

Code

test_ci_maps_gde[["min_otu_50"]] + labs(title = "min_otu_50")

Code

test_ci_maps_gde[["min_otu_100"]] + labs(title = "min_otu_100")

Code

test_ci_maps_gde[["min_otu_150"]] + labs(title = "min_otu_150")

Code

test_ci_maps_gde[["min_otu_200"]] + labs(title = "min_otu_200")

Full maps

Predicting the models to the full data sets

Code

pred_vars <- c("bio_2", 
                 "bio_5", 
                 "bio_13", 
                 "bio_14", 
                 "bio_15", 
                 "ghh_std_dev", 
                 "human_gHM", 
                 "precip_trend", 
                 "precip_var",
                 "temp_trend",
                 "temp_var")


full_scaled <- suppressWarnings(map(train_sf, ~scale_test(.x, df_test = full_sf)))

full_scaled_noNA <- map(full_scaled, ~filter(.x, complete.cases(across(any_of(pred_vars)))))

full_preds_gde <- map2(glmm_gde, full_scaled_noNA, ~predict(.x, .y))

full_sf_noNA <- filter(full_sf, complete.cases(across(any_of(pred_vars)))) 
  
full_preds_sf_gde <-
  map(full_preds_gde, ~ bind_cols(full_sf_noNA, .x)) %>%
  map( ~ mutate(
    .x,
    .pred = estimate,
    .pred_ci = conf_high - conf_low,
    .resid = gde - .pred,
    # there are some outlier localities that really skew the distribution of error
    .log_ci = log(.pred_ci)
  ))

Write the predictions to file because they take forever to generate

Code

write_rds(full_preds_gde, here("output", "spreadsheets", "full_preds_gde.rds"))

Full prediction maps

Make the prediction maps.

Code

full_pred_maps_gde <- map(full_preds_sf_gde, ~map_test(.x, var = ".pred"))

Code

full_pred_maps_gde[["min_otu_10"]] + labs(title = "min_otu_10")

Code

full_pred_maps_gde[["min_otu_25"]] + labs(title = "min_otu_25")

Code

full_pred_maps_gde[["min_otu_50"]] + labs(title = "min_otu_50")

Code

full_pred_maps_gde[["min_otu_100"]] + labs(title = "min_otu_100")

Code

full_pred_maps_gde[["min_otu_150"]] + labs(title = "min_otu_150")

Code

full_pred_maps_gde[["min_otu_200"]] + labs(title = "min_otu_200")

Full 95% CI maps

Make the CI error maps.

Code

full_ci_maps_gde <- map(full_preds_sf_gde, ~map_test(.x, var = ".log_ci"))

Code

full_ci_maps_gde[["min_otu_10"]] + labs(title = "min_otu_10")

Code

full_ci_maps_gde[["min_otu_25"]] + labs(title = "min_otu_25")

Code

full_ci_maps_gde[["min_otu_50"]] + labs(title = "min_otu_50")

Code

full_ci_maps_gde[["min_otu_100"]] + labs(title = "min_otu_100")

Code

full_ci_maps_gde[["min_otu_150"]] + labs(title = "min_otu_150")

Code

full_ci_maps_gde[["min_otu_200"]] + labs(title = "min_otu_200")

Load packages

Read in and process raw data

Read raw data

Filter out bad cells

Add in spatial and environmental metadata

Create data sets with minimum OTU thresholds

Split into testing and training data

Model selection

Spatial CV

Environmental space

Define the models

SAR with projection predictive inference and horseshoe priors

GLMMFields

Projection predictive variable selection

Compare models

Residual SAC

Posterior plots

Get posteriors

Posterior-predictive check

Coefficient posteriors

Test predictions

Test maps

Test residual maps

Test 95% CI maps

Full maps

Full prediction maps

Full 95% CI maps

System information