# for path managementlibrary(here)# for raster datalibrary(raster)# for data manipulationlibrary(tidyverse)# for spatial operationslibrary(sf)# for basemapslibrary(rnaturalearth)# for assessing SAClibrary(spdep)# for bayesian linear modellibrary(brms)# for spatial glmmlibrary(glmmfields)# for model selection (projective predictive inference)library(projpred)# for spatial CVlibrary(spatialsample)# for easy exploratory plotslibrary(bayesplot)# for easy working with posteriorslibrary(tidybayes)# for modeling workflowlibrary(tidymodels)# for stitching together model output plotslibrary(patchwork)source(here("R", "modeling_fns.R"))
Read in and process raw data
Read raw data
Code
# reading in the raw pairwise distances, just in case I need that info later# I already created per-cell summaries, which I'm reading in belowraw_pi <-read_csv(here("output", "spreadsheets", "cell_medium_3_10_pi.csv"))# only keeping relevant columnsraw_df <-read_csv(here("output", "spreadsheets", "cell_medium_3_10_sumstats.csv")) %>%select(cell, gdm = sqrt_pi, gde = hill_1, num_ind, num_otu, num_order)
Filter out bad cells
These cells contain wonky environmental data that impacts modeling. The wonky environmental data is likely due to their coastal location, resulting in faulty projection of the environmental models.
HOLDING OFF ON THIS FOR NOW
Code
filter_df <- raw_df
Add in spatial and environmental metadata
This includes converting the data frames to sf objects to make spatial operations and plotting easier, mapping cells to their continents, and extracting relevant environmental data.
Code
# setting the crs for maps and environmental data# behrmann projectioncrs_behr <-"+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"# for assigning cells to continents. Islands are missed at coarser resolutionsworld_large <- rnaturalearth::ne_countries(scale ="large", returnclass ="sf") %>%select(continent, name_long) %>%st_transform(crs = crs_behr) # for mapping. this will be smaller so plotting is fasterworld_small <- rnaturalearth::ne_countries(scale ="small", returnclass ="sf") %>%select(continent, name_long) %>%st_transform(crs = crs_behr) # all predictorspredictors_all <-c("current_medium_bio_13", "current_medium_bio_14", "current_medium_bio_15", "current_medium_bio_2", "current_medium_bio_5", "ghh_medium_std_dev", "human_medium_gHM", "GlobalExtreme_tsTrendExt", "GlobalExtreme_tsVarExt", "GlobalExtreme_prTrendExt", "GlobalExtreme_prVarExt")# Read in predictor rastersrast_list <-list.files(here("data", "climate_agg"),pattern ="medium",full.names =TRUE) rast_list_pred <- rast_list[str_detect(rast_list, paste(predictors_all, collapse ="|"))]rasters_full <- raster::stack(rast_list_pred)crs(rasters_full) <- crs_behr# convert to data frame predictors_sf <- rasters_full %>%rasterToPolygons() %>%st_as_sf() predictors_sf <- predictors_sf %>%mutate(cell =cellFromXY(rasters_full, st_coordinates(st_centroid(predictors_sf)))) %>%rename_with(~str_remove_all(.x, "current_medium_"), contains("current_medium")) %>%rename_with(~str_remove_all(.x, "medium_"), contains("_medium_"))# read in stability stability_sf <-read_sf(here("data", "climate_poly", "new_stability.geojson"), crs = crs_behr) %>%rename(precip_trend = GlobalExtreme_prTrendExt,precip_var = GlobalExtreme_prVarExt,temp_trend = GlobalExtreme_tsTrendExt,temp_var = GlobalExtreme_tsVarExt)
Code
# add predictors and continents full_sf <-left_join(predictors_sf, filter_df, by ="cell") %>%st_join(stability_sf, left =TRUE, largest =TRUE) %>%st_join(world_large, largest =TRUE) ggplot() +geom_sf(data = world_small) +geom_sf(data = full_sf, aes(geometry = geometry, fill = gde, color = gde)) +scale_fill_viridis_c(na.value ="transparent") +scale_color_viridis_c(na.value ="transparent") +theme_bw()
Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
that
Create data sets with minimum OTU thresholds
10, 25, 50, 100, 150, 200
Code
min_otu_vals <-c(10, 25, 50, 100, 150, 200)splits_sf <-map(min_otu_vals, ~filter(full_sf, num_otu >= .x, # there's a single missing value!is.na(ghh_std_dev)))# there is a single cell in South America that doesn't map to the continent across all data setssplits_sf <-map(splits_sf, ~mutate(.x, continent =replace_na(continent, "South America")))# name the data framesnames(splits_sf) <-paste0("min_otu_", min_otu_vals)
Split into testing and training data
Stratify the split according to continent to ensure somewhat even spatial representation of the test vs training data. I’m doing a 75% train to 25% test split. The test data will not be touched until the end- this is not a train-validate split.
I’m considering two main approaches to control for spatial autocorrelation in spatial modeling. One is to use spatial cross-validation to train the model and select parameters that predict spatially independent withheld data well. Another is to model spatial autocorrelation in the data explicitly. When incorporating spatiality into the model through a special covariance matrix (e.g. spatial weights matrix for SAR or Gaussian random fields), the spatial relationships among the response and predictors are assumed to be maintained across the area of prediction, which is highly unlikely, except in an interpolation scenario. In cases where spatial weights are used, predicting beyond the data the model is fit to is not possible. So, for prediction into unsampled areas, models that don’t incorporate space explicitly, but are selected through spatially independent data are preferred.
I’m therefore using a machine learning approach for mapping, where I’m performing regularized linear regression and random forest regression coupled with spatial cross-validation for feature selection.
In case there is spatial autocorrelation in the residuals, and a goal of the study is to understand the predictors, I’m using a regularized Bayesian regression approach, where I am including a spatial covariance matrix (SAR, spatial autoregressive) and regulating parameter selection with a regularized horseshoe prior for the model coefficients.
Spatial CV
5-fold CV using random spatial blocks and k-means clustering. I’m exploring both methods to see if the train-test splits overlap in predictor space, i.e. make sure that segregating in space doesn’t lead to extrapolation of the environmental variables when tuning the model.
Code
set.seed(4562)block_folds <-map(train_sf, ~spatial_block_cv(.x, v =5))set.seed(7658)cluster_folds <-map(train_sf, ~spatial_clustering_cv(.x, v =5)) # I have to modify cluster_folds later to convert them to tibbles, so I'm keeping this to make spatial plots of error easiercluster_folds_sf <- cluster_folds
Code
autoplot(block_folds[[1]])
Code
autoplot(cluster_folds[[1]])
Environmental space
To ensure that the environmental space of each fold falls within the environmental space of the training data, I’m performing a PCA of the data, then coloring observations according to each train-validate split.
Since both methods demonstrate overlap in environmental space between train-validate splits, I’m going to move forward with a k-means clustering method of spatial cross-validation since the folds are more geographically discrete than blocking and can be discussed in terms of geopolitical boundaries. Both methods have slightly uneven distributions in the number of observations assigned to test folds (e.g. split 2 of the min_otu_150 data has 9 test localities, while split 5 has 41), but most are similar enough.
ggplot(data = pca_150_cluster, aes(x = PC1, y = PC2, color = cv_split)) +stat_ellipse() +geom_point() +theme_bw() +facet_wrap(~split_number) +labs(title ="Cluster CV")
Define the models
I’m defining the model specifications in parsnip. The two models are a regularized linear model and a random forest regression. The tuning parameters are below:
Linear model:
penalty: amount of regularization
mixture: proportion of lasso penalty, where 1 is a pure lasso model and 0 is a ridge regression
Random forest (ranger r package):
mtry: # of randomly selected predictors
trees: # of trees. I’m not going to tune this- a sufficiently high number like 1000 is fine
In addition, we’re considering the response of GDE and GDM.
Code
# specifying the responses (GDE and GDM) and the predictor variablesraw_vars <-workflow_variables(outcomes = gde,predictors =c(bio_2, bio_5, bio_13, bio_14, bio_15, temp_trend, temp_var, precip_trend, precip_var, ghh_std_dev, human_gHM))# I need to normalize the variables for linear regression, but leave them unchanged for random forestnormalized_vars <- raw_vars %>%step_normalize(all_predictors())# specify the workflows for each model-preprocessing listwf_linreg <-workflow_set(preproc =list(normalized = normalized_vars),models =list(linear_regression = linear_reg_spec))wf_rf <-workflow_set(preproc =list(raw = raw_vars),models =list(rf = rf_spec))full_wf <-bind_rows( wf_linreg, wf_rf)
Now I’m specifying the range of values to tune over for the four models (two for each response). I’m using a grid search and letting tidymodels choose reasonable values.
Code
# get the coefficients for linear modelsget_lm_coefs <-function(x) { x %>%# get the lm model objectextract_fit_engine() %>%# transform its formattidy()}grid_ctrl <-control_grid(save_pred =TRUE,parallel_over ="resamples",save_workflow =TRUE,extract = get_lm_coefs)
Now time to tune the models!
Code
# for some reason, a tidymodels package doesn't like sf objects instead of tibbles for the fold datafor (i in1:length(cluster_folds)) { cluster_folds[[i]]$splits <-ssf_to_sf(cluster_folds[[i]]$splits)}# I'm also timing the tuningresults_time_10 <-system.time( grid_results_10 <- full_wf %>%workflow_map(seed =1841,resamples = cluster_folds$min_otu_10,grid =25,control = grid_ctrl, verbose =TRUE ) )results_time_10nrow(collect_metrics(grid_results_10, summarize =FALSE))
Let’s check out the results! Looks like both of the methods performed similarly. I think linear regression is easier to interpret, so we can move forward there.
Code
autoplot( grid_results_10,rank_metric ="rmse", # <- how to order modelsmetric ="rmse", # <- which metric to visualizeselect_best =TRUE# <- one point per workflow) autoplot(grid_results_10, id ="normalized_linear_regression", metric ="rmse")
Now let’s select the best linear regression model!
Code
best_results <- grid_results_10 %>%extract_workflow_set_result("normalized_linear_regression") %>%select_best(metric ="rmse")best_resultslinear_best_wf <- grid_results_10 %>%extract_workflow("normalized_linear_regression") %>%finalize_workflow(best_results) linear_fit_resamples <-fit_resamples(object = linear_best_wf, resamples = cluster_folds$min_otu_10,control =control_resamples(save_pred =TRUE))linear_fit_pred <-collect_predictions(linear_fit_resamples)linear_fit_metrics <-collect_metrics(linear_fit_resamples, summarize =FALSE) %>%filter(.metric =="rmse")train_sf_10 <- cluster_folds_sf$min_otu_10$splits[[1]]$data %>%# to make combining with the predicted data easiermutate(.row =row_number())full_fit <- linear_fit_pred %>%left_join(linear_fit_metrics, by ="id") %>%right_join(train_sf_10, by =".row") %>%select(-ends_with(".y")) %>%rename_with(~str_remove(.x, ".x"), ends_with(".x")) %>%st_sf() %>%mutate(.resid = gde - .pred)ggplot() +geom_sf(data = full_fit, aes(color = .estimate, fill = .estimate)) +scale_fill_viridis_c() +scale_color_viridis_c()ggplot(data = full_fit, aes(x = .pred, y = .resid)) +geom_point() +geom_smooth() qqnorm(full_fit$.resid)qqline(full_fit$.resid)
Code
# create a neighborhood matrix (queen = TRUE means all neighbors, including diagonals, will be included)sp_nb <-poly2nb(full_fit, queen =TRUE)# create a weights matrix. style = "W" means that the weights will be scaled from 0-1. This way we can compare across areas with different numbers of areas, which is true for this data set.# also, ignoring cases with no neighbors. Errors would get thrown otherwise, and since we have islands with no neighbors, this would be a problemsp_w <-nb2listw(sp_nb, style ="W", zero.policy =TRUE)moran.mc(full_fit$.resid, sp_w, nsim =10000, zero.policy =TRUE)
SAR with projection predictive inference and horseshoe priors
Code
train_100 <- train_sf$min_otu_100 # create a neighborhood matrix (queen = TRUE means all neighbors, including diagonals, will be included)sp_nb <-poly2nb(train_100, queen =TRUE)# create a weights matrix. style = "W" means that the weights will be scaled from 0-1. This way we can compare across areas with different numbers of areas, which is true for this data set.# also, ignoring cases with no neighbors. Errors would get thrown otherwise, and since we have islands with no neighbors, this would be a problemsp_w <-nb2listw(sp_nb, style ="W", zero.policy =TRUE)# just checking, but the response has a fair amount of SAC, which predicts whether there will be residual SACmoran.mc(train_100$gde, sp_w, nsim =10000, zero.policy =TRUE)
Fit the full bayesian SAR model with a horseshoe prior
Need to scale the data before modeling (mean 0, sd 1). When predicting to test data, I will apply the centering and scaling factors of the training data to the test data to prevent data leakage.
Do projection predictive selection. After visually inspecting the performance statistics plots (ELPD and RMSE; example plot below), I determined that the model size suggested by projpred::suggest_size() was the most reasonable model size for all OTU filtering schemes.
Comparing the top models selected from the two model selection approaches. I’m going to compare the variables that are selected, the levels of residual spatial autocorrelation (SAC), and the spatial distribution of error in the training set.
Residual SAC
It looks like the GLMM properly accounts for SAC in all data sets!
Code
# predictionsglmm_pred_gde <-map(glmm_gde, predict)# map to sf data frames for spatial weights matrices and doing moran.mc easilyglmm_pred_sf_gde <-map2(train_sf, glmm_pred_gde, ~mutate(.x, .pred =pull(.y, 1), .resid = gde - .pred))# create a neighborhood matrix (queen = TRUE means all neighbors, including diagonals, will be included) and spatial weights matrixglmm_w_gde <-map(train_sf, ~poly2nb(.x, queen =TRUE)) %>%map(~nb2listw(.x, style ="W", zero.policy =TRUE))glmm_moran_gde <-map2(glmm_pred_sf_gde, glmm_w_gde, ~moran.mc(.x$.resid, .y, nsim =10000, zero.policy =TRUE))# check moran p-valuesmap(glmm_moran_gde, ~.x[["p.value"]]) %>%bind_rows() %>% knitr::kable(caption ="Moran's I p-values")
Moran’s I p-values
min_otu_10
min_otu_25
min_otu_50
min_otu_100
min_otu_150
min_otu_200
0.2170783
0.1277872
0.1851815
0.710329
0.4210579
0.5553445
Posterior plots
Generating a bunch of diagnostic plots to compare how the models perform on training data.
Get posteriors
Code
posts_gde <-map(glmm_gde, ~glmmfields::posterior_predict(.x, iter =1000))resp_gde <-map(train_sf, ~pull(.x, gde))
Posterior-predictive check
The posteriors for all are similar, but the distribution of GDE gets narrower as the minimum OTU filter increases.
BIO5 (max temp of the warmest month) shows up in all models, followed by BIO13 (prec of wettest month, 4/6), temp_var (4/6), temp_trend (4/6), BIO2 (diurnal temp range, 3/6)