# for path managementlibrary(here)# for raster datalibrary(raster)# for data manipulationlibrary(tidyverse)# for spatial operationslibrary(sf)# for basemapslibrary(rnaturalearth)# for assessing SAClibrary(spdep)# for bayesian linear modellibrary(rstanarm)# for spatial glmmlibrary(glmmfields)# for model selection (projective predictive inference)library(projpred)# for spatial CVlibrary(spatialsample)# for easy exploratory plotslibrary(bayesplot)# for easy working with posteriorslibrary(tidybayes)# for modeling workflowlibrary(tidymodels)# for stitching together model output plotslibrary(patchwork)# for spatial t-tests of differences in GDE/GDM across the global freezelinelibrary(SpatialPack) source(here("R", "modeling_fns.R"))
Read in and process raw data
Read raw data
Code
# reading in the raw pairwise distances, just in case I need that info later# I already created per-cell summaries, which I'm reading in belowraw_pi <-read_csv(here("output", "spreadsheets", "cell_medium_3_10_pi.csv"))# only keeping relevant columnsraw_df <-read_csv(here("output", "spreadsheets", "cell_medium_3_10_sumstats.csv")) %>%select(cell, gdm = sqrt_pi, gde = hill_1, num_ind, num_otu, num_order)
Filter out bad cells
These cells contain wonky environmental data that impacts modeling. The wonky environmental data is likely due to their coastal location, resulting in faulty projection of the environmental models.
HOLDING OFF ON THIS FOR NOW
Code
filter_df <- raw_df
Add in spatial and environmental metadata
This includes converting the data frames to sf objects to make spatial operations and plotting easier, mapping cells to their continents, and extracting relevant environmental data.
Code
# setting the crs for maps and environmental data# behrmann projectioncrs_behr <-"+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs"# for assigning cells to continents. Islands are missed at coarser resolutionsworld_large <- rnaturalearth::ne_countries(scale ="large", returnclass ="sf") %>%select(continent, name_long) %>%st_transform(crs = crs_behr) # for mapping. this will be smaller so plotting is fasterworld_small <- rnaturalearth::ne_countries(scale ="small", returnclass ="sf") %>%select(continent, name_long) %>%st_transform(crs = crs_behr) # all predictorspredictors_all <-c("current_medium_bio_13", "current_medium_bio_14", "current_medium_bio_15", "current_medium_bio_2", "current_medium_bio_5", "ghh_medium_std_dev", "human_medium_gHM", "GlobalExtreme_tsTrendExt", "GlobalExtreme_tsVarExt", "GlobalExtreme_prTrendExt", "GlobalExtreme_prVarExt")# Read in predictor rastersrast_list <-list.files(here("data", "climate_agg"),pattern ="medium",full.names =TRUE) rast_list_pred <- rast_list[str_detect(rast_list, paste(predictors_all, collapse ="|"))]rasters_full <- raster::stack(rast_list_pred)crs(rasters_full) <- crs_behr# convert to data frame predictors_sf <- rasters_full %>%rasterToPolygons() %>%st_as_sf() predictors_sf <- predictors_sf %>%mutate(cell =cellFromXY(rasters_full, st_coordinates(st_centroid(predictors_sf)))) %>%rename_with(~str_remove_all(.x, "current_medium_"), contains("current_medium")) %>%rename_with(~str_remove_all(.x, "medium_"), contains("_medium_"))# read in stability stability_sf <-read_sf(here("data", "climate_poly", "new_stability.geojson"), crs = crs_behr) %>%rename(precip_trend = GlobalExtreme_prTrendExt,precip_var = GlobalExtreme_prVarExt,temp_trend = GlobalExtreme_tsTrendExt,temp_var = GlobalExtreme_tsVarExt)# for freezeline delineationdiv_sf <-st_read(here("data", "climate_poly", "min_temp_binary.geojson")) %>%st_transform(crs = crs_behr)
Reading layer `min_temp_binary' from data source
`/Users/connorfrench/Dropbox/Old_Mac/School_Stuff/CUNY/BigAss-bird-phylogeography/BigAss-phylogeography/data/climate_poly/min_temp_binary.geojson'
using driver `GeoJSON'
Simple feature collection with 2 features and 1 field
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -180.0001 ymin: -88.29181 xmax: 179.6665 ymax: 83.66653
Geodetic CRS: WGS 84
Code
# hack to validate the multipolygondiv_sf_buf <-st_buffer(div_sf, dist =0)
Code
# add predictors and continents full_sf <-left_join(predictors_sf, filter_df, by ="cell") %>%st_join(stability_sf, left =TRUE, largest =TRUE) %>%st_join(world_large, largest =TRUE) %>%st_join(div_sf_buf, largest =TRUE) %>%mutate(freezeline =if_else(min_temp =="temperate", 1L, 0L))ggplot() +geom_sf(data = world_small) +geom_sf(data = full_sf, aes(geometry = geometry, fill = gde, color = gde)) +scale_fill_viridis_c(na.value ="transparent") +scale_color_viridis_c(na.value ="transparent") +theme_bw()
Warning: st_crs<- : replacing crs does not reproject data; use st_transform for
that
Create data sets with minimum OTU thresholds
10, 25, 50, 100, 150, 200
Code
min_otu_vals <-c(10, 25, 50, 100, 150, 200)splits_sf <-map(min_otu_vals, ~filter(full_sf, num_otu >= .x, # there's a single missing value!is.na(ghh_std_dev)))# there is a single cell in South America that doesn't map to the continent across all data setssplits_sf <-map(splits_sf, ~mutate(.x, continent =replace_na(continent, "South America")))# name the data framesnames(splits_sf) <-paste0("min_otu_", min_otu_vals)
Split into testing and training data
Stratify the split according to continent to ensure somewhat even spatial representation of the test vs training data. I’m doing a 75% train to 25% test split. The test data will not be touched until the end- this is not a train-validate split.
Need to scale the data before modeling (mean 0, sd 1). When predicting to test data, I will apply the centering and scaling factors of the training data to the test data to prevent data leakage.
Do projection predictive selection. After visually inspecting the performance statistics plots (ELPD and RMSE; example plot below), I determined that the model size suggested by projpred::suggest_size() was the most reasonable model size for all OTU filtering schemes.
Comparing the top models selected from the two model selection approaches. I’m going to compare the variables that are selected, the levels of residual spatial autocorrelation (SAC), and the spatial distribution of error in the training set.
Residual SAC
The GLMM properly accounts for SAC in both data sets!
Code
# predictionsglmm_pred_gde <-map(glmm_gde, predict)glmm_pred_gdm <-map(glmm_gdm, predict)# map to sf data frames for spatial weights matrices and doing moran.mc easilyglmm_pred_sf_gde <-map2(train_sf, glmm_pred_gde, ~mutate(.x, .pred =pull(.y, 1), .resid = gde - .pred))glmm_pred_sf_gdm <-map2(train_sf, glmm_pred_gdm, ~mutate(.x, .pred =pull(.y, 1), .resid = gdm - .pred))# create a neighborhood matrix (queen = TRUE means all neighbors, including diagonals, will be included) and spatial weights matrixglmm_w <-map(train_sf, ~poly2nb(.x, queen =TRUE)) %>%map(~nb2listw(.x, style ="W", zero.policy =TRUE))glmm_moran_gde <-map2(glmm_pred_sf_gde, glmm_w, ~moran.mc(.x$.resid, .y, nsim =10000, zero.policy =TRUE))glmm_moran_gdm <-map2(glmm_pred_sf_gdm, glmm_w, ~moran.mc(.x$.resid, .y, nsim =10000, zero.policy =TRUE))# check moran p-valuesmap(glmm_moran_gde, ~.x[["p.value"]]) %>%bind_rows() %>% knitr::kable(caption ="Moran's I p-values (GDE)")
Moran’s I p-values (GDE)
min_otu_10
min_otu_25
min_otu_50
min_otu_100
min_otu_150
min_otu_200
0.2127787
0.1192881
0.1878812
0.7075292
0.4193581
0.5543446
Code
map(glmm_moran_gdm, ~.x[["p.value"]]) %>%bind_rows() %>% knitr::kable(caption ="Moran's I p-values (GDM)")
Moran’s I p-values (GDM)
min_otu_10
min_otu_25
min_otu_50
min_otu_100
min_otu_150
min_otu_200
0.3030697
0.1594841
0.1461854
0.7384262
0.4887511
0.2788721
Posterior plots
Generating a bunch of diagnostic plots to compare how the models perform on training data.
Get posteriors
Code
posts_gde <-map(glmm_gde, ~glmmfields::posterior_predict(.x, iter =1000))posts_gdm <-map(glmm_gdm, ~glmmfields::posterior_predict(.x, iter =1000))resp_gde <-map(train_sf, ~pull(.x, gde))resp_gdm <-map(train_sf, ~pull(.x, gdm))
Posterior-predictive check
The posteriors for all are similar, but the distribution of GDE gets narrower as the minimum OTU filter increases.
For GDE:
BIO5 (max temp of the warmest month) shows up in all models, followed by BIO13 (prec of wettest month, 4/6), temp_var (4/6), temp_trend (4/6), BIO2 (diurnal temp range, 3/6)
For GDM:
Code
# only include post-warmup drawsbeta_posts_gde <-map(glmm_gde, get_beta_posts) beta_posts_gdm <-map(glmm_gdm, get_beta_posts) beta_plots_gde <-map(beta_posts_gde, ~mcmc_intervals(.x, prob =0.70,prob_outer =0.95) +geom_vline(xintercept =0) +ggtitle("70% and 95% CI intervals")) beta_plots_gdm <-map(beta_posts_gdm, ~mcmc_intervals(.x, prob =0.70,prob_outer =0.95) +geom_vline(xintercept =0) +ggtitle("70% and 95% CI intervals"))
lat_df <-map(splits_sf, function(x) { c <-st_coordinates(st_centroid(x)) df <-mutate(x, lat = c[,2], lon = c[,1])return(df)})
Warning in st_centroid.sf(x): st_centroid assumes attributes are constant over
geometries of x
Warning in st_centroid.sf(x): st_centroid assumes attributes are constant over
geometries of x
Warning in st_centroid.sf(x): st_centroid assumes attributes are constant over
geometries of x
Warning in st_centroid.sf(x): st_centroid assumes attributes are constant over
geometries of x
Warning in st_centroid.sf(x): st_centroid assumes attributes are constant over
geometries of x
Warning in st_centroid.sf(x): st_centroid assumes attributes are constant over
geometries of x