Tidy model willow tit predictions

library(tidyverse); library(mgcv); library(mgcViz); library(janitor); library(tidymodels)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.

## Loading required package: qgam

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## Registered S3 method overwritten by 'mgcViz':
##   method from  
##   +.gg   GGally

## 
## Attaching package: 'mgcViz'

## The following objects are masked from 'package:stats':
## 
##     qqline, qqnorm, qqplot

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──

## ✓ broom        0.7.10     ✓ rsample      0.1.1 
## ✓ dials        0.0.10     ✓ tune         0.1.6 
## ✓ infer        1.0.0      ✓ workflows    0.2.4 
## ✓ modeldata    0.1.1      ✓ workflowsets 0.1.0 
## ✓ parsnip      0.1.7      ✓ yardstick    0.0.9 
## ✓ recipes      0.1.17

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x recipes::check()  masks qgam::check()
## x nlme::collapse()  masks dplyr::collapse()
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

library(skimr); library(ggmap); library(vip)

## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.

## Please cite ggmap if you use it! See citation("ggmap") for details.

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

d <- readRDS("~/Dropbox/Mac (2)/Desktop/aru-projects/sd_data.rds")
tidymodels_prefer()

glimpse(d)

## Rows: 4,306
## Columns: 11
## $ rID                    <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ lc1990                 <dbl> 21, 3, 4, 21, 4, 4, 4, 3, 4, 3, 4, 3, 20, 3, 4,…
## $ lc2000                 <dbl> 7, 3, 3, 7, 3, 4, 3, 3, 4, 3, 4, 1, 7, 3, 4, 1,…
## $ lc2015                 <dbl> 10, 1, 4, 10, 4, 4, 3, 3, 4, 3, 4, 4, 10, 3, 4,…
## $ bt                     <dbl> 603.0000, 1194.0000, 1251.2693, 575.7488, 269.5…
## $ gsw                    <dbl> 1072.00000, 1147.00000, 1265.49651, 764.46792, …
## $ meantemp               <dbl> 93.43246, 95.09166, 94.10497, 91.58929, 97.1954…
## $ diurnal_range          <dbl> 72.99537, 72.97844, 74.00000, 75.80238, 62.0000…
## $ min_temp_coldest_month <dbl> 5.4324588, 8.1833136, -3.1581891, 0.5902241, 15…
## $ presence               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ coords                 <dbl[,3]> <matrix[26 x 3]>

Code book

lc1990 21 category landcover - dominant class
lc2000, lc2000 10 category landcover - dominant class per km2
lc2015, lc2015 10 category landcover - dominant class per km2
bt blue tit hectad counts (observations per km2)
gsw great spotted woodpecker hectad counts (observations per km2)
meantemp mean temp across year C
diurnal_range mean daily variation C
min_temp_coldest min temp on coldest day C
presence presence/ pseudo-absence willow tit (estimated by random sampling background as part of sdmData preparation)
rID row ID
coords x y coordinates in metres

Convert outcome to factor and extract x, y variavles

d_1 <- d %>%
  mutate(presence = factor(presence), 
         coords1 = data.frame(coords), 
         x = coords1[, 2],
         y = coords1[, 3])

d_1 <- d_1 %>%
  select(-c(rID, x, y)) 

d_1 %>%
  head()

Split data and create validation dataset

d_split <- initial_split(d_1, strata = presence)
d_train <- training(d_split)
d_test <- testing(d_split)

val_set <- validation_split(d_train, 
                            strata = presence, 
                            prop = 0.80)
val_set

Create recipe (pre-processing)

wt_rec <- recipe(presence ~ bt + gsw + meantemp + diurnal_range + min_temp_coldest_month + lc1990 + lc2000 + lc2015, data = d_train) %>%
  step_log(bt, base = 10) %>%
  step_log(gsw, base = 10)

Create mod specs

glm <- 
  logistic_reg() %>%
  set_engine("glm") %>% 
  set_mode("classification")
  
lr_mod <- 
  logistic_reg(penalty = 1) %>% 
  set_engine("glmnet")

rf_mod <- 
  rand_forest(mtry = 1, min_n = 8, trees = 1000) %>% 
  set_engine("ranger", num.threads = 8, importance = "impurity") %>% 
  set_mode("classification")

resamples

wt_cv_folds <- bake(prep(wt_rec),
                    new_data = d_train) %>%
  rsample::vfold_cv(v = 5)

Fit models

glmw <- workflow() %>%
  add_model(glm) %>%
  add_recipe(wt_rec) 

glm_fit <- fit(glmw,  d_train)

glmnetw <- workflow() %>%
  add_model(lr_mod) %>%
  add_recipe(wt_rec) 

glmnet_fit <- fit(glmnetw,  d_train)

rfw <- workflow() %>%
  add_model(rf_mod) %>%
  add_recipe(wt_rec) 

rf_fit <- fit(rfw,  d_train)



## after Julia Silge

xgb_spec <- boost_tree(
  trees = 1000, 
  tree_depth = tune(), min_n = tune(), 
  loss_reduction = tune(),                     ## first three: model complexity
  sample_size = tune(), mtry = tune(),         ## randomness
  learn_rate = tune(),                         ## step size
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

xgb_spec

## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

xgb_grid <- grid_latin_hypercube(
  
  tree_depth(),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  finalize(mtry(), d_train),
  learn_rate(), 
  size = 30
)

xgb_grid

xgboost_wf <- workflow() %>%
  add_model(xgb_spec) %>%
  add_recipe(wt_rec)

set.seed(42)

vb_folds <- vfold_cv(d_train, strata = presence)

doParallel::registerDoParallel()
set.seed(12345)


xgb_res <- tune_grid(
  xgboost_wf,
  resamples = vb_folds, 
  grid = xgb_grid, 
  control  = control_grid(save_pred = TRUE)
  
)


collect_metrics(xgb_res)

xgb_res %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  select(mean, mtry:sample_size) %>%
  pivot_longer(mtry:sample_size,
               values_to = "value",
               names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "AUC")

show_best(xgb_res, "roc_auc")

best_auc <- select_best(xgb_res, "roc_auc")
best_auc

final_xgb <- finalize_workflow(
  xgboost_wf,
  best_auc
)

final_xgb

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_log()
## • step_log()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = 3
##   trees = 1000
##   min_n = 10
##   tree_depth = 7
##   learn_rate = 0.00100950459024817
##   loss_reduction = 6.35134527379776e-09
##   sample_size = 0.601558735554572
## 
## Computational engine: xgboost

final_xgb %>%
  fit(data = d_train) %>%
  pull_workflow_fit() %>%
  vip(geom = "point")

## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

## [15:45:32] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

final_res <- last_fit(final_xgb, d_split)

collect_metrics(final_res)

final_res %>%
  collect_predictions() %>%
  roc_curve(truth = presence, .pred_0) %>%
  autoplot()

Review results

glm

glm_fit %>%
  extract_fit_parsnip() %>%
  tidy() %>%
  filter(p.value < 0.05)

• Blue tit density and land cover in 2000 significant predictors

glmnet

glmnet_fit %>%
  extract_fit_parsnip()

## parsnip model object
## 
## Fit time:  31ms 
## 
## Call:  glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial") 
## 
##    Df  %Dev   Lambda
## 1   0  0.00 0.059110
## 2   1  1.72 0.053860
## 3   1  3.24 0.049070
## 4   1  4.58 0.044710
## 5   1  5.76 0.040740
## 6   1  6.80 0.037120
## 7   1  7.72 0.033820
## 8   1  8.52 0.030820
## 9   1  9.23 0.028080
## 10  1  9.85 0.025590
## 11  1 10.39 0.023310
## 12  1 10.86 0.021240
## 13  2 11.29 0.019360
## 14  2 11.80 0.017640
## 15  2 12.25 0.016070
## 16  2 12.64 0.014640
## 17  2 12.97 0.013340
## 18  2 13.27 0.012160
## 19  2 13.52 0.011080
## 20  2 13.74 0.010090
## 21  2 13.92 0.009195
## 22  2 14.08 0.008378
## 23  2 14.22 0.007634
## 24  2 14.34 0.006956
## 25  2 14.44 0.006338
## 26  2 14.52 0.005775
## 27  2 14.60 0.005262
## 28  3 14.66 0.004794
## 29  3 14.71 0.004369
## 30  3 14.75 0.003980
## 31  3 14.79 0.003627
## 32  3 14.82 0.003305
## 33  3 14.85 0.003011
## 34  3 14.87 0.002744
## 35  3 14.89 0.002500
## 36  3 14.91 0.002278
## 37  3 14.92 0.002075
## 38  3 14.93 0.001891
## 39  3 14.94 0.001723
## 40  3 14.95 0.001570
## 41  3 14.95 0.001431
## 42  5 14.96 0.001303
## 43  5 14.98 0.001188
## 44  5 14.99 0.001082
## 45  6 15.00 0.000986
## 46  6 15.02 0.000898
## 47  6 15.04 0.000819
## 48  6 15.06 0.000746
## 49  6 15.07 0.000680
## 50  6 15.08 0.000619
## 51  6 15.09 0.000564
## 52  6 15.10 0.000514
## 53  7 15.11 0.000468
## 54  7 15.12 0.000427
## 55  7 15.12 0.000389
## 56  7 15.13 0.000354
## 57  7 15.13 0.000323
## 58  7 15.14 0.000294
## 59  7 15.14 0.000268
## 60  7 15.14 0.000244
## 61  7 15.14 0.000222
## 62  7 15.15 0.000203
## 63  7 15.15 0.000185
## 64  7 15.15 0.000168
## 65  7 15.15 0.000153

random forest (ranger)

rf_fit %>%
  extract_fit_parsnip() %>%
  vip()

xgboost

final_xgb %>%
  fit(data = d_train) %>%
  pull_workflow_fit() %>%
  vip()

## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

## [15:45:36] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.