library(tidyverse); library(mgcv); library(mgcViz); library(janitor); library(tidymodels)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-38. For overview type 'help("mgcv-package")'.
## Loading required package: qgam
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Registered S3 method overwritten by 'mgcViz':
## method from
## +.gg GGally
##
## Attaching package: 'mgcViz'
## The following objects are masked from 'package:stats':
##
## qqline, qqnorm, qqplot
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
## Registered S3 method overwritten by 'tune':
## method from
## required_pkgs.model_spec parsnip
## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.4 ──
## ✓ broom 0.7.10 ✓ rsample 0.1.1
## ✓ dials 0.0.10 ✓ tune 0.1.6
## ✓ infer 1.0.0 ✓ workflows 0.2.4
## ✓ modeldata 0.1.1 ✓ workflowsets 0.1.0
## ✓ parsnip 0.1.7 ✓ yardstick 0.0.9
## ✓ recipes 0.1.17
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x recipes::check() masks qgam::check()
## x nlme::collapse() masks dplyr::collapse()
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
library(skimr); library(ggmap); library(vip)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
d <- readRDS("~/Dropbox/Mac (2)/Desktop/aru-projects/sd_data.rds")
tidymodels_prefer()
glimpse(d)
## Rows: 4,306
## Columns: 11
## $ rID <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ lc1990 <dbl> 21, 3, 4, 21, 4, 4, 4, 3, 4, 3, 4, 3, 20, 3, 4,…
## $ lc2000 <dbl> 7, 3, 3, 7, 3, 4, 3, 3, 4, 3, 4, 1, 7, 3, 4, 1,…
## $ lc2015 <dbl> 10, 1, 4, 10, 4, 4, 3, 3, 4, 3, 4, 4, 10, 3, 4,…
## $ bt <dbl> 603.0000, 1194.0000, 1251.2693, 575.7488, 269.5…
## $ gsw <dbl> 1072.00000, 1147.00000, 1265.49651, 764.46792, …
## $ meantemp <dbl> 93.43246, 95.09166, 94.10497, 91.58929, 97.1954…
## $ diurnal_range <dbl> 72.99537, 72.97844, 74.00000, 75.80238, 62.0000…
## $ min_temp_coldest_month <dbl> 5.4324588, 8.1833136, -3.1581891, 0.5902241, 15…
## $ presence <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ coords <dbl[,3]> <matrix[26 x 3]>
sdmData
preparation)d_1 <- d %>%
mutate(presence = factor(presence),
coords1 = data.frame(coords),
x = coords1[, 2],
y = coords1[, 3])
d_1 <- d_1 %>%
select(-c(rID, x, y))
d_1 %>%
head()
d_split <- initial_split(d_1, strata = presence)
d_train <- training(d_split)
d_test <- testing(d_split)
val_set <- validation_split(d_train,
strata = presence,
prop = 0.80)
val_set
wt_rec <- recipe(presence ~ bt + gsw + meantemp + diurnal_range + min_temp_coldest_month + lc1990 + lc2000 + lc2015, data = d_train) %>%
step_log(bt, base = 10) %>%
step_log(gsw, base = 10)
glm <-
logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
lr_mod <-
logistic_reg(penalty = 1) %>%
set_engine("glmnet")
rf_mod <-
rand_forest(mtry = 1, min_n = 8, trees = 1000) %>%
set_engine("ranger", num.threads = 8, importance = "impurity") %>%
set_mode("classification")
wt_cv_folds <- bake(prep(wt_rec),
new_data = d_train) %>%
rsample::vfold_cv(v = 5)
glmw <- workflow() %>%
add_model(glm) %>%
add_recipe(wt_rec)
glm_fit <- fit(glmw, d_train)
glmnetw <- workflow() %>%
add_model(lr_mod) %>%
add_recipe(wt_rec)
glmnet_fit <- fit(glmnetw, d_train)
rfw <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(wt_rec)
rf_fit <- fit(rfw, d_train)
## after Julia Silge
xgb_spec <- boost_tree(
trees = 1000,
tree_depth = tune(), min_n = tune(),
loss_reduction = tune(), ## first three: model complexity
sample_size = tune(), mtry = tune(), ## randomness
learn_rate = tune(), ## step size
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_spec
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## loss_reduction = tune()
## sample_size = tune()
##
## Computational engine: xgboost
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), d_train),
learn_rate(),
size = 30
)
xgb_grid
xgboost_wf <- workflow() %>%
add_model(xgb_spec) %>%
add_recipe(wt_rec)
set.seed(42)
vb_folds <- vfold_cv(d_train, strata = presence)
doParallel::registerDoParallel()
set.seed(12345)
xgb_res <- tune_grid(
xgboost_wf,
resamples = vb_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
collect_metrics(xgb_res)
xgb_res %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mean, mtry:sample_size) %>%
pivot_longer(mtry:sample_size,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "AUC")
show_best(xgb_res, "roc_auc")
best_auc <- select_best(xgb_res, "roc_auc")
best_auc
final_xgb <- finalize_workflow(
xgboost_wf,
best_auc
)
final_xgb
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_log()
## • step_log()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = 3
## trees = 1000
## min_n = 10
## tree_depth = 7
## learn_rate = 0.00100950459024817
## loss_reduction = 6.35134527379776e-09
## sample_size = 0.601558735554572
##
## Computational engine: xgboost
final_xgb %>%
fit(data = d_train) %>%
pull_workflow_fit() %>%
vip(geom = "point")
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
## [15:45:32] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
final_res <- last_fit(final_xgb, d_split)
collect_metrics(final_res)
final_res %>%
collect_predictions() %>%
roc_curve(truth = presence, .pred_0) %>%
autoplot()
glm_fit %>%
extract_fit_parsnip() %>%
tidy() %>%
filter(p.value < 0.05)
• Blue tit density and land cover in 2000 significant predictors
glmnet_fit %>%
extract_fit_parsnip()
## parsnip model object
##
## Fit time: 31ms
##
## Call: glmnet::glmnet(x = maybe_matrix(x), y = y, family = "binomial")
##
## Df %Dev Lambda
## 1 0 0.00 0.059110
## 2 1 1.72 0.053860
## 3 1 3.24 0.049070
## 4 1 4.58 0.044710
## 5 1 5.76 0.040740
## 6 1 6.80 0.037120
## 7 1 7.72 0.033820
## 8 1 8.52 0.030820
## 9 1 9.23 0.028080
## 10 1 9.85 0.025590
## 11 1 10.39 0.023310
## 12 1 10.86 0.021240
## 13 2 11.29 0.019360
## 14 2 11.80 0.017640
## 15 2 12.25 0.016070
## 16 2 12.64 0.014640
## 17 2 12.97 0.013340
## 18 2 13.27 0.012160
## 19 2 13.52 0.011080
## 20 2 13.74 0.010090
## 21 2 13.92 0.009195
## 22 2 14.08 0.008378
## 23 2 14.22 0.007634
## 24 2 14.34 0.006956
## 25 2 14.44 0.006338
## 26 2 14.52 0.005775
## 27 2 14.60 0.005262
## 28 3 14.66 0.004794
## 29 3 14.71 0.004369
## 30 3 14.75 0.003980
## 31 3 14.79 0.003627
## 32 3 14.82 0.003305
## 33 3 14.85 0.003011
## 34 3 14.87 0.002744
## 35 3 14.89 0.002500
## 36 3 14.91 0.002278
## 37 3 14.92 0.002075
## 38 3 14.93 0.001891
## 39 3 14.94 0.001723
## 40 3 14.95 0.001570
## 41 3 14.95 0.001431
## 42 5 14.96 0.001303
## 43 5 14.98 0.001188
## 44 5 14.99 0.001082
## 45 6 15.00 0.000986
## 46 6 15.02 0.000898
## 47 6 15.04 0.000819
## 48 6 15.06 0.000746
## 49 6 15.07 0.000680
## 50 6 15.08 0.000619
## 51 6 15.09 0.000564
## 52 6 15.10 0.000514
## 53 7 15.11 0.000468
## 54 7 15.12 0.000427
## 55 7 15.12 0.000389
## 56 7 15.13 0.000354
## 57 7 15.13 0.000323
## 58 7 15.14 0.000294
## 59 7 15.14 0.000268
## 60 7 15.14 0.000244
## 61 7 15.14 0.000222
## 62 7 15.15 0.000203
## 63 7 15.15 0.000185
## 64 7 15.15 0.000168
## 65 7 15.15 0.000153
rf_fit %>%
extract_fit_parsnip() %>%
vip()
final_xgb %>%
fit(data = d_train) %>%
pull_workflow_fit() %>%
vip()
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.
## [15:45:36] WARNING: amalgamation/../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.