tidymodels exercise

Evaluate Models with Resampling

Load Data and Create Stratified Split

data(cells, package = "modeldata")
cells <- cells %>% as_tibble()
cells

## # A tibble: 2,019 x 58
##    case  class angle_c~1 area_~2 avg_i~3 avg_i~4 avg_i~5 avg_i~6 conve~7 conve~8
##    <fct> <fct>     <dbl>   <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Test  PS       143.       185    15.7    4.95    9.55    2.21    1.12   0.920
##  2 Train PS       134.       819    31.9  207.     69.9   164.      1.26   0.797
##  3 Train WS       107.       431    28.0  116.     63.9   107.      1.05   0.935
##  4 Train PS        69.2      298    19.5  102.     28.2    31.0     1.20   0.866
##  5 Test  PS         2.89     285    24.3  112.     20.5    40.6     1.11   0.957
##  6 Test  WS        40.7      172   326.   654.    129.    347.      1.01   0.993
##  7 Test  WS       174.       177   260.   596.    124.    273.      1.01   0.984
##  8 Test  PS       180.       251    18.3    5.73   17.2     1.55    1.20   0.831
##  9 Test  WS        18.9      495    16.1   89.5    13.7    51.4     1.19   0.822
## 10 Test  WS       153.       384    17.7   89.9    20.4    63.1     1.16   0.865
## # ... with 2,009 more rows, 48 more variables: diff_inten_density_ch_1 <dbl>,
## #   diff_inten_density_ch_3 <dbl>, diff_inten_density_ch_4 <dbl>,
## #   entropy_inten_ch_1 <dbl>, entropy_inten_ch_3 <dbl>,
## #   entropy_inten_ch_4 <dbl>, eq_circ_diam_ch_1 <dbl>,
## #   eq_ellipse_lwr_ch_1 <dbl>, eq_ellipse_oblate_vol_ch_1 <dbl>,
## #   eq_ellipse_prolate_vol_ch_1 <dbl>, eq_sphere_area_ch_1 <dbl>,
## #   eq_sphere_vol_ch_1 <dbl>, fiber_align_2_ch_3 <dbl>, ...

We now want to segment the data. This data was pre-segmented via the case field, but we will create our own splits.

set.seed(1234)
cell_split <- initial_split(
  cells %>% select(-case), 
  strata = class
  )
cell_train <- training(cell_split)
cell_test  <- testing(cell_split)
# training set proportions by class
cell_train %>% 
  count(class) %>% 
  mutate(
    prop = n / sum(n)
    )

## # A tibble: 2 x 3
##   class     n  prop
##   <fct> <int> <dbl>
## 1 PS      975 0.644
## 2 WS      539 0.356

# test set proportions by class
cell_test %>% 
  count(class) %>% 
  mutate(
    prop = n / sum(n)
    )

## # A tibble: 2 x 3
##   class     n  prop
##   <fct> <int> <dbl>
## 1 PS      325 0.644
## 2 WS      180 0.356

Construct Random Forest Model

We initially want to fit the Random Forest model for this classification

rf_mod <- rand_forest(trees = 1000) %>% 
  set_engine("ranger") %>% 
  set_mode("classification")
set.seed(234)
rf_fit <- rf_mod %>% 
  fit(
    class ~ .,
    data = cell_train
    )
rf_fit %>% print()

## parsnip model object
## 
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000,      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  1000 
## Sample size:                      1514 
## Number of independent variables:  56 
## Mtry:                             7 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.1257638

rf_training_pred <- predict(rf_fit, cell_train) %>% 
  bind_cols(predict(rf_fit, cell_train, type = "prob")) %>% 
  # Add the true outcome data back in
  bind_cols(cell_train %>% select(class))
rf_training_pred %>% glimpse()

## Rows: 1,514
## Columns: 4
## $ .pred_class <fct> PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS~
## $ .pred_PS    <dbl> 0.8843575, 0.7394246, 0.9582540, 0.9649690, 0.8761579, 0.9~
## $ .pred_WS    <dbl> 0.115642460, 0.260575397, 0.041746032, 0.035030952, 0.1238~
## $ class       <fct> PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS~

rf_training_pred %>% roc_auc( truth = class, .pred_PS)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary          1.00

rf_training_pred %>% accuracy(truth = class, .pred_class)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.990

Calculate the out-of-sample test data predictions.

rf_testing_pred <- predict(rf_fit, cell_test) %>% 
  bind_cols(predict(rf_fit, cell_test, type = "prob")) %>% 
  bind_cols(cell_test %>% select(class))
rf_testing_pred %>% glimpse()

## Rows: 505
## Columns: 4
## $ .pred_class <fct> WS, PS, WS, PS, PS, WS, PS, WS, WS, PS, WS, PS, PS, PS, PS~
## $ .pred_PS    <dbl> 0.42694524, 0.86494246, 0.15610159, 0.85963770, 0.74186548~
## $ .pred_WS    <dbl> 0.57305476, 0.13505754, 0.84389841, 0.14036230, 0.25813452~
## $ class       <fct> WS, PS, WS, PS, PS, PS, PS, WS, WS, WS, WS, PS, PS, PS, PS~

rf_testing_pred %>% roc_auc( truth = class, .pred_PS)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.923

rf_testing_pred %>% accuracy(truth = class, .pred_class)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.850

Fit Model with Resampling

set.seed(345)
folds <- vfold_cv(cell_train, v = 10)
folds %>% glimpse()

## Rows: 10
## Columns: 2
## $ splits <list> [<vfold_split[1362 x 152 x 1514 x 57]>], [<vfold_split[1362 x ~
## $ id     <chr> "Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fold06", "Fo~

rf_wf <- workflow() %>%
  add_model(rf_mod) %>%
  add_formula(class ~ .)
set.seed(456)
rf_fit_rs <- rf_wf %>% 
  fit_resamples(folds)
rf_fit_rs %>% print()

## # Resampling results
## # 10-fold cross-validation 
## # A tibble: 10 x 4
##    splits             id     .metrics         .notes          
##    <list>             <chr>  <list>           <list>          
##  1 <split [1362/152]> Fold01 <tibble [2 x 4]> <tibble [0 x 3]>
##  2 <split [1362/152]> Fold02 <tibble [2 x 4]> <tibble [0 x 3]>
##  3 <split [1362/152]> Fold03 <tibble [2 x 4]> <tibble [0 x 3]>
##  4 <split [1362/152]> Fold04 <tibble [2 x 4]> <tibble [0 x 3]>
##  5 <split [1363/151]> Fold05 <tibble [2 x 4]> <tibble [0 x 3]>
##  6 <split [1363/151]> Fold06 <tibble [2 x 4]> <tibble [0 x 3]>
##  7 <split [1363/151]> Fold07 <tibble [2 x 4]> <tibble [0 x 3]>
##  8 <split [1363/151]> Fold08 <tibble [2 x 4]> <tibble [0 x 3]>
##  9 <split [1363/151]> Fold09 <tibble [2 x 4]> <tibble [0 x 3]>
## 10 <split [1363/151]> Fold10 <tibble [2 x 4]> <tibble [0 x 3]>

collect_metrics(rf_fit_rs)

## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.820    10 0.0102  Preprocessor1_Model1
## 2 roc_auc  binary     0.896    10 0.00679 Preprocessor1_Model1

rf_testing_pred %>% roc_auc( truth = class, .pred_PS)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.923

rf_testing_pred %>% accuracy(truth = class, .pred_class)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.850