data(cells, package = "modeldata")
cells <- cells %>% as_tibble()
cells
## # A tibble: 2,019 x 58
## case class angle_c~1 area_~2 avg_i~3 avg_i~4 avg_i~5 avg_i~6 conve~7 conve~8
## <fct> <fct> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Test PS 143. 185 15.7 4.95 9.55 2.21 1.12 0.920
## 2 Train PS 134. 819 31.9 207. 69.9 164. 1.26 0.797
## 3 Train WS 107. 431 28.0 116. 63.9 107. 1.05 0.935
## 4 Train PS 69.2 298 19.5 102. 28.2 31.0 1.20 0.866
## 5 Test PS 2.89 285 24.3 112. 20.5 40.6 1.11 0.957
## 6 Test WS 40.7 172 326. 654. 129. 347. 1.01 0.993
## 7 Test WS 174. 177 260. 596. 124. 273. 1.01 0.984
## 8 Test PS 180. 251 18.3 5.73 17.2 1.55 1.20 0.831
## 9 Test WS 18.9 495 16.1 89.5 13.7 51.4 1.19 0.822
## 10 Test WS 153. 384 17.7 89.9 20.4 63.1 1.16 0.865
## # ... with 2,009 more rows, 48 more variables: diff_inten_density_ch_1 <dbl>,
## # diff_inten_density_ch_3 <dbl>, diff_inten_density_ch_4 <dbl>,
## # entropy_inten_ch_1 <dbl>, entropy_inten_ch_3 <dbl>,
## # entropy_inten_ch_4 <dbl>, eq_circ_diam_ch_1 <dbl>,
## # eq_ellipse_lwr_ch_1 <dbl>, eq_ellipse_oblate_vol_ch_1 <dbl>,
## # eq_ellipse_prolate_vol_ch_1 <dbl>, eq_sphere_area_ch_1 <dbl>,
## # eq_sphere_vol_ch_1 <dbl>, fiber_align_2_ch_3 <dbl>, ...
We now want to segment the data. This data was pre-segmented via the
case
field, but we will create our own splits.
set.seed(1234)
cell_split <- initial_split(
cells %>% select(-case),
strata = class
)
cell_train <- training(cell_split)
cell_test <- testing(cell_split)
# training set proportions by class
cell_train %>%
count(class) %>%
mutate(
prop = n / sum(n)
)
## # A tibble: 2 x 3
## class n prop
## <fct> <int> <dbl>
## 1 PS 975 0.644
## 2 WS 539 0.356
# test set proportions by class
cell_test %>%
count(class) %>%
mutate(
prop = n / sum(n)
)
## # A tibble: 2 x 3
## class n prop
## <fct> <int> <dbl>
## 1 PS 325 0.644
## 2 WS 180 0.356
We initially want to fit the Random Forest model for this classification
rf_mod <- rand_forest(trees = 1000) %>%
set_engine("ranger") %>%
set_mode("classification")
set.seed(234)
rf_fit <- rf_mod %>%
fit(
class ~ .,
data = cell_train
)
rf_fit %>% print()
## parsnip model object
##
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, num.trees = ~1000, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 1000
## Sample size: 1514
## Number of independent variables: 56
## Mtry: 7
## Target node size: 10
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1257638
rf_training_pred <- predict(rf_fit, cell_train) %>%
bind_cols(predict(rf_fit, cell_train, type = "prob")) %>%
# Add the true outcome data back in
bind_cols(cell_train %>% select(class))
rf_training_pred %>% glimpse()
## Rows: 1,514
## Columns: 4
## $ .pred_class <fct> PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS~
## $ .pred_PS <dbl> 0.8843575, 0.7394246, 0.9582540, 0.9649690, 0.8761579, 0.9~
## $ .pred_WS <dbl> 0.115642460, 0.260575397, 0.041746032, 0.035030952, 0.1238~
## $ class <fct> PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS, PS~
rf_training_pred %>% roc_auc( truth = class, .pred_PS)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 1.00
rf_training_pred %>% accuracy(truth = class, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.990
Calculate the out-of-sample test data predictions.
rf_testing_pred <- predict(rf_fit, cell_test) %>%
bind_cols(predict(rf_fit, cell_test, type = "prob")) %>%
bind_cols(cell_test %>% select(class))
rf_testing_pred %>% glimpse()
## Rows: 505
## Columns: 4
## $ .pred_class <fct> WS, PS, WS, PS, PS, WS, PS, WS, WS, PS, WS, PS, PS, PS, PS~
## $ .pred_PS <dbl> 0.42694524, 0.86494246, 0.15610159, 0.85963770, 0.74186548~
## $ .pred_WS <dbl> 0.57305476, 0.13505754, 0.84389841, 0.14036230, 0.25813452~
## $ class <fct> WS, PS, WS, PS, PS, PS, PS, WS, WS, WS, WS, PS, PS, PS, PS~
rf_testing_pred %>% roc_auc( truth = class, .pred_PS)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.923
rf_testing_pred %>% accuracy(truth = class, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.850
set.seed(345)
folds <- vfold_cv(cell_train, v = 10)
folds %>% glimpse()
## Rows: 10
## Columns: 2
## $ splits <list> [<vfold_split[1362 x 152 x 1514 x 57]>], [<vfold_split[1362 x ~
## $ id <chr> "Fold01", "Fold02", "Fold03", "Fold04", "Fold05", "Fold06", "Fo~
rf_wf <- workflow() %>%
add_model(rf_mod) %>%
add_formula(class ~ .)
set.seed(456)
rf_fit_rs <- rf_wf %>%
fit_resamples(folds)
rf_fit_rs %>% print()
## # Resampling results
## # 10-fold cross-validation
## # A tibble: 10 x 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [1362/152]> Fold01 <tibble [2 x 4]> <tibble [0 x 3]>
## 2 <split [1362/152]> Fold02 <tibble [2 x 4]> <tibble [0 x 3]>
## 3 <split [1362/152]> Fold03 <tibble [2 x 4]> <tibble [0 x 3]>
## 4 <split [1362/152]> Fold04 <tibble [2 x 4]> <tibble [0 x 3]>
## 5 <split [1363/151]> Fold05 <tibble [2 x 4]> <tibble [0 x 3]>
## 6 <split [1363/151]> Fold06 <tibble [2 x 4]> <tibble [0 x 3]>
## 7 <split [1363/151]> Fold07 <tibble [2 x 4]> <tibble [0 x 3]>
## 8 <split [1363/151]> Fold08 <tibble [2 x 4]> <tibble [0 x 3]>
## 9 <split [1363/151]> Fold09 <tibble [2 x 4]> <tibble [0 x 3]>
## 10 <split [1363/151]> Fold10 <tibble [2 x 4]> <tibble [0 x 3]>
collect_metrics(rf_fit_rs)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.820 10 0.0102 Preprocessor1_Model1
## 2 roc_auc binary 0.896 10 0.00679 Preprocessor1_Model1
rf_testing_pred %>% roc_auc( truth = class, .pred_PS)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.923
rf_testing_pred %>% accuracy(truth = class, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.850