rm(list = ls())
### Packages
require(tidyverse)
require(tidymodels)
require(palmerpenguins)
doParallel::registerDoParallel()
### Check data
penguins
## # A tibble: 344 x 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # ... with 334 more rows, and 2 more variables: sex <fct>, year <int>
penguins %>% select(species) %>% unique()
## # A tibble: 3 x 1
## species
## <fct>
## 1 Adelie
## 2 Gentoo
## 3 Chinstrap
levels(penguins$species)
## [1] "Adelie" "Chinstrap" "Gentoo"
### Data for modellling
penguins_df <- penguins %>% drop_na() %>% select(-year)
GGally::ggpairs(penguins_df, aes(color = species))

### Split into train and test
penguin_split <- initial_split(penguins_df, strata = species)
penguin_train <- training(penguin_split)
penguin_test <- testing(penguin_split)
penguin_split
## <Analysis/Assess/Total>
## <251/82/333>
### Define Random Forest Model
rf_spec <- rand_forest(trees = 1000,
mtry = tune(),
min_n = tune()) %>%
set_mode("classification") %>%
set_engine("ranger")
rf_spec
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
##
## Computational engine: ranger
### Cross Validation Sets to tune RF
penguin_cv <- vfold_cv(penguins_df, strata = species)
penguin_cv
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 2
## splits id
## <list> <chr>
## 1 <split [299/34]> Fold01
## 2 <split [299/34]> Fold02
## 3 <split [299/34]> Fold03
## 4 <split [299/34]> Fold04
## 5 <split [299/34]> Fold05
## 6 <split [299/34]> Fold06
## 7 <split [300/33]> Fold07
## 8 <split [300/33]> Fold08
## 9 <split [301/32]> Fold09
## 10 <split [302/31]> Fold10
### Add formula and model together with workflow
tune_wf <- workflow() %>%
add_formula(species ~ .) %>%
add_model(rf_spec)
tune_wf
## == Workflow ====================================================================
## Preprocessor: Formula
## Model: rand_forest()
##
## -- Preprocessor ----------------------------------------------------------------
## species ~ .
##
## -- Model -----------------------------------------------------------------------
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
##
## Computational engine: ranger
### Tune HyperParameters
rf_tune <- tune_grid(
tune_wf,
resamples = penguin_cv,
control = control_resamples(save_pred = TRUE),
grid = 20
)
rf_tune
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [299/34]> Fold01 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
## 2 <split [299/34]> Fold02 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
## 3 <split [299/34]> Fold03 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
## 4 <split [299/34]> Fold04 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
## 5 <split [299/34]> Fold05 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
## 6 <split [299/34]> Fold06 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
## 7 <split [300/33]> Fold07 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [660 x 9]>
## 8 <split [300/33]> Fold08 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [660 x 9]>
## 9 <split [301/32]> Fold09 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [640 x 9]>
## 10 <split [302/31]> Fold10 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [620 x 9]>
### Check Grid
rf_tune %>% autoplot()

### Collect Metrics
rf_tune %>% collect_metrics()
## # A tibble: 40 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 4 5 accuracy multiclass 0.982 10 0.0100 Preprocessor1_Model01
## 2 4 5 roc_auc hand_till 0.999 10 0.00102 Preprocessor1_Model01
## 3 3 3 accuracy multiclass 0.991 10 0.00628 Preprocessor1_Model02
## 4 3 3 roc_auc hand_till 0.999 10 0.000750 Preprocessor1_Model02
## 5 4 15 accuracy multiclass 0.979 10 0.00985 Preprocessor1_Model03
## 6 4 15 roc_auc hand_till 0.998 10 0.00138 Preprocessor1_Model03
## 7 4 10 accuracy multiclass 0.979 10 0.00985 Preprocessor1_Model04
## 8 4 10 roc_auc hand_till 0.998 10 0.00116 Preprocessor1_Model04
## 9 3 29 accuracy multiclass 0.982 10 0.00899 Preprocessor1_Model05
## 10 3 29 roc_auc hand_till 0.998 10 0.00145 Preprocessor1_Model05
## # ... with 30 more rows
rf_tune %>% collect_predictions() %>% conf_mat(.pred_class, species )
## Truth
## Prediction Adelie Chinstrap Gentoo
## Adelie 2857 54 9
## Chinstrap 58 1302 0
## Gentoo 0 10 2370
### ROC curves for 3 classes
rf_tune %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(species, .pred_Adelie:.pred_Gentoo) %>%
autoplot()

### Finalize Model according to accuracy
rf_final <- finalize_model(
rf_spec,
rf_tune %>% select_best("accuracy")
)
rf_final
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = 1
## trees = 1000
## min_n = 17
##
## Computational engine: ranger
### Testing Data
final_res <- workflow() %>%
add_formula(species ~ .)%>%
add_model(rf_final) %>%
last_fit(penguin_split)
### Metrics on Test data
final_res %>% collect_metrics()
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy multiclass 1 Preprocessor1_Model1
## 2 roc_auc hand_till 1 Preprocessor1_Model1
final_res %>% collect_predictions() %>% conf_mat(.pred_class, species )
## Truth
## Prediction Adelie Chinstrap Gentoo
## Adelie 36 0 0
## Chinstrap 0 17 0
## Gentoo 0 0 29
### Fit model on whole data and save for Production
final_prod <- workflow() %>%
add_formula(species ~ .)%>%
add_model(rf_final) %>%
fit(penguins_df)
### Final Model for Production
final_prod
## == Workflow [trained] ==========================================================
## Preprocessor: Formula
## Model: rand_forest()
##
## -- Preprocessor ----------------------------------------------------------------
## species ~ .
##
## -- Model -----------------------------------------------------------------------
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~1L, x), num.trees = ~1000, min.node.size = min_rows(~17L, x), num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 1000
## Sample size: 333
## Number of independent variables: 6
## Mtry: 1
## Target node size: 17
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.03256562
library(vip)
rf_final %>%
set_engine("ranger", importance = "permutation") %>%
fit(species ~ . ,
data = penguins_df
) %>%
vip(geom = "col")

saveRDS(final_prod, "Multiclass_RF_Model.rds")
saveRDS(rf_final, "Multiclass_RF_varimp.rds")