rm(list = ls())

### Packages
require(tidyverse)
require(tidymodels)
require(palmerpenguins)
doParallel::registerDoParallel()


### Check data
penguins
## # A tibble: 344 x 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ... with 334 more rows, and 2 more variables: sex <fct>, year <int>
penguins %>% select(species) %>% unique()
## # A tibble: 3 x 1
##   species  
##   <fct>    
## 1 Adelie   
## 2 Gentoo   
## 3 Chinstrap
levels(penguins$species)
## [1] "Adelie"    "Chinstrap" "Gentoo"
### Data for modellling
penguins_df <- penguins %>% drop_na() %>% select(-year)

GGally::ggpairs(penguins_df, aes(color = species))

### Split into train and test
penguin_split <- initial_split(penguins_df, strata = species)
penguin_train <- training(penguin_split)
penguin_test <- testing(penguin_split)
penguin_split
## <Analysis/Assess/Total>
## <251/82/333>
### Define Random Forest Model
rf_spec <- rand_forest(trees = 1000,
                       mtry = tune(),
                       min_n = tune()) %>% 
  set_mode("classification") %>% 
  set_engine("ranger")
rf_spec
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
## 
## Computational engine: ranger
### Cross Validation Sets to tune RF
penguin_cv <- vfold_cv(penguins_df, strata = species)
penguin_cv
## #  10-fold cross-validation using stratification 
## # A tibble: 10 x 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [299/34]> Fold01
##  2 <split [299/34]> Fold02
##  3 <split [299/34]> Fold03
##  4 <split [299/34]> Fold04
##  5 <split [299/34]> Fold05
##  6 <split [299/34]> Fold06
##  7 <split [300/33]> Fold07
##  8 <split [300/33]> Fold08
##  9 <split [301/32]> Fold09
## 10 <split [302/31]> Fold10
### Add formula and model together with workflow
tune_wf <- workflow() %>% 
  add_formula(species ~ .) %>% 
  add_model(rf_spec)
tune_wf
## == Workflow ====================================================================
## Preprocessor: Formula
## Model: rand_forest()
## 
## -- Preprocessor ----------------------------------------------------------------
## species ~ .
## 
## -- Model -----------------------------------------------------------------------
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
## 
## Computational engine: ranger
### Tune HyperParameters
rf_tune <- tune_grid(
  tune_wf,
  resamples = penguin_cv,
  control = control_resamples(save_pred = TRUE),
  grid = 20
)
rf_tune
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 x 5
##    splits           id     .metrics          .notes           .predictions      
##    <list>           <chr>  <list>            <list>           <list>            
##  1 <split [299/34]> Fold01 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
##  2 <split [299/34]> Fold02 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
##  3 <split [299/34]> Fold03 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
##  4 <split [299/34]> Fold04 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
##  5 <split [299/34]> Fold05 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
##  6 <split [299/34]> Fold06 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [680 x 9]>
##  7 <split [300/33]> Fold07 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [660 x 9]>
##  8 <split [300/33]> Fold08 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [660 x 9]>
##  9 <split [301/32]> Fold09 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [640 x 9]>
## 10 <split [302/31]> Fold10 <tibble [40 x 6]> <tibble [0 x 1]> <tibble [620 x 9]>
### Check Grid
rf_tune %>% autoplot()

### Collect Metrics
rf_tune %>% collect_metrics()
## # A tibble: 40 x 8
##     mtry min_n .metric  .estimator  mean     n  std_err .config              
##    <int> <int> <chr>    <chr>      <dbl> <int>    <dbl> <chr>                
##  1     4     5 accuracy multiclass 0.982    10 0.0100   Preprocessor1_Model01
##  2     4     5 roc_auc  hand_till  0.999    10 0.00102  Preprocessor1_Model01
##  3     3     3 accuracy multiclass 0.991    10 0.00628  Preprocessor1_Model02
##  4     3     3 roc_auc  hand_till  0.999    10 0.000750 Preprocessor1_Model02
##  5     4    15 accuracy multiclass 0.979    10 0.00985  Preprocessor1_Model03
##  6     4    15 roc_auc  hand_till  0.998    10 0.00138  Preprocessor1_Model03
##  7     4    10 accuracy multiclass 0.979    10 0.00985  Preprocessor1_Model04
##  8     4    10 roc_auc  hand_till  0.998    10 0.00116  Preprocessor1_Model04
##  9     3    29 accuracy multiclass 0.982    10 0.00899  Preprocessor1_Model05
## 10     3    29 roc_auc  hand_till  0.998    10 0.00145  Preprocessor1_Model05
## # ... with 30 more rows
rf_tune %>% collect_predictions() %>% conf_mat(.pred_class, species )
##            Truth
## Prediction  Adelie Chinstrap Gentoo
##   Adelie      2857        54      9
##   Chinstrap     58      1302      0
##   Gentoo         0        10   2370
### ROC curves for 3 classes
rf_tune %>% 
  collect_predictions() %>% 
  group_by(id) %>% 
  roc_curve(species, .pred_Adelie:.pred_Gentoo) %>% 
  autoplot()

### Finalize Model according to accuracy
rf_final <- finalize_model(
  rf_spec,
  rf_tune %>% select_best("accuracy")
)
rf_final
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = 1
##   trees = 1000
##   min_n = 17
## 
## Computational engine: ranger
### Testing Data
final_res <- workflow() %>% 
  add_formula(species ~ .)%>%
  add_model(rf_final) %>%
  last_fit(penguin_split)


### Metrics on Test data
final_res %>% collect_metrics()
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy multiclass         1 Preprocessor1_Model1
## 2 roc_auc  hand_till          1 Preprocessor1_Model1
final_res %>% collect_predictions() %>% conf_mat(.pred_class, species )
##            Truth
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        36         0      0
##   Chinstrap      0        17      0
##   Gentoo         0         0     29
### Fit model on whole data and save for Production
final_prod <- workflow() %>% 
  add_formula(species ~ .)%>%
  add_model(rf_final) %>%
  fit(penguins_df)


### Final Model for Production
final_prod
## == Workflow [trained] ==========================================================
## Preprocessor: Formula
## Model: rand_forest()
## 
## -- Preprocessor ----------------------------------------------------------------
## species ~ .
## 
## -- Model -----------------------------------------------------------------------
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~1L,      x), num.trees = ~1000, min.node.size = min_rows(~17L, x),      num.threads = 1, verbose = FALSE, seed = sample.int(10^5,          1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  1000 
## Sample size:                      333 
## Number of independent variables:  6 
## Mtry:                             1 
## Target node size:                 17 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.03256562
library(vip)
rf_final %>%
  set_engine("ranger", importance = "permutation") %>%
  fit(species ~ . ,
      data = penguins_df
  ) %>%
  vip(geom = "col")

saveRDS(final_prod, "Multiclass_RF_Model.rds")

saveRDS(rf_final, "Multiclass_RF_varimp.rds")