using mlbench library

library(mlbench)

dataset is PimaIndianDiabetes2

uploading the data from the library

       data("PimaIndiansDiabetes2")

        # this data is the updated datset as the previous version was not having some technical errors
 # for structure of the data
library(dplyr)
glimpse(PimaIndiansDiabetes2)

## Rows: 768
## Columns: 9
## $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1~
## $ glucose  <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,~
## $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, 72, N~
## $ triceps  <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, 19, N~
## $ insulin  <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 846, 17~
## $ mass     <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA, 37.~
## $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158~
## $ age      <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3~
## $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n~

lets do some EDA using the DataExplorer

     library(DataExplorer)
plot_missing(PimaIndiansDiabetes2)

# we can see that there are missing values in the variables. Our target variable is diabetes and it is not having missing values. So now comes the issue that should we need to impute beforehand or not.

# is it a imbalanced dataset
# our target variable is diabetes
table(PimaIndiansDiabetes2$diabetes)#

## 
## neg pos 
## 500 268

this shows that the our dataset to an extent falls under the category imbalanced data set. So our analysis has to take this into account

# first step is to split the data 
# using the library(rsample)
# setting seed for reproducing the result
set.seed(1234)
library(rsample)
diabetes_split<-initial_split(PimaIndiansDiabetes2,strata = diabetes,prop=0.75)
diabetes_train<-training(diabetes_split)
diabetes_test<- testing(diabetes_split)
diabetes_split

## <Analysis/Assess/Total>
## <576/192/768>

so now we have done with the data split and now we want to go for cross validation and here also we use strata = diabetes

set.seed(2345)
  diabetes_cv<-vfold_cv(diabetes_train,v=10, strata=diabetes)

now we need to create the recipe for our model

one thing to note is that we have not talked about the imbalanced issue in the recipe

 library(tidymodels)
library(tidyverse)
diabetes_recipe<- diabetes_train %>% 
   recipe(diabetes~.) %>%
   step_impute_knn(all_predictors())%>%
  step_zv(all_predictors()) %>%
   step_normalize(all_predictors()) %>%
   step_corr(all_predictors(), threshold = 0.75)
  diabetes_recipe

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          8
## 
## Operations:
## 
## K-nearest neighbor imputation for all_predictors()
## Zero variance filter on all_predictors()
## Centering and scaling for all_predictors()
## Correlation filter on all_predictors()

so now we are getting ready for prep the data and extract the preprocessed data. so what we are been doing is using the usual recipe, taking stratified sampling in the split and recipe. But at present we dont take into account imbalanced problem.

diabetes_recipe %>% prep(diabetes_train) %>% juice()

## # A tibble: 576 x 9
##    pregnant  glucose pressure triceps insulin   mass pedigree    age diabetes
##       <dbl>    <dbl>    <dbl>   <dbl>   <dbl>  <dbl>    <dbl>  <dbl> <fct>   
##  1   -0.851 -1.08    -0.537    -0.622  -0.637 -0.619  -0.930  -1.04  neg     
##  2    0.343 -0.189    0.143    -0.394  -0.566 -0.985  -0.826  -0.276 neg     
##  3    1.84  -0.222    0.00676   0.333   0.700  0.436  -1.03   -0.361 neg     
##  4    1.84   0.571    0.652     0.167   1.05  -0.765   2.96    2.03  neg     
##  5   -0.851 -0.816   -0.537    -1.45   -0.162 -1.34    0.0468 -0.959 neg     
##  6    0.343 -0.156    1.67      0.250  -0.523  0.260  -0.411   0.408 neg     
##  7    0.343 -0.420    0.228    -0.311   0.917  0.539   0.227   2.29  neg     
##  8   -0.254 -1.11    -1.22     -1.87   -1.05  -1.10   -0.624  -0.959 neg     
##  9    0.642 -0.981    1.67     -1.41   -1.09  -1.82   -0.865  -0.447 neg     
## 10    1.84   0.00923  0.483     0.208   0.312 -0.692   0.123   1.01  neg     
## # ... with 566 more rows

the result seems to be good. so now we can move to having the model specification

# setting up the model
rf_model <- 
  # specify that the model is a random forest
  rand_forest() %>%
  # specify that the `mtry` parameter needs to be tuned
  set_args(mtry = tune(),trees= tune()) %>%
  # select the engine/package that underlies the model
  set_engine("ranger", importance = "impurity") %>%
  # choose either the continuous regression or binary classification mode
  set_mode("classification")

rf_workflow <- workflow() %>%
  add_recipe(diabetes_recipe) %>%
  add_model(rf_model)

rf_grid <- expand.grid(mtry  = 1:8, 
                       trees = c(  750,1000,1500,2000))

rf_tune_results <- rf_workflow %>%
  tune_grid(resamples = diabetes_cv, #CV object
            grid = rf_grid, # grid of values to try
            metrics = metric_set(accuracy, roc_auc) # metrics we care about
            )

rf_tune_results

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 x 4
##    splits           id     .metrics          .notes          
##    <list>           <chr>  <list>            <list>          
##  1 <split [517/59]> Fold01 <tibble [64 x 6]> <tibble [0 x 1]>
##  2 <split [518/58]> Fold02 <tibble [64 x 6]> <tibble [0 x 1]>
##  3 <split [518/58]> Fold03 <tibble [64 x 6]> <tibble [0 x 1]>
##  4 <split [518/58]> Fold04 <tibble [64 x 6]> <tibble [0 x 1]>
##  5 <split [518/58]> Fold05 <tibble [64 x 6]> <tibble [0 x 1]>
##  6 <split [519/57]> Fold06 <tibble [64 x 6]> <tibble [0 x 1]>
##  7 <split [519/57]> Fold07 <tibble [64 x 6]> <tibble [0 x 1]>
##  8 <split [519/57]> Fold08 <tibble [64 x 6]> <tibble [0 x 1]>
##  9 <split [519/57]> Fold09 <tibble [64 x 6]> <tibble [0 x 1]>
## 10 <split [519/57]> Fold10 <tibble [64 x 6]> <tibble [0 x 1]>

#collect_metrics
rf_tune_results %>% collect_metrics()

## # A tibble: 64 x 8
##     mtry trees .metric  .estimator  mean     n std_err .config              
##    <int> <dbl> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
##  1     1   750 accuracy binary     0.754    10  0.0163 Preprocessor1_Model01
##  2     1   750 roc_auc  binary     0.820    10  0.0221 Preprocessor1_Model01
##  3     2   750 accuracy binary     0.754    10  0.0197 Preprocessor1_Model02
##  4     2   750 roc_auc  binary     0.823    10  0.0222 Preprocessor1_Model02
##  5     3   750 accuracy binary     0.759    10  0.0201 Preprocessor1_Model03
##  6     3   750 roc_auc  binary     0.825    10  0.0223 Preprocessor1_Model03
##  7     4   750 accuracy binary     0.762    10  0.0190 Preprocessor1_Model04
##  8     4   750 roc_auc  binary     0.823    10  0.0230 Preprocessor1_Model04
##  9     5   750 accuracy binary     0.771    10  0.0192 Preprocessor1_Model05
## 10     5   750 roc_auc  binary     0.827    10  0.0220 Preprocessor1_Model05
## # ... with 54 more rows

# select_best()
rf_tune_results %>% select_best(metric = "roc_auc")

## # A tibble: 1 x 3
##    mtry trees .config              
##   <int> <dbl> <chr>                
## 1     7  1000 Preprocessor1_Model15

 rf_tune_results %>%
  collect_metrics() %>%
  filter(.metric == "roc_auc") %>%
  select(mean, trees, mtry) %>%
  pivot_longer(trees:mtry,values_to = "value",names_to = "parameter")%>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "AUC")

          rf_tune_results %>% collect_metrics() %>%
          filter(.metric=="roc_auc") %>%
          mutate(trees=factor(trees)) %>%
          ggplot(aes(mtry, mean, color = trees)) +
          geom_line(alpha = 0.5, size = 1.5) +
          geom_point() +
          labs(y = "AUC")

# lets see the best for accuracy and see the plotting results

rf_tune_results %>% select_best(metric = "accuracy")

## # A tibble: 1 x 3
##    mtry trees .config              
##   <int> <dbl> <chr>                
## 1     5   750 Preprocessor1_Model05

# lets plot the results
rf_tune_results %>% collect_metrics() %>% 
filter(.metric=="accuracy") %>%
select(mean,mtry,trees)  %>%
pivot_longer(mtry:trees,values_to = "value",names_to = "parameter") %>%
ggplot(aes(value,mean,color=parameter))+
geom_point(show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "Accuracy")

# lets see if we can do more visuals with accuracy
rf_tune_results %>% collect_metrics() %>%
          filter(.metric=="accuracy") %>%
          mutate(trees=factor(trees)) %>%
          ggplot(aes(mtry, mean, color = trees)) +
          geom_line(alpha = 0.5, size = 1.5) +
          geom_point() +
          labs(y = "Accuracy")

# lets try to finalize the things

param_final <- rf_tune_results %>%
  select_best(metric = "accuracy")
param_final

## # A tibble: 1 x 3
##    mtry trees .config              
##   <int> <dbl> <chr>                
## 1     5   750 Preprocessor1_Model05

rf_workflow <- rf_workflow %>%
  finalize_workflow(param_final)

rf_fit <- rf_workflow %>%
  # fit on the training set and evaluate on test set
  last_fit(diabetes_split)

rf_fit

## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits            id               .metrics   .notes   .predictions  .workflow
##   <list>            <chr>            <list>     <list>   <list>        <list>   
## 1 <split [576/192]> train/test split <tibble [~ <tibble~ <tibble [192~ <workflo~

test_performance <- rf_fit %>% collect_metrics()
test_performance

## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.755 Preprocessor1_Model1
## 2 roc_auc  binary         0.840 Preprocessor1_Model1

# generate predictions from the test set
test_predictions <- rf_fit %>% collect_predictions()
test_predictions

## # A tibble: 192 x 7
##    id               .pred_neg .pred_pos  .row .pred_class diabetes .config      
##    <chr>                <dbl>     <dbl> <int> <fct>       <fct>    <chr>        
##  1 train/test split    0.921     0.0793     2 neg         neg      Preprocessor~
##  2 train/test split    0.714     0.286     10 neg         pos      Preprocessor~
##  3 train/test split    0.769     0.231     11 neg         neg      Preprocessor~
##  4 train/test split    0.531     0.469     14 neg         pos      Preprocessor~
##  5 train/test split    0.282     0.718     15 pos         pos      Preprocessor~
##  6 train/test split    0.831     0.169     16 neg         pos      Preprocessor~
##  7 train/test split    0.726     0.274     19 neg         neg      Preprocessor~
##  8 train/test split    0.703     0.297     21 neg         neg      Preprocessor~
##  9 train/test split    0.571     0.429     22 neg         neg      Preprocessor~
## 10 train/test split    0.0235    0.977     23 pos         pos      Preprocessor~
## # ... with 182 more rows

# generate a confusion matrix
test_predictions %>% 
  conf_mat(truth = diabetes, estimate = .pred_class)

##           Truth
## Prediction neg pos
##        neg 109  31
##        pos  16  36

test_predictions %>%
  ggplot() +
  geom_density(aes(x = .pred_pos, fill = diabetes), 
               alpha = 0.5)

final_model <- fit(rf_workflow, PimaIndiansDiabetes2)
final_model

## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: rand_forest()
## 
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
## 
## * step_impute_knn()
## * step_zv()
## * step_normalize()
## * step_corr()
## 
## -- Model -----------------------------------------------------------------------
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L,      x), num.trees = ~750, importance = ~"impurity", num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  750 
## Sample size:                      768 
## Number of independent variables:  8 
## Mtry:                             5 
## Target node size:                 10 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.158931

# lets see what is the variables of importance
ranger_obj <- pull_workflow_fit(final_model)$fit
ranger_obj

## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L,      x), num.trees = ~750, importance = ~"impurity", num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  750 
## Sample size:                      768 
## Number of independent variables:  8 
## Mtry:                             5 
## Target node size:                 10 
## Variable importance mode:         impurity 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.158931

ranger_obj$variable.importance

## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
## 15.74561 84.75321 17.28561 20.47093 51.56952 43.73359 30.50090 31.97322

lets see using library vip and hrbrthemes

      library(vip)
      library(hrbrthemes)
tibble(var = ranger_obj$variable.importance,
       nam = ranger_obj$variable.importance %>% names(),
       ) %>% 
  ggplot(aes(var, reorder(nam, var))) +
  geom_bar(stat  = 'identity', 
           alpha = .7, 
           width = .8
           ) +
theme_ipsum() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_blank()
        )

        sessionInfo()

## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Germany.1252  LC_CTYPE=English_Germany.1252   
## [3] LC_MONETARY=English_Germany.1252 LC_NUMERIC=C                    
## [5] LC_TIME=English_Germany.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] hrbrthemes_0.8.6   vip_0.3.2          ranger_0.13.1      vctrs_0.3.8       
##  [5] rlang_0.4.11       forcats_0.5.1      stringr_1.4.0      readr_2.0.0       
##  [9] tidyverse_1.3.1    yardstick_0.0.8    workflowsets_0.1.0 workflows_0.2.3   
## [13] tune_0.1.6         tidyr_1.1.3        tibble_3.1.3       recipes_0.1.16    
## [17] purrr_0.3.4        parsnip_0.1.7      modeldata_0.1.1    infer_1.0.0       
## [21] ggplot2_3.3.5      dials_0.0.9        scales_1.1.1       broom_0.7.9       
## [25] tidymodels_0.1.3   rsample_0.1.0      DataExplorer_0.8.2 dplyr_1.0.7       
## [29] mlbench_2.1-3     
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-2   ellipsis_0.3.2     class_7.3-19       fs_1.5.0          
##  [5] rstudioapi_0.13    listenv_0.8.0      furrr_0.2.3        farver_2.1.0      
##  [9] prodlim_2019.11.13 fansi_0.5.0        lubridate_1.7.10   xml2_1.3.2        
## [13] codetools_0.2-18   splines_4.1.1      extrafont_0.17     knitr_1.33        
## [17] jsonlite_1.7.2     pROC_1.17.0.1      Rttf2pt1_1.3.9     dbplyr_2.1.1      
## [21] compiler_4.1.1     httr_1.4.2         backports_1.2.1    assertthat_0.2.1  
## [25] Matrix_1.3-4       cli_3.0.1          htmltools_0.5.1.1  tools_4.1.1       
## [29] igraph_1.2.6       gtable_0.3.0       glue_1.4.2         Rcpp_1.0.7        
## [33] cellranger_1.1.0   jquerylib_0.1.4    DiceDesign_1.9     extrafontdb_1.0   
## [37] iterators_1.0.13   timeDate_3043.102  gower_0.2.2        xfun_0.25         
## [41] globals_0.14.0     networkD3_0.4      rvest_1.0.1        lifecycle_1.0.0   
## [45] future_1.21.0      MASS_7.3-54        ipred_0.9-11       hms_1.1.0         
## [49] parallel_4.1.1     yaml_2.2.1         gridExtra_2.3      gdtools_0.2.3     
## [53] sass_0.4.0         rpart_4.1-15       stringi_1.7.3      highr_0.9         
## [57] foreach_1.5.1      lhs_1.1.1          hardhat_0.1.6      lava_1.6.9        
## [61] systemfonts_1.0.2  pkgconfig_2.0.3    evaluate_0.14      lattice_0.20-44   
## [65] htmlwidgets_1.5.3  labeling_0.4.2     tidyselect_1.1.1   parallelly_1.27.0 
## [69] plyr_1.8.6         magrittr_2.0.1     R6_2.5.1           generics_0.1.0    
## [73] DBI_1.1.1          pillar_1.6.2       haven_2.4.3        withr_2.4.2       
## [77] survival_3.2-11    nnet_7.3-16        modelr_0.1.8       crayon_1.4.1      
## [81] utf8_1.2.2         tzdb_0.1.2         rmarkdown_2.10     grid_4.1.1        
## [85] readxl_1.3.1       data.table_1.14.0  reprex_2.0.1       digest_0.6.27     
## [89] GPfit_1.0-8        munsell_0.5.0      bslib_0.2.5.1

Diabetes

Mandar Phatak

8/21/2021