library(mlbench)
data("PimaIndiansDiabetes2")
# this data is the updated datset as the previous version was not having some technical errors
# for structure of the data
library(dplyr)
glimpse(PimaIndiansDiabetes2)
## Rows: 768
## Columns: 9
## $ pregnant <dbl> 6, 1, 8, 1, 0, 5, 3, 10, 2, 8, 4, 10, 10, 1, 5, 7, 0, 7, 1, 1~
## $ glucose <dbl> 148, 85, 183, 89, 137, 116, 78, 115, 197, 125, 110, 168, 139,~
## $ pressure <dbl> 72, 66, 64, 66, 40, 74, 50, NA, 70, 96, 92, 74, 80, 60, 72, N~
## $ triceps <dbl> 35, 29, NA, 23, 35, NA, 32, NA, 45, NA, NA, NA, NA, 23, 19, N~
## $ insulin <dbl> NA, NA, NA, 94, 168, NA, 88, NA, 543, NA, NA, NA, NA, 846, 17~
## $ mass <dbl> 33.6, 26.6, 23.3, 28.1, 43.1, 25.6, 31.0, 35.3, 30.5, NA, 37.~
## $ pedigree <dbl> 0.627, 0.351, 0.672, 0.167, 2.288, 0.201, 0.248, 0.134, 0.158~
## $ age <dbl> 50, 31, 32, 21, 33, 30, 26, 29, 53, 54, 30, 34, 57, 59, 51, 3~
## $ diabetes <fct> pos, neg, pos, neg, pos, neg, pos, neg, pos, pos, neg, pos, n~
library(DataExplorer)
plot_missing(PimaIndiansDiabetes2)
# we can see that there are missing values in the variables. Our target variable is diabetes and it is not having missing values. So now comes the issue that should we need to impute beforehand or not.
# is it a imbalanced dataset
# our target variable is diabetes
table(PimaIndiansDiabetes2$diabetes)#
##
## neg pos
## 500 268
# first step is to split the data
# using the library(rsample)
# setting seed for reproducing the result
set.seed(1234)
library(rsample)
diabetes_split<-initial_split(PimaIndiansDiabetes2,strata = diabetes,prop=0.75)
diabetes_train<-training(diabetes_split)
diabetes_test<- testing(diabetes_split)
diabetes_split
## <Analysis/Assess/Total>
## <576/192/768>
set.seed(2345)
diabetes_cv<-vfold_cv(diabetes_train,v=10, strata=diabetes)
library(tidymodels)
library(tidyverse)
diabetes_recipe<- diabetes_train %>%
recipe(diabetes~.) %>%
step_impute_knn(all_predictors())%>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_corr(all_predictors(), threshold = 0.75)
diabetes_recipe
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 8
##
## Operations:
##
## K-nearest neighbor imputation for all_predictors()
## Zero variance filter on all_predictors()
## Centering and scaling for all_predictors()
## Correlation filter on all_predictors()
diabetes_recipe %>% prep(diabetes_train) %>% juice()
## # A tibble: 576 x 9
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 -0.851 -1.08 -0.537 -0.622 -0.637 -0.619 -0.930 -1.04 neg
## 2 0.343 -0.189 0.143 -0.394 -0.566 -0.985 -0.826 -0.276 neg
## 3 1.84 -0.222 0.00676 0.333 0.700 0.436 -1.03 -0.361 neg
## 4 1.84 0.571 0.652 0.167 1.05 -0.765 2.96 2.03 neg
## 5 -0.851 -0.816 -0.537 -1.45 -0.162 -1.34 0.0468 -0.959 neg
## 6 0.343 -0.156 1.67 0.250 -0.523 0.260 -0.411 0.408 neg
## 7 0.343 -0.420 0.228 -0.311 0.917 0.539 0.227 2.29 neg
## 8 -0.254 -1.11 -1.22 -1.87 -1.05 -1.10 -0.624 -0.959 neg
## 9 0.642 -0.981 1.67 -1.41 -1.09 -1.82 -0.865 -0.447 neg
## 10 1.84 0.00923 0.483 0.208 0.312 -0.692 0.123 1.01 neg
## # ... with 566 more rows
# setting up the model
rf_model <-
# specify that the model is a random forest
rand_forest() %>%
# specify that the `mtry` parameter needs to be tuned
set_args(mtry = tune(),trees= tune()) %>%
# select the engine/package that underlies the model
set_engine("ranger", importance = "impurity") %>%
# choose either the continuous regression or binary classification mode
set_mode("classification")
rf_workflow <- workflow() %>%
add_recipe(diabetes_recipe) %>%
add_model(rf_model)
rf_grid <- expand.grid(mtry = 1:8,
trees = c( 750,1000,1500,2000))
rf_tune_results <- rf_workflow %>%
tune_grid(resamples = diabetes_cv, #CV object
grid = rf_grid, # grid of values to try
metrics = metric_set(accuracy, roc_auc) # metrics we care about
)
rf_tune_results
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [517/59]> Fold01 <tibble [64 x 6]> <tibble [0 x 1]>
## 2 <split [518/58]> Fold02 <tibble [64 x 6]> <tibble [0 x 1]>
## 3 <split [518/58]> Fold03 <tibble [64 x 6]> <tibble [0 x 1]>
## 4 <split [518/58]> Fold04 <tibble [64 x 6]> <tibble [0 x 1]>
## 5 <split [518/58]> Fold05 <tibble [64 x 6]> <tibble [0 x 1]>
## 6 <split [519/57]> Fold06 <tibble [64 x 6]> <tibble [0 x 1]>
## 7 <split [519/57]> Fold07 <tibble [64 x 6]> <tibble [0 x 1]>
## 8 <split [519/57]> Fold08 <tibble [64 x 6]> <tibble [0 x 1]>
## 9 <split [519/57]> Fold09 <tibble [64 x 6]> <tibble [0 x 1]>
## 10 <split [519/57]> Fold10 <tibble [64 x 6]> <tibble [0 x 1]>
#collect_metrics
rf_tune_results %>% collect_metrics()
## # A tibble: 64 x 8
## mtry trees .metric .estimator mean n std_err .config
## <int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 750 accuracy binary 0.754 10 0.0163 Preprocessor1_Model01
## 2 1 750 roc_auc binary 0.820 10 0.0221 Preprocessor1_Model01
## 3 2 750 accuracy binary 0.754 10 0.0197 Preprocessor1_Model02
## 4 2 750 roc_auc binary 0.823 10 0.0222 Preprocessor1_Model02
## 5 3 750 accuracy binary 0.759 10 0.0201 Preprocessor1_Model03
## 6 3 750 roc_auc binary 0.825 10 0.0223 Preprocessor1_Model03
## 7 4 750 accuracy binary 0.762 10 0.0190 Preprocessor1_Model04
## 8 4 750 roc_auc binary 0.823 10 0.0230 Preprocessor1_Model04
## 9 5 750 accuracy binary 0.771 10 0.0192 Preprocessor1_Model05
## 10 5 750 roc_auc binary 0.827 10 0.0220 Preprocessor1_Model05
## # ... with 54 more rows
# select_best()
rf_tune_results %>% select_best(metric = "roc_auc")
## # A tibble: 1 x 3
## mtry trees .config
## <int> <dbl> <chr>
## 1 7 1000 Preprocessor1_Model15
rf_tune_results %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mean, trees, mtry) %>%
pivot_longer(trees:mtry,values_to = "value",names_to = "parameter")%>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "AUC")
rf_tune_results %>% collect_metrics() %>%
filter(.metric=="roc_auc") %>%
mutate(trees=factor(trees)) %>%
ggplot(aes(mtry, mean, color = trees)) +
geom_line(alpha = 0.5, size = 1.5) +
geom_point() +
labs(y = "AUC")
# lets see the best for accuracy and see the plotting results
rf_tune_results %>% select_best(metric = "accuracy")
## # A tibble: 1 x 3
## mtry trees .config
## <int> <dbl> <chr>
## 1 5 750 Preprocessor1_Model05
# lets plot the results
rf_tune_results %>% collect_metrics() %>%
filter(.metric=="accuracy") %>%
select(mean,mtry,trees) %>%
pivot_longer(mtry:trees,values_to = "value",names_to = "parameter") %>%
ggplot(aes(value,mean,color=parameter))+
geom_point(show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "Accuracy")
# lets see if we can do more visuals with accuracy
rf_tune_results %>% collect_metrics() %>%
filter(.metric=="accuracy") %>%
mutate(trees=factor(trees)) %>%
ggplot(aes(mtry, mean, color = trees)) +
geom_line(alpha = 0.5, size = 1.5) +
geom_point() +
labs(y = "Accuracy")
# lets try to finalize the things
param_final <- rf_tune_results %>%
select_best(metric = "accuracy")
param_final
## # A tibble: 1 x 3
## mtry trees .config
## <int> <dbl> <chr>
## 1 5 750 Preprocessor1_Model05
rf_workflow <- rf_workflow %>%
finalize_workflow(param_final)
rf_fit <- rf_workflow %>%
# fit on the training set and evaluate on test set
last_fit(diabetes_split)
rf_fit
## # Resampling results
## # Manual resampling
## # A tibble: 1 x 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [576/192]> train/test split <tibble [~ <tibble~ <tibble [192~ <workflo~
test_performance <- rf_fit %>% collect_metrics()
test_performance
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.755 Preprocessor1_Model1
## 2 roc_auc binary 0.840 Preprocessor1_Model1
# generate predictions from the test set
test_predictions <- rf_fit %>% collect_predictions()
test_predictions
## # A tibble: 192 x 7
## id .pred_neg .pred_pos .row .pred_class diabetes .config
## <chr> <dbl> <dbl> <int> <fct> <fct> <chr>
## 1 train/test split 0.921 0.0793 2 neg neg Preprocessor~
## 2 train/test split 0.714 0.286 10 neg pos Preprocessor~
## 3 train/test split 0.769 0.231 11 neg neg Preprocessor~
## 4 train/test split 0.531 0.469 14 neg pos Preprocessor~
## 5 train/test split 0.282 0.718 15 pos pos Preprocessor~
## 6 train/test split 0.831 0.169 16 neg pos Preprocessor~
## 7 train/test split 0.726 0.274 19 neg neg Preprocessor~
## 8 train/test split 0.703 0.297 21 neg neg Preprocessor~
## 9 train/test split 0.571 0.429 22 neg neg Preprocessor~
## 10 train/test split 0.0235 0.977 23 pos pos Preprocessor~
## # ... with 182 more rows
# generate a confusion matrix
test_predictions %>%
conf_mat(truth = diabetes, estimate = .pred_class)
## Truth
## Prediction neg pos
## neg 109 31
## pos 16 36
test_predictions %>%
ggplot() +
geom_density(aes(x = .pred_pos, fill = diabetes),
alpha = 0.5)
final_model <- fit(rf_workflow, PimaIndiansDiabetes2)
final_model
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: rand_forest()
##
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
##
## * step_impute_knn()
## * step_zv()
## * step_normalize()
## * step_corr()
##
## -- Model -----------------------------------------------------------------------
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L, x), num.trees = ~750, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 750
## Sample size: 768
## Number of independent variables: 8
## Mtry: 5
## Target node size: 10
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error (Brier s.): 0.158931
# lets see what is the variables of importance
ranger_obj <- pull_workflow_fit(final_model)$fit
ranger_obj
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~5L, x), num.trees = ~750, importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 750
## Sample size: 768
## Number of independent variables: 8
## Mtry: 5
## Target node size: 10
## Variable importance mode: impurity
## Splitrule: gini
## OOB prediction error (Brier s.): 0.158931
ranger_obj$variable.importance
## pregnant glucose pressure triceps insulin mass pedigree age
## 15.74561 84.75321 17.28561 20.47093 51.56952 43.73359 30.50090 31.97322
library(vip)
library(hrbrthemes)
tibble(var = ranger_obj$variable.importance,
nam = ranger_obj$variable.importance %>% names(),
) %>%
ggplot(aes(var, reorder(nam, var))) +
geom_bar(stat = 'identity',
alpha = .7,
width = .8
) +
theme_ipsum() +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank()
)
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_Germany.1252 LC_CTYPE=English_Germany.1252
## [3] LC_MONETARY=English_Germany.1252 LC_NUMERIC=C
## [5] LC_TIME=English_Germany.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] hrbrthemes_0.8.6 vip_0.3.2 ranger_0.13.1 vctrs_0.3.8
## [5] rlang_0.4.11 forcats_0.5.1 stringr_1.4.0 readr_2.0.0
## [9] tidyverse_1.3.1 yardstick_0.0.8 workflowsets_0.1.0 workflows_0.2.3
## [13] tune_0.1.6 tidyr_1.1.3 tibble_3.1.3 recipes_0.1.16
## [17] purrr_0.3.4 parsnip_0.1.7 modeldata_0.1.1 infer_1.0.0
## [21] ggplot2_3.3.5 dials_0.0.9 scales_1.1.1 broom_0.7.9
## [25] tidymodels_0.1.3 rsample_0.1.0 DataExplorer_0.8.2 dplyr_1.0.7
## [29] mlbench_2.1-3
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-2 ellipsis_0.3.2 class_7.3-19 fs_1.5.0
## [5] rstudioapi_0.13 listenv_0.8.0 furrr_0.2.3 farver_2.1.0
## [9] prodlim_2019.11.13 fansi_0.5.0 lubridate_1.7.10 xml2_1.3.2
## [13] codetools_0.2-18 splines_4.1.1 extrafont_0.17 knitr_1.33
## [17] jsonlite_1.7.2 pROC_1.17.0.1 Rttf2pt1_1.3.9 dbplyr_2.1.1
## [21] compiler_4.1.1 httr_1.4.2 backports_1.2.1 assertthat_0.2.1
## [25] Matrix_1.3-4 cli_3.0.1 htmltools_0.5.1.1 tools_4.1.1
## [29] igraph_1.2.6 gtable_0.3.0 glue_1.4.2 Rcpp_1.0.7
## [33] cellranger_1.1.0 jquerylib_0.1.4 DiceDesign_1.9 extrafontdb_1.0
## [37] iterators_1.0.13 timeDate_3043.102 gower_0.2.2 xfun_0.25
## [41] globals_0.14.0 networkD3_0.4 rvest_1.0.1 lifecycle_1.0.0
## [45] future_1.21.0 MASS_7.3-54 ipred_0.9-11 hms_1.1.0
## [49] parallel_4.1.1 yaml_2.2.1 gridExtra_2.3 gdtools_0.2.3
## [53] sass_0.4.0 rpart_4.1-15 stringi_1.7.3 highr_0.9
## [57] foreach_1.5.1 lhs_1.1.1 hardhat_0.1.6 lava_1.6.9
## [61] systemfonts_1.0.2 pkgconfig_2.0.3 evaluate_0.14 lattice_0.20-44
## [65] htmlwidgets_1.5.3 labeling_0.4.2 tidyselect_1.1.1 parallelly_1.27.0
## [69] plyr_1.8.6 magrittr_2.0.1 R6_2.5.1 generics_0.1.0
## [73] DBI_1.1.1 pillar_1.6.2 haven_2.4.3 withr_2.4.2
## [77] survival_3.2-11 nnet_7.3-16 modelr_0.1.8 crayon_1.4.1
## [81] utf8_1.2.2 tzdb_0.1.2 rmarkdown_2.10 grid_4.1.1
## [85] readxl_1.3.1 data.table_1.14.0 reprex_2.0.1 digest_0.6.27
## [89] GPfit_1.0-8 munsell_0.5.0 bslib_0.2.5.1