setwd("D:/Class Materials & Work/Summer 2020 practice/3_Tidymodel_Model evaluation")
getwd()
## [1] "D:/Class Materials & Work/Summer 2020 practice/3_Tidymodel_Model evaluation"
library(tidymodels) # for the rsample package, along with the rest of tidymodels
## -- Attaching packages -------------------------------------------------------------------------------- tidymodels 0.1.0 --
## v broom 0.5.6 v recipes 0.1.12
## v dials 0.0.6 v rsample 0.0.6
## v dplyr 0.8.5 v tibble 3.0.1
## v ggplot2 3.3.1 v tune 0.1.0
## v infer 0.5.1 v workflows 0.1.1
## v parsnip 0.1.1 v yardstick 0.0.6
## v purrr 0.3.4
## -- Conflicts ----------------------------------------------------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x ggplot2::margin() masks dials::margin()
## x recipes::step() masks stats::step()
# Helper packages
library(modeldata) # for the cells data
## Warning: package 'modeldata' was built under R version 4.0.2
library(ranger)
## Warning: package 'ranger' was built under R version 4.0.2
#Cell Image Data----
#We are predicting cell image segmentation quality with resampling.
#Load the data
data(cells, package = "modeldata")
cells
## # A tibble: 2,019 x 58
## case class angle_ch_1 area_ch_1 avg_inten_ch_1 avg_inten_ch_2 avg_inten_ch_3
## <fct> <fct> <dbl> <int> <dbl> <dbl> <dbl>
## 1 Test PS 143. 185 15.7 4.95 9.55
## 2 Train PS 134. 819 31.9 207. 69.9
## 3 Train WS 107. 431 28.0 116. 63.9
## 4 Train PS 69.2 298 19.5 102. 28.2
## 5 Test PS 2.89 285 24.3 112. 20.5
## 6 Test WS 40.7 172 326. 654. 129.
## 7 Test WS 174. 177 260. 596. 124.
## 8 Test PS 180. 251 18.3 5.73 17.2
## 9 Test WS 18.9 495 16.1 89.5 13.7
## 10 Test WS 153. 384 17.7 89.9 20.4
## # ... with 2,009 more rows, and 51 more variables: avg_inten_ch_4 <dbl>,
## # convex_hull_area_ratio_ch_1 <dbl>, convex_hull_perim_ratio_ch_1 <dbl>,
## # diff_inten_density_ch_1 <dbl>, diff_inten_density_ch_3 <dbl>,
## # diff_inten_density_ch_4 <dbl>, entropy_inten_ch_1 <dbl>,
## # entropy_inten_ch_3 <dbl>, entropy_inten_ch_4 <dbl>,
## # eq_circ_diam_ch_1 <dbl>, eq_ellipse_lwr_ch_1 <dbl>,
## # eq_ellipse_oblate_vol_ch_1 <dbl>, eq_ellipse_prolate_vol_ch_1 <dbl>,
## # eq_sphere_area_ch_1 <dbl>, eq_sphere_vol_ch_1 <dbl>,
## # fiber_align_2_ch_3 <dbl>, fiber_align_2_ch_4 <dbl>,
## # fiber_length_ch_1 <dbl>, fiber_width_ch_1 <dbl>, inten_cooc_asm_ch_3 <dbl>,
## # inten_cooc_asm_ch_4 <dbl>, inten_cooc_contrast_ch_3 <dbl>,
## # inten_cooc_contrast_ch_4 <dbl>, inten_cooc_entropy_ch_3 <dbl>,
## # inten_cooc_entropy_ch_4 <dbl>, inten_cooc_max_ch_3 <dbl>,
## # inten_cooc_max_ch_4 <dbl>, kurt_inten_ch_1 <dbl>, kurt_inten_ch_3 <dbl>,
## # kurt_inten_ch_4 <dbl>, length_ch_1 <dbl>, neighbor_avg_dist_ch_1 <dbl>,
## # neighbor_min_dist_ch_1 <dbl>, neighbor_var_dist_ch_1 <dbl>,
## # perim_ch_1 <dbl>, shape_bfr_ch_1 <dbl>, shape_lwr_ch_1 <dbl>,
## # shape_p_2_a_ch_1 <dbl>, skew_inten_ch_1 <dbl>, skew_inten_ch_3 <dbl>,
## # skew_inten_ch_4 <dbl>, spot_fiber_count_ch_3 <int>,
## # spot_fiber_count_ch_4 <dbl>, total_inten_ch_1 <int>,
## # total_inten_ch_2 <dbl>, total_inten_ch_3 <int>, total_inten_ch_4 <int>,
## # var_inten_ch_1 <dbl>, var_inten_ch_3 <dbl>, var_inten_ch_4 <dbl>,
## # width_ch_1 <dbl>
#The data has 2019 cells and 58 variables. We aim to predict the image segmentation quality of cells in the data set,
#which is defined by the class variable, and classified as factor.
#WS = Well segmented, PS = Poorly segmented.
#Each cell has 56 predictors such as nucleus intensity, and cell size.
#Calculating the proportion of image class.
cells %>%
count(class) %>%
mutate(prop = n/sum(n))
## # A tibble: 2 x 3
## class n prop
## <fct> <int> <dbl>
## 1 PS 1300 0.644
## 2 WS 719 0.356
#The rates of the classes are somewhat imbalanced; there are more poorly segmented cells than well-segmented cells
#Splitting the Data----
#It is common when beginning a modeling project to separate the data set into two partitions:
#Training set: To estimate parameters, compare models and feature engineering techniques, tune models, etc.
#Testing set: It is used as an unbiased source for measuring final model performance. There should only be one or two adjusted models at the later stage before testing.
#The most common approach to partition the data into two sets is random sample.
#rsample package will randomly select 25% for the test set and use the remainder for the training set.
#Since random sampling uses random numbers, it is important to set the random number seed. This ensures that the random numbers can be reproduced at a later time (if needed).
#In the original analysis, the authors already made their own training/test set and that information is contained in the column "case".
#To demonstrate how to make a split, we'll remove this column before we make our own split.
set.seed(123)
cell_split <- initial_split(cells %>% select(-case),
strata = class)
#(-case) because we want to remove the original case column before partitioning.
#We use "strata" to make the splitted data set possess the same proportion of class (WS/PS) as the original cells data for representation.
#After the initial_split, the training() and testing() functions return the actual data sets.
cell_train <- training(cell_split)
cell_test <- testing(cell_split)
nrow(cell_train)
## [1] 1515
nrow(cell_train)/nrow(cells)
## [1] 0.7503715
# training set proportions by class
cell_train %>%
count(class) %>%
mutate(prop = n/sum(n))
## # A tibble: 2 x 3
## class n prop
## <fct> <int> <dbl>
## 1 PS 975 0.644
## 2 WS 540 0.356
# test set proportions by class
cell_test %>%
count(class) %>%
mutate(prop = n/sum(n))
## # A tibble: 2 x 3
## class n prop
## <fct> <int> <dbl>
## 1 PS 325 0.645
## 2 WS 179 0.355
#The majority of the modeling work is then conducted on the training set data.
#Modeling----
#We will use Random forest, which is the ensemble of decision trees.
#A large number of decision tree models are created for the ensemble based on slightly different versions of the training set from the available predictors.
#Model fitting allows the tree to be as diverse as possible.
#Multiple trees are formed into the Random Forest Model.
#When a new sample is predicted, the votes from each tree are used to calculate the final predicted value for the new sample.
#In our case, the majority vote across all the trees in the random forest determines the predicted class for the new sample in our cell data set.
#Random Forest requires low data pre-processing effort. Therefore, no need to create much recipe for the data set.
#However, the large number of trees makes the model computationally taxing.
#First, we define the model we want to create:
rf_mod <-
rand_forest(trees = 1000) %>%
set_engine("ranger") %>%
set_mode("classification")
#parship::fit can be used to fit the model formula as usual.
#Since random forest models use random numbers, we again set the seed prior to computing:
set.seed(234)
rf_fit <-
rf_mod %>%
fit(class ~ ., data = cell_train)
rf_fit
## parsnip model object
##
## Fit time: 6.3s
## Ranger result
##
## Call:
## ranger::ranger(formula = formula, data = data, num.trees = ~1000, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 1000
## Sample size: 1515
## Number of independent variables: 56
## Mtry: 7
## Target node size: 10
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.1220191
#The above "rf_fit" is our fitted model that is trained the train data set.
#Estimating performance----
#To evaluate our model, we could use Area under curve of the ROC or overall classification accuracy.
#The former uses the class probability estimates to give us a sense of performance across the entire set of potential probability cutoffs.
#The latter uses the hard class predictions to measure performance, which tell us whether our model predicted PS or WS for each cell.
#The yardstick package has functions for computing both of these measures called roc_auc() and accuracy().
#At first glance, it might seem like a good idea to use the training set data to compute these statistics through predict().
#We can then get both AUC and accuracy statistics from the prediction.
rf_training_pred <-
predict(rf_fit, cell_train) %>%
# Add the result
bind_cols(predict(rf_fit, cell_train, type = "prob")) %>%
# Add the true outcome data back in
bind_cols(cell_train %>%
select(class))
#Using the yardstick functions, this model has spectacular results, so spectacular that you might be starting to get suspicious:
rf_training_pred %>% # training set predictions
roc_auc(truth = class, .pred_PS)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 1.00
rf_training_pred %>% # training set predictions
accuracy(truth = class, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.994
#With both indices indicating exceptional performance (100% and 99.4%), we can try implementing our model with the test set.
rf_testing_pred <-
predict(rf_fit, cell_test) %>%
bind_cols(predict(rf_fit, cell_test, type = "prob")) %>%
bind_cols(cell_test %>% select(class))
rf_testing_pred %>% # test set predictions
roc_auc(truth = class, .pred_PS)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.909
rf_testing_pred %>% # test set predictions
accuracy(truth = class, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.841
#Although the test results are not entirely bad, they are certainly worse than what we initially thought based on predicting the training set.
#What happened?----
#Statistics shown from the training set are unrealistically optimistic due to:
#Models like random forests, neural networks, and other black-box methods, which are not like the simple General Linear Model, can essentially memorize the training set. Re-predicting that same set should always result in nearly perfect results.
#The training set does not have the capacity to be a good arbiter of performance. It is not an independent piece of information; predicting the training set can only reflect what the model already knows.
#Basically, this is the case of practice effect in machine learning as it is in human learning.
#Suppose you give a class a test, then give them the answers, then provide the same test.
#The student scores on the second test do not accurately reflect what they know about the subject; these scores would probably be higher than their results on the first test.
#Resampling to the Rescue----
#Resampling methods, e.g., cross-validation, and the bootstrap, are empirical in simulation systems.
#They iteratively generate resampled versions of the training data set, namely the subset of training data, for modeling and measuring the model itself, so that practice effect will not occur.
#In our case, we will use 10-fold cross validation, which resamples the training set into both 10% assessment set (or test set when called at the larger picture) and 90% analysis set (or training set) 10 times (or folds).
#Like the regular model fitting, the analysis set will be used to fit and train the model, and the assessment set will be used to compute performance statistics, i.e., accuracy index and AUC.
#Ultimately, we will have 10 sets of performance statistics from 10 iterative processes of resampling, analyzing, and assessing different sub-data of the training set.
#We will only use the 10 models from the iteration to compute the performance statistics only, so there is no need to retain the models.
#The finalized estimation of performance statistics will be the average of both accuracy index and AUC of all iterations.
#!!These resampling statistics are an effective method for measuring model performance without predicting the training set directly as a whole!!
#Fit a model with resampling----
#cross-validation folds can be created using rsample::vfold_cv()
set.seed(345)
folds <- vfold_cv(cell_train, v = 10)
folds
## # 10-fold cross-validation
## # A tibble: 10 x 2
## splits id
## <named list> <chr>
## 1 <split [1.4K/152]> Fold01
## 2 <split [1.4K/152]> Fold02
## 3 <split [1.4K/152]> Fold03
## 4 <split [1.4K/152]> Fold04
## 5 <split [1.4K/152]> Fold05
## 6 <split [1.4K/151]> Fold06
## 7 <split [1.4K/151]> Fold07
## 8 <split [1.4K/151]> Fold08
## 9 <split [1.4K/151]> Fold09
## 10 <split [1.4K/151]> Fold10
#The list column for splits contains the information on which rows belong in the analysis and assessment sets.
#There are functions that can be used to extract the individual resampled data called analysis() and assessment().
#See examples in "help(analysis)".
#The tune package also contains high-level functions that can do the required computations to resample a model for the purpose of measuring performance.
#To build an object for resampling, we can:
#Resample a model specification preprocessed with a formula or "recipe", or
#Resample a workflow() that bundles together a model specification and formula/recipe.
#In this project, we will use a workflow that bundles together the random forest model and a formula, since we are not using a recipe.
#We can use "fit_resamples()" instead of fit() to fit the Cross-Validation in the workflow after including the Random forest model.
#Create a workflow that includes the Random forest model and intended formula.
rf_wf <-
workflow() %>%
add_model(rf_mod) %>%
add_formula(class ~ .)
rf_wf
## == Workflow ==============================================================================================================
## Preprocessor: Formula
## Model: rand_forest()
##
## -- Preprocessor ----------------------------------------------------------------------------------------------------------
## class ~ .
##
## -- Model -----------------------------------------------------------------------------------------------------------------
## Random Forest Model Specification (classification)
##
## Main Arguments:
## trees = 1000
##
## Computational engine: ranger
#Fit the workflow with the 10-fold CV resampling. Don't forget to set the seed for replicability.
set.seed(456)
rf_fit_rs <-
#Identify the workflow
rf_wf %>%
#Fit the CV resample
fit_resamples(folds)
rf_fit_rs
## # 10-fold cross-validation
## # A tibble: 10 x 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [1.4K/152]> Fold01 <tibble [2 x 3]> <tibble [0 x 1]>
## 2 <split [1.4K/152]> Fold02 <tibble [2 x 3]> <tibble [0 x 1]>
## 3 <split [1.4K/152]> Fold03 <tibble [2 x 3]> <tibble [0 x 1]>
## 4 <split [1.4K/152]> Fold04 <tibble [2 x 3]> <tibble [0 x 1]>
## 5 <split [1.4K/152]> Fold05 <tibble [2 x 3]> <tibble [0 x 1]>
## 6 <split [1.4K/151]> Fold06 <tibble [2 x 3]> <tibble [0 x 1]>
## 7 <split [1.4K/151]> Fold07 <tibble [2 x 3]> <tibble [0 x 1]>
## 8 <split [1.4K/151]> Fold08 <tibble [2 x 3]> <tibble [0 x 1]>
## 9 <split [1.4K/151]> Fold09 <tibble [2 x 3]> <tibble [0 x 1]>
## 10 <split [1.4K/151]> Fold10 <tibble [2 x 3]> <tibble [0 x 1]>
#The results are similar to the folds results with .metrics contains the performance statistics created from the 10 assessment sets.
#These can be manually unnested but the tune package contains a number of simple functions that can extract these data:
collect_metrics(rf_fit_rs)
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.832 10 0.0114
## 2 roc_auc binary 0.904 10 0.00809
#Our finalized performance indices are now lower, but more realistic, than the direct first-attempt after the adjustment from resampling.
#If we wanted to try different model types for this data set, we could more confidently compare performance metrics computed using resampling to choose between models.
#To test each performance statistics individually with the test set again for comparison:
rf_testing_pred %>% # test set predictions
roc_auc(truth = class, .pred_PS)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.909
rf_testing_pred %>% # test set predictions
accuracy(truth = class, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.841
#The performance metrics from the test set are much closer to the performance metrics computed using resampling than our first ("bad idea") attempt.
#Resampling allows us to simulate how well our model will perform on new data, and the test set acts as the final, unbiased check for our model's performance.
#We can use check the predicted test set manually by calling out the data set (rf_testing_pred), but evaluating the model from performance statistics are much more efficient.