Load packages

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## Warning: package 'tidyr' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidymodels)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.0.3 --
## v broom     0.5.4     v recipes   0.1.9
## v dials     0.0.4     v rsample   0.0.5
## v infer     0.5.1     v yardstick 0.0.5
## v parsnip   0.0.5
## Warning: package 'broom' was built under R version 3.6.2
## Warning: package 'parsnip' was built under R version 3.6.2
## Warning: package 'recipes' was built under R version 3.6.2
## Warning: package 'yardstick' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x dials::margin()   masks ggplot2::margin()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()

Load data

df <- as_tibble(iris)

Split the data

set.seed(999)

split <- initial_split(df, strata = Species)

df_train <- training(split)

df_test <- testing(split)

Prep the data

df_rec <- recipe(Species ~., data =df_train) %>%
  #make sure the training set has equal numbers of target variale (not needed for iris dataset)
  step_downsample(Species) %>% 
  #normalise the data
  step_center(-Species) %>% 
  step_scale(-Species) %>% 
  step_BoxCox(-Species) %>% 
  #function to apply the recipe to the data
  prep()
  
#apply the recipe to the traning set (df_rec contains the data and the pre processing steps)
df_juiced <- juice(df_rec)

#visualise what this processed data looks like
df_juiced %>% pivot_longer(-Species) %>% 
    ggplot() +
    geom_histogram(aes(value, fill = Species)) +
    facet_wrap(~name)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bake the test data

#apply the same recipe to the test data
baked_test <- bake(df_rec, new_data = df_test)

Specify the fits

#make a knn spec
knn_spec <- nearest_neighbor() %>% 
  set_engine("kknn") %>% 
  set_mode("classification")

#make a rf spec
rf_spec <- rand_forest() %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

Fit the model

#use the knn spec to fit the pre-processed training data
knn_fit <- knn_spec %>% 
  fit(Species ~., data = df_juiced)

#use the rf spec to fit the pre-processed training data
rf_fit <- rf_spec %>% 
  fit(Species ~., data = df_juiced)
  

knn_fit
## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:
## kknn::train.kknn(formula = formula, data = data, ks = 5)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.07017544
## Best kernel: optimal
## Best k: 5
rf_fit
## parsnip model object
## 
## Fit time:  51ms 
## Ranger result
## 
## Call:
##  ranger::ranger(formula = formula, data = data, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      114 
## Number of independent variables:  4 
## Mtry:                             2 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.04429843

Evaluate the models

#knn

knn_fit %>% 
    predict(baked_test) %>% 
    bind_cols(baked_test) %>% 
    metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.972
## 2 kap      multiclass     0.958
knn_fit %>% 
    predict(df_juiced) %>% 
    bind_cols(df_juiced) %>% 
    metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.965
## 2 kap      multiclass     0.947
#Training: 97.2% accuracy
#Testing: 96.5% accuracy



#rf

rf_fit %>% 
    predict(baked_test) %>% 
    bind_cols(baked_test) %>% 
    metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass         1
## 2 kap      multiclass         1
rf_fit %>% 
    predict(df_juiced) %>% 
    bind_cols(df_juiced) %>% 
    metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.965
## 2 kap      multiclass     0.947
#Training: 100% accuracy
#Testing: 96.5 accuracy

Graph this evaluation manually

results_train <- knn_fit %>% 
  predict(df_juiced) %>% 
  mutate(model = "knn", 
         truth = df_juiced$Species) %>% 
          bind_rows(rf_fit %>% 
          predict(df_juiced) %>% 
          mutate(model = "rf",
          truth = df_juiced$Species)
    
  ) %>% 
  mutate(accuracy = if_else(.pred_class == truth, "yes", "no"))
  

results_test <- knn_fit %>% 
  predict(baked_test) %>% 
  mutate(model = "knn", 
         truth = baked_test$Species) %>% 
          bind_rows(rf_fit %>% 
          predict(baked_test) %>% 
          mutate(model = "rf",
          truth = baked_test$Species)
    
  ) %>% 
  mutate(accuracy = if_else(.pred_class == truth, "yes", "no"))


results_train %>% 
  ggplot() +
  geom_bar(aes(accuracy, fill = accuracy)) +
  facet_wrap(~ model)

results_test %>% 
  ggplot() +
  geom_bar(aes(accuracy, fill = accuracy)) +
  facet_wrap(~ model)

Per classifer metrics (with gain curve)

knn_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

rf_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

Per classifer metrics (with roc curve)

knn_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

rf_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()