Load packages

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.2

## -- Attaching packages ---------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'tidyr' was built under R version 3.6.2

## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(tidymodels)

## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.0.3 --

## v broom     0.5.4     v recipes   0.1.9
## v dials     0.0.4     v rsample   0.0.5
## v infer     0.5.1     v yardstick 0.0.5
## v parsnip   0.0.5

## Warning: package 'broom' was built under R version 3.6.2

## Warning: package 'parsnip' was built under R version 3.6.2

## Warning: package 'recipes' was built under R version 3.6.2

## Warning: package 'yardstick' was built under R version 3.6.2

## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter()   masks stats::filter()
## x recipes::fixed()  masks stringr::fixed()
## x dplyr::lag()      masks stats::lag()
## x dials::margin()   masks ggplot2::margin()
## x yardstick::spec() masks readr::spec()
## x recipes::step()   masks stats::step()

Load data

df <- as_tibble(iris)

Split the data

set.seed(999)

split <- initial_split(df, strata = Species)

df_train <- training(split)

df_test <- testing(split)

Prep the data

df_rec <- recipe(Species ~., data =df_train) %>%
  #make sure the training set has equal numbers of target variale (not needed for iris dataset)
  step_downsample(Species) %>% 
  #normalise the data
  step_center(-Species) %>% 
  step_scale(-Species) %>% 
  step_BoxCox(-Species) %>% 
  #function to apply the recipe to the data
  prep()
  
#apply the recipe to the traning set (df_rec contains the data and the pre processing steps)
df_juiced <- juice(df_rec)

#visualise what this processed data looks like
df_juiced %>% pivot_longer(-Species) %>% 
    ggplot() +
    geom_histogram(aes(value, fill = Species)) +
    facet_wrap(~name)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bake the test data

#apply the same recipe to the test data
baked_test <- bake(df_rec, new_data = df_test)

Specify the fits

#make a knn spec
knn_spec <- nearest_neighbor() %>% 
  set_engine("kknn") %>% 
  set_mode("classification")

#make a rf spec
rf_spec <- rand_forest() %>% 
  set_engine("ranger") %>% 
  set_mode("classification")

Fit the model

#use the knn spec to fit the pre-processed training data
knn_fit <- knn_spec %>% 
  fit(Species ~., data = df_juiced)

#use the rf spec to fit the pre-processed training data
rf_fit <- rf_spec %>% 
  fit(Species ~., data = df_juiced)
  

knn_fit

## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:
## kknn::train.kknn(formula = formula, data = data, ks = 5)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.07017544
## Best kernel: optimal
## Best k: 5

rf_fit

## parsnip model object
## 
## Fit time:  51ms 
## Ranger result
## 
## Call:
##  ranger::ranger(formula = formula, data = data, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      114 
## Number of independent variables:  4 
## Mtry:                             2 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.04429843

Evaluate the models

#knn

knn_fit %>% 
    predict(baked_test) %>% 
    bind_cols(baked_test) %>% 
    metrics(truth = Species, estimate = .pred_class)

## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.972
## 2 kap      multiclass     0.958

knn_fit %>% 
    predict(df_juiced) %>% 
    bind_cols(df_juiced) %>% 
    metrics(truth = Species, estimate = .pred_class)

## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.965
## 2 kap      multiclass     0.947

#Training: 97.2% accuracy
#Testing: 96.5% accuracy



#rf

rf_fit %>% 
    predict(baked_test) %>% 
    bind_cols(baked_test) %>% 
    metrics(truth = Species, estimate = .pred_class)

## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass         1
## 2 kap      multiclass         1

rf_fit %>% 
    predict(df_juiced) %>% 
    bind_cols(df_juiced) %>% 
    metrics(truth = Species, estimate = .pred_class)

## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.965
## 2 kap      multiclass     0.947

#Training: 100% accuracy
#Testing: 96.5 accuracy

Graph this evaluation manually

results_train <- knn_fit %>% 
  predict(df_juiced) %>% 
  mutate(model = "knn", 
         truth = df_juiced$Species) %>% 
          bind_rows(rf_fit %>% 
          predict(df_juiced) %>% 
          mutate(model = "rf",
          truth = df_juiced$Species)
    
  ) %>% 
  mutate(accuracy = if_else(.pred_class == truth, "yes", "no"))
  

results_test <- knn_fit %>% 
  predict(baked_test) %>% 
  mutate(model = "knn", 
         truth = baked_test$Species) %>% 
          bind_rows(rf_fit %>% 
          predict(baked_test) %>% 
          mutate(model = "rf",
          truth = baked_test$Species)
    
  ) %>% 
  mutate(accuracy = if_else(.pred_class == truth, "yes", "no"))


results_train %>% 
  ggplot() +
  geom_bar(aes(accuracy, fill = accuracy)) +
  facet_wrap(~ model)

results_test %>% 
  ggplot() +
  geom_bar(aes(accuracy, fill = accuracy)) +
  facet_wrap(~ model)

Per classifer metrics (with gain curve)

knn_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

rf_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  gain_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

Per classifer metrics (with roc curve)

knn_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

rf_fit %>%
  #probability for each possible predicted value
  predict(df_juiced, type = "prob") %>% 
  bind_cols(df_juiced) %>%
  #graph
  roc_curve(Species, .pred_setosa:.pred_virginica) %>%
  autoplot()

Iris with Tidymodels

Load packages

Load data

Split the data

Prep the data

Bake the test data

Specify the fits

Fit the model

Evaluate the models

Graph this evaluation manually

Per classifer metrics (with gain curve)

Per classifer metrics (with roc curve)