Load packages
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.2
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tidyr' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidymodels)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------------- tidymodels 0.0.3 --
## v broom 0.5.4 v recipes 0.1.9
## v dials 0.0.4 v rsample 0.0.5
## v infer 0.5.1 v yardstick 0.0.5
## v parsnip 0.0.5
## Warning: package 'broom' was built under R version 3.6.2
## Warning: package 'parsnip' was built under R version 3.6.2
## Warning: package 'recipes' was built under R version 3.6.2
## Warning: package 'yardstick' was built under R version 3.6.2
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------------ tidymodels_conflicts() --
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x dials::margin() masks ggplot2::margin()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
Load data
df <- as_tibble(iris)
Split the data
set.seed(999)
split <- initial_split(df, strata = Species)
df_train <- training(split)
df_test <- testing(split)
Prep the data
df_rec <- recipe(Species ~., data =df_train) %>%
#make sure the training set has equal numbers of target variale (not needed for iris dataset)
step_downsample(Species) %>%
#normalise the data
step_center(-Species) %>%
step_scale(-Species) %>%
step_BoxCox(-Species) %>%
#function to apply the recipe to the data
prep()
#apply the recipe to the traning set (df_rec contains the data and the pre processing steps)
df_juiced <- juice(df_rec)
#visualise what this processed data looks like
df_juiced %>% pivot_longer(-Species) %>%
ggplot() +
geom_histogram(aes(value, fill = Species)) +
facet_wrap(~name)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bake the test data
#apply the same recipe to the test data
baked_test <- bake(df_rec, new_data = df_test)
Specify the fits
#make a knn spec
knn_spec <- nearest_neighbor() %>%
set_engine("kknn") %>%
set_mode("classification")
#make a rf spec
rf_spec <- rand_forest() %>%
set_engine("ranger") %>%
set_mode("classification")
Fit the model
#use the knn spec to fit the pre-processed training data
knn_fit <- knn_spec %>%
fit(Species ~., data = df_juiced)
#use the rf spec to fit the pre-processed training data
rf_fit <- rf_spec %>%
fit(Species ~., data = df_juiced)
knn_fit
## parsnip model object
##
## Fit time: 0ms
##
## Call:
## kknn::train.kknn(formula = formula, data = data, ks = 5)
##
## Type of response variable: nominal
## Minimal misclassification: 0.07017544
## Best kernel: optimal
## Best k: 5
rf_fit
## parsnip model object
##
## Fit time: 51ms
## Ranger result
##
## Call:
## ranger::ranger(formula = formula, data = data, num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
##
## Type: Probability estimation
## Number of trees: 500
## Sample size: 114
## Number of independent variables: 4
## Mtry: 2
## Target node size: 10
## Variable importance mode: none
## Splitrule: gini
## OOB prediction error (Brier s.): 0.04429843
Evaluate the models
#knn
knn_fit %>%
predict(baked_test) %>%
bind_cols(baked_test) %>%
metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.972
## 2 kap multiclass 0.958
knn_fit %>%
predict(df_juiced) %>%
bind_cols(df_juiced) %>%
metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.965
## 2 kap multiclass 0.947
#Training: 97.2% accuracy
#Testing: 96.5% accuracy
#rf
rf_fit %>%
predict(baked_test) %>%
bind_cols(baked_test) %>%
metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 1
## 2 kap multiclass 1
rf_fit %>%
predict(df_juiced) %>%
bind_cols(df_juiced) %>%
metrics(truth = Species, estimate = .pred_class)
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy multiclass 0.965
## 2 kap multiclass 0.947
#Training: 100% accuracy
#Testing: 96.5 accuracy
Graph this evaluation manually
results_train <- knn_fit %>%
predict(df_juiced) %>%
mutate(model = "knn",
truth = df_juiced$Species) %>%
bind_rows(rf_fit %>%
predict(df_juiced) %>%
mutate(model = "rf",
truth = df_juiced$Species)
) %>%
mutate(accuracy = if_else(.pred_class == truth, "yes", "no"))
results_test <- knn_fit %>%
predict(baked_test) %>%
mutate(model = "knn",
truth = baked_test$Species) %>%
bind_rows(rf_fit %>%
predict(baked_test) %>%
mutate(model = "rf",
truth = baked_test$Species)
) %>%
mutate(accuracy = if_else(.pred_class == truth, "yes", "no"))
results_train %>%
ggplot() +
geom_bar(aes(accuracy, fill = accuracy)) +
facet_wrap(~ model)

results_test %>%
ggplot() +
geom_bar(aes(accuracy, fill = accuracy)) +
facet_wrap(~ model)

Per classifer metrics (with gain curve)
knn_fit %>%
#probability for each possible predicted value
predict(df_juiced, type = "prob") %>%
bind_cols(df_juiced) %>%
#graph
gain_curve(Species, .pred_setosa:.pred_virginica) %>%
autoplot()

rf_fit %>%
#probability for each possible predicted value
predict(df_juiced, type = "prob") %>%
bind_cols(df_juiced) %>%
#graph
gain_curve(Species, .pred_setosa:.pred_virginica) %>%
autoplot()

Per classifer metrics (with roc curve)
knn_fit %>%
#probability for each possible predicted value
predict(df_juiced, type = "prob") %>%
bind_cols(df_juiced) %>%
#graph
roc_curve(Species, .pred_setosa:.pred_virginica) %>%
autoplot()

rf_fit %>%
#probability for each possible predicted value
predict(df_juiced, type = "prob") %>%
bind_cols(df_juiced) %>%
#graph
roc_curve(Species, .pred_setosa:.pred_virginica) %>%
autoplot()
