Finally, no more iris flowers: presenting penguins!

Tural Sadigov

Hamilton College

Follow along

https://rpubs.com/tsadigov/ada_svm_penguins_2022

Data : palmer penguins

Source

Artwork by @allison_horst

Dream Island

Loading data

library(palmerpenguins)
data(package = 'palmerpenguins')

Data are available by CC-0 license in accordance with the Palmer Station LTER Data Policy and the LTER Data Access Policy for Type I data.

Penguins data

library(tidyverse)
penguins %>% 
  select(-sex) %>% 
  head(5)

Penguins data

library(tidyverse)
penguins %>% 
  select(-sex) %>% 
  head(5)
# A tibble: 5 × 6
  species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
  <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
1 Adelie  Torgersen           39.1          18.7               181        3750
2 Adelie  Torgersen           39.5          17.4               186        3800
3 Adelie  Torgersen           40.3          18                 195        3250
4 Adelie  Torgersen           NA            NA                  NA          NA
5 Adelie  Torgersen           36.7          19.3               193        3450

Penguin measurements: bill dimensions

Penguin measurements: flipper length

Summary of the categorical data

penguins %>% 
  select(where(is.factor)) %>% 
  summary()

Summary of the categorical data

penguins %>% 
  select(where(is.factor)) %>% 
  summary()
      species          island        sex     
 Adelie   :152   Biscoe   :168   female:165  
 Chinstrap: 68   Dream    :124   male  :168  
 Gentoo   :124   Torgersen: 52   NA's  : 11  

Summary of the categorical data: detailed

penguins %>% 
  count(species, sex)

Summary of the categorical data: detailed

penguins %>% 
  count(species, sex)
# A tibble: 8 × 3
  species   sex        n
  <fct>     <fct>  <int>
1 Adelie    female    73
2 Adelie    male      73
3 Adelie    <NA>       6
4 Chinstrap female    34
5 Chinstrap male      34
6 Gentoo    female    58
7 Gentoo    male      61
8 Gentoo    <NA>       5

Summary of the categorical data: detailed

penguins %>% 
  drop_na() %>% 
  count(species, sex)

Summary of the categorical data: detailed

penguins %>% 
  drop_na() %>% 
  count(species, sex)
# A tibble: 6 × 3
  species   sex        n
  <fct>     <fct>  <int>
1 Adelie    female    73
2 Adelie    male      73
3 Chinstrap female    34
4 Chinstrap male      34
5 Gentoo    female    58
6 Gentoo    male      61

Numerical summaries grouped by species

penguins %>% 
  drop_na() %>% 
  group_by(species) %>% 
  summarize(across(where(is.numeric), 
                   mean))

Numerical summaries grouped by species

penguins %>% 
  drop_na() %>% 
  group_by(species) %>% 
  summarize(across(where(is.numeric), 
                   mean))
# A tibble: 3 × 6
  species   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g  year
  <fct>              <dbl>         <dbl>             <dbl>       <dbl> <dbl>
1 Adelie              38.8          18.3              190.       3706. 2008.
2 Chinstrap           48.8          18.4              196.       3733. 2008.
3 Gentoo              47.6          15.0              217.       5092. 2008.

Numerical summaries grouped by gender

penguins %>% 
  drop_na() %>% 
  group_by(sex) %>% 
  summarize(across(where(is.numeric), 
                   mean))

Numerical summaries grouped by gender

penguins %>% 
  drop_na() %>% 
  group_by(sex) %>% 
  summarize(across(where(is.numeric), 
                   mean))
# A tibble: 2 × 6
  sex    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g  year
  <fct>           <dbl>         <dbl>             <dbl>       <dbl> <dbl>
1 female           42.1          16.4              197.       3862. 2008.
2 male             45.9          17.9              205.       4546. 2008.

Scatterplot colored by species

penguins %>% 
  drop_na() %>%  
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm, 
             color = species, 
             size = body_mass_g)) +
  geom_point(alpha = 0.8) +
  xlab("Bill length (mm)") +
  ylab("Bill depth (mm)")

Scatterplot colored by gender

penguins %>% 
  drop_na() %>%  
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm, 
             color =sex, 
             size = body_mass_g)) +
  geom_point(alpha = 0.8) +
  xlab("Bill length (mm)") +
  ylab("Bill depth (mm)")

Scatterplot faceted with species

penguins %>% 
  drop_na() %>%  
  ggplot(aes(x = bill_length_mm, 
             y = bill_depth_mm, 
             color =sex, 
             size = body_mass_g)) +
  geom_point(alpha = 0.8) +
  facet_wrap(~species)+
  xlab("Bill length (mm)") +
  ylab("Bill depth (mm)")

Histogram faceted with gender

3D - Visual summary: colored by species

library(plotly)
plot_ly(x=penguins$bill_length_mm, 
        y=penguins$bill_depth_mm, 
        z=penguins$flipper_length_mm, 
        type="scatter3d", 
        mode="markers", 
        color=penguins$species)

3D - Visual summary: colored by gender

Let the prediction begin!

Lets see Support Vector Machines in action to predict the gender of a penguin.

Drop island and load tidymodels

penguins_df <- 
  penguins %>% 
  drop_na() %>% 
  select(-island)
library(tidymodels)

set.seed(123)
penguins_df %>% 
  sample_n(10)

Drop island and load tidymodels

penguins_df <- 
  penguins %>% 
  drop_na() %>% 
  select(-island)
library(tidymodels)

set.seed(123)
penguins_df %>% 
  sample_n(10)
# A tibble: 10 × 6
   species   bill_length_mm bill_depth_mm flipper_length_mm body_mass_g sex   
   <fct>              <dbl>         <dbl>             <int>       <int> <fct> 
 1 Gentoo              59.6          17                 230        6050 male  
 2 Adelie              34.4          18.4               184        3325 female
 3 Gentoo              45.2          15.8               215        5300 male  
 4 Chinstrap           49            19.5               210        3950 male  
 5 Adelie              41.4          18.5               202        3875 male  
 6 Chinstrap           51            18.8               203        4100 male  
 7 Gentoo              44.9          13.8               212        4750 female
 8 Gentoo              51.1          16.5               225        5250 male  
 9 Chinstrap           50.8          19                 210        4100 male  
10 Gentoo              45.4          14.6               211        4800 female

Split the data

set.seed(2022)
penguin_split <- initial_split(penguins_df, 
                               strata = species, 
                               prop = .70)
penguin_train <- training(penguin_split)
penguin_split <- testing(penguin_split)

set.seed(1234)
penguin_cv <- vfold_cv(penguin_train, v = 5)
penguin_cv

Split the data

set.seed(2022)
penguin_split <- initial_split(penguins_df, 
                               strata = sex, 
                               prop = .70)
penguin_train <- training(penguin_split)
penguin_test <- testing(penguin_split)

set.seed(1234)
penguin_cv <- vfold_cv(penguin_train, v = 5)
penguin_cv
#  5-fold cross-validation 
# A tibble: 5 × 2
  splits           id   
  <list>           <chr>
1 <split [185/47]> Fold1
2 <split [185/47]> Fold2
3 <split [186/46]> Fold3
4 <split [186/46]> Fold4
5 <split [186/46]> Fold5

Model specifications

# support vector classifier
svc_spec <- 
  svm_linear() %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

Model specifications

# support vector classifier
svc_spec <- 
  svm_linear() %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

# support vector machine with radial kernel
svm_rad_spec <- 
  svm_rbf() %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

Model specifications

# support vector classifier
svc_spec <- 
  svm_linear() %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

# support vector machine with radial kernel
svm_rad_spec <- 
  svm_rbf() %>% 
  set_engine("kernlab") %>% 
  set_mode("classification")

Workflows (i.e., ML pipeline) and model fitting

penguin_wf <- 
  workflow() %>% 
  add_formula(sex ~ .)

Workflows (i.e., ML pipeline) and model fitting

penguin_wf <- 
  workflow() %>% 
  add_formula(sex ~ .)

svc_results <-
  penguin_wf %>% 
  add_model(spec = svc_spec) %>% 
  fit_resamples(penguin_cv)

Workflows (i.e., ML pipeline) and model fitting

penguin_wf <- 
  workflow() %>% 
  add_formula(sex ~ .)

svc_results <-
  penguin_wf %>% 
  add_model(spec = svc_spec) %>% 
  fit_resamples(penguin_cv)

svm_results <-
  penguin_wf %>% 
  add_model(spec = svm_rad_spec) %>% 
  fit_resamples(penguin_cv)

Workflows (i.e., ML pipeline) and model fitting

penguin_wf <- 
  workflow() %>% 
  add_formula(sex ~ .)

svc_results <-
  penguin_wf %>% 
  add_model(spec = svc_spec) %>% 
  fit_resamples(penguin_cv)

svm_results <-
  penguin_wf %>% 
  add_model(spec = svm_rad_spec) %>% 
  fit_resamples(penguin_cv)

Model evaluation

collect_metrics(svc_results)

Model evaluation

collect_metrics(svc_results)
# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.914     5 0.0155  Preprocessor1_Model1
2 roc_auc  binary     0.972     5 0.00644 Preprocessor1_Model1
collect_metrics(svm_results)

Model evaluation

collect_metrics(svc_results)
# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.914     5 0.0155  Preprocessor1_Model1
2 roc_auc  binary     0.972     5 0.00644 Preprocessor1_Model1
collect_metrics(svm_results)
# A tibble: 2 × 6
  .metric  .estimator  mean     n std_err .config             
  <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
1 accuracy binary     0.940     5 0.0141  Preprocessor1_Model1
2 roc_auc  binary     0.977     5 0.00543 Preprocessor1_Model1

Let’s choose SVM with radial kernel!

Fit SVM for the whole training data

penguin_final <- 
  penguin_wf %>% 
  add_model(svm_rad_spec) %>%
  last_fit(penguin_split)

Prediction accuracy on testing data

collect_metrics(penguin_final)

Prediction accuracy on testing data

collect_metrics(penguin_final)
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.911 Preprocessor1_Model1
2 roc_auc  binary         0.958 Preprocessor1_Model1

Predictions on testing data

collect_predictions(penguin_final)

Predictions on testing data

collect_predictions(penguin_final) %>% 
  select(-.config)
# A tibble: 101 × 6
   id               .pred_female .pred_male  .row .pred_class sex   
   <chr>                   <dbl>      <dbl> <int> <fct>       <fct> 
 1 train/test split      0.990      0.00990    14 female      female
 2 train/test split      0.784      0.216      16 female      female
 3 train/test split      0.700      0.300      18 female      female
 4 train/test split      0.210      0.790      19 male        male  
 5 train/test split      0.00461    0.995      25 male        male  
 6 train/test split      0.0285     0.972      31 male        male  
 7 train/test split      0.0696     0.930      33 male        female
 8 train/test split      0.996      0.00401    40 female      female
 9 train/test split      0.0293     0.971      44 male        male  
10 train/test split      0.00938    0.991      48 male        male  
# … with 91 more rows
# ℹ Use `print(n = ...)` to see more rows

Confusion matrix on testing data

collect_predictions(penguin_final) %>% 
  conf_mat(sex, .pred_class)

Confusion matrix on testing data

collect_predictions(penguin_final) %>% 
  conf_mat(sex, .pred_class)
          Truth
Prediction female male
    female     45    4
    male        5   47

Sanity check!

Manually calculate accuracy

M <- collect_predictions(penguin_final) %>% 
  conf_mat(sex, .pred_class)

(M$table[1,1] + M$table[2,2])/nrow(penguin_test)

Sanity check!

Manually calculate accuracy

M <- collect_predictions(penguin_final) %>% 
  conf_mat(sex, .pred_class)

(M$table[1,1] + M$table[2,2])/nrow(penguin_test)
[1] 0.9108911

Sanity check!

Manually calculate accuracy

M <- collect_predictions(penguin_final) %>% 
  conf_mat(sex, .pred_class)

(M$table[1,1] + M$table[2,2])/nrow(penguin_test)
[1] 0.9108911

Collect accuracy

collect_metrics(penguin_final)

Sanity check!

Manually calculate accuracy

M <- collect_predictions(penguin_final) %>% 
  conf_mat(sex, .pred_class)

(M$table[1,1] + M$table[2,2])/nrow(penguin_test)
[1] 0.9108911

Collect accuracy

collect_metrics(penguin_final)
# A tibble: 2 × 4
  .metric  .estimator .estimate .config             
  <chr>    <chr>          <dbl> <chr>               
1 accuracy binary         0.911 Preprocessor1_Model1
2 roc_auc  binary         0.958 Preprocessor1_Model1

Back to the presentation

Summary slide