Load Package

library(tidyverse)
library(tidymodels)
library(scales)
theme_set(theme_light())
library(ggstatsplot)
penguin <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-28/penguins.csv')

Change character variables into factors

penguin <- penguin %>%
  mutate_if(is.character, as.factor)

Statistical analysis

There are three penguin species across three different islands, we can see the spread of species over different island.

table(penguin$species, penguin$island)
##            
##             Biscoe Dream Torgersen
##   Adelie        44    56        52
##   Chinstrap      0    68         0
##   Gentoo       124     0         0

We Adelie penguin evenly spread across the different islands, while Chinstrap and Gentoo are uniquely located on one island only.

There are size and weight measurements in the data set, we can have a look at their correlartions.

ggcorrmat(
  penguin,
  sig.level = 0.05,
  cor.vars = c(bill_length_mm:body_mass_g),
  cor.vars.names = c(
    "Bill Length (mm)",
    "Bill Depth (mm)",
    "Flipper Length (mm)",
    "Body Mass (g)"
  ),
  title = "Size and Mass Correlation Matrix"
)

From the correlation plot, we can see the size is highly correlated with flipper length. Size is also correlated with bill length, but negative correlated with width. This is probably due to evlution affect on diet, as there may be a restriction on the size of bill, increase in it’s length will reduce it’s width.

penguin %>%
  filter(!is.na(sex)) %>%
  ggplot(aes(
    x = fct_reorder(species, body_mass_g),
    y = body_mass_g,
    fill = sex
  )) +
  geom_boxplot() +
  labs(x = "Species",
       y = "Body Mass (g)",
       title = "Body Mass Distribution of Different Penguin Species") 

Male penguin is bigger (has higher body mass) then female penguin as expected. It is clear that Gentoo penguin has substantically big than the other two species.

As noted in the cross table, Adelie lives all three islands, we can also check if the location affect the body size.

penguin %>%
  ggplot(aes(
    x = fct_reorder(island, body_mass_g),
    y = body_mass_g,
    fill = species
  )) +
  geom_boxplot() +
  labs(x = "Island",
       y = "Body Mass (g)",
       title = "Body Mass Distribution of Different Island") 

We see there is no difference for Adelie penguins at different island, so the island are similiar in it’s enviroment, not affecting the Adelie penguin to change it behaviour or physiology.

Let’s have a look at the filper length and bill length.

penguin %>% 
  filter(!is.na(sex)) %>% 
  ggplot(aes(flipper_length_mm, bill_length_mm, colour = sex)) +
  geom_point(size = 3) +
  facet_wrap(~ species) +
  labs(x = "Flipper Length (mm)",
       y = "Bill Length (mm)",
       title = "Flipper Length vs Bill Length by Species")

Interestingly, Chinstrap penguin has similiar flipper and bill length to Gentoo, even though we have seen from the previous plot, they have similar size to the Adelie penguin.

Lastly, we check year to see if there is any change

penguin %>%
  filter(!is.na(sex)) %>%
  mutate(year = as_factor(year)) %>%
  ggplot(aes(species, body_mass_g, colour = year)) +
  geom_jitter(size = 3) +
  labs(x = "Species",
       y = "Body Mass (g)",
       title = "Body Mass change over the Year")

All points are overlapping each other, so there are no difference beween each year.

Modelling (Classification)

## Split 
set.seed(123)
penguin_split <- penguin %>%
  filter(!is.na(sex)) %>% 
  select(-year) %>%
  initial_split(strata = sex)

penguin_train <- training(penguin_split)
penguin_test <- testing(penguin_split)

set.seed(234)
penguin_boot <- bootstraps(penguin_train)

Set models

glm_spec <- logistic_reg() %>% 
  set_engine("glm")

stan_spec <- logistic_reg() %>% 
  set_engine("stan")

rf_spec <- rand_forest() %>% 
  set_mode("classification") %>% 
  set_engine("ranger")

Set work flow

penguin_wf <- workflow() %>%
  add_formula(sex ~ .)  
model_glm <- penguin_wf %>% 
  add_model(glm_spec) %>% 
  fit_resamples(
    resamples = penguin_boot,
    control = control_resamples(save_pred = TRUE, verbose = TRUE)
  )

model_stan <- penguin_wf %>% 
  add_model(stan_spec) %>% 
  fit_resamples(
    resamples = penguin_boot,
    control = control_resamples(save_pred = TRUE, verbose = TRUE)
  )

model_rf <- penguin_wf %>% 
  add_model(rf_spec) %>% 
  fit_resamples(
    resamples = penguin_boot,
    control = control_resamples(save_pred = TRUE, verbose = TRUE)
  )

Evalute models

collect_metrics(model_glm)
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.885    25 0.00525 Preprocessor1_Model1
## 2 roc_auc  binary     0.957    25 0.00297 Preprocessor1_Model1
collect_metrics(model_stan) ## bayesian model
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.890    25 0.00539 Preprocessor1_Model1
## 2 roc_auc  binary     0.959    25 0.00271 Preprocessor1_Model1
collect_metrics(model_rf)
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n std_err .config             
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy binary     0.895    25 0.00699 Preprocessor1_Model1
## 2 roc_auc  binary     0.959    25 0.00305 Preprocessor1_Model1

Pick

model_glm %>% 
  conf_mat_resampled()
## # A tibble: 4 x 3
##   Prediction Truth   Freq
##   <fct>      <fct>  <dbl>
## 1 female     female 40.3 
## 2 female     male    5.76
## 3 male       female  4.84
## 4 male       male   40.9
model_glm %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(sex, .pred_female) %>%
  autoplot()

Final fit on testing set

model_final <- penguin_wf %>% 
  add_model(glm_spec) %>% 
  last_fit(penguin_split)
model_final$.workflow[[1]] %>%  
  tidy(exponentiate = TRUE) ## covert log odds to odds ratio
## # A tibble: 9 x 5
##   term              estimate std.error statistic       p.value
##   <chr>                <dbl>     <dbl>     <dbl>         <dbl>
## 1 (Intercept)       2.30e-35  13.5        -5.91  0.00000000342
## 2 speciesChinstrap  1.31e- 3   1.75       -3.78  0.000154     
## 3 speciesGentoo     8.41e- 5   2.95       -3.18  0.00149      
## 4 islandDream       7.36e- 1   0.917      -0.334 0.738        
## 5 islandTorgersen   6.73e- 1   0.913      -0.433 0.665        
## 6 bill_length_mm    1.79e+ 0   0.139       4.19  0.0000283    
## 7 bill_depth_mm     3.92e+ 0   0.374       3.66  0.000257     
## 8 flipper_length_mm 1.08e+ 0   0.0536      1.35  0.176        
## 9 body_mass_g       1.01e+ 0   0.00108     4.64  0.00000340

(Reference level is female Adelie penguin on Biscoe) From the significant p values, we see increase in body mass by 1g increases the odds of being male by 1, which is what we observed that male is larger than female. Flipper length is not a significant factor to determine the sex of the penguin. Increase in bill size does, every 1mm increase in bill depth has twice the odds of of being male than 1mm increase in bill length. Island doesn’t affect the predication as noted before. Species matter, though the effect is negligible.