library(tidyverse)
library(tidymodels)
library(scales)
theme_set(theme_light())
library(ggstatsplot)
penguin <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-28/penguins.csv')
Change character variables into factors
penguin <- penguin %>%
mutate_if(is.character, as.factor)
There are three penguin species across three different islands, we can see the spread of species over different island.
table(penguin$species, penguin$island)
##
## Biscoe Dream Torgersen
## Adelie 44 56 52
## Chinstrap 0 68 0
## Gentoo 124 0 0
We Adelie penguin evenly spread across the different islands, while Chinstrap and Gentoo are uniquely located on one island only.
There are size and weight measurements in the data set, we can have a look at their correlartions.
ggcorrmat(
penguin,
sig.level = 0.05,
cor.vars = c(bill_length_mm:body_mass_g),
cor.vars.names = c(
"Bill Length (mm)",
"Bill Depth (mm)",
"Flipper Length (mm)",
"Body Mass (g)"
),
title = "Size and Mass Correlation Matrix"
)
From the correlation plot, we can see the size is highly correlated with flipper length. Size is also correlated with bill length, but negative correlated with width. This is probably due to evlution affect on diet, as there may be a restriction on the size of bill, increase in it’s length will reduce it’s width.
penguin %>%
filter(!is.na(sex)) %>%
ggplot(aes(
x = fct_reorder(species, body_mass_g),
y = body_mass_g,
fill = sex
)) +
geom_boxplot() +
labs(x = "Species",
y = "Body Mass (g)",
title = "Body Mass Distribution of Different Penguin Species")
Male penguin is bigger (has higher body mass) then female penguin as expected. It is clear that Gentoo penguin has substantically big than the other two species.
As noted in the cross table, Adelie lives all three islands, we can also check if the location affect the body size.
penguin %>%
ggplot(aes(
x = fct_reorder(island, body_mass_g),
y = body_mass_g,
fill = species
)) +
geom_boxplot() +
labs(x = "Island",
y = "Body Mass (g)",
title = "Body Mass Distribution of Different Island")
We see there is no difference for Adelie penguins at different island, so the island are similiar in it’s enviroment, not affecting the Adelie penguin to change it behaviour or physiology.
Let’s have a look at the filper length and bill length.
penguin %>%
filter(!is.na(sex)) %>%
ggplot(aes(flipper_length_mm, bill_length_mm, colour = sex)) +
geom_point(size = 3) +
facet_wrap(~ species) +
labs(x = "Flipper Length (mm)",
y = "Bill Length (mm)",
title = "Flipper Length vs Bill Length by Species")
Interestingly, Chinstrap penguin has similiar flipper and bill length to Gentoo, even though we have seen from the previous plot, they have similar size to the Adelie penguin.
Lastly, we check year to see if there is any change
penguin %>%
filter(!is.na(sex)) %>%
mutate(year = as_factor(year)) %>%
ggplot(aes(species, body_mass_g, colour = year)) +
geom_jitter(size = 3) +
labs(x = "Species",
y = "Body Mass (g)",
title = "Body Mass change over the Year")
All points are overlapping each other, so there are no difference beween each year.
## Split
set.seed(123)
penguin_split <- penguin %>%
filter(!is.na(sex)) %>%
select(-year) %>%
initial_split(strata = sex)
penguin_train <- training(penguin_split)
penguin_test <- testing(penguin_split)
set.seed(234)
penguin_boot <- bootstraps(penguin_train)
Set models
glm_spec <- logistic_reg() %>%
set_engine("glm")
stan_spec <- logistic_reg() %>%
set_engine("stan")
rf_spec <- rand_forest() %>%
set_mode("classification") %>%
set_engine("ranger")
Set work flow
penguin_wf <- workflow() %>%
add_formula(sex ~ .)
model_glm <- penguin_wf %>%
add_model(glm_spec) %>%
fit_resamples(
resamples = penguin_boot,
control = control_resamples(save_pred = TRUE, verbose = TRUE)
)
model_stan <- penguin_wf %>%
add_model(stan_spec) %>%
fit_resamples(
resamples = penguin_boot,
control = control_resamples(save_pred = TRUE, verbose = TRUE)
)
model_rf <- penguin_wf %>%
add_model(rf_spec) %>%
fit_resamples(
resamples = penguin_boot,
control = control_resamples(save_pred = TRUE, verbose = TRUE)
)
Evalute models
collect_metrics(model_glm)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.885 25 0.00525 Preprocessor1_Model1
## 2 roc_auc binary 0.957 25 0.00297 Preprocessor1_Model1
collect_metrics(model_stan) ## bayesian model
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.890 25 0.00539 Preprocessor1_Model1
## 2 roc_auc binary 0.959 25 0.00271 Preprocessor1_Model1
collect_metrics(model_rf)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.895 25 0.00699 Preprocessor1_Model1
## 2 roc_auc binary 0.959 25 0.00305 Preprocessor1_Model1
Pick
model_glm %>%
conf_mat_resampled()
## # A tibble: 4 x 3
## Prediction Truth Freq
## <fct> <fct> <dbl>
## 1 female female 40.3
## 2 female male 5.76
## 3 male female 4.84
## 4 male male 40.9
model_glm %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(sex, .pred_female) %>%
autoplot()
Final fit on testing set
model_final <- penguin_wf %>%
add_model(glm_spec) %>%
last_fit(penguin_split)
model_final$.workflow[[1]] %>%
tidy(exponentiate = TRUE) ## covert log odds to odds ratio
## # A tibble: 9 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.30e-35 13.5 -5.91 0.00000000342
## 2 speciesChinstrap 1.31e- 3 1.75 -3.78 0.000154
## 3 speciesGentoo 8.41e- 5 2.95 -3.18 0.00149
## 4 islandDream 7.36e- 1 0.917 -0.334 0.738
## 5 islandTorgersen 6.73e- 1 0.913 -0.433 0.665
## 6 bill_length_mm 1.79e+ 0 0.139 4.19 0.0000283
## 7 bill_depth_mm 3.92e+ 0 0.374 3.66 0.000257
## 8 flipper_length_mm 1.08e+ 0 0.0536 1.35 0.176
## 9 body_mass_g 1.01e+ 0 0.00108 4.64 0.00000340
(Reference level is female Adelie penguin on Biscoe) From the significant p values, we see increase in body mass by 1g increases the odds of being male by 1, which is what we observed that male is larger than female. Flipper length is not a significant factor to determine the sex of the penguin. Increase in bill size does, every 1mm increase in bill depth has twice the odds of of being male than 1mm increase in bill length. Island doesn’t affect the predication as noted before. Species matter, though the effect is negligible.