Penguins data

The penguins data has 344 rows and 8 columns. Read more about the data at the package website.

penguins <- palmerpenguins::penguins
# Code for info is in appendix
info(penguins)
column type min max zero_count na_count unique_values levels
species factor NA NA NA 0 3 Adelie, Gentoo, Chinstrap
island factor NA NA NA 0 3 Biscoe, Dream, Torgersen
bill_length_mm numeric 32.1 59.6 0 2 164 NA
bill_depth_mm numeric 13.1 21.5 0 2 80 NA
flipper_length_mm integer 172.0 231.0 0 2 55 NA
body_mass_g integer 2700.0 6300.0 0 2 94 NA
sex factor NA NA NA 11 2 male, female
year integer 2007.0 2009.0 0 0 3 NA

Distribution of Sex

The distribution of penguins by sex is 50/50 for all species. The species representation in the sample, however, is not equal.

table(penguins[, c('species', 'sex')], useNA = "always")
##            sex
## species     female male <NA>
##   Adelie        73   73    6
##   Chinstrap     34   34    0
##   Gentoo        58   61    5
##   <NA>           0    0    0
table(penguins$species, useNA = "always") / nrow(penguins)
## 
##    Adelie Chinstrap    Gentoo      <NA> 
## 0.4418605 0.1976744 0.3604651 0.0000000

Islands

Gentoo are all on Biscoe island, Chinstrap are all on Dream island, and Adelie can be found on all three. I am interested in whether the characteristics of the three populations of Adelie are different.

table(penguins[, c('species', 'island')], useNA = 'always')
##            island
## species     Biscoe Dream Torgersen <NA>
##   Adelie        44    56        52    0
##   Chinstrap      0    68         0    0
##   Gentoo       124     0         0    0
##   <NA>           0     0         0    0
adelie <- penguins %>% filter(species == 'Adelie')
table(adelie[, c('island', 'sex')], useNA = 'always')
##            sex
## island      female male <NA>
##   Biscoe        22   22    0
##   Dream         27   28    1
##   Torgersen     24   23    5
##   <NA>           0    0    0

adelie %>% 
  mutate(ID = 1:nrow(.)) %>% 
  drop_na() %>% 
  select(-species, -sex, -year) %>% 
  pivot_longer(
    cols = -c('ID', 'island'), 
    names_to = 'field', 
    values_to = 'value') %>% 
  ggplot() +
  aes(x = value, fill = island) +
  geom_density(alpha = 0.75) +
  facet_wrap(~field, scales = 'free')

There is something interesting with bill_depth_mm on Torgerson island, but nothing appears dramatic enough for me to want to include island.

Based on this sample of birds, if we know the island a bird comes from, there are at most two species it can belong to. In the case of Torgersen island, we know it has to be Adelie. However, I want to build models that do not rely on island.

Pairs Plot

library(GGally)
penguins %>% 
  drop_na %>% 
  select(
    species, 
    bill_length_mm, 
    bill_depth_mm, 
    flipper_length_mm, 
    body_mass_g) %>% 
  GGally::ggpairs(
    progress = FALSE,
    mapping = aes(
      fill = species, color = species),
    columns = setdiff(names(.), 'species'))

If you look at the distributions of bill_depth_mm and body_mass_g and their scatter plot, something really interesting pops out. There is a perfect linear separation of Gentoo from the other two species. It’s as if you could use the ratio of bill depth to body mass to predict whether an individual is Gentoo. Then, use a conditional logistic model to split Adelie from Chinstrap.

Question #1: Logistic Model

Part #1a: response variable

There are three species. To do a true binomial logistic model, we need a response variable with only two classes! I’m going to combine the Adelie and Chinstrap species based on my observations above.

penguins$gentoo <- penguins$species == 'Gentoo'
sum(penguins$gentoo)
## [1] 124

Part #1b: variable and model selection

I’m restricting my model building to the four continuous features. There are only 16 unique combinations of these four (ignoring derived features and interaction terms). It’s easy enough to run all 16.

# The four variables I care about
cols <- c(
  "bill_length_mm", 
  "bill_depth_mm", 
  "flipper_length_mm", 
  "body_mass_g")

# Build a "masking" matrix
a <- c(TRUE, FALSE)
b <- matrix(c(
  rep(a, each = 8, length.out = 16),
  rep(a, each = 4, length.out = 16),
  rep(a, each = 2, length.out = 16),
  rep(a, each = 1, length.out = 16)),
  nrow = 16)

# Temp vectors to store output
converge <- logical()
deviance <- numeric()
aic <- numeric()
mod_cols <- character()

# Iterate through all 16 combinations
for (i in 1:16) {
  r <- b[i, ]
  cols_i <- cols[r]
  suppressWarnings(mod_i <- glm(
    gentoo ~ .,
    data = penguins[, c(cols_i, 'gentoo')],
    family = 'binomial'))
  converge[i] <- mod_i$converged
  deviance[i] <- mod_i$deviance
  aic[i] <- mod_i$aic
}

# Prepare an exhibit
res <- ifelse(b, "X", "")
colnames(res) <- cols
res <- as.data.frame(res)
res$converge <- converge
res$deviance <- ifelse(converge, deviance, NA)
res$aic <- ifelse(converge, aic, NA)
res[c(which(converge), which(!converge)), ] %>% 
  knitr::kable(digits = c(0, 0, 0, 0, 0, 1, 1))
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g converge deviance aic
4 X X TRUE 30.0 36.0
5 X X X TRUE 33.3 41.3
6 X X TRUE 44.8 50.8
7 X X TRUE 115.3 121.3
8 X TRUE 354.8 358.8
12 X TRUE 90.2 94.2
13 X X TRUE 38.8 44.8
14 X TRUE 49.4 53.4
15 X TRUE 117.9 121.9
16 TRUE 449.7 451.7
1 X X X X FALSE NA NA
2 X X X FALSE NA NA
3 X X X FALSE NA NA
9 X X X FALSE NA NA
10 X X FALSE NA NA
11 X X FALSE NA NA

Six of the sixteen do not converge. This is because of the perfect linear separation between Gentoo and the other species in the pairs plot above. All six have bill_depth_mm and at least one of flipper_length_mm and body_mass_g.

Of the ten models that do converge, the two that have the lowest residual deviance are

m1 <- glm(
  gentoo ~ bill_length_mm + bill_depth_mm, 
  data = penguins, family = binomial)

m2 <- glm(
  gentoo ~ bill_length_mm + flipper_length_mm + body_mass_g, 
  data = penguins, family = binomial)

The potential of bill measurements being all that is needed is really attractive in this model. However, the coefficients in that model are severe for a logistic model and make them hard to interpret. It also makes the model very sensitive to anomalous observations.

The coefficients in m2 are much less severe and lead to a more attractive model, with a low impact on AIC and residual deviance.

knitr::kable(tidy(m1), digits = c(0, 3, 1, 1, 4))
term estimate std.error statistic p.value
(Intercept) 49.010 14.8 3.3 9e-04
bill_length_mm 0.558 0.1 4.1 0e+00
bill_depth_mm -4.532 1.1 -4.3 0e+00
knitr::kable(tidy(m2), digits = c(0, 3, 1, 1, 4))
term estimate std.error statistic p.value
(Intercept) -137.761 33.5 -4.1 0.0000
bill_length_mm -0.302 0.2 -2.0 0.0507
flipper_length_mm 0.636 0.2 3.8 0.0001
body_mass_g 0.005 0.0 2.6 0.0098

I select m2, using bill_length_mm + flipper_length_mm + body_mass_g.

Part #1c: variable interpretations

I’d like to discuss what the coefficients of a binomial model mean. This will help in understanding the output of m2 and why m1 is not a great choice.

Use \(L\) for the linear predictor or \(L = \beta_0 + \beta_1 x_1 + \cdots\). The model is then

\[ \begin{aligned} L &= \log \frac{p}{1-p} \\ e^L &= \frac{p}{1-p} \end{aligned} \] It’s helpful to have an aid in understanding how changes in the linear predictor affect \(p\).

p e^L L
0.0 0.00 -Inf
0.01 1/99 -4.6
0.25 1/3 -1.1
0.5 1.0 0.00
0.75 3.0 1.1
0.99 99.0 4.6
1.0 Inf Inf

A change of 4.6 x 2 is either direction changes the model output from a certainty of being in one class to a certainty of being in the other. m1’s coefficient for bill_depth_mm is -4.5. A change of just 2mm holding everything else equal is enough to change \(p\) from 0.01 to 0.99 or vice-versa. It is just too sensitive. With m2 to see a 10% change in \(p\), bill length would have to change by about 3mm, flipper length by 1.5mm, or body mass by 200g. I believe this is a helpful way to understand the coefficients of the model and their impact on the overall probability of being Gentoo.

Question #2: model diagnostics

See appendix for code of model_objects.

mo2 <- model_objects(m2)

# AUC
mo2$auc
## [1] 0.9983666

# Accuracy
mo2$accuracy
## [1] 0.9766082

# True positive rate aka Sensitivity
mo2$sensitivity
## [1] 0.9756098

# True negative rate aka Specificity
mo2$specificity
## [1] 0.9771689

# False positive rate = 1 - TNR
1 - mo2$specificity
## [1] 0.02283105

# False negative rate = 1 - TPR
1 - mo2$sensitivity
## [1] 0.02439024

Question #3: multinomial logistic regression

Since there are only 16 models, I’ll do the same as above and check convergence of all 16.

# The four variables I care about
cols <- c(
  "bill_length_mm", 
  "bill_depth_mm", 
  "flipper_length_mm", 
  "body_mass_g")

# Build a "masking" matrix
a <- c(TRUE, FALSE)
b <- matrix(c(
  rep(a, each = 8, length.out = 16),
  rep(a, each = 4, length.out = 16),
  rep(a, each = 2, length.out = 16),
  rep(a, each = 1, length.out = 16)),
  nrow = 16)

# Temp vectors to store output
converge <- logical()
deviance <- numeric()
aic <- numeric()
mod_cols <- character()

# Iterate through all 16 combinations
for (i in 1:16) {
  r <- b[i, ]
  cols_i <- cols[r]
  capture.output(suppressWarnings(mod_i <- nnet::multinom(
    species ~ .,
    data = penguins[, c(cols_i, 'species')])))
  converge[i] <- mod_i$convergence == 0
  deviance[i] <- mod_i$deviance
  aic[i] <- mod_i$AIC
}

# Prepare an exhibit
res <- ifelse(b, "X", "")
colnames(res) <- cols
res <- as.data.frame(res)
res$converge <- converge
res$deviance <- ifelse(converge, deviance, NA)
res$aic <- ifelse(converge, aic, NA)
res[c(which(converge), which(!converge)), ] %>% 
  knitr::kable(digits = c(0, 0, 0, 0, 0, 1, 1))
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g converge deviance aic
2 X X X TRUE 15.3 31.3
4 X X TRUE 47.9 59.9
5 X X X TRUE 27.0 43.0
6 X X TRUE 78.1 90.1
7 X X TRUE 77.6 89.6
8 X TRUE 332.7 340.7
9 X X X TRUE 229.5 245.5
10 X X TRUE 234.3 246.3
11 X X TRUE 271.1 283.1
12 X TRUE 361.4 369.4
13 X X TRUE 268.7 280.7
14 X TRUE 287.7 295.7
15 X TRUE 388.9 396.9
16 TRUE 721.8 725.8
1 X X X X FALSE NA NA
3 X X X FALSE NA NA

This time, only two do not converge. Those are the two that include bill_length_mm, bill_depth_mm, and body_mass_g.

The two best models, by residual deviance and AIC, are

multi1 <- nnet::multinom(
    species ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
    data = penguins)

multi2 <- nnet::multinom(
    species ~ bill_length_mm + flipper_length_mm + body_mass_g,
    data = penguins)

I’m selecting multi2 because it has only slight worse AIC and deviance, uses the same variables as the 2-class problem above, and the coefficients are again less severe.

tidy(multi2) %>% 
  knitr::kable(digits = c(0, 0, 2, 3, 1, 3))
y.level term estimate std.error statistic p.value
Chinstrap (Intercept) -95.24 0.002 -43113.0 0.000
Chinstrap bill_length_mm 5.73 0.007 811.0 0.000
Chinstrap flipper_length_mm -0.31 0.043 -7.3 0.000
Chinstrap body_mass_g -0.03 0.002 -11.6 0.000
Gentoo (Intercept) -158.00 0.000 -583534.5 0.000
Gentoo bill_length_mm 0.67 0.286 2.4 0.018
Gentoo flipper_length_mm 0.55 0.071 7.7 0.000
Gentoo body_mass_g 0.00 0.002 1.6 0.115

Question #4: diagnotics for multinomial models

I’d like to do something similar to the binomial case

Here are some examples using the 2-class case.

Plot ROC

plot_roc(mo2)

Faraway halfnorm plots

halfnorm_plot(mo2)

Hosmer-Lemeshow Plot

hosmer_lemeshow(mo2)
## `summarise()` ungrouping output (override with `.groups` argument)
## $plot

## 
## $hlstat
## [1] 1.33466e+11
## 
## $pval
## [1] 0

Appendix

Packages

search()
##  [1] ".GlobalEnv"             "package:GGally"         "package:palmerpenguins"
##  [4] "package:faraway"        "package:broom"          "package:pROC"          
##  [7] "package:caret"          "package:lattice"        "package:tree"          
## [10] "package:forcats"        "package:stringr"        "package:dplyr"         
## [13] "package:purrr"          "package:readr"          "package:tidyr"         
## [16] "package:tibble"         "package:ggplot2"        "package:tidyverse"     
## [19] "package:knitr"          "package:alrtools"       "package:stats"         
## [22] "package:graphics"       "package:grDevices"      "package:utils"         
## [25] "package:datasets"       "package:methods"        "Autoloads"             
## [28] "package:base"

Code

model_objects <- function(m) {
  # m response has to be TRUE/FALSE
  TF <- c(TRUE, FALSE)
  out <- list()
  out$m <- m
  out$actual <- factor(
    unname(model.response(model.frame(m))),
    levels = TF)
  N <- nrow(m$data)
  out$prob <- predict(m, type = "response")
  out$prob_lin <- predict(m)
  out$class <- ifelse(out$prob > 0.5, TRUE, FALSE) %>% 
    factor(labels = TF, levels = TF)
  out$cm <- caret::confusionMatrix(
    out$class, out$actual, positive = 'TRUE')
  
  out$broom <- broom::tidy(m)
  out$broom$exp_estimate <- exp(out$broom$estimate)
  out$roc <- suppressMessages(pROC::roc(out$actual, out$prob))
  out$auc <- out$roc$auc %>% as.numeric
  out$chisq <- 1 - pchisq(
    m$null.deviance - m$deviance, m$df.null - m$df.residual)
  out$confint <- suppressMessages(suppressWarnings(confint(m)))
  out$R2 <- (1-exp((m$deviance-m$null.deviance)/N))/(1-exp(-m$null.deviance/N))
  
  A <- out$cm$table[1, 1]
  B <- out$cm$table[1, 2]
  C <- out$cm$table[2, 1]
  D <- out$cm$table[2, 2]
  
  out$accuracy <- (A + D) / (A + B + C + D)
  out$classification_error_rate <- (B + C) / (A + B + C + D)
  out$precision <- A / (A + B)
  out$sensitivity <- recall <- A / (A + C)
  out$specificity <- D / (D + B)
  out$F1_score <- 
    (2 * out$precision * out$sensitivity) / (out$precision + out$sensitivity)
  out$PPV <- A / (A + B)
  out$NPV <- D / (C + D)
  
  return(out)
}
halfnorm_plot <- function(mo) {
  faraway::halfnorm(hatvalues(mo$m))
}
plot_roc <- function(mo) {
  orig_pty <- par()$pty
  par(pty="s")
  on.exit(par(pty = orig_pty))
  plot(
    mo$roc, 
    print.auc = TRUE, 
    auc.polygon = TRUE, 
    grid = c(0.1, 0.2),
    grid.col = c("green", "red"), 
    max.auc.polygon = TRUE,
    auc.polygon.col = "lightblue", 
    print.thres = FALSE)
}
hosmer_lemeshow <- function(mo) {
  out <- list()
  M <- floor(nrow(mo$m$data) / 30)
  qs <- (0:M)/M
  breaks <- unique(quantile(mo$prob_lin, qs))
  buckets <- cut(mo$prob_lin, breaks = breaks)
  
  d <- data.frame(
    buckets = buckets, 
    class = as.integer(mo$actual) - 1, 
    prob = mo$prob)
  
  gd <- d %>% 
    group_by(buckets) %>% 
    summarise(
      y = sum(class), 
      ppred = mean(prob), 
      count = n()) %>% 
    mutate(
      se.fit = sqrt(ppred*(1-ppred)/count))
  
  out$plot <- ggplot(
    gd,
    aes(x=ppred,y=y/count,ymin=y/count-2*se.fit,ymax=y/count+2*se.fit))+
    geom_point()+
    geom_linerange(color=grey(0.75))+
    geom_abline(intercept=0,slope=1)+
    xlab("Predicted Probability")+
    ylab("Observed Proportion")
  
  out$hlstat <- with(
    gd, 
    sum( (y-count*ppred)^2/(count*ppred*(1-ppred))))
  
  out$pval <- 1 - pchisq(out$hlstat, nrow(gd) - 1)
  return(out)
}
info <- function(x, ...) {
  UseMethod("info")
}
info.data.frame <- function(dataframe) {
  
  out <- NULL
  for (i in 1:ncol(dataframe)) {
    
    v <- dataframe[, i, drop = TRUE]
    num <- is.numeric(v) | is.logical(v)
    lv <- length(v)
    vn <- v[!is.na(v)]
    uniques <- length(unique(vn))
    typ <- class(v)
    
    if (num) {
      qv <- stats::quantile(
        vn, 
        probs = c(0, 0.25, 0.5, 0.75, 1))
      sdv <- sd(vn)
      muv <- mean(vn)
      zero_count = sum(vn == 0)
      lvls <- NA
    } else {
      qv <- rep(NA, 5)
      sdv <- NA
      muv <- NA
      zero_count = NA
      lvls <- table(v) %>% sort(decreasing = TRUE) %>% 
        names %>% `[`(., 1:min(10, length(.))) %>% paste0(collapse = ', ')
    }
    
    qvr <- data.frame(
      row.names = i,
      column = names(dataframe)[i],
      type = typ,
      min = qv[1],
      q25 = qv[2],
      median = qv[3],
      mean = muv,
      q75 = qv[4],
      max = qv[5],
      sd = sdv,
      zero_count,
      # sd2ratio = sum(abs(vn - muv) < 2 * sdv) / length(vn),
      na_count = lv - length(vn),
      unique_values = uniques,
      levels = lvls)
    
    if (is.null(out)) { 
      out <- qvr } else { out <- rbind(out, qvr) }  
    
  }
  return(out)
}