CUNY Spring 2021 Data 622 HW #1

Penguins data

The penguins data has 344 rows and 8 columns. Read more about the data at the package website.

penguins <- palmerpenguins::penguins
# Code for info is in appendix
info(penguins)

column	type	min	max	zero_count	na_count	unique_values	levels
species	factor	NA	NA	NA	0	3	Adelie, Gentoo, Chinstrap
island	factor	NA	NA	NA	0	3	Biscoe, Dream, Torgersen
bill_length_mm	numeric	32.1	59.6	0	2	164	NA
bill_depth_mm	numeric	13.1	21.5	0	2	80	NA
flipper_length_mm	integer	172.0	231.0	0	2	55	NA
body_mass_g	integer	2700.0	6300.0	0	2	94	NA
sex	factor	NA	NA	NA	11	2	male, female
year	integer	2007.0	2009.0	0	0	3	NA

Distribution of Sex

The distribution of penguins by sex is 50/50 for all species. The species representation in the sample, however, is not equal.

table(penguins[, c('species', 'sex')], useNA = "always")
##            sex
## species     female male <NA>
##   Adelie        73   73    6
##   Chinstrap     34   34    0
##   Gentoo        58   61    5
##   <NA>           0    0    0
table(penguins$species, useNA = "always") / nrow(penguins)
## 
##    Adelie Chinstrap    Gentoo      <NA> 
## 0.4418605 0.1976744 0.3604651 0.0000000

Islands

Gentoo are all on Biscoe island, Chinstrap are all on Dream island, and Adelie can be found on all three. I am interested in whether the characteristics of the three populations of Adelie are different.

table(penguins[, c('species', 'island')], useNA = 'always')
##            island
## species     Biscoe Dream Torgersen <NA>
##   Adelie        44    56        52    0
##   Chinstrap      0    68         0    0
##   Gentoo       124     0         0    0
##   <NA>           0     0         0    0
adelie <- penguins %>% filter(species == 'Adelie')
table(adelie[, c('island', 'sex')], useNA = 'always')
##            sex
## island      female male <NA>
##   Biscoe        22   22    0
##   Dream         27   28    1
##   Torgersen     24   23    5
##   <NA>           0    0    0

adelie %>% 
  mutate(ID = 1:nrow(.)) %>% 
  drop_na() %>% 
  select(-species, -sex, -year) %>% 
  pivot_longer(
    cols = -c('ID', 'island'), 
    names_to = 'field', 
    values_to = 'value') %>% 
  ggplot() +
  aes(x = value, fill = island) +
  geom_density(alpha = 0.75) +
  facet_wrap(~field, scales = 'free')

There is something interesting with bill_depth_mm on Torgerson island, but nothing appears dramatic enough for me to want to include island.

Based on this sample of birds, if we know the island a bird comes from, there are at most two species it can belong to. In the case of Torgersen island, we know it has to be Adelie. However, I want to build models that do not rely on island.

Pairs Plot

library(GGally)
penguins %>% 
  drop_na %>% 
  select(
    species, 
    bill_length_mm, 
    bill_depth_mm, 
    flipper_length_mm, 
    body_mass_g) %>% 
  GGally::ggpairs(
    progress = FALSE,
    mapping = aes(
      fill = species, color = species),
    columns = setdiff(names(.), 'species'))

If you look at the distributions of bill_depth_mm and body_mass_g and their scatter plot, something really interesting pops out. There is a perfect linear separation of Gentoo from the other two species. It’s as if you could use the ratio of bill depth to body mass to predict whether an individual is Gentoo. Then, use a conditional logistic model to split Adelie from Chinstrap.

Question #1: Logistic Model

Part #1a: response variable

There are three species. To do a true binomial logistic model, we need a response variable with only two classes! I’m going to combine the Adelie and Chinstrap species based on my observations above.

penguins$gentoo <- penguins$species == 'Gentoo'
sum(penguins$gentoo)
## [1] 124

Part #1b: variable and model selection

I’m restricting my model building to the four continuous features. There are only 16 unique combinations of these four (ignoring derived features and interaction terms). It’s easy enough to run all 16.

# The four variables I care about
cols <- c(
  "bill_length_mm", 
  "bill_depth_mm", 
  "flipper_length_mm", 
  "body_mass_g")

# Build a "masking" matrix
a <- c(TRUE, FALSE)
b <- matrix(c(
  rep(a, each = 8, length.out = 16),
  rep(a, each = 4, length.out = 16),
  rep(a, each = 2, length.out = 16),
  rep(a, each = 1, length.out = 16)),
  nrow = 16)

# Temp vectors to store output
converge <- logical()
deviance <- numeric()
aic <- numeric()
mod_cols <- character()

# Iterate through all 16 combinations
for (i in 1:16) {
  r <- b[i, ]
  cols_i <- cols[r]
  suppressWarnings(mod_i <- glm(
    gentoo ~ .,
    data = penguins[, c(cols_i, 'gentoo')],
    family = 'binomial'))
  converge[i] <- mod_i$converged
  deviance[i] <- mod_i$deviance
  aic[i] <- mod_i$aic
}

# Prepare an exhibit
res <- ifelse(b, "X", "")
colnames(res) <- cols
res <- as.data.frame(res)
res$converge <- converge
res$deviance <- ifelse(converge, deviance, NA)
res$aic <- ifelse(converge, aic, NA)

res[c(which(converge), which(!converge)), ] %>% 
  knitr::kable(digits = c(0, 0, 0, 0, 0, 1, 1))

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	converge	deviance	aic
4	X	X			TRUE	30.0	36.0
5	X		X	X	TRUE	33.3	41.3
6	X		X		TRUE	44.8	50.8
7	X			X	TRUE	115.3	121.3
8	X				TRUE	354.8	358.8
12		X			TRUE	90.2	94.2
13			X	X	TRUE	38.8	44.8
14			X		TRUE	49.4	53.4
15				X	TRUE	117.9	121.9
16					TRUE	449.7	451.7
1	X	X	X	X	FALSE	NA	NA
2	X	X	X		FALSE	NA	NA
3	X	X		X	FALSE	NA	NA
9		X	X	X	FALSE	NA	NA
10		X	X		FALSE	NA	NA
11		X		X	FALSE	NA	NA

Six of the sixteen do not converge. This is because of the perfect linear separation between Gentoo and the other species in the pairs plot above. All six have bill_depth_mm and at least one of flipper_length_mm and body_mass_g.

Of the ten models that do converge, the two that have the lowest residual deviance are

m1 <- glm(
  gentoo ~ bill_length_mm + bill_depth_mm, 
  data = penguins, family = binomial)

m2 <- glm(
  gentoo ~ bill_length_mm + flipper_length_mm + body_mass_g, 
  data = penguins, family = binomial)

The potential of bill measurements being all that is needed is really attractive in this model. However, the coefficients in that model are severe for a logistic model and make them hard to interpret. It also makes the model very sensitive to anomalous observations.

The coefficients in m2 are much less severe and lead to a more attractive model, with a low impact on AIC and residual deviance.

knitr::kable(tidy(m1), digits = c(0, 3, 1, 1, 4))

term	estimate	std.error	statistic	p.value
(Intercept)	49.010	14.8	3.3	9e-04
bill_length_mm	0.558	0.1	4.1	0e+00
bill_depth_mm	-4.532	1.1	-4.3	0e+00

knitr::kable(tidy(m2), digits = c(0, 3, 1, 1, 4))

term	estimate	std.error	statistic	p.value
(Intercept)	-137.761	33.5	-4.1	0.0000
bill_length_mm	-0.302	0.2	-2.0	0.0507
flipper_length_mm	0.636	0.2	3.8	0.0001
body_mass_g	0.005	0.0	2.6	0.0098

I select m2, using bill_length_mm + flipper_length_mm + body_mass_g.

Part #1c: variable interpretations

I’d like to discuss what the coefficients of a binomial model mean. This will help in understanding the output of m2 and why m1 is not a great choice.

Use \(L\) for the linear predictor or \(L = \beta_0 + \beta_1 x_1 + \cdots\). The model is then

\[ \begin{aligned} L &= \log \frac{p}{1-p} \\ e^L &= \frac{p}{1-p} \end{aligned} \] It’s helpful to have an aid in understanding how changes in the linear predictor affect \(p\).

p	e^L	L
0.0	0.00	-`Inf`
0.01	1/99	-4.6
0.25	1/3	-1.1
0.5	1.0	0.00
0.75	3.0	1.1
0.99	99.0	4.6
1.0	`Inf`	`Inf`

A change of 4.6 x 2 is either direction changes the model output from a certainty of being in one class to a certainty of being in the other. m1’s coefficient for bill_depth_mm is -4.5. A change of just 2mm holding everything else equal is enough to change \(p\) from 0.01 to 0.99 or vice-versa. It is just too sensitive. With m2 to see a 10% change in \(p\), bill length would have to change by about 3mm, flipper length by 1.5mm, or body mass by 200g. I believe this is a helpful way to understand the coefficients of the model and their impact on the overall probability of being Gentoo.

Question #2: model diagnostics

See appendix for code of model_objects.

mo2 <- model_objects(m2)

# AUC
mo2$auc
## [1] 0.9983666

# Accuracy
mo2$accuracy
## [1] 0.9766082

# True positive rate aka Sensitivity
mo2$sensitivity
## [1] 0.9756098

# True negative rate aka Specificity
mo2$specificity
## [1] 0.9771689

# False positive rate = 1 - TNR
1 - mo2$specificity
## [1] 0.02283105

# False negative rate = 1 - TPR
1 - mo2$sensitivity
## [1] 0.02439024

Question #3: multinomial logistic regression

Since there are only 16 models, I’ll do the same as above and check convergence of all 16.

# The four variables I care about
cols <- c(
  "bill_length_mm", 
  "bill_depth_mm", 
  "flipper_length_mm", 
  "body_mass_g")

# Build a "masking" matrix
a <- c(TRUE, FALSE)
b <- matrix(c(
  rep(a, each = 8, length.out = 16),
  rep(a, each = 4, length.out = 16),
  rep(a, each = 2, length.out = 16),
  rep(a, each = 1, length.out = 16)),
  nrow = 16)

# Temp vectors to store output
converge <- logical()
deviance <- numeric()
aic <- numeric()
mod_cols <- character()

# Iterate through all 16 combinations
for (i in 1:16) {
  r <- b[i, ]
  cols_i <- cols[r]
  capture.output(suppressWarnings(mod_i <- nnet::multinom(
    species ~ .,
    data = penguins[, c(cols_i, 'species')])))
  converge[i] <- mod_i$convergence == 0
  deviance[i] <- mod_i$deviance
  aic[i] <- mod_i$AIC
}

# Prepare an exhibit
res <- ifelse(b, "X", "")
colnames(res) <- cols
res <- as.data.frame(res)
res$converge <- converge
res$deviance <- ifelse(converge, deviance, NA)
res$aic <- ifelse(converge, aic, NA)

res[c(which(converge), which(!converge)), ] %>% 
  knitr::kable(digits = c(0, 0, 0, 0, 0, 1, 1))

	bill_length_mm	bill_depth_mm	flipper_length_mm	body_mass_g	converge	deviance	aic
2	X	X	X		TRUE	15.3	31.3
4	X	X			TRUE	47.9	59.9
5	X		X	X	TRUE	27.0	43.0
6	X		X		TRUE	78.1	90.1
7	X			X	TRUE	77.6	89.6
8	X				TRUE	332.7	340.7
9		X	X	X	TRUE	229.5	245.5
10		X	X		TRUE	234.3	246.3
11		X		X	TRUE	271.1	283.1
12		X			TRUE	361.4	369.4
13			X	X	TRUE	268.7	280.7
14			X		TRUE	287.7	295.7
15				X	TRUE	388.9	396.9
16					TRUE	721.8	725.8
1	X	X	X	X	FALSE	NA	NA
3	X	X		X	FALSE	NA	NA

This time, only two do not converge. Those are the two that include bill_length_mm, bill_depth_mm, and body_mass_g.

The two best models, by residual deviance and AIC, are

multi1 <- nnet::multinom(
    species ~ bill_length_mm + bill_depth_mm + flipper_length_mm,
    data = penguins)

multi2 <- nnet::multinom(
    species ~ bill_length_mm + flipper_length_mm + body_mass_g,
    data = penguins)

I’m selecting multi2 because it has only slight worse AIC and deviance, uses the same variables as the 2-class problem above, and the coefficients are again less severe.

tidy(multi2) %>% 
  knitr::kable(digits = c(0, 0, 2, 3, 1, 3))

y.level	term	estimate	std.error	statistic	p.value
Chinstrap	(Intercept)	-95.24	0.002	-43113.0	0.000
Chinstrap	bill_length_mm	5.73	0.007	811.0	0.000
Chinstrap	flipper_length_mm	-0.31	0.043	-7.3	0.000
Chinstrap	body_mass_g	-0.03	0.002	-11.6	0.000
Gentoo	(Intercept)	-158.00	0.000	-583534.5	0.000
Gentoo	bill_length_mm	0.67	0.286	2.4	0.018
Gentoo	flipper_length_mm	0.55	0.071	7.7	0.000
Gentoo	body_mass_g	0.00	0.002	1.6	0.115

Question #4: diagnotics for multinomial models

I’d like to do something similar to the binomial case

3x3 confusion matrix
half-normal plots, like Faraway does in “Extending the Linear Model with R”
plotting a ROC curve
Hosmer-Lemeshow plots

Here are some examples using the 2-class case.

Plot ROC

plot_roc(mo2)

Faraway halfnorm plots

halfnorm_plot(mo2)

Hosmer-Lemeshow Plot

hosmer_lemeshow(mo2)
## `summarise()` ungrouping output (override with `.groups` argument)
## $plot

## 
## $hlstat
## [1] 1.33466e+11
## 
## $pval
## [1] 0

Appendix

Packages

search()
##  [1] ".GlobalEnv"             "package:GGally"         "package:palmerpenguins"
##  [4] "package:faraway"        "package:broom"          "package:pROC"          
##  [7] "package:caret"          "package:lattice"        "package:tree"          
## [10] "package:forcats"        "package:stringr"        "package:dplyr"         
## [13] "package:purrr"          "package:readr"          "package:tidyr"         
## [16] "package:tibble"         "package:ggplot2"        "package:tidyverse"     
## [19] "package:knitr"          "package:alrtools"       "package:stats"         
## [22] "package:graphics"       "package:grDevices"      "package:utils"         
## [25] "package:datasets"       "package:methods"        "Autoloads"             
## [28] "package:base"

Code

model_objects <- function(m) {
  # m response has to be TRUE/FALSE
  TF <- c(TRUE, FALSE)
  out <- list()
  out$m <- m
  out$actual <- factor(
    unname(model.response(model.frame(m))),
    levels = TF)
  N <- nrow(m$data)
  out$prob <- predict(m, type = "response")
  out$prob_lin <- predict(m)
  out$class <- ifelse(out$prob > 0.5, TRUE, FALSE) %>% 
    factor(labels = TF, levels = TF)
  out$cm <- caret::confusionMatrix(
    out$class, out$actual, positive = 'TRUE')
  
  out$broom <- broom::tidy(m)
  out$broom$exp_estimate <- exp(out$broom$estimate)
  out$roc <- suppressMessages(pROC::roc(out$actual, out$prob))
  out$auc <- out$roc$auc %>% as.numeric
  out$chisq <- 1 - pchisq(
    m$null.deviance - m$deviance, m$df.null - m$df.residual)
  out$confint <- suppressMessages(suppressWarnings(confint(m)))
  out$R2 <- (1-exp((m$deviance-m$null.deviance)/N))/(1-exp(-m$null.deviance/N))
  
  A <- out$cm$table[1, 1]
  B <- out$cm$table[1, 2]
  C <- out$cm$table[2, 1]
  D <- out$cm$table[2, 2]
  
  out$accuracy <- (A + D) / (A + B + C + D)
  out$classification_error_rate <- (B + C) / (A + B + C + D)
  out$precision <- A / (A + B)
  out$sensitivity <- recall <- A / (A + C)
  out$specificity <- D / (D + B)
  out$F1_score <- 
    (2 * out$precision * out$sensitivity) / (out$precision + out$sensitivity)
  out$PPV <- A / (A + B)
  out$NPV <- D / (C + D)
  
  return(out)
}
halfnorm_plot <- function(mo) {
  faraway::halfnorm(hatvalues(mo$m))
}
plot_roc <- function(mo) {
  orig_pty <- par()$pty
  par(pty="s")
  on.exit(par(pty = orig_pty))
  plot(
    mo$roc, 
    print.auc = TRUE, 
    auc.polygon = TRUE, 
    grid = c(0.1, 0.2),
    grid.col = c("green", "red"), 
    max.auc.polygon = TRUE,
    auc.polygon.col = "lightblue", 
    print.thres = FALSE)
}
hosmer_lemeshow <- function(mo) {
  out <- list()
  M <- floor(nrow(mo$m$data) / 30)
  qs <- (0:M)/M
  breaks <- unique(quantile(mo$prob_lin, qs))
  buckets <- cut(mo$prob_lin, breaks = breaks)
  
  d <- data.frame(
    buckets = buckets, 
    class = as.integer(mo$actual) - 1, 
    prob = mo$prob)
  
  gd <- d %>% 
    group_by(buckets) %>% 
    summarise(
      y = sum(class), 
      ppred = mean(prob), 
      count = n()) %>% 
    mutate(
      se.fit = sqrt(ppred*(1-ppred)/count))
  
  out$plot <- ggplot(
    gd,
    aes(x=ppred,y=y/count,ymin=y/count-2*se.fit,ymax=y/count+2*se.fit))+
    geom_point()+
    geom_linerange(color=grey(0.75))+
    geom_abline(intercept=0,slope=1)+
    xlab("Predicted Probability")+
    ylab("Observed Proportion")
  
  out$hlstat <- with(
    gd, 
    sum( (y-count*ppred)^2/(count*ppred*(1-ppred))))
  
  out$pval <- 1 - pchisq(out$hlstat, nrow(gd) - 1)
  return(out)
}
info <- function(x, ...) {
  UseMethod("info")
}
info.data.frame <- function(dataframe) {
  
  out <- NULL
  for (i in 1:ncol(dataframe)) {
    
    v <- dataframe[, i, drop = TRUE]
    num <- is.numeric(v) | is.logical(v)
    lv <- length(v)
    vn <- v[!is.na(v)]
    uniques <- length(unique(vn))
    typ <- class(v)
    
    if (num) {
      qv <- stats::quantile(
        vn, 
        probs = c(0, 0.25, 0.5, 0.75, 1))
      sdv <- sd(vn)
      muv <- mean(vn)
      zero_count = sum(vn == 0)
      lvls <- NA
    } else {
      qv <- rep(NA, 5)
      sdv <- NA
      muv <- NA
      zero_count = NA
      lvls <- table(v) %>% sort(decreasing = TRUE) %>% 
        names %>% `[`(., 1:min(10, length(.))) %>% paste0(collapse = ', ')
    }
    
    qvr <- data.frame(
      row.names = i,
      column = names(dataframe)[i],
      type = typ,
      min = qv[1],
      q25 = qv[2],
      median = qv[3],
      mean = muv,
      q75 = qv[4],
      max = qv[5],
      sd = sdv,
      zero_count,
      # sd2ratio = sum(abs(vn - muv) < 2 * sdv) / length(vn),
      na_count = lv - length(vn),
      unique_values = uniques,
      levels = lvls)
    
    if (is.null(out)) { 
      out <- qvr } else { out <- rbind(out, qvr) }  
    
  }
  return(out)
}