Reading in the data:

penguins <- 
  palmerpenguins::penguins |>
  rename(
    bill_len  = bill_length_mm,
    bill_dep  = bill_depth_mm,
    flip_len  = flipper_length_mm,
    body_mass = body_mass_g
  ) |>
  filter(
    !is.na(bill_dep)
  )


penguins

## # A tibble: 342 × 8
##    species island    bill_len bill_dep flip_len body_mass sex     year
##    <fct>   <fct>        <dbl>    <dbl>    <int>     <int> <fct>  <int>
##  1 Adelie  Torgersen     39.1     18.7      181      3750 male    2007
##  2 Adelie  Torgersen     39.5     17.4      186      3800 female  2007
##  3 Adelie  Torgersen     40.3     18        195      3250 female  2007
##  4 Adelie  Torgersen     36.7     19.3      193      3450 female  2007
##  5 Adelie  Torgersen     39.3     20.6      190      3650 male    2007
##  6 Adelie  Torgersen     38.9     17.8      181      3625 female  2007
##  7 Adelie  Torgersen     39.2     19.6      195      4675 male    2007
##  8 Adelie  Torgersen     34.1     18.1      193      3475 <NA>    2007
##  9 Adelie  Torgersen     42       20.2      190      4250 <NA>    2007
## 10 Adelie  Torgersen     37.8     17.1      186      3300 <NA>    2007
## # ℹ 332 more rows

Initial check

Using skim() to get an initial view of the data

#skimr::skim(penguins)

Sample size (n), variables (p), and number of groups (k)

N <- nrow(penguins); p <- 4; k <- n_distinct(penguins$species)

c("N" = N, "p" = p, "k" = k)

##   N   p   k 
## 342   4   3

Comparing how the variables differ across groups:

penguins |> 
  pivot_longer(
    cols = bill_len:body_mass, 
    names_to = "Var", 
    values_to = "values"
  ) |>
  
  ggplot(
    mapping = aes(
      x = values
    )
  ) + 
  geom_density(
    mapping = aes(fill = Var),
    show.legend = F
  ) + 
  facet_wrap(
    facets = ~ Var, 
    scales = "free"
  ) + 
  
  scale_y_continuous(expand = c(0, 0, 0.05, 0))

Correlation plots and Scatterplot Matrices

Let’s create a correlation plot and scatterplot matrix for the variables.

When performing LDA and QDA, we need to worry about the correlations between variables, since the methods rely on the inverse of the covariance matrix:

\[L^2_{ij} = (\textbf{y}_i - \bar{\textbf{y}}_j)'\textbf{S}^{-1}_{pl}(\textbf{y}_i - \bar{\textbf{y}}_j)\]

\[Q^2_{ij} = (\textbf{y}_i - \bar{\textbf{y}}_j)'\textbf{S}^{-1}_i(\textbf{y}_i - \bar{\textbf{y}}_j)\]

For QDA, if any of the covariance matrices has a linear dependency, then we can’t calculate the distance our random vector is from that group.

Correlation plot

# Calculating the correlations per star type in the data
pen_cors <- 
  penguins |> 
  
  summarize(
    .by = species,
    cor_billLen_billDep  = cor(bill_len, bill_dep),
    cor_billLen_flipLen  = cor(bill_len, flip_len),
    cor_billLen_bodyMass = cor(bill_len, body_mass),
    cor_billDep_flipLen  = cor(bill_dep, flip_len),
    cor_billDep_bodyMass = cor(bill_dep, body_mass),
    cor_flipLen_bodyMass = cor(flip_len, body_mass)
  ) |> 
  
  pivot_longer(
    cols = contains("cor"),
    values_to = "correlation",
    names_to = "pairs"
  ) |> 
  
  separate(
    col = pairs,
    into = c('cor', 'col1', 'col2'),
    sep = '_'
  ) |>
  
  
  mutate(
    col1 = factor(col1, levels = c('billLen', 'billDep', 'flipLen', 'bodyMass')),
    col2 = factor(col2, levels = c('billLen', 'billDep', 'flipLen', 'bodyMass')),
    
    var1 = if_else(as.numeric(col1) <= as.numeric(col2), col1, col2),
    var2 = if_else(as.numeric(col1) > as.numeric(col2), col1, col2)
  ) 

# Calculating the correlation matrix plot per star type
xtabs(formula = correlation ~ var1 + var2 + species,
      data = pen_cors) |> 
  
  data.frame() |> 
  
  rename(correlation = Freq) |> 
  
  ggplot(
    mapping = aes(
      x = fct_rev(var1),
      y = fct_rev(var2)
    )
  ) + 
  
  geom_tile(
    mapping = aes(fill = correlation),
    color = "white",
    show.legend = F
  ) +
  
  geom_text(
    data = pen_cors,
    mapping = aes(
      x = var1,
      y = var2,
      label = round(correlation, digits = 2)
    ),
    color = "black"
  ) +
  
#  geom_text(
#    data = data.frame(
#      var1 = c("billDepth", "billLen", "flipLen", "bodyMass") |> fct_inorder(),
#      var2 = c("billDepth", "billLen", "flipLen", "bodyMass") |> fct_inorder(),
#      variable = c("billDepth", "billLen", "flipLen", "bodyMass")
#   ),
#    mapping = aes(label = variable)
#  ) + 
  
  #scale_x_discrete(breaks = NULL) + 
  
#  scale_y_discrete(breaks = NULL) + 
  
  facet_wrap(facets = vars(species)) + 
  
  scale_fill_gradient2(limits = c(-1, 1)) + 
  
  labs(
    x = NULL,
    y = NULL
  )

Scatterplot matrix using ggpairs()

penguins |> 
  ggpairs(
    columns = 3:6, 
    mapping = aes(color = species, 
                  alpha = 0.75)
  ) + 
  theme_bw()

From the correlation plot and scatterplots, it doesn’t look like the covariance matrices are equal, but let’s checking using Box’s M test (box_m() in rstatix)

rstatix::box_m(
  penguins |> dplyr::select(bill_len:body_mass),
  group = penguins$species
)

## # A tibble: 1 × 4
##   statistic      p.value parameter method                                       
##       <dbl>        <dbl>     <dbl> <chr>                                        
## 1      76.8 0.0000000136        20 Box's M-test for Homogeneity of Covariance M…

If we want to use a discriminant method to classify the penguin species, QDA would be more appropriate since we don’t have equal covariance matrices.

Quadratic Discriminant Analysis

The qda() function is in the MASS library, like the lda() function. It has the same arguments as the lda() function.

qda_penguins <- 
  MASS::qda(
    formula = species ~ bill_len + bill_dep + flip_len + body_mass, 
    data = penguins,
    priors = rep(1, k)/k
  )

# Looking at the info we can extract from the qda object
names(qda_penguins)

##  [1] "prior"   "counts"  "means"   "scaling" "ldet"    "lev"     "N"      
##  [8] "call"    "terms"   "xlevels"

# What is scaling for QDA?
qda_penguins$scaling

## , , Adelie
## 
##                1      2       3        4
## bill_len  -0.375 -0.160  0.0984 -0.17217
## bill_dep   0.000  0.893  0.1890 -0.43812
## flip_len   0.000  0.000 -0.1653 -0.05417
## body_mass  0.000  0.000  0.0000  0.00311
## 
## , , Chinstrap
## 
##               1      2      3        4
## bill_len  0.299  0.259 -0.060  0.05924
## bill_dep  0.000 -1.164 -0.519  0.33803
## flip_len  0.000  0.000  0.174  0.08340
## body_mass 0.000  0.000  0.000 -0.00369
## 
## , , Gentoo
## 
##               1      2      3        4
## bill_len  0.324  0.273 -0.175  0.13348
## bill_dep  0.000 -1.331 -0.747  0.59179
## flip_len  0.000  0.000  0.236  0.07162
## body_mass 0.000  0.000  0.000 -0.00324

Nothing important for what we are doing as they aren’t the quadratic equivalent of our discriminant functions from LDA

QDA: Predict

We’ll still use the predict() function to make predictions using the qda() model. If you want to pull up the help menu, you’ll want to call the function by the full name: predict.qda

?MASS::predict.qda

## starting httpd help server ... done

# Adding $class to the end just keeps the predicted species
predict(object = qda_penguins, 
        newdata = penguins) |> 
  pluck('class') |>
  data.frame() |> 
  tibble()

## # A tibble: 342 × 1
##    pluck.predict.object...qda_penguins..newdata...penguins....class..
##    <fct>                                                             
##  1 Adelie                                                            
##  2 Adelie                                                            
##  3 Adelie                                                            
##  4 Adelie                                                            
##  5 Adelie                                                            
##  6 Adelie                                                            
##  7 Adelie                                                            
##  8 Adelie                                                            
##  9 Adelie                                                            
## 10 Adelie                                                            
## # ℹ 332 more rows

# Adding $post to the end keeps the posterior probabilities: P(G_i|Y)
predict(object = qda_penguins, 
        newdata = penguins) |> 
  
  pluck("posterior") |> 

  round(digits = 3) |> 
  data.frame() |> 
  tibble()

## # A tibble: 342 × 3
##    Adelie Chinstrap Gentoo
##     <dbl>     <dbl>  <dbl>
##  1  1         0          0
##  2  0.998     0.002      0
##  3  0.979     0.021      0
##  4  1         0          0
##  5  1         0          0
##  6  1         0          0
##  7  1         0          0
##  8  1         0          0
##  9  1         0          0
## 10  0.998     0.002      0
## # ℹ 332 more rows

# Store both in a dataframe!
pen_qda_df <- 
  data.frame(
    penguins,
    predict(object = qda_penguins, newdata = penguins)
  )
head(pen_qda_df)

##   species    island bill_len bill_dep flip_len body_mass    sex year  class
## 1  Adelie Torgersen     39.1     18.7      181      3750   male 2007 Adelie
## 2  Adelie Torgersen     39.5     17.4      186      3800 female 2007 Adelie
## 3  Adelie Torgersen     40.3     18.0      195      3250 female 2007 Adelie
## 4  Adelie Torgersen     36.7     19.3      193      3450 female 2007 Adelie
## 5  Adelie Torgersen     39.3     20.6      190      3650   male 2007 Adelie
## 6  Adelie Torgersen     38.9     17.8      181      3625 female 2007 Adelie
##   posterior.Adelie posterior.Chinstrap posterior.Gentoo
## 1            1.000            1.21e-05         1.23e-35
## 2            0.998            1.76e-03         4.29e-22
## 3            0.979            2.11e-02         1.01e-22
## 4            1.000            6.55e-07         2.40e-34
## 5            1.000            1.68e-07         7.36e-45
## 6            1.000            2.79e-04         2.21e-29

We can view the results in a confusion matrix using confusionMatrix() in the caret package:

pen_qda_df |>
  rename(
    predicted = class, 
    actual = species
    ) |> 
  xtabs(formula = ~ predicted + actual) |> 
  confusionMatrix()

## Confusion Matrix and Statistics
## 
##            actual
## predicted   Adelie Chinstrap Gentoo
##   Adelie       149         2      0
##   Chinstrap      2        66      0
##   Gentoo         0         0    123
## 
## Overall Statistics
##                                        
##                Accuracy : 0.988        
##                  95% CI : (0.97, 0.997)
##     No Information Rate : 0.442        
##     P-Value [Acc > NIR] : <2e-16       
##                                        
##                   Kappa : 0.982        
##                                        
##  Mcnemar's Test P-Value : NA           
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                  0.987            0.971          1.00
## Specificity                  0.990            0.993          1.00
## Pos Pred Value               0.987            0.971          1.00
## Neg Pred Value               0.990            0.993          1.00
## Prevalence                   0.442            0.199          0.36
## Detection Rate               0.436            0.193          0.36
## Detection Prevalence         0.442            0.199          0.36
## Balanced Accuracy            0.988            0.982          1.00

QDA using Cross-validation

We can tell qda() to use leave-one-out cross-validation by including the CV = T argument:

qda_pen_cv <- 
  MASS::qda(
    species ~ bill_len + bill_dep + flip_len + body_mass, 
    data = penguins, 
    CV = T
  )

Now it just returns what the predict function did: class = Predicted class using CV posterior = Posterior probability using CV

table(
  predicted = qda_pen_cv$class, 
  actual = penguins$species
) |> 
  confusionMatrix()

## Confusion Matrix and Statistics
## 
##            actual
## predicted   Adelie Chinstrap Gentoo
##   Adelie       149         2      0
##   Chinstrap      2        66      0
##   Gentoo         0         0    123
## 
## Overall Statistics
##                                        
##                Accuracy : 0.988        
##                  95% CI : (0.97, 0.997)
##     No Information Rate : 0.442        
##     P-Value [Acc > NIR] : <2e-16       
##                                        
##                   Kappa : 0.982        
##                                        
##  Mcnemar's Test P-Value : NA           
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                  0.987            0.971          1.00
## Specificity                  0.990            0.993          1.00
## Pos Pred Value               0.987            0.971          1.00
## Neg Pred Value               0.990            0.993          1.00
## Prevalence                   0.442            0.199          0.36
## Detection Rate               0.436            0.193          0.36
## Detection Prevalence         0.442            0.199          0.36
## Balanced Accuracy            0.988            0.982          1.00

Graphing the penguin predictions for LDA vs QDA for two predictors

The function read in below will allow us to compare the ‘prediction regions’ for a scatter plot.

Note: the function only works with p = 2.

# Reading in the graph
source('graph_DA_area.R')

# Using bill length and flipper length
gg_lda_vs_qda <- 
  class_DA_area_plot(
    formula = species ~ bill_len + flip_len,
    data = penguins
  )

gg_lda_vs_qda

QDA: Penguins

Chapter 9

STAT 5230