penguins <-
palmerpenguins::penguins |>
rename(
bill_len = bill_length_mm,
bill_dep = bill_depth_mm,
flip_len = flipper_length_mm,
body_mass = body_mass_g
) |>
filter(
!is.na(bill_dep)
)
penguins
## # A tibble: 342 × 8
## species island bill_len bill_dep flip_len body_mass sex year
## <fct> <fct> <dbl> <dbl> <int> <int> <fct> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007
## 2 Adelie Torgersen 39.5 17.4 186 3800 female 2007
## 3 Adelie Torgersen 40.3 18 195 3250 female 2007
## 4 Adelie Torgersen 36.7 19.3 193 3450 female 2007
## 5 Adelie Torgersen 39.3 20.6 190 3650 male 2007
## 6 Adelie Torgersen 38.9 17.8 181 3625 female 2007
## 7 Adelie Torgersen 39.2 19.6 195 4675 male 2007
## 8 Adelie Torgersen 34.1 18.1 193 3475 <NA> 2007
## 9 Adelie Torgersen 42 20.2 190 4250 <NA> 2007
## 10 Adelie Torgersen 37.8 17.1 186 3300 <NA> 2007
## # ℹ 332 more rows
Using skim() to get an initial view of the data
#skimr::skim(penguins)
Sample size (n), variables (p), and number of groups (k)
N <- nrow(penguins); p <- 4; k <- n_distinct(penguins$species)
c("N" = N, "p" = p, "k" = k)
## N p k
## 342 4 3
Comparing how the variables differ across groups:
penguins |>
pivot_longer(
cols = bill_len:body_mass,
names_to = "Var",
values_to = "values"
) |>
ggplot(
mapping = aes(
x = values
)
) +
geom_density(
mapping = aes(fill = Var),
show.legend = F
) +
facet_wrap(
facets = ~ Var,
scales = "free"
) +
scale_y_continuous(expand = c(0, 0, 0.05, 0))
Let’s create a correlation plot and scatterplot matrix for the variables.
When performing LDA and QDA, we need to worry about the correlations between variables, since the methods rely on the inverse of the covariance matrix:
\[L^2_{ij} = (\textbf{y}_i - \bar{\textbf{y}}_j)'\textbf{S}^{-1}_{pl}(\textbf{y}_i - \bar{\textbf{y}}_j)\]
\[Q^2_{ij} = (\textbf{y}_i - \bar{\textbf{y}}_j)'\textbf{S}^{-1}_i(\textbf{y}_i - \bar{\textbf{y}}_j)\]
For QDA, if any of the covariance matrices has a linear dependency, then we can’t calculate the distance our random vector is from that group.
# Calculating the correlations per star type in the data
pen_cors <-
penguins |>
summarize(
.by = species,
cor_billLen_billDep = cor(bill_len, bill_dep),
cor_billLen_flipLen = cor(bill_len, flip_len),
cor_billLen_bodyMass = cor(bill_len, body_mass),
cor_billDep_flipLen = cor(bill_dep, flip_len),
cor_billDep_bodyMass = cor(bill_dep, body_mass),
cor_flipLen_bodyMass = cor(flip_len, body_mass)
) |>
pivot_longer(
cols = contains("cor"),
values_to = "correlation",
names_to = "pairs"
) |>
separate(
col = pairs,
into = c('cor', 'col1', 'col2'),
sep = '_'
) |>
mutate(
col1 = factor(col1, levels = c('billLen', 'billDep', 'flipLen', 'bodyMass')),
col2 = factor(col2, levels = c('billLen', 'billDep', 'flipLen', 'bodyMass')),
var1 = if_else(as.numeric(col1) <= as.numeric(col2), col1, col2),
var2 = if_else(as.numeric(col1) > as.numeric(col2), col1, col2)
)
# Calculating the correlation matrix plot per star type
xtabs(formula = correlation ~ var1 + var2 + species,
data = pen_cors) |>
data.frame() |>
rename(correlation = Freq) |>
ggplot(
mapping = aes(
x = fct_rev(var1),
y = fct_rev(var2)
)
) +
geom_tile(
mapping = aes(fill = correlation),
color = "white",
show.legend = F
) +
geom_text(
data = pen_cors,
mapping = aes(
x = var1,
y = var2,
label = round(correlation, digits = 2)
),
color = "black"
) +
# geom_text(
# data = data.frame(
# var1 = c("billDepth", "billLen", "flipLen", "bodyMass") |> fct_inorder(),
# var2 = c("billDepth", "billLen", "flipLen", "bodyMass") |> fct_inorder(),
# variable = c("billDepth", "billLen", "flipLen", "bodyMass")
# ),
# mapping = aes(label = variable)
# ) +
#scale_x_discrete(breaks = NULL) +
# scale_y_discrete(breaks = NULL) +
facet_wrap(facets = vars(species)) +
scale_fill_gradient2(limits = c(-1, 1)) +
labs(
x = NULL,
y = NULL
)
penguins |>
ggpairs(
columns = 3:6,
mapping = aes(color = species,
alpha = 0.75)
) +
theme_bw()
From the correlation plot and scatterplots, it doesn’t look like the
covariance matrices are equal, but let’s checking using Box’s M test
(box_m() in rstatix)
rstatix::box_m(
penguins |> dplyr::select(bill_len:body_mass),
group = penguins$species
)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <dbl> <chr>
## 1 76.8 0.0000000136 20 Box's M-test for Homogeneity of Covariance M…
If we want to use a discriminant method to classify the penguin species, QDA would be more appropriate since we don’t have equal covariance matrices.
The qda() function is in the MASS library,
like the lda() function. It has the same arguments as the
lda() function.
qda_penguins <-
MASS::qda(
formula = species ~ bill_len + bill_dep + flip_len + body_mass,
data = penguins,
priors = rep(1, k)/k
)
# Looking at the info we can extract from the qda object
names(qda_penguins)
## [1] "prior" "counts" "means" "scaling" "ldet" "lev" "N"
## [8] "call" "terms" "xlevels"
# What is scaling for QDA?
qda_penguins$scaling
## , , Adelie
##
## 1 2 3 4
## bill_len -0.375 -0.160 0.0984 -0.17217
## bill_dep 0.000 0.893 0.1890 -0.43812
## flip_len 0.000 0.000 -0.1653 -0.05417
## body_mass 0.000 0.000 0.0000 0.00311
##
## , , Chinstrap
##
## 1 2 3 4
## bill_len 0.299 0.259 -0.060 0.05924
## bill_dep 0.000 -1.164 -0.519 0.33803
## flip_len 0.000 0.000 0.174 0.08340
## body_mass 0.000 0.000 0.000 -0.00369
##
## , , Gentoo
##
## 1 2 3 4
## bill_len 0.324 0.273 -0.175 0.13348
## bill_dep 0.000 -1.331 -0.747 0.59179
## flip_len 0.000 0.000 0.236 0.07162
## body_mass 0.000 0.000 0.000 -0.00324
Nothing important for what we are doing as they aren’t the quadratic equivalent of our discriminant functions from LDA
We’ll still use the predict() function to make
predictions using the qda() model. If you want to pull up
the help menu, you’ll want to call the function by the full name:
predict.qda
?MASS::predict.qda
## starting httpd help server ... done
# Adding $class to the end just keeps the predicted species
predict(object = qda_penguins,
newdata = penguins) |>
pluck('class') |>
data.frame() |>
tibble()
## # A tibble: 342 × 1
## pluck.predict.object...qda_penguins..newdata...penguins....class..
## <fct>
## 1 Adelie
## 2 Adelie
## 3 Adelie
## 4 Adelie
## 5 Adelie
## 6 Adelie
## 7 Adelie
## 8 Adelie
## 9 Adelie
## 10 Adelie
## # ℹ 332 more rows
# Adding $post to the end keeps the posterior probabilities: P(G_i|Y)
predict(object = qda_penguins,
newdata = penguins) |>
pluck("posterior") |>
round(digits = 3) |>
data.frame() |>
tibble()
## # A tibble: 342 × 3
## Adelie Chinstrap Gentoo
## <dbl> <dbl> <dbl>
## 1 1 0 0
## 2 0.998 0.002 0
## 3 0.979 0.021 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 0
## 7 1 0 0
## 8 1 0 0
## 9 1 0 0
## 10 0.998 0.002 0
## # ℹ 332 more rows
# Store both in a dataframe!
pen_qda_df <-
data.frame(
penguins,
predict(object = qda_penguins, newdata = penguins)
)
head(pen_qda_df)
## species island bill_len bill_dep flip_len body_mass sex year class
## 1 Adelie Torgersen 39.1 18.7 181 3750 male 2007 Adelie
## 2 Adelie Torgersen 39.5 17.4 186 3800 female 2007 Adelie
## 3 Adelie Torgersen 40.3 18.0 195 3250 female 2007 Adelie
## 4 Adelie Torgersen 36.7 19.3 193 3450 female 2007 Adelie
## 5 Adelie Torgersen 39.3 20.6 190 3650 male 2007 Adelie
## 6 Adelie Torgersen 38.9 17.8 181 3625 female 2007 Adelie
## posterior.Adelie posterior.Chinstrap posterior.Gentoo
## 1 1.000 1.21e-05 1.23e-35
## 2 0.998 1.76e-03 4.29e-22
## 3 0.979 2.11e-02 1.01e-22
## 4 1.000 6.55e-07 2.40e-34
## 5 1.000 1.68e-07 7.36e-45
## 6 1.000 2.79e-04 2.21e-29
We can view the results in a confusion matrix using
confusionMatrix() in the caret package:
pen_qda_df |>
rename(
predicted = class,
actual = species
) |>
xtabs(formula = ~ predicted + actual) |>
confusionMatrix()
## Confusion Matrix and Statistics
##
## actual
## predicted Adelie Chinstrap Gentoo
## Adelie 149 2 0
## Chinstrap 2 66 0
## Gentoo 0 0 123
##
## Overall Statistics
##
## Accuracy : 0.988
## 95% CI : (0.97, 0.997)
## No Information Rate : 0.442
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.982
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.987 0.971 1.00
## Specificity 0.990 0.993 1.00
## Pos Pred Value 0.987 0.971 1.00
## Neg Pred Value 0.990 0.993 1.00
## Prevalence 0.442 0.199 0.36
## Detection Rate 0.436 0.193 0.36
## Detection Prevalence 0.442 0.199 0.36
## Balanced Accuracy 0.988 0.982 1.00
We can tell qda() to use leave-one-out cross-validation
by including the CV = T argument:
qda_pen_cv <-
MASS::qda(
species ~ bill_len + bill_dep + flip_len + body_mass,
data = penguins,
CV = T
)
Now it just returns what the predict function did:
class = Predicted class using CV posterior =
Posterior probability using CV
table(
predicted = qda_pen_cv$class,
actual = penguins$species
) |>
confusionMatrix()
## Confusion Matrix and Statistics
##
## actual
## predicted Adelie Chinstrap Gentoo
## Adelie 149 2 0
## Chinstrap 2 66 0
## Gentoo 0 0 123
##
## Overall Statistics
##
## Accuracy : 0.988
## 95% CI : (0.97, 0.997)
## No Information Rate : 0.442
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.982
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.987 0.971 1.00
## Specificity 0.990 0.993 1.00
## Pos Pred Value 0.987 0.971 1.00
## Neg Pred Value 0.990 0.993 1.00
## Prevalence 0.442 0.199 0.36
## Detection Rate 0.436 0.193 0.36
## Detection Prevalence 0.442 0.199 0.36
## Balanced Accuracy 0.988 0.982 1.00
The function read in below will allow us to compare the ‘prediction regions’ for a scatter plot.
Note: the function only works with p = 2.
# Reading in the graph
source('graph_DA_area.R')
# Using bill length and flipper length
gg_lda_vs_qda <-
class_DA_area_plot(
formula = species ~ bill_len + flip_len,
data = penguins
)
gg_lda_vs_qda