stars <-
read_excel("stars.xlsx") |>
dplyr::mutate(
type = factor(type,
levels=c("Brown Dwarf", "Red Dwarf",
"White Dwarf","Main Sequence",
"Supergiant", "Hypergiant"))) |>
dplyr::select(-color,-class)
Using skim()
to get an initial view of the data
skimr::skim(stars)
Name | stars |
Number of rows | 240 |
Number of columns | 5 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 4 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
type | 0 | 1 | FALSE | 6 | Bro: 40, Red: 40, Whi: 40, Mai: 40 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
temp | 0 | 1 | 1.05e+04 | 9552.4 | 1939.00 | 3344.25 | 5776.00 | 15055.5 | 40000.0 | ▇▂▂▁▁ |
lumin | 0 | 1 | 1.07e+05 | 179432.2 | 0.00 | 0.00 | 0.07 | 198050.0 | 849420.0 | ▇▂▁▁▁ |
radius | 0 | 1 | 2.37e+02 | 517.2 | 0.01 | 0.10 | 0.76 | 42.8 | 1948.5 | ▇▁▁▁▁ |
mag | 0 | 1 | 4.38e+00 | 10.5 | -11.92 | -6.23 | 8.31 | 13.7 | 20.1 | ▇▃▂▆▆ |
Sample size (n), variables (p), and number of groups (k)
N <- nrow(stars); p <- 4; k <- n_distinct(stars$type)
c("N" = N, "p" = p, "k" = k)
## N p k
## 240 4 6
Comparing how the variables differ across groups:
stars |>
pivot_longer(
cols = temp:mag,
names_to = "Var",
values_to = "values"
) |>
ggplot(
mapping = aes(
x = values
)
) +
geom_density(
mapping = aes(fill = Var),
show.legend = F
) +
facet_wrap(
facets = ~ Var,
scales = "free"
) +
scale_y_continuous(expand = c(0, 0, 0.05, 0))
Looks like Lumin, Radius, and Temp could all be transformed! Let’s conduct a log10 transformation for those variables
stars2 <-
stars |>
mutate(
across(
.cols = c(lumin, radius, temp),
.fns = log10
)
)
# Density plots for the transformed data
stars2 |>
pivot_longer(
cols=temp:mag,
names_to = "Var",
values_to = "values"
) |>
ggplot(
mapping = aes(
x = values
)
) +
geom_density(
#mapping = aes(fill=type),
mapping = aes(fill = Var),
show.legend = F,
alpha = 0.5
) +
facet_wrap(
facets = ~ Var,
scales = "free"
) +
labs(x = NULL, y = NULL) +
theme_bw() +
scale_y_continuous(expand = c(0, 0, 0.05, 0))
Let’s create a correlation plot and scatterplot matrix for the variables.
When performing LDA and QDA, we need to worry about the correlations between variables, since the methods rely on the inverse of the covariance matrix:
\[L^2_{ij} = (\textbf{y}_i - \bar{\textbf{y}}_j)'\textbf{S}^{-1}_{pl}(\textbf{y}_i - \bar{\textbf{y}}_j)\]
\[Q^2_{ij} = (\textbf{y}_i - \bar{\textbf{y}}_j)'\textbf{S}^{-1}_i(\textbf{y}_i - \bar{\textbf{y}}_j)\]
For QDA, if any of the covariance matrices has a linear dependency, then we can’t calculate the distance our random vector is from that group.
# Calculating the correlations per star type in the data
star_cors <-
stars2 |>
summarize(
.by = type,
cor_temp_lumins = cor(temp, lumin),
cor_temp_rads = cor(temp, radius),
cor_temp_mags = cor(temp, mag),
cor_lumin_rads = cor(radius, lumin),
cor_lumin_mags = cor(mag, lumin),
cor_radius_mags = cor(radius, mag)
) |>
pivot_longer(
cols = contains("cor"),
values_to = "correlation",
names_to = "pairs"
) |>
mutate(
col1 = case_when(
str_detect(pairs, "_temp_") ~ "temp",
str_detect(pairs, "_lumin_") ~ "lumin",
str_detect(pairs, "_mag_") ~ "mag",
str_detect(pairs, "_radius_") ~ "radius"
)|>
factor(levels = c("temp", "lumin", "mag", "radius")),
col2 = case_when(
str_detect(pairs, "_temps") ~ "temp",
str_detect(pairs, "_lumins") ~ "lumin",
str_detect(pairs, "_mags") ~ "mag",
str_detect(pairs, "_rads") ~ "radius"
) |>
factor(levels = c("temp", "lumin", "mag", "radius")),
var1 = if_else(as.numeric(col1) <= as.numeric(col2), col1, col2),
var2 = if_else(as.numeric(col1) > as.numeric(col2), col1, col2)
)
# Calculating the correlation matrix plot per star type
xtabs(formula = correlation ~ var1 + var2 + type,
data = star_cors) |>
data.frame() |>
rename(correlation = Freq) |>
ggplot(mapping = aes(x = fct_rev(var1),
y = fct_rev(var2))) +
geom_tile(mapping = aes(fill = correlation),
color = "white",
show.legend = F) +
geom_text(data = star_cors,
mapping = aes(x = var1,
y = var2,
label = round(correlation, digits = 2)),
color = "black") +
geom_text(data = data.frame(var1 = c("temp", "lumin", "mag", "radius"),
var2 = c("temp", "lumin", "mag", "radius"),
variable = c("temp", "lumin", "mag", "radius")),
mapping = aes(label = variable)) +
scale_x_discrete(breaks = NULL) +
scale_y_discrete(breaks = NULL) +
facet_wrap(facets = ~ type) +
scale_fill_gradient2(limits = c(-1, 1)) +
labs(x = NULL,
y = NULL)
We probably should remove either temp
or
lumin
since the correlation in the main sequence stars is
0.99. Since lumin
has a higher correlation with the other
two variables overall, we’ll remove it
stars2 <-
stars2 |>
dplyr::select(-lumin)
stars2 |>
ggpairs(
columns = 1:3,
mapping = aes(color = type,
alpha = 0.75)
) +
theme_bw()
From the correlation plot and scatterplots, it doesn’t look like the
covariance matrices are equal, but let’s checking using Box’s M test
(box_m()
in rstatix
)
rstatix::box_m(
stars2 |> dplyr::select(where(is.numeric)),
group = stars2$type
)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <dbl> <chr>
## 1 663. 2.46e-120 30 Box's M-test for Homogeneity of Covariance Matr…
If we want to use a discriminant method to classify the star types, QDA would be more appropriate since we don’t have equal covariance matrices.
The qda()
function is in the MASS
library,
like the lda()
function. It has the same arguments as the
lda()
function.
qda_star <-
MASS::qda(
formula = type ~ .,
data = stars2,
priors = rep(1, k)/k
)
# Looking at the info we can extract from the qda object
names(qda_star)
## [1] "prior" "counts" "means" "scaling" "ldet" "lev" "N"
## [8] "call" "terms" "xlevels"
# What is scaling for QDA?
qda_star$scaling
## , , Brown Dwarf
##
## 1 2 3
## temp -19.8 0.691 -4.536
## radius 0.0 -10.230 -0.582
## mag 0.0 0.000 -0.847
##
## , , Red Dwarf
##
## 1 2 3
## temp -27.2 20.25 -6.864
## radius 0.0 -6.19 0.531
## mag 0.0 0.00 -0.718
##
## , , White Dwarf
##
## 1 2 3
## temp -6.3 0.848 9.83
## radius 0.0 -14.936 -1.65
## mag 0.0 0.000 1.44
##
## , , Main Sequence
##
## 1 2 3
## temp -3.35 -6.74 -8.76
## radius 0.00 6.04 -4.29
## mag 0.00 0.00 -1.17
##
## , , Supergiant
##
## 1 2 3
## temp 2.88 -0.116 0.6656
## radius 0.00 3.689 0.0947
## mag 0.00 0.000 -1.8336
##
## , , Hypergiant
##
## 1 2 3
## temp 2.5 -0.0768 0.750
## radius 0.0 11.4980 -1.847
## mag 0.0 0.0000 -0.726
Nothing important for what we are doing as they aren’t the quadratic equivalent of our discriminant functions from LDA
We’ll still use the predict()
function to make
predictions using the qda()
model. If you want to pull up
the help menu, you’ll want to call the function by the full name:
predict.qda
?MASS::predict.qda
## starting httpd help server ... done
# Adding $class to the end just keeps the predicted type of star
predict(object = qda_star,
newdata = stars2)$class |>
data.frame() |>
tibble()
## # A tibble: 240 × 1
## predict.object...qda_star..newdata...stars2..class
## <fct>
## 1 Brown Dwarf
## 2 Brown Dwarf
## 3 Brown Dwarf
## 4 Brown Dwarf
## 5 Brown Dwarf
## 6 Brown Dwarf
## 7 Brown Dwarf
## 8 Brown Dwarf
## 9 Brown Dwarf
## 10 Brown Dwarf
## # ℹ 230 more rows
# Adding $post to the end keeps the posterior probabilities: P(G_i|Y)
predict(object = qda_star,
newdata = stars2) |>
pluck("posterior") |>
round(digits = 3) |>
data.frame() |>
tibble()
## # A tibble: 240 × 6
## Brown.Dwarf Red.Dwarf White.Dwarf Main.Sequence Supergiant Hypergiant
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.853 0.147 0 0 0 0
## 2 0.984 0.016 0 0 0 0
## 3 1 0 0 0 0 0
## 4 0.973 0.027 0 0 0 0
## 5 1 0 0 0 0 0
## 6 0.999 0.001 0 0 0 0
## 7 0.999 0.001 0 0 0 0
## 8 1 0 0 0 0 0
## 9 1 0 0 0 0 0
## 10 0.976 0.024 0 0 0 0
## # ℹ 230 more rows
# Store both in a dataframe!
star_qda_df <-
data.frame(stars2,
predict(object = qda_star,
newdata = stars2))
head(star_qda_df)
## temp radius mag type class posterior.Brown.Dwarf
## 1 3.49 -0.770 16.1 Brown Dwarf Brown Dwarf 0.853
## 2 3.48 -0.812 16.6 Brown Dwarf Brown Dwarf 0.984
## 3 3.41 -0.991 18.7 Brown Dwarf Brown Dwarf 1.000
## 4 3.45 -0.796 16.6 Brown Dwarf Brown Dwarf 0.973
## 5 3.29 -0.987 20.1 Brown Dwarf Brown Dwarf 1.000
## 6 3.45 -0.959 17.0 Brown Dwarf Brown Dwarf 0.999
## posterior.Red.Dwarf posterior.White.Dwarf posterior.Main.Sequence
## 1 1.47e-01 8.03e-80 3.25e-19
## 2 1.63e-02 1.91e-74 2.68e-21
## 3 9.20e-06 2.13e-55 3.12e-27
## 4 2.74e-02 4.46e-77 6.01e-20
## 5 4.88e-10 1.64e-55 2.19e-25
## 6 6.97e-04 4.75e-59 1.52e-21
## posterior.Supergiant posterior.Hypergiant
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
We can view the results in a confusion matrix using
confusionMatrix()
in the caret
package:
star_qda_df |>
rename(predicted = class,
actual = type) |>
xtabs(formula = ~ predicted + actual) |>
confusionMatrix()
## Confusion Matrix and Statistics
##
## actual
## predicted Brown Dwarf Red Dwarf White Dwarf Main Sequence Supergiant
## Brown Dwarf 40 0 0 0 0
## Red Dwarf 0 40 0 0 0
## White Dwarf 0 0 40 0 0
## Main Sequence 0 0 0 40 2
## Supergiant 0 0 0 0 38
## Hypergiant 0 0 0 0 0
## actual
## predicted Hypergiant
## Brown Dwarf 0
## Red Dwarf 0
## White Dwarf 0
## Main Sequence 0
## Supergiant 0
## Hypergiant 40
##
## Overall Statistics
##
## Accuracy : 0.992
## 95% CI : (0.97, 0.999)
## No Information Rate : 0.167
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.99
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Brown Dwarf Class: Red Dwarf Class: White Dwarf
## Sensitivity 1.000 1.000 1.000
## Specificity 1.000 1.000 1.000
## Pos Pred Value 1.000 1.000 1.000
## Neg Pred Value 1.000 1.000 1.000
## Prevalence 0.167 0.167 0.167
## Detection Rate 0.167 0.167 0.167
## Detection Prevalence 0.167 0.167 0.167
## Balanced Accuracy 1.000 1.000 1.000
## Class: Main Sequence Class: Supergiant Class: Hypergiant
## Sensitivity 1.000 0.950 1.000
## Specificity 0.990 1.000 1.000
## Pos Pred Value 0.952 1.000 1.000
## Neg Pred Value 1.000 0.990 1.000
## Prevalence 0.167 0.167 0.167
## Detection Rate 0.167 0.158 0.167
## Detection Prevalence 0.175 0.158 0.167
## Balanced Accuracy 0.995 0.975 1.000
We can tell qda()
to use leave-one-out cross-validation
by including the CV = T
argument:
qda_star_cv <-
MASS::qda(type ~ .,
data = stars2,
CV = T)
Now it just returns what the predict function did:
Class =
Predicted class using CV Posterior =
Posterior probability using CV
table(predicted = qda_star_cv$class,
actual = stars$type) |>
confusionMatrix()
## Confusion Matrix and Statistics
##
## actual
## predicted Brown Dwarf Red Dwarf White Dwarf Main Sequence Supergiant
## Brown Dwarf 40 0 0 0 0
## Red Dwarf 0 40 0 0 0
## White Dwarf 0 0 40 0 0
## Main Sequence 0 0 0 40 2
## Supergiant 0 0 0 0 38
## Hypergiant 0 0 0 0 0
## actual
## predicted Hypergiant
## Brown Dwarf 0
## Red Dwarf 0
## White Dwarf 0
## Main Sequence 0
## Supergiant 0
## Hypergiant 40
##
## Overall Statistics
##
## Accuracy : 0.992
## 95% CI : (0.97, 0.999)
## No Information Rate : 0.167
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.99
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Brown Dwarf Class: Red Dwarf Class: White Dwarf
## Sensitivity 1.000 1.000 1.000
## Specificity 1.000 1.000 1.000
## Pos Pred Value 1.000 1.000 1.000
## Neg Pred Value 1.000 1.000 1.000
## Prevalence 0.167 0.167 0.167
## Detection Rate 0.167 0.167 0.167
## Detection Prevalence 0.167 0.167 0.167
## Balanced Accuracy 1.000 1.000 1.000
## Class: Main Sequence Class: Supergiant Class: Hypergiant
## Sensitivity 1.000 0.950 1.000
## Specificity 0.990 1.000 1.000
## Pos Pred Value 0.952 1.000 1.000
## Neg Pred Value 1.000 0.990 1.000
## Prevalence 0.167 0.167 0.167
## Detection Rate 0.167 0.158 0.167
## Detection Prevalence 0.175 0.158 0.167
## Balanced Accuracy 0.995 0.975 1.000