Chapter 8: Discriminant Analysis

Set Up Your Project and Load Libraries

knitr::opts_chunk$set(echo = TRUE) 
options(digits = 3)

## Load the libraries we will be using
pacman::p_load(tidyverse, magrittr, GGally, MVN)

## Changing the default theme to black white
theme_set(theme_bw()) 
theme_update(
  axis.title = element_text(size = 12),
  axis.text = element_text(size = 10),
  plot.title = element_text(size = 14, hjust = 0.5),
  plot.subtitle = element_text(size = 10, hjust = 0.5),
  plot.caption = element_text(size = 8)
)

# Reading the R file that has the partial F test function:
source("Partial F Test function.R")

# Reading in the data:
nba <- 
  readxl::read_excel("NBA Players 2018.xlsx") |>
  mutate(
    True_shot = True_shot*100,
    Pos = factor(Pos, levels = c("C", "PF", "SF", "SG", "PG"))
  ) |> 
  dplyr::select(-OffRebound, -DefRebound) |>
  column_to_rownames(var = "Player")

Skimming the data and recording the number of numeric variables and groups

skimr::skim_without_charts(nba)

Data summary
Name	nba
Number of rows	228
Number of columns	8
_______________________
Column type frequency:
factor	1
numeric	7
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Pos	0	1	FALSE	5	SG: 57, PG: 50, C: 43, PF: 40

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
Efficiency	1	15.54	4.68	6.4	12.00	14.2	18.42	30.9
True_shot	1	56.38	4.33	46.4	53.68	56.2	58.90	69.2
TotRebound	1	10.07	4.56	3.8	6.70	8.8	12.53	25.9
Assist	1	15.80	9.43	4.0	8.07	12.4	21.32	46.5
Steal	1	1.53	0.53	0.3	1.10	1.5	1.80	3.1
Block	1	1.83	1.58	0.0	0.90	1.3	2.22	10.0
Turnover	1	11.98	3.10	3.7	9.90	11.5	13.80	26.8

# Recording the sample size
N <- nrow(nba); 

# The number of variables
p <- 
  nba |> 
  dplyr::select(where(is.numeric)) |> 
  ncol()

# and the number of groups 
k <- n_distinct(nba$Pos)

Preliminary Analysis: Graphs

Correlation plot

nba |> 
  dplyr::select(where(is.numeric)) |> 
  ggcorr(
    low = "darkred",
    mid = "grey90",
    high = "mediumblue",
    label = T,
    label_round = 2
  )

Scatterplot Matrix

# not separating by position
nba |> 
  ggpairs(
    columns = 2:8, 
    mapping = aes(
      color = Pos,
      alpha = 0.5
    )
  )

Hard to read, let’s look at a couple of sets instead

# Examining by position: Efficiency, true shot percentage, rebounds, & assists
nba |> 
  ggpairs(
    columns = 2:5,
    mapping = aes(
      color = Pos, 
      alpha = 0.5
    )
  )

# Efficiency, Steal, Block, Turnover
nba |> 
  ggpairs(
    columns = c(2, 6:8), 
    mapping = aes(
      color = Pos, 
      alpha=0.5
    )
  )

Side-by-side boxplots:

nba |> 
  pivot_longer(
    cols = where(is.numeric),
    names_to = "attribute",
    values_to = "value"
  ) |> 
  
  ggplot(
    mapping = aes(
      x = value,
      fill = Pos,
      y = Pos
    )
  ) + 
  
  geom_boxplot() + 
  
  facet_wrap(
    facets = ~ attribute,
    scales = "free_x",
    ncol = 3
  ) +
  labs(y = NULL, x = NULL) +
  theme(legend.position = "none")

Preliminary Check: MANOVA

MANOVA Test

nba_man <- 
  manova(
    cbind(Efficiency, True_shot, TotRebound, Assist, Steal, Block, Turnover) ~ Pos,
    data = nba
  )

summary(nba_man, 
        test = "Wilks")

##            Df Wilks approx F num Df den Df Pr(>F)    
## Pos         4 0.121     22.2     28    784 <2e-16 ***
## Residuals 223                                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(nba_man, 
        test = "Pillai")

##            Df Pillai approx F num Df den Df Pr(>F)    
## Pos         4   1.21     13.6     28    880 <2e-16 ***
## Residuals 223                                         
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Multivariate Normality Check for Residuals

mvn(
  data = nba_man$residuals, 
  desc = F, 
  multivariatePlot = "qq",
  univariateTest = "SW",
  mvnTest = "mardia"
)

## $multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 333.757093403413 1.74144293702382e-31     NO
## 2 Mardia Kurtosis 11.0207598323385                    0     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## $univariateNormality
##           Test   Variable Statistic   p value Normality
## 1 Shapiro-Wilk Efficiency     0.930  <0.001      NO    
## 2 Shapiro-Wilk True_shot      0.996  0.8321      YES   
## 3 Shapiro-Wilk TotRebound     0.964  <0.001      NO    
## 4 Shapiro-Wilk   Assist       0.926  <0.001      NO    
## 5 Shapiro-Wilk   Steal        0.974   3e-04      NO    
## 6 Shapiro-Wilk   Block        0.892  <0.001      NO    
## 7 Shapiro-Wilk  Turnover      0.965  <0.001      NO

Equal Covariance Matrix

rstatix::box_m(data = nba |> 
             dplyr::select(where(is.numeric)),
      group = nba$Pos)

## # A tibble: 1 × 4
##   statistic  p.value parameter method                                           
##       <dbl>    <dbl>     <dbl> <chr>                                            
## 1      288. 1.71e-17       112 Box's M-test for Homogeneity of Covariance Matri…

Data aren’t MVN (which is ok for Discriminant Analysis), but the test for equal covariance matrices needs it

Box’s M test rejects Equal covariance matrices, but the groups do appear different for at least 1 of the variables according to our box plots from earlier :)

Discriminant Analysis

lda() uses the opposite formula of MANOVA:

LHS = Groups
RHS = Variables

And we can use the . notation in R to not have to list each variable other than the grouping variable!

nba_lda <- 
  MASS::lda(
    Pos ~ ., 
    data = nba
  )

nba_lda

## Call:
## lda(Pos ~ ., data = nba)
## 
## Prior probabilities of groups:
##     C    PF    SF    SG    PG 
## 0.189 0.175 0.167 0.250 0.219 
## 
## Group means:
##    Efficiency True_shot TotRebound Assist Steal Block Turnover
## C        20.3      60.6      17.08   11.5  1.46 4.142     12.3
## PF       14.7      57.1      11.52   11.2  1.34 1.985     11.7
## SF       13.7      55.7       9.14   12.1  1.55 1.484     11.1
## SG       13.6      54.9       7.02   14.8  1.51 1.040     11.1
## PG       15.8      54.4       7.05   27.1  1.74 0.862     13.6
## 
## Coefficients of linear discriminants:
##                 LD1    LD2     LD3      LD4
## Efficiency -0.00918 -0.202 -0.4053 -0.20608
## True_shot   0.07751  0.100  0.1556  0.08010
## TotRebound  0.30102  0.141  0.3356  0.14695
## Assist     -0.05090  0.216  0.1242  0.04711
## Steal      -0.40292 -0.147 -0.3204  2.00166
## Block       0.41952  0.421 -0.3145  0.00273
## Turnover    0.03665 -0.126 -0.0958 -0.18150
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4 
## 0.8744 0.1127 0.0106 0.0023

Finding the objects we can get from lda

names(nba_lda)

##  [1] "prior"   "counts"  "means"   "scaling" "lev"     "svd"     "N"      
##  [8] "call"    "terms"   "xlevels"

The discriminant functions will be under “scaling”

nba_lda |> 
  pluck("scaling") |> 
  data.frame() |> 
  round(digits = 3)

##               LD1    LD2    LD3    LD4
## Efficiency -0.009 -0.202 -0.405 -0.206
## True_shot   0.078  0.100  0.156  0.080
## TotRebound  0.301  0.141  0.336  0.147
## Assist     -0.051  0.216  0.124  0.047
## Steal      -0.403 -0.147 -0.320  2.002
## Block       0.420  0.421 -0.315  0.003
## Turnover    0.037 -0.126 -0.096 -0.181

The “standard deviation for each LD”

nba_lda$svd

## [1] 15.095  5.420  1.662  0.768

We can square them to find the proportion of groups differences explained by each LD

LD_per <- 
  nba_lda$svd^2/sum(nba_lda$svd^2) |> 
  round(3)

LD_per

## [1] 0.87442 0.11272 0.01060 0.00226

LDA is scale invariant, meaning we don’t need to rescale the data beforehand.

But if we want to be able to use the discriminant functions to rank the variables or interpret what the discriminants are in context of the original variables, we need to rescale the data.

So let’s see what happens if we rescale them first. Make sure we use the Pooled Covariance Matrix to rescale the data!

# Calculating the pooled standard deviation from the common Error Matrix from MANOVA
nba_man |> 
  summary() |> 
  pluck("SS") |> 
  pluck("Residuals") |> 
  diag() |> 
  sqrt() |> 
  divide_by(sqrt(N-k)) ->
  sd_nba

sd_nba

## Efficiency  True_shot TotRebound     Assist      Steal      Block   Turnover 
##      4.031      3.745      2.607      7.218      0.522      1.063      2.974

Now we need to divide each variable by the pooled sd:

nba_sc <- 
  nba |> 
  dplyr::select(where(is.numeric)) |> 
  scale(center = T, scale = sd_nba) |> 
  data.frame() |> 
  mutate(Pos = nba$Pos)


nba_lda_sc <- 
  MASS::lda(
    Pos ~ ., 
    data = nba_sc
  )

nba_lda_sc

## Call:
## lda(Pos ~ ., data = nba_sc)
## 
## Prior probabilities of groups:
##     C    PF    SF    SG    PG 
## 0.189 0.175 0.167 0.250 0.219 
## 
## Group means:
##    Efficiency True_shot TotRebound Assist   Steal  Block Turnover
## C      1.1710     1.135      2.689 -0.591 -0.1384  2.179   0.1027
## PF    -0.2119     0.180      0.558 -0.634 -0.3507  0.150  -0.0842
## SF    -0.4542    -0.191     -0.353 -0.511  0.0420 -0.321  -0.2810
## SG    -0.4906    -0.400     -1.168 -0.139 -0.0403 -0.739  -0.3087
## PG     0.0669    -0.519     -1.158  1.563  0.4136 -0.907   0.5445
## 
## Coefficients of linear discriminants:
##               LD1     LD2    LD3     LD4
## Efficiency -0.037 -0.8146 -1.634 -0.8308
## True_shot   0.290  0.3758  0.583  0.3000
## TotRebound  0.785  0.3676  0.875  0.3831
## Assist     -0.367  1.5563  0.897  0.3400
## Steal      -0.210 -0.0766 -0.167  1.0450
## Block       0.446  0.4472 -0.334  0.0029
## Turnover    0.109 -0.3749 -0.285 -0.5397
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4 
## 0.8744 0.1127 0.0106 0.0023

The numbers in the Discriminant functions change, but the results will be exactly the same! See the “Proportion of trace” (eigenvalues)

# Scaled discriminant functions
nba_lda_sc |> 
  pluck("scaling") |> 
  data.frame() |> 
  dplyr::select(1) |> 
# Dividing by the unscaled discriminant functions
  divide_by(
    nba_lda |> 
      pluck("scaling") |> 
      data.frame() |> 
      dplyr::select(1)
  )

##              LD1
## Efficiency 4.031
## True_shot  3.745
## TotRebound 2.607
## Assist     7.218
## Steal      0.522
## Block      1.063
## Turnover   2.974

# The difference between scaled and unscaled should just be
# the sd of the variable:
data.frame(sd_nba)

##            sd_nba
## Efficiency  4.031
## True_shot   3.745
## TotRebound  2.607
## Assist      7.218
## Steal       0.522
## Block       1.063
## Turnover    2.974

And the values of the discriminants will be the same as well!

data.frame(
  unscaled_lda = predict(nba_lda) |> 
    pluck("x") |>
    data.frame() |> 
    dplyr::select(1),
  
  scaled_lda = predict(nba_lda_sc) |> 
    pluck("x") |>
    data.frame() |> 
    dplyr::select(1)
  
  ) |> 
  head(n = 10)

##                       LD1    LD1.1
## Aaron Gordon      0.42815  0.42815
## Al Horford        1.61895  1.61895
## Alex Len          2.88816  2.88816
## Al-Farouq Aminu   1.50236  1.50236
## Alfonzo McKinnie  1.52425  1.52425
## Allen Crabbe     -0.95323 -0.95323
## Allonzo Trier    -0.75072 -0.75072
## Andre Drummond    5.61651  5.61651
## Andre Iguodala   -0.00399 -0.00399
## Andrew Wiggins   -1.17453 -1.17453

Standardized discriminant function

Instead of rescaling the data then calculating the linear discriminant functions, we can just rescale the LDs directly!

# Manually rescaling the discriminant functions
stan_nba_lda <- nba_lda$scaling*sd_nba

# Comparing the results:
cbind(nba_lda_sc$scaling[,1],stan_nba_lda[,1])

##              [,1]   [,2]
## Efficiency -0.037 -0.037
## True_shot   0.290  0.290
## TotRebound  0.785  0.785
## Assist     -0.367 -0.367
## Steal      -0.210 -0.210
## Block       0.446  0.446
## Turnover    0.109  0.109

Same result!

Partial F Tests

We can use a partial F test to determine which variable does the least to describe the difference between the groups when the other variables are already included

Need 2 inputs to use the Partial_F function:

Y = Response variables
x = Grouping variable

Y <- with(nba,cbind(Efficiency, True_shot, TotRebound,
                    Assist, Steal, Block, Turnover))

x <- nba$Pos


# Using the function:
Partial_F(Y=Y,x=x)

## # A tibble: 7 × 4
##   variables  F_lam F_stat  p_val
##   <chr>      <dbl>  <dbl>  <dbl>
## 1 Turnover   0.976   1.33 0.259 
## 2 Steal      0.960   2.27 0.0631
## 3 Efficiency 0.943   3.30 0.012 
## 4 True_shot  0.935   3.74 0.0057
## 5 Assist     0.807  13.0  0     
## 6 Block      0.805  13.2  0     
## 7 TotRebound 0.700  23.2  0

Seems we don’t need turnover when all others are included

Let’s exclude it and rerun it again

Partial_F(
  Y = nba |> 
    dplyr::select(Efficiency, True_shot, TotRebound, Assist, Steal, Block), 
  x = nba$Pos
)

## # A tibble: 6 × 4
##   variables  F_lam F_stat  p_val
##   <chr>      <dbl>  <dbl>  <dbl>
## 1 Steal      0.960   2.25 0.0649
## 2 Efficiency 0.951   2.83 0.0256
## 3 True_shot  0.932   3.97 0.004 
## 4 Block      0.797  13.9  0     
## 5 Assist     0.745  18.6  0     
## 6 TotRebound 0.584  38.8  0

Now it looks like steal is unnecessary

Partial_F(
  Y = nba |> 
    dplyr::select(Efficiency, True_shot, TotRebound, Assist, Block), 
  x = nba$Pos
)

## # A tibble: 5 × 4
##   variables  F_lam F_stat  p_val
##   <chr>      <dbl>  <dbl>  <dbl>
## 1 Efficiency 0.948   3.00 0.0193
## 2 True_shot  0.912   5.26 0.0005
## 3 Block      0.813  12.6  0     
## 4 Assist     0.737  19.5  0     
## 5 TotRebound 0.592  37.7  0

The remaining 5 variables are all significant at a 5% significance level

Let’s compare the rank via LDA vs Partial F test

# Scaled LDA in order of LD1 importance
MASS::lda(
  Pos ~ . - Turnover - Steal, 
  data = nba_sc
) |> 
  pluck("scaling") |> 
  data.frame() |> 
  arrange(desc(abs(LD1)))

##               LD1    LD2    LD3     LD4
## TotRebound -0.821  0.198 -0.786 -0.1507
## Block      -0.432  0.385  0.392 -0.3104
## True_shot  -0.371  0.276 -0.562  0.9366
## Assist      0.312  1.168 -0.642  0.0165
## Efficiency  0.166 -0.434  1.428  0.0233

# Partial F, naturally ordered by importance
Partial_F(
  Y = nba |> 
    dplyr::select(Efficiency, True_shot, TotRebound, Assist, Block), 
  x = nba$Pos
)

## # A tibble: 5 × 4
##   variables  F_lam F_stat  p_val
##   <chr>      <dbl>  <dbl>  <dbl>
## 1 Efficiency 0.948   3.00 0.0193
## 2 True_shot  0.912   5.26 0.0005
## 3 Block      0.813  12.6  0     
## 4 Assist     0.737  19.5  0     
## 5 TotRebound 0.592  37.7  0

Plotting the maximal group separation

If we plot the lda object, it will create a scatterplot matrix of the discriminant functions

nba_lda2 <- 
  MASS::lda(
    Pos ~ . - Turnover - Steal, 
    data = nba
  )

plot(nba_lda2)

They don’t look great

So how do we improve it?

Calculating the new discriminants

We can use the predict function to get the values of the discriminants!

?predict.lda

It will give us:

Class = The group (position) each player would be classified into (next chapter)
Posterior = The probability that the player is a certain position
x = The value of the discriminants for each player

Let’s create a dataframe with the actual position and the discriminant functions

nba2 <- 
  data.frame(
    nba, 
    predict(nba_lda2)$x
  ) |>
  dplyr::select(-Turnover,-Steal)

head(nba2)

##                  Pos Efficiency True_shot TotRebound Assist Block    LD1    LD2
## Aaron Gordon      PF       15.1      53.8       11.7   16.6   1.8 -0.231  0.102
## Al Horford         C       20.2      60.5       12.4   21.2   3.9 -1.561  1.605
## Alex Len           C       17.2      57.5       14.4    8.6   3.9 -2.561 -0.181
## Al-Farouq Aminu   PF       13.2      56.8       14.2    6.0   1.2 -1.609 -1.216
## Alfonzo McKinnie  SF       11.2      56.9       13.1    4.0   1.3 -1.482 -1.364
## Allen Crabbe      SG        7.7      51.9        6.9    5.8   0.9  1.063 -1.680
##                     LD3     LD4
## Aaron Gordon     -0.343 -0.7335
## Al Horford        0.612  0.3284
## Alex Len          0.517 -0.5837
## Al-Farouq Aminu  -1.499  0.0122
## Alfonzo McKinnie -1.676  0.0554
## Allen Crabbe     -0.603 -0.7361

Now we can plot the results!

# Creating a vector with the percentage of group seperation explained
ld_sep_pct <- round(nba_lda2$svd^2/sum(nba_lda2$svd^2)*100,
                    digits = 1)

gg_lda_scatter <- 
  nba2 |> 
  ggplot(mapping = aes(x = LD1, 
                       y = LD2, 
                       color = Pos)) + 
  
  theme(legend.position = "bottom") + 
  
  labs(x = paste0("LD1 (Percent Explained: ", ld_sep_pct[1], "%)"),
       y = paste0("LD2 (Percent Explained: ", ld_sep_pct[2], "%)"))

gg_lda_scatter + 
  geom_point(size = 1.5, 
             alpha = 0.75)

We can even add an ellipse around each group!

gg_lda_scatter + 
  
  geom_point(
    size = 1.5, 
             alpha = 0.75
    ) +
  
  stat_ellipse(
    linetype = "dashed",
               linewidth = 1, 
               show.legend = F
    )

nba2 |> 
  dplyr::select(Pos,LD1,LD2) |>
  ggpairs(
    columns=2:3, 
    mapping = aes(
      color = Pos, 
      alpha=0.50
    )
  )

We can use a custom function (no warantee) to create a few different graphs:

# source('lda plot creator function.R')
# gg_lda(df_ld = nba |> dplyr::select(-Steal, -Turnover),
#        ld_formula = Pos ~ ., 
#        q = 4, 
#        ellipse = T)