knitr::opts_chunk$set(echo = TRUE)
options(digits = 3)
## Load the libraries we will be using
pacman::p_load(tidyverse, magrittr, GGally, MVN)
## Changing the default theme to black white
theme_set(theme_bw())
theme_update(
axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
plot.title = element_text(size = 14, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
plot.caption = element_text(size = 8)
)
# Reading the R file that has the partial F test function:
source("Partial F Test function.R")
# Reading in the data:
nba <-
readxl::read_excel("NBA Players 2018.xlsx") |>
mutate(
True_shot = True_shot*100,
Pos = factor(Pos, levels = c("C", "PF", "SF", "SG", "PG"))
) |>
dplyr::select(-OffRebound, -DefRebound) |>
column_to_rownames(var = "Player")
Skimming the data and recording the number of numeric variables and groups
skimr::skim_without_charts(nba)
Name | nba |
Number of rows | 228 |
Number of columns | 8 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 7 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Pos | 0 | 1 | FALSE | 5 | SG: 57, PG: 50, C: 43, PF: 40 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
Efficiency | 0 | 1 | 15.54 | 4.68 | 6.4 | 12.00 | 14.2 | 18.42 | 30.9 |
True_shot | 0 | 1 | 56.38 | 4.33 | 46.4 | 53.68 | 56.2 | 58.90 | 69.2 |
TotRebound | 0 | 1 | 10.07 | 4.56 | 3.8 | 6.70 | 8.8 | 12.53 | 25.9 |
Assist | 0 | 1 | 15.80 | 9.43 | 4.0 | 8.07 | 12.4 | 21.32 | 46.5 |
Steal | 0 | 1 | 1.53 | 0.53 | 0.3 | 1.10 | 1.5 | 1.80 | 3.1 |
Block | 0 | 1 | 1.83 | 1.58 | 0.0 | 0.90 | 1.3 | 2.22 | 10.0 |
Turnover | 0 | 1 | 11.98 | 3.10 | 3.7 | 9.90 | 11.5 | 13.80 | 26.8 |
# Recording the sample size
N <- nrow(nba);
# The number of variables
p <-
nba |>
dplyr::select(where(is.numeric)) |>
ncol()
# and the number of groups
k <- n_distinct(nba$Pos)
nba |>
dplyr::select(where(is.numeric)) |>
ggcorr(
low = "darkred",
mid = "grey90",
high = "mediumblue",
label = T,
label_round = 2
)
# not separating by position
nba |>
ggpairs(
columns = 2:8,
mapping = aes(
color = Pos,
alpha = 0.5
)
)
Hard to read, let’s look at a couple of sets instead
# Examining by position: Efficiency, true shot percentage, rebounds, & assists
nba |>
ggpairs(
columns = 2:5,
mapping = aes(
color = Pos,
alpha = 0.5
)
)
# Efficiency, Steal, Block, Turnover
nba |>
ggpairs(
columns = c(2, 6:8),
mapping = aes(
color = Pos,
alpha=0.5
)
)
nba |>
pivot_longer(
cols = where(is.numeric),
names_to = "attribute",
values_to = "value"
) |>
ggplot(
mapping = aes(
x = value,
fill = Pos,
y = Pos
)
) +
geom_boxplot() +
facet_wrap(
facets = ~ attribute,
scales = "free_x",
ncol = 3
) +
labs(y = NULL, x = NULL) +
theme(legend.position = "none")
nba_man <-
manova(
cbind(Efficiency, True_shot, TotRebound, Assist, Steal, Block, Turnover) ~ Pos,
data = nba
)
summary(nba_man,
test = "Wilks")
## Df Wilks approx F num Df den Df Pr(>F)
## Pos 4 0.121 22.2 28 784 <2e-16 ***
## Residuals 223
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(nba_man,
test = "Pillai")
## Df Pillai approx F num Df den Df Pr(>F)
## Pos 4 1.21 13.6 28 880 <2e-16 ***
## Residuals 223
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mvn(
data = nba_man$residuals,
desc = F,
multivariatePlot = "qq",
univariateTest = "SW",
mvnTest = "mardia"
)
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 333.757093403413 1.74144293702382e-31 NO
## 2 Mardia Kurtosis 11.0207598323385 0 NO
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk Efficiency 0.930 <0.001 NO
## 2 Shapiro-Wilk True_shot 0.996 0.8321 YES
## 3 Shapiro-Wilk TotRebound 0.964 <0.001 NO
## 4 Shapiro-Wilk Assist 0.926 <0.001 NO
## 5 Shapiro-Wilk Steal 0.974 3e-04 NO
## 6 Shapiro-Wilk Block 0.892 <0.001 NO
## 7 Shapiro-Wilk Turnover 0.965 <0.001 NO
rstatix::box_m(data = nba |>
dplyr::select(where(is.numeric)),
group = nba$Pos)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <dbl> <chr>
## 1 288. 1.71e-17 112 Box's M-test for Homogeneity of Covariance Matri…
Data aren’t MVN (which is ok for Discriminant Analysis), but the test for equal covariance matrices needs it
Box’s M test rejects Equal covariance matrices, but the groups do appear different for at least 1 of the variables according to our box plots from earlier :)
lda()
uses the opposite formula of MANOVA:
And we can use the .
notation in R to not have to list
each variable other than the grouping variable!
nba_lda <-
MASS::lda(
Pos ~ .,
data = nba
)
nba_lda
## Call:
## lda(Pos ~ ., data = nba)
##
## Prior probabilities of groups:
## C PF SF SG PG
## 0.189 0.175 0.167 0.250 0.219
##
## Group means:
## Efficiency True_shot TotRebound Assist Steal Block Turnover
## C 20.3 60.6 17.08 11.5 1.46 4.142 12.3
## PF 14.7 57.1 11.52 11.2 1.34 1.985 11.7
## SF 13.7 55.7 9.14 12.1 1.55 1.484 11.1
## SG 13.6 54.9 7.02 14.8 1.51 1.040 11.1
## PG 15.8 54.4 7.05 27.1 1.74 0.862 13.6
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4
## Efficiency -0.00918 -0.202 -0.4053 -0.20608
## True_shot 0.07751 0.100 0.1556 0.08010
## TotRebound 0.30102 0.141 0.3356 0.14695
## Assist -0.05090 0.216 0.1242 0.04711
## Steal -0.40292 -0.147 -0.3204 2.00166
## Block 0.41952 0.421 -0.3145 0.00273
## Turnover 0.03665 -0.126 -0.0958 -0.18150
##
## Proportion of trace:
## LD1 LD2 LD3 LD4
## 0.8744 0.1127 0.0106 0.0023
Finding the objects we can get from lda
names(nba_lda)
## [1] "prior" "counts" "means" "scaling" "lev" "svd" "N"
## [8] "call" "terms" "xlevels"
The discriminant functions will be under “scaling”
nba_lda |>
pluck("scaling") |>
data.frame() |>
round(digits = 3)
## LD1 LD2 LD3 LD4
## Efficiency -0.009 -0.202 -0.405 -0.206
## True_shot 0.078 0.100 0.156 0.080
## TotRebound 0.301 0.141 0.336 0.147
## Assist -0.051 0.216 0.124 0.047
## Steal -0.403 -0.147 -0.320 2.002
## Block 0.420 0.421 -0.315 0.003
## Turnover 0.037 -0.126 -0.096 -0.181
The “standard deviation for each LD”
nba_lda$svd
## [1] 15.095 5.420 1.662 0.768
We can square them to find the proportion of groups differences explained by each LD
LD_per <-
nba_lda$svd^2/sum(nba_lda$svd^2) |>
round(3)
LD_per
## [1] 0.87442 0.11272 0.01060 0.00226
LDA is scale invariant, meaning we don’t need to rescale the data beforehand.
But if we want to be able to use the discriminant functions to rank the variables or interpret what the discriminants are in context of the original variables, we need to rescale the data.
So let’s see what happens if we rescale them first. Make sure we use the Pooled Covariance Matrix to rescale the data!
# Calculating the pooled standard deviation from the common Error Matrix from MANOVA
nba_man |>
summary() |>
pluck("SS") |>
pluck("Residuals") |>
diag() |>
sqrt() |>
divide_by(sqrt(N-k)) ->
sd_nba
sd_nba
## Efficiency True_shot TotRebound Assist Steal Block Turnover
## 4.031 3.745 2.607 7.218 0.522 1.063 2.974
Now we need to divide each variable by the pooled sd:
nba_sc <-
nba |>
dplyr::select(where(is.numeric)) |>
scale(center = T, scale = sd_nba) |>
data.frame() |>
mutate(Pos = nba$Pos)
nba_lda_sc <-
MASS::lda(
Pos ~ .,
data = nba_sc
)
nba_lda_sc
## Call:
## lda(Pos ~ ., data = nba_sc)
##
## Prior probabilities of groups:
## C PF SF SG PG
## 0.189 0.175 0.167 0.250 0.219
##
## Group means:
## Efficiency True_shot TotRebound Assist Steal Block Turnover
## C 1.1710 1.135 2.689 -0.591 -0.1384 2.179 0.1027
## PF -0.2119 0.180 0.558 -0.634 -0.3507 0.150 -0.0842
## SF -0.4542 -0.191 -0.353 -0.511 0.0420 -0.321 -0.2810
## SG -0.4906 -0.400 -1.168 -0.139 -0.0403 -0.739 -0.3087
## PG 0.0669 -0.519 -1.158 1.563 0.4136 -0.907 0.5445
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4
## Efficiency -0.037 -0.8146 -1.634 -0.8308
## True_shot 0.290 0.3758 0.583 0.3000
## TotRebound 0.785 0.3676 0.875 0.3831
## Assist -0.367 1.5563 0.897 0.3400
## Steal -0.210 -0.0766 -0.167 1.0450
## Block 0.446 0.4472 -0.334 0.0029
## Turnover 0.109 -0.3749 -0.285 -0.5397
##
## Proportion of trace:
## LD1 LD2 LD3 LD4
## 0.8744 0.1127 0.0106 0.0023
The numbers in the Discriminant functions change, but the results will be exactly the same! See the “Proportion of trace” (eigenvalues)
# Scaled discriminant functions
nba_lda_sc |>
pluck("scaling") |>
data.frame() |>
dplyr::select(1) |>
# Dividing by the unscaled discriminant functions
divide_by(
nba_lda |>
pluck("scaling") |>
data.frame() |>
dplyr::select(1)
)
## LD1
## Efficiency 4.031
## True_shot 3.745
## TotRebound 2.607
## Assist 7.218
## Steal 0.522
## Block 1.063
## Turnover 2.974
# The difference between scaled and unscaled should just be
# the sd of the variable:
data.frame(sd_nba)
## sd_nba
## Efficiency 4.031
## True_shot 3.745
## TotRebound 2.607
## Assist 7.218
## Steal 0.522
## Block 1.063
## Turnover 2.974
And the values of the discriminants will be the same as well!
data.frame(
unscaled_lda = predict(nba_lda) |>
pluck("x") |>
data.frame() |>
dplyr::select(1),
scaled_lda = predict(nba_lda_sc) |>
pluck("x") |>
data.frame() |>
dplyr::select(1)
) |>
head(n = 10)
## LD1 LD1.1
## Aaron Gordon 0.42815 0.42815
## Al Horford 1.61895 1.61895
## Alex Len 2.88816 2.88816
## Al-Farouq Aminu 1.50236 1.50236
## Alfonzo McKinnie 1.52425 1.52425
## Allen Crabbe -0.95323 -0.95323
## Allonzo Trier -0.75072 -0.75072
## Andre Drummond 5.61651 5.61651
## Andre Iguodala -0.00399 -0.00399
## Andrew Wiggins -1.17453 -1.17453
Instead of rescaling the data then calculating the linear discriminant functions, we can just rescale the LDs directly!
# Manually rescaling the discriminant functions
stan_nba_lda <- nba_lda$scaling*sd_nba
# Comparing the results:
cbind(nba_lda_sc$scaling[,1],stan_nba_lda[,1])
## [,1] [,2]
## Efficiency -0.037 -0.037
## True_shot 0.290 0.290
## TotRebound 0.785 0.785
## Assist -0.367 -0.367
## Steal -0.210 -0.210
## Block 0.446 0.446
## Turnover 0.109 0.109
Same result!
We can use a partial F test to determine which variable does the least to describe the difference between the groups when the other variables are already included
Need 2 inputs to use the Partial_F function:
Y <- with(nba,cbind(Efficiency, True_shot, TotRebound,
Assist, Steal, Block, Turnover))
x <- nba$Pos
# Using the function:
Partial_F(Y=Y,x=x)
## # A tibble: 7 × 4
## variables F_lam F_stat p_val
## <chr> <dbl> <dbl> <dbl>
## 1 Turnover 0.976 1.33 0.259
## 2 Steal 0.960 2.27 0.0631
## 3 Efficiency 0.943 3.30 0.012
## 4 True_shot 0.935 3.74 0.0057
## 5 Assist 0.807 13.0 0
## 6 Block 0.805 13.2 0
## 7 TotRebound 0.700 23.2 0
Seems we don’t need turnover when all others are included
Let’s exclude it and rerun it again
Partial_F(
Y = nba |>
dplyr::select(Efficiency, True_shot, TotRebound, Assist, Steal, Block),
x = nba$Pos
)
## # A tibble: 6 × 4
## variables F_lam F_stat p_val
## <chr> <dbl> <dbl> <dbl>
## 1 Steal 0.960 2.25 0.0649
## 2 Efficiency 0.951 2.83 0.0256
## 3 True_shot 0.932 3.97 0.004
## 4 Block 0.797 13.9 0
## 5 Assist 0.745 18.6 0
## 6 TotRebound 0.584 38.8 0
Now it looks like steal is unnecessary
Partial_F(
Y = nba |>
dplyr::select(Efficiency, True_shot, TotRebound, Assist, Block),
x = nba$Pos
)
## # A tibble: 5 × 4
## variables F_lam F_stat p_val
## <chr> <dbl> <dbl> <dbl>
## 1 Efficiency 0.948 3.00 0.0193
## 2 True_shot 0.912 5.26 0.0005
## 3 Block 0.813 12.6 0
## 4 Assist 0.737 19.5 0
## 5 TotRebound 0.592 37.7 0
The remaining 5 variables are all significant at a 5% significance level
Let’s compare the rank via LDA vs Partial F test
# Scaled LDA in order of LD1 importance
MASS::lda(
Pos ~ . - Turnover - Steal,
data = nba_sc
) |>
pluck("scaling") |>
data.frame() |>
arrange(desc(abs(LD1)))
## LD1 LD2 LD3 LD4
## TotRebound -0.821 0.198 -0.786 -0.1507
## Block -0.432 0.385 0.392 -0.3104
## True_shot -0.371 0.276 -0.562 0.9366
## Assist 0.312 1.168 -0.642 0.0165
## Efficiency 0.166 -0.434 1.428 0.0233
# Partial F, naturally ordered by importance
Partial_F(
Y = nba |>
dplyr::select(Efficiency, True_shot, TotRebound, Assist, Block),
x = nba$Pos
)
## # A tibble: 5 × 4
## variables F_lam F_stat p_val
## <chr> <dbl> <dbl> <dbl>
## 1 Efficiency 0.948 3.00 0.0193
## 2 True_shot 0.912 5.26 0.0005
## 3 Block 0.813 12.6 0
## 4 Assist 0.737 19.5 0
## 5 TotRebound 0.592 37.7 0
If we plot the lda object, it will create a scatterplot matrix of the discriminant functions
nba_lda2 <-
MASS::lda(
Pos ~ . - Turnover - Steal,
data = nba
)
plot(nba_lda2)
They don’t look great
So how do we improve it?
We can use the predict function to get the values of the discriminants!
?predict.lda
It will give us:
Let’s create a dataframe with the actual position and the discriminant functions
nba2 <-
data.frame(
nba,
predict(nba_lda2)$x
) |>
dplyr::select(-Turnover,-Steal)
head(nba2)
## Pos Efficiency True_shot TotRebound Assist Block LD1 LD2
## Aaron Gordon PF 15.1 53.8 11.7 16.6 1.8 -0.231 0.102
## Al Horford C 20.2 60.5 12.4 21.2 3.9 -1.561 1.605
## Alex Len C 17.2 57.5 14.4 8.6 3.9 -2.561 -0.181
## Al-Farouq Aminu PF 13.2 56.8 14.2 6.0 1.2 -1.609 -1.216
## Alfonzo McKinnie SF 11.2 56.9 13.1 4.0 1.3 -1.482 -1.364
## Allen Crabbe SG 7.7 51.9 6.9 5.8 0.9 1.063 -1.680
## LD3 LD4
## Aaron Gordon -0.343 -0.7335
## Al Horford 0.612 0.3284
## Alex Len 0.517 -0.5837
## Al-Farouq Aminu -1.499 0.0122
## Alfonzo McKinnie -1.676 0.0554
## Allen Crabbe -0.603 -0.7361
Now we can plot the results!
# Creating a vector with the percentage of group seperation explained
ld_sep_pct <- round(nba_lda2$svd^2/sum(nba_lda2$svd^2)*100,
digits = 1)
gg_lda_scatter <-
nba2 |>
ggplot(mapping = aes(x = LD1,
y = LD2,
color = Pos)) +
theme(legend.position = "bottom") +
labs(x = paste0("LD1 (Percent Explained: ", ld_sep_pct[1], "%)"),
y = paste0("LD2 (Percent Explained: ", ld_sep_pct[2], "%)"))
gg_lda_scatter +
geom_point(size = 1.5,
alpha = 0.75)
We can even add an ellipse around each group!
gg_lda_scatter +
geom_point(
size = 1.5,
alpha = 0.75
) +
stat_ellipse(
linetype = "dashed",
linewidth = 1,
show.legend = F
)
nba2 |>
dplyr::select(Pos,LD1,LD2) |>
ggpairs(
columns=2:3,
mapping = aes(
color = Pos,
alpha=0.50
)
)
We can use a custom function (no warantee) to create a few different graphs:
# source('lda plot creator function.R')
# gg_lda(df_ld = nba |> dplyr::select(-Steal, -Turnover),
# ld_formula = Pos ~ .,
# q = 4,
# ellipse = T)