Chapters 6 & 8 Practice: Skulls

knitr::opts_chunk$set(echo = TRUE,
                      fig.width = 10, 
                      fig.height = 6, 
                      fig.align = "center")


# Load the needed package(s) below:
pacman::p_load(heplots, readxl, tidyverse, 
               magrittr, MVN, rstatix, broom)


# Change the default theme below:
theme_set(theme_bw())
theme_update(
  axis.title = element_text(size = 12),
  axis.text = element_text(size = 10),
  plot.title = element_text(size = 16, hjust = 0.5),
  plot.subtitle = element_text(size = 10, hjust = 0.5),
  plot.caption = element_text(size = 8)
)

# Changing the default number of digits to print to 3
options(digits=3)

Data Description:

The data set Skulls contains data collected on 150 Egyptian skulls from 5 different epochs (eras).

The five variables measured for each skull are

epoch: The epoch the skull is believed to be from
breadth (mb): Skull width
height (bh): Skull height
length (bl): Skull length (front to back)
nasal height (nh): Height of the nose cavity

For more details, type ?Skulls in the console area to bring up the help menu for the data set.

If the skull measurements change over time, it would indicate the native Egyptians mingled with foreign populations.

# Adding the skulls data set to the global environment
Skulls <- 
  heplots::Skulls |> 
  rename(breadth = mb,
         height = bh,
         length = bl,
         nasal_height = nh) 

unique(Skulls$epoch)

## [1] c4000BC c3300BC c1850BC c200BC  cAD150 
## Levels: c4000BC < c3300BC < c1850BC < c200BC < cAD150

# And skimming the data set to get a quick overview:
skimr::skim(Skulls)

Data summary
Name	Skulls
Number of rows	150
Number of columns	5
_______________________
Column type frequency:
factor	1
numeric	4
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
epoch	0	1	TRUE	5	c40: 30, c33: 30, c18: 30, c20: 30

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
breadth	1	134.0	4.89	119	131	134	137	148	▁▃▇▅▁
height	1	132.6	4.94	120	129	133	136	145	▂▅▇▅▁
length	1	96.5	5.38	81	93	96	100	114	▁▆▇▃▁
nasal_height	1	50.9	3.21	44	49	51	53	60	▃▇▇▅▁

## Calculating the basic information for the data
# Number of rows
N <- nrow(Skulls) 

# Number of columns
p <- Skulls |> 
     dplyr::select(where(is.numeric)) |> 
     ncol()

# Number of epochs
k <- n_distinct(Skulls$epoch)

# Number of skulls per epoch
n <- Skulls |> 
  count(epoch) |> 
  dplyr::select(n) |> 
  unlist()

epoch_means <- 
  Skulls |> 
  group_by(epoch) |> 
  summarize(across(.cols = breadth:nasal_height,
                   .fns = mean))

epoch_means

## # A tibble: 5 × 5
##   epoch   breadth height length nasal_height
##   <ord>     <dbl>  <dbl>  <dbl>        <dbl>
## 1 c4000BC    131.   134.   99.2         50.5
## 2 c3300BC    132.   133.   99.1         50.2
## 3 c1850BC    134.   134.   96.0         50.6
## 4 c200BC     136.   132.   94.5         52.0
## 5 cAD150     136.   130.   93.5         51.4

Question 1: What are the null and alternative hypotheses?

Write the null and alternative hypotheses in words and in context of the data

Null: All four skull measurements are the same across all 5 epochs or All four skull measurements did not change across the 5 epochs

Alternative: At least one skull measurement is different for at least one of the five epochs

Question 2: Side-by-side Boxplots

Create a group of side-by-side boxplots for each of the 4 different skull measurements by epoch. Arrange them in a way that compares each individual measurement across epochs the easiest.

Skulls |> 
  
  pivot_longer(
    cols = where(is.numeric),
               names_to = "Feature",
               values_to = "Measure"
    ) |> 
  
  mutate(Feature = as_factor(Feature)) |> 
  
  ggplot(
    mapping = aes(
      x = Measure,
                       y = epoch
      )
    ) + 
  
  geom_boxplot(
    mapping = aes(fill = epoch),
               show.legend = F
    ) +
  
  facet_wrap(
    facets = ~ Feature,
             ncol = 2,
             scales = "free_x"
    ) + 
  
  labs(x = NULL)

Using the side-by-side boxplots, breadth and length appear to have changed the most over the 5 epochs.

height and nasal height (nh) appear to be somewhat similar across all 5 epochs.

Question 3: Hypothesis Test

Perform an appropriate hypothesis test and state the appropriate conclusion.

# Performing Wilks Manova test
skull_man <- manova(cbind(breadth, height, length, nasal_height) ~ epoch, 
                    data = Skulls)

# Checking the Wilks summary stat
summary(skull_man, test="Wilks") |> 
  pluck("stats") |> 
  data.frame() |> 
  gt::gt()

Df	Wilks	approx.F	num.Df	den.Df	Pr..F.
4	0.664	3.9	16	434	7.01e-07
145	NA	NA	NA	NA	NA

# Checking the Pillai summary stat
summary(skull_man) |> 
  pluck("stats") |> 
  data.frame() |> 
  gt::gt()

Df	Pillai	approx.F	num.Df	den.Df	Pr..F.
4	0.353	3.51	16	580	4.68e-06
145	NA	NA	NA	NA	NA

According to both test statistics, there does appear to be at least one different skull measurement for at least one epoch

Question 4: Checking Assumptions

Check the MANOVA assumptions for the Skulls data. Which assumptions appear to be true and which are false?

## Equal covariance matrices:
# Using boxM() function in heplots to determine if the covariance matrices for each epoch are equal
box_m(
  data = Skulls |> dplyr::select(where(is.numeric)),
  group = Skulls$epoch
) |> 
  dplyr::select(method, statistic, parameter, p.value) |> 
  rename(df = parameter) |> 
  gt::gt()

method	statistic	df	p.value
Box's M-test for Homogeneity of Covariance Matrices	45.7	40	0.248

## Residuals are MVN:
# Using the mvn() function in the MVN package on the residuals of the MANOVA model: skull_man$res
mvn(
  skull_man$res, 
  multivariatePlot = "qq", 
  univariateTest = "SW",
  mvnTest = "mardia",
  desc = F
)

## $multivariateNormality
##              Test         Statistic           p value Result
## 1 Mardia Skewness  19.2570567874466 0.505177925496236    YES
## 2 Mardia Kurtosis 0.267418576192474 0.789146898351176    YES
## 3             MVN              <NA>              <NA>    YES
## 
## $univariateNormality
##           Test     Variable Statistic   p value Normality
## 1 Shapiro-Wilk   breadth        0.993     0.712    YES   
## 2 Shapiro-Wilk    height        0.994     0.749    YES   
## 3 Shapiro-Wilk    length        0.996     0.935    YES   
## 4 Shapiro-Wilk nasal_height     0.993     0.719    YES

Both tests for Equal Covariance Matrices and MVN of the residuals had large p-values, indicating there isn’t evidence that either of the assumptions have been violated.

Examining the Multivariate Q-Q Plot, none of the dots are significantly far from the black line, indicating no evidence of impactful MV outliers

Question 5: What measurements changed across epochs using a 5% overall error rate?

# Using anova_test() in the rstatix package to conduct an anova test for each skull measurement
Skulls |> 
  
  pivot_longer(
    cols = breadth:nasal_height,
    names_to = "Feature", 
    values_to = "Measure"
  ) |>
  
  group_by(Feature) |> 
  
  anova_test(Measure ~ epoch) |> 
  
  data.frame() |> 
  
  mutate(p = round(p, digits = 4)) |> 
  
  dplyr::select(Feature:p) |> 
  
  arrange(p)

##        Feature Effect DFn DFd    F      p
## 1       length  epoch   4 145 8.31 0.0000
## 2      breadth  epoch   4 145 5.96 0.0002
## 3       height  epoch   4 145 2.45 0.0490
## 4 nasal_height  epoch   4 145 1.51 0.2030

There is very strong evidence that bl and mb (skull length and maximum skull width) have changed across the epochs. There is some evidence that the skull height has changed, but with a p-value of 0.049, however, if we use Bonferroni’s correction, our significance level is 0.05/5 = 0.01.

Question 6

For the variable that is the most statistically significant in question 4, across which epochs is the measurement different?

# using the tukey_hsd() function in rstatix
Skulls |> 
  dplyr::select(epoch, length) |> 
  
  tukey_hsd(formula = length ~ epoch) |> 
  
  filter(p.adj < 0.05/p) |> 
  
  gt::gt()

term	group1	group2	estimate	conf.low	conf.high	p.adj	p.adj.signif
epoch	c4000BC	c200BC	-4.63	-8.14	-1.13	0.003320	**
epoch	c4000BC	cAD150	-5.67	-9.17	-2.16	0.000155	***
epoch	c3300BC	c200BC	-4.53	-8.04	-1.03	0.004350	**
epoch	c3300BC	cAD150	-5.57	-9.07	-2.06	0.000213	***

# Creating a line graph to display the confidence intervals
Skulls |> 
  dplyr::select(epoch, length) |> 
  
  tukey_hsd(formula = length ~ epoch) |> 
  
  filter(p.adj < 0.05/p) |> 
  
  mutate(
    vs = paste(group2, group1, sep = " - "),
    vs = str_remove_all(vs, "c")
  ) |> 
  
  pivot_longer(cols = conf.low:conf.high) |> 
  
  mutate(value = abs(value)) |> 
  
  ggplot(
    mapping = aes(
      x = value, 
      y = vs
    )
  ) + 
  
  geom_line(
    mapping = aes(color = vs),
    linewidth = 1,
    show.legend = F
  ) + 
  
  labs(
    x = "Mean Length of Skull",
    y = NULL,
    title = "Confidence Intervals Comparing Skull Length Across Epochs"
  )

Since each ANOVA model should be tested at a 5/4% significance level, our confidence level will be at 1-0.05/4 to keep the results consistent.

It appears that the two earliest epochs are different than the two last epochs for the changes in skull length. The further apart the epochs, the stronger the evidence of a difference!

Question 7: Discriminant Functions

Calculate the discriminant functions.

How many discriminants are needed?

Include any figures you used to justify your answer.

# Using lda() in the MASS package to find the discriminant functions.
skull_lda <- 
  MASS::lda(
    epoch ~ ., 
    data = Skulls
  )

skull_lda

## Call:
## lda(epoch ~ ., data = Skulls)
## 
## Prior probabilities of groups:
## c4000BC c3300BC c1850BC  c200BC  cAD150 
##     0.2     0.2     0.2     0.2     0.2 
## 
## Group means:
##         breadth height length nasal_height
## c4000BC     131    134   99.2         50.5
## c3300BC     132    133   99.1         50.2
## c1850BC     134    134   96.0         50.6
## c200BC      136    132   94.5         52.0
## cAD150      136    130   93.5         51.4
## 
## Coefficients of linear discriminants:
##                  LD1     LD2     LD3      LD4
## breadth       0.1267  0.0387 -0.0928 -0.14884
## height       -0.0370  0.2101  0.0246  0.00042
## length       -0.1451 -0.0681 -0.0147 -0.13250
## nasal_height  0.0829 -0.0773  0.2946 -0.06686
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4 
## 0.8823 0.0809 0.0326 0.0042

It appears we only need 1 discriminant function, since LD1 accounts for over 88% of the differences between the 5 epochs

Question 8

Using the first discriminant function from the previous question, rank the variables from most to least important in separating the epochs

Need to calculate the standardized discriminant function:

\[a_i^* = s_i * a_i\]

We need to calculate the pooled standard deviation:

\[s_i = \sqrt{ \left( \frac{e_{ii}}{N-K} \right)} \]

# Standardizing the data assuming that the covariance matrices are equal across epochs:
# First finding the standard deviation assuming equal variances:
skull_sd <- 
  summary(skull_man)$SS$Residuals |>  # Getting the residual matrix
  diag() |>                           # Extracting the diagonal to capture the variances
  divide_by(N-k) |>                   # dividing by N-k to convert to a variance
  sqrt()                              # Taking the square root of the variance to get the sd

# Discriminant function is in the $scaling name
ldf1 <- 
  skull_lda |> 
  pluck("scaling") |> 
  data.frame() |> 
  dplyr::select(1)

# multiplying the discriminant function by the pooled standard deviation
ldf1 * skull_sd

##                 LD1
## breadth       0.582
## height       -0.179
## length       -0.714
## nasal_height  0.264

The rank of the variables in describing the differences in the skulls across epochs is:

skull length
skull breadth
nose height
skull height

Question 9: Partial F tests

Use partial F tests to determine which variables are useful in describing the differences in epochs when the other variables are included.

Then using the result of the partial F-tests with all 4 variables, does the first discriminant function agree with the partial F-tests in the importance of the 4 variables in describing how the epochs change?

# Make sure that the R Script with the partial F test function is in the same folder as this practice, then use the line below to read it into R:
source("Partial F Test function.R")

resp_vars <- 
  Skulls |> 
  dplyr::select(-epoch) |> 
  cbind()

# Using the function to perform a partial F test for all 4 variables
Partial_F(Y = resp_vars, 
          x = Skulls$epoch)

## # A tibble: 4 × 4
##   variables    F_lam F_stat  p_val
##   <chr>        <dbl>  <dbl>  <dbl>
## 1 nasal_height 0.965   1.28 0.281 
## 2 height       0.955   1.67 0.160 
## 3 breadth      0.896   4.12 0.0035
## 4 length       0.851   6.24 0.0001

# Removing nose height from the resp_vars
resp_vars1 <- Skulls |> 
  dplyr::select(-epoch, -nasal_height) |> 
  cbind()

# Partial F tests for bl, mb, and bh 
Partial_F(Y = resp_vars1, 
          x = Skulls$epoch)

## # A tibble: 3 × 4
##   variables F_lam F_stat  p_val
##   <chr>     <dbl>  <dbl>  <dbl>
## 1 height    0.960   1.50 0.204 
## 2 breadth   0.881   4.83 0.0011
## 3 length    0.851   6.25 0.0001

# Removing breadth from the resp_vars
resp_vars2 <- 
  Skulls |> 
  dplyr::select(-epoch, -nasal_height, -height) |> 
  cbind()


# Partial F test for bl and mb
Partial_F(Y = resp_vars2, 
          x = Skulls$epoch)

## # A tibble: 2 × 4
##   variables  F_lam F_stat p_val
##   <chr>      <dbl>  <dbl> <dbl>
## 1 length    0.120    263.     0
## 2 breadth   0.0863   381.     0

From the partial F test, only mb (skull breadth) and bl (skull length) are necessary to describe the differences across epochs.

Using the partial F-test results for all 4 variables, the rank of importance is:

skull length (bl)
skull width (mb),
skull height (bh)
nose height (nh)

Both methods rank 1 & 2 the same (the important variables), but switch the order of bh and nh (the unimportant variables)

Question 10: Visualize the differences across epochs

Create an appropriate plot (based on your answer to question 6) to visualize the differences in the epochs. Which ones appear to be different, and how are they different?

Since we only need 1 discriminant function, We can make a boxplot separated by the epoch the skull is from!

# Use predict to get the discriminants
skull_discrim <- 
  data.frame(epoch = Skulls$epoch,
             predict(skull_lda)$x)

head(skull_discrim)

##     epoch    LD1    LD2      LD3     LD4
## 1 c4000BC  0.344  1.688 -0.04970  1.5626
## 2 c4000BC -0.675 -0.142 -0.00391  2.1220
## 3 c4000BC -0.802 -0.331 -0.05001  0.1682
## 4 c4000BC -2.384 -0.128 -0.66008  2.7529
## 5 c4000BC -0.390  1.797  0.92001 -0.9713
## 6 c4000BC  1.848  1.208  1.33847  0.0522

# Creating the box plots
skull_discrim |> 
  ggplot(mapping = aes(x = LD1)) + 
  
  geom_boxplot(mapping = aes(fill = epoch)) + 
  
  labs(title = "Boxplots of Skulls for the First Discriminant",
       x = paste0("LD1 (Percent Explained: ", 
                      round(skull_lda$svd[1]^2/sum(skull_lda$svd^2),3)*100, "%)"),
       fill = "Epoch") + 
  
  scale_fill_discrete(labels = str_remove(levels(Skulls$epoch), "c"))

# You can also make a scatterplot using 2 discriminant functions
# But the second won't show much a difference between epochs (~8%)
skull_discrim |> 
  ggplot(mapping = aes(x = LD1, 
                       y = LD2)) +
  
  geom_point(mapping = aes(color = epoch)) + 
  
  stat_ellipse(mapping = aes(color = epoch)) + 
  
  labs(title = "Scatterplot of Skulls with the First Two Discriminants",
       x = paste0("LD1 (Percent Explained: ", 
                  round(skull_lda$svd[1]^2/sum(skull_lda$svd^2),3)*100, "%)"),
       y = paste0("LD2 (Percent Explained: ", 
                  round(skull_lda$svd[2]^2/sum(skull_lda$svd^2),3)*100, "%)")) +
  
  labs(color = "Epoch") + 
  
  scale_color_discrete(labels = str_remove(levels(Skulls$epoch), "c"))

The earliest 2 epochs looks very similar and the last 2 epochs appear very similar, and the middle epoch is about half-way between. It appear that there may have been a transition in skull features around 1850 BC.