Calculating the mahalanobis distance to look for outliers

data.frame(
  x = 1:nrow(bones),
  y = mahalanobis(bones, center = colMeans(bones), cov = var(bones))
) |>
   
   ggplot(
     mapping = aes(x = x, y = y)
   ) + 
   
   geom_col() + 
   
   geom_hline(
     yintercept = qchisq((nrow(bones)-0.5)/nrow(bones), df = p),
     color = "red"
   ) + 
   
   scale_y_continuous(expand = c(0, 0, 0.05, 0)) +
   
   scale_x_continuous(expand = c(0, 2)) + 
   
   labs(y = "Mahalanobix Distance",
        x = "Row Number")

Need to run twice to remove all the outliers, since removing the original outliers changes the mean vector and covariance matrix

bones <- 
   bones |> 
   filter(
     mahalanobis(bones,center = colMeans(bones), cov = var(bones)) < 
       qchisq((nrow(bones)-0.5)/nrow(bones), df = p)
   )


data.frame(
  x = 1:nrow(bones),
  y = mahalanobis(bones, center = colMeans(bones), cov = var(bones))
) |>
   
   ggplot(
     mapping = aes(x = x, y = y)
   ) + 
   
   geom_col() + 
   
   geom_hline(
     yintercept = qchisq((nrow(bones)-0.5)/nrow(bones), df = p),
              color = "red"
     ) + 
   
   scale_y_continuous(expand = c(0, 0, 0.05, 0)) +
   
   scale_x_continuous(expand = c(0, 2)) + 
   
   labs(y = "Mahalanobix Distance",
        x = "Row Number")

Recording the sample info and summary statistics

# Recording the sample size and number of numeric variables
n <- nrow(bones)

# Calculating the mean vector and covariance matrix
ybar <- colMeans(bones)
covmat <- cov(bones)

Correlation plot

We can rearrange the order to find sets of highly correlated variables. Orders them based on the First PC: order = “FPC”

corrplot::corrplot(
  cor(bones),
  method = "circle",
  order = "FPC"
)

Scatterplot Matrix for arm and leg bones

# Arm  and leg bones
bones |> 
   dplyr::select(HumL:TibR) |>
   
   ggpairs(
     lower = list(continuous = wrap("smooth", 
                                    method = "loess", 
                                    se = F,
                                    alpha = 0.5, 
                                    color="steelblue")),
     
     diag = list(continuous = wrap("densityDiag", 
                                   fill = "steelblue" ))
   ) + 
   theme_bw()

## Hip bones
bones |> 
   dplyr::select(IBL:BIB) |>
   
   ggpairs(
     lower = list(continuous = wrap("smooth", 
                                    method = "loess", 
                                    se = F,
                                    alpha = 0.5, 
                                    color="darkred")),
     
     diag = list(continuous = wrap("densityDiag", 
                                   fill = "darkred" ))
     ) + 
   
   theme_bw()

KMO: Measuring the overall correlation of the data

KMO can be used to check if the data are correlated enough for factor analysis

?KMO
KMO(bones)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = bones)
## Overall MSA =  0.89
## MSA for each item = 
## HumL HumR RadL RadR FemL FemR TibL TibR  IBL  IBR AceL AceR  BIB 
## 0.90 0.90 0.88 0.88 0.89 0.89 0.87 0.88 0.86 0.87 0.88 0.88 0.98

Since all the variables are above 0.70, factor analysis should work well!

Checking Multivariate Normality of the Data

Need to see if the data are MVN to determine if we need to use the PCA method or Likelihood Method

mvn(
  data = bones, 
  mvnTest = "mardia", 
  multivariatePlot = "qq",
  univariateTest = "SW",
  desc = F
)

## $multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 578.399294789812 7.47719995663404e-05     NO
## 2 Mardia Kurtosis 8.26316727098036 2.22044604925031e-16     NO
## 3             MVN             <NA>                 <NA>     NO
## 
## $univariateNormality
##            Test  Variable Statistic   p value Normality
## 1  Shapiro-Wilk   HumL        0.997    0.2810    YES   
## 2  Shapiro-Wilk   HumR        0.997    0.3971    YES   
## 3  Shapiro-Wilk   RadL        0.996    0.1426    YES   
## 4  Shapiro-Wilk   RadR        0.996    0.1677    YES   
## 5  Shapiro-Wilk   FemL        0.998    0.6776    YES   
## 6  Shapiro-Wilk   FemR        0.998    0.7913    YES   
## 7  Shapiro-Wilk   TibL        0.996    0.1787    YES   
## 8  Shapiro-Wilk   TibR        0.996    0.1470    YES   
## 9  Shapiro-Wilk    IBL        0.993    0.0078    NO    
## 10 Shapiro-Wilk    IBR        0.994    0.0176    NO    
## 11 Shapiro-Wilk   AceL        0.995    0.0499    NO    
## 12 Shapiro-Wilk   AceR        0.997    0.2920    YES   
## 13 Shapiro-Wilk    BIB        0.994    0.0191    NO

While the data doesn’t appear to be MVN, let’s use both methods to fit the model and see how they compare

Fitting the Factor Analysis Models

Factor Analysis: PCA Method

Deciding the number of factors

Use principal() in the psych package

Need to specify how many factors there are. When using the PCA method, scree plots can be beneficial

screeplot(
  prcomp(bones, 
         scale. = T), 
  type = "l"
)

Scree plot recommends 2 factors

An alternative to the screeplot for factor analysis models is the parallel plot:

#?fa.parallel

fa.parallel(
  bones, 
  fm = "pa", 
  fa = "fa"
)

## Parallel analysis suggests that the number of factors =  3  and the number of components =  NA

To use the parallel plot, we should choose the number of factors above the red dotted line.

The red line represents what we would expect to see if there no factors in our data!

It recommends 2 or 3 factors when using the PC method for Factor Analysis!

Model Fitting: PC method

Let’s compare when m = 1, m = 2, and m = 3

# Fitting the FA model using PC with 1 factor
bones_pcfa1 <- 
  fa(
    r = cor(bones), 
    nfactors = 1, 
    fm = "pa",
    rotate = "none",
    n.obs = n
  )



# Fitting the FA model using PC with 2 factors
bones_pcfa2 <- 
  fa(r = cor(bones), 
     nfactors = 2, 
     fm = "pa",
     rotate = "none",
     n.obs = n
  )


# Fitting the FA model using PC with 3 factors
bones_pcfa3 <- 
  fa(r = cor(bones), 
     nfactors = 3, 
     fm = "pa",
     rotate = "none",
     n.obs = n
  )

We can get the residual matrix by adding $residuals.

But what is the residual matrix given by the fa() function? It is the \(R - LL'\) matrix!

Ideally the Residual matrix should be small: Most are around 0 - 0.10

# m = 1
ggcorr(
  data = NULL,
  cor_matrix = bones_pcfa1$residual, 
  low = "red",
  mid = "white",
  high = "red"
)

# m = 2
ggcorr(
  data = NULL,
  cor_matrix = bones_pcfa2$residual, 
  low = "red",
  mid = "white",
  high = "red"
)

# m = 3
ggcorr(
  data = NULL,
  cor_matrix = bones_pcfa3$residual, 
  low = "red",
  mid = "white",
  high = "red"
)

From the correlation plot, it looks like there is a large decrease in the residuals going from 1 to 2 factors, but not much of a reduction going from 2 to 3. Reinforces our earlier conclusion that m = 2

Additionally, the $fit will show how much of the correlation the FA model correlation matrix is retained:

\[\textrm{fit} = 1 - \sum\left( \frac{r_{ij}^2 - \hat{r}_{ij}^2}{r_{ij}^2} \right)\]

c(
  "m = 1" = bones_pcfa1$fit,
  "m = 2" = bones_pcfa2$fit,
  "m = 3" = bones_pcfa3$fit
)
## m = 1 m = 2 m = 3 
## 0.963 0.993 0.997
# Reduction in residuals between m = 1 and m = 2
ggcorr(
  data = NULL,
  cor_matrix = bones_pcfa1$residual - bones_pcfa2$residual, 
  low = "red",
  mid = "white",
  high = "red"
) + 
   
  labs(title = "Reduction in Residual Matrix from m = 1 to m = 2")

# Reduction in residuals between m = 2 and m = 3
ggcorr(
  data = NULL,
  cor_matrix = bones_pcfa2$residual - bones_pcfa3$residual, 
  low = "red",
  mid = "white",
  high = "red"
) + 
  
  labs(title = "Reduction in Residual Matrix from m = 2 to m = 3")

Rotating the factors: PC Method

We want to rotate the factors using varimax

bones_pcfa <- fa(
  r = cor(bones), 
  nfactors = 3, 
  fm = "pa",
  rotate = "varimax",
  n.obs = n
)


# We can get just the rotated Loadings using loadings()
loadings(bones_pcfa)
## 
## Loadings:
##      PA1   PA2   PA3  
## HumL 0.816 0.367 0.312
## HumR 0.807 0.394 0.305
## RadL 0.890 0.193 0.253
## RadR 0.897 0.196 0.243
## FemL 0.852 0.335 0.303
## FemR 0.844 0.341 0.312
## TibL 0.928 0.231 0.130
## TibR 0.927 0.233 0.137
## IBL  0.301 0.894 0.283
## IBR  0.303 0.895 0.287
## AceL 0.382 0.447 0.782
## AceR 0.377 0.462 0.772
## BIB  0.223 0.732 0.177
## 
##                  PA1   PA2   PA3
## SS loadings    6.594 3.251 1.939
## Proportion Var 0.507 0.250 0.149
## Cumulative Var 0.507 0.757 0.906

Visualizing the Factor Analysis Rotated Loadings

# Convert the loadings to a dataframe
bones_pcfa$loadings[,] |> 
data.frame() |>
   
   # Changing the variables from a row name to a column named variable
   rownames_to_column(var = "Variable") |>
   
   # Extending it so the loadings are 1 column and the Factor has an indicator column
   pivot_longer(
     cols = where(is.numeric), 
     names_to = "Factor", 
     values_to ="Loadings"
   ) |>
   
   # Changing the order of the bones to match the order in the FA model
  mutate(
    Variable = factor(Variable, levels = rev(colnames(bones)))
  ) |>
   
   # Removing any loadings below a chosen limit
   filter(Loadings > 0.40) |> 
   
   # Call on ggplot() and specify x = 1st and y = 2nd factor
   ggplot(
     mapping = aes(
       x = Factor, 
       y = Variable
     )
   ) + 
   
  geom_tile(
    mapping = aes(fill = Loadings),
    color = "white"
  ) + 
   
   scale_fill_gradient(
     low = "white", 
     high = "darkblue",
     limits = c(0, 1)
   ) + 
   
   labs(fill = "Loading Values") + 
   
   theme_classic() 

## Or we can use corrplot to make the process a little simpler

corrplot(bones_pcfa$loadings[,])

# Compare it to the correlation plot of the data
corrplot(cor(bones), order="hclust")

data.frame(communality = bones_pcfa$communality) |> 
  mutate(specific = 1-communality) |> 
  rownames_to_column(var = "variable") |> 
  gt::gt()
variable communality specific
HumL 0.898 0.1019
HumR 0.899 0.1007
RadL 0.894 0.1062
RadR 0.902 0.0976
FemL 0.929 0.0710
FemR 0.925 0.0747
TibL 0.932 0.0678
TibR 0.933 0.0673
IBL 0.971 0.0295
IBR 0.974 0.0260
AceL 0.957 0.0433
AceR 0.952 0.0481
BIB 0.617 0.3829