knitr::opts_chunk$set(echo = TRUE,
                      warning = F,
                      message = F,
                      fig.align = "center")
options(digits=3)

# Loading packages: tidyverse & moderndive
pacman::p_load(tidyverse, corrplot, psych, MVN, GGally, ggrepel)

# Changing the default theme to theme_bw()
theme_set(theme_classic())
theme_update(axis.title = element_text(size = 12),
             axis.text = element_text(size = 10),
             plot.title = element_text(size = 12, hjust = 0.5),
             plot.subtitle = element_text(size = 10, hjust = 0.5),
             plot.caption = element_text(size = 8))

# Reading in the dataset
bones <- 
   read.csv("Bones.csv") %>% 
   filter(complete.cases(.)) |> 
   dplyr::select(-sex) 



p <- ncol(bones)

Removing outliers

Need to run twice to remove all the outliers, since removing the original outliers changes the mean vector and covariance matrix

bones <- 
   bones |>  
   filter(
     mahalanobis(bones, center = colMeans(bones), cov = var(bones)) < 
       qchisq((nrow(bones)-0.5)/nrow(bones), df = p)
     )

bones <- 
   bones |> 
   filter(
     mahalanobis(bones, center = colMeans(bones), cov = var(bones)) < 
       qchisq((nrow(bones)-0.5)/nrow(bones), df = p)
     )

n <- nrow(bones)

Factor Analysis: Maximum Likelihood Estimation Method

Since the QQ plot showed that the data are aren’t MVN, but let’s use the MLE method to estimate the Factor Analysis Model using the fa() function in the psych package anyway. We can use MLE method by specifying fm = "ml"

Let’s compare when m = 1, m = 2, and m = 3

# Fitting the FA model using PC with 1 factor
bones_mlfa1 <- 
  fa(
    r = cor(bones), 
    nfactors = 1, 
    fm = "ml",
    rotate = "none",
    n.obs = n
  )



# Fitting the FA model using PC with 2 factors
bones_mlfa2 <- 
  fa(
    r = cor(bones), 
    nfactors = 2, 
    fm = "ml",
    rotate = "none",
    n.obs = n
  )


# Fitting the FA model using PC with 3 factors
bones_mlfa3 <- 
  fa(
    r = cor(bones), 
    nfactors = 3, 
    fm = "ml",
    rotate = "none",
    n.obs = n
  )

Examining the FA scree plot

fa.parallel(
  bones, 
  fm = "ml", 
  fa = "fa"
)

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA

Checking the “fit” of the estimated correlation matrix

data.frame(
  corr_fit = c("m = 1" = bones_mlfa1$fit,
               "m = 2" = bones_mlfa2$fit,
               "m = 3" = bones_mlfa3$fit),
  
  off_diag = c("m = 1" = bones_mlfa1$fit.off,
               "m = 2" = bones_mlfa2$fit.off,
               "m = 3" = bones_mlfa3$fit.off)
)
##       corr_fit off_diag
## m = 1    0.950    0.964
## m = 2    0.992    0.996
## m = 3    0.993    0.997
# Reduction in residuals between m = 1 and m = 2
ggcorr(
  data = NULL,
  cor_matrix = bones_mlfa1$residual - bones_mlfa2$residual, 
  low = "red",
  mid = "white",
  high = "red"
) + 
   labs(title = "Reduction in Residual Matrix from m = 1 to m = 2")

# Reduction in residuals between m = 2 and m = 3
ggcorr(
  data = NULL,
  cor_matrix = bones_mlfa2$residual - bones_mlfa3$residual, 
  low = "red",
  mid = "white",
  high = "red"
) + 
   labs(title = "Reduction in Residual Matrix from m = 2 to m = 3")

The fit improves noticeably from m = 1 to m = 2, but not much of an increase from m = 2 to m = 3

Objective Methods: Information Criterion

If the data are multivariate normal, we can use either AIC or BIC/SBC to help determine the “best” choice for the number of factors.

The “FA AIC function.R” file has a function that will calculate the AIC (default) or BIC (penalty = log(n)) for a FA model.

Best choice using AIC

Let’s look at the AIC value for m = 1 to m = 5

source("FA AIC function.R")
c(  
  "m = 1" = fa_IC(Y = bones, m = 1),
  "m = 2" = fa_IC(Y = bones, m = 2),
  "m = 3" = fa_IC(Y = bones, m = 3),
  "m = 4" = fa_IC(Y = bones, m = 4),
  "m = 5" = fa_IC(Y = bones, m = 5)
)
## m = 1 m = 2 m = 3 m = 4 m = 5 
## 15823 15492 15482 15486 15502

The AIC suggests three factors, then four factors, then two factors.

What if we use BIC?

Best choice using BIC

The difference between AIC and BIC is the penalty for the additional model terms.

AIC uses a penalty of 2: complexity term = 2 * k

where k = the number of model terms (p * m for a FA model)

BIC uses a penalty of \(\log(n)\) where \(n\) is the sample size: complexity term = \(\log(n) * k\)

c(  
  "m = 1" = fa_IC(Y = bones, m = 1, penalty = log(n)),
  "m = 2" = fa_IC(Y = bones, m = 2, penalty = log(n)),
  "m = 3" = fa_IC(Y = bones, m = 3, penalty = log(n)),
  "m = 4" = fa_IC(Y = bones, m = 4, penalty = log(n)),
  "m = 5" = fa_IC(Y = bones, m = 5, penalty = log(n))
)
## m = 1 m = 2 m = 3 m = 4 m = 5 
## 15880 15606 15654 15715 15789

BIC will penalize larger, more complex models more than AIC does, and we see that for our example. BIC suggests two factors, then three factors, then 4 factors.

Factor Scores

Two options:

  1. Either specify how you want the scores estimated using the original data when fitting the fa model.
### Need to give it the data, not the correlation matrix
bones_fa3 <- 
  fa(
    r = bones, 
    nfactors = 3, 
    fm = "ml",
    rotate = "varimax",
    n.obs = n,
    scores = "regression"
  )

bones_fa3$scores |> 
  data.frame() |> 
  tibble()
## # A tibble: 604 × 3
##       ML1    ML2     ML3
##     <dbl>  <dbl>   <dbl>
##  1 -1.96   1.14   0.685 
##  2  0.268  0.511 -0.243 
##  3 -0.853  1.28  -1.11  
##  4 -1.82   1.47  -0.655 
##  5 -2.08   0.223 -0.435 
##  6 -1.29  -0.173  0.664 
##  7 -0.993  1.55   0.247 
##  8 -1.73   0.547  1.14  
##  9 -1.63   2.05  -0.0673
## 10  0.824  1.26   4.29  
## # ℹ 594 more rows
  1. Second method: If using fa() and the correlation matrix to fit the factor model, you can use factor.scores() to retrieve the factor scores.
  • x = data matrix
  • f = factor analysis object created by fa()
  • method = the method you want to estimate the scores
    • “Bartlett” for Bartlett’s method
    • “Thurstone” for regression
factor.scores(
  x = bones |> head(10),
  f = bones_fa3,
  method = "Thurstone"
)
## $scores
##       ML1    ML2    ML3
## 1  -4.849  4.951  2.986
## 2   0.343  1.048 -0.924
## 3   1.218 -2.166 -1.551
## 4   4.272 -5.605 -3.916
## 5  -2.545  0.186  3.658
## 6  -0.376 -0.607 -0.283
## 7   1.814 -0.150 -1.659
## 8   2.985 -4.266 -2.999
## 9  -4.771  6.215  2.275
## 10  1.909  0.394  2.414
## 
## $weights
##          ML1     ML2    ML3
## HumL  -0.977   2.455  -2.19
## HumR -17.939  17.932  19.09
## RadL  -1.481   0.175   2.10
## RadR  20.509 -25.717 -13.47
## FemL   8.973 -10.120  -4.98
## FemR  -6.522   7.535   6.74
## TibL   0.481   1.042  -2.59
## TibR  11.617 -10.818 -12.64
## IBL   -9.983  13.845   2.31
## IBR   -2.877   0.439   7.49
## AceL  11.336 -11.352 -12.40
## AceR -21.759  25.316  18.64
## BIB   10.562 -10.545  -9.64
## 
## $r.scores
##        ML1    ML2    ML3
## ML1  1.000 -0.899 -0.810
## ML2 -0.899  1.000  0.782
## ML3 -0.810  0.782  1.000
## 
## $missing
## [1] FALSE
## 
## $R2
## [1] NA NA NA

Final Solution

It looks like that we should remove the Ace vars since those 2 aren’t that strongly correlated with the other 11 bones and rerun the analysis with just 2 factors: length and width

# PC Method
(bones_pc <- 
   fa(
     r = cor(bones |> select(-AceR, -AceL)), 
     nfactors = 2, 
     fm = "pa",
     rotate = "varimax",
     n.obs = n
   )
 )
## Factor Analysis using method =  pa
## Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n, 
##     rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##       PA1  PA2   h2    u2 com
## HumL 0.85 0.42 0.90 0.100 1.5
## HumR 0.84 0.45 0.90 0.098 1.5
## RadL 0.92 0.23 0.89 0.107 1.1
## RadR 0.92 0.24 0.90 0.098 1.1
## FemL 0.88 0.38 0.93 0.073 1.4
## FemR 0.88 0.39 0.92 0.077 1.4
## TibL 0.93 0.24 0.92 0.084 1.1
## TibR 0.93 0.24 0.92 0.082 1.1
## IBL  0.32 0.93 0.97 0.032 1.2
## IBR  0.32 0.94 0.98 0.022 1.2
## BIB  0.23 0.75 0.61 0.393 1.2
## 
##                        PA1  PA2
## SS loadings           6.64 3.20
## Proportion Var        0.60 0.29
## Cumulative Var        0.60 0.89
## Proportion Explained  0.68 0.32
## Cumulative Proportion 0.68 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 2 factors are sufficient.
## 
## df null model =  55  with the objective function =  26.6 with Chi Square =  15892
## df of  the model are 34  and the objective function was  7.2 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  604 with the empirical chi square  46.1  with prob <  0.081 
## The total n.obs was  604  with Likelihood Chi Square =  4301  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.563
## RMSEA index =  0.456  and the 90 % confidence intervals are  0.445 0.468
## BIC =  4083
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    PA1  PA2
## Correlation of (regression) scores with factors   0.99 0.99
## Multiple R square of scores with factors          0.98 0.98
## Minimum correlation of possible factor scores     0.96 0.96
summary(bones_pc)
## 
## Factor analysis with Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n, 
##     rotate = "varimax", fm = "pa")
## 
## Test of the hypothesis that 2 factors are sufficient.
## The degrees of freedom for the model is 34  and the objective function was  7.2 
## The number of observations was  604  with Chi Square =  4301  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.03 
## The df corrected root mean square of the residuals is  0.03 
## 
## Tucker Lewis Index of factoring reliability =  0.563
## RMSEA index =  0.456  and the 10 % confidence intervals are  0.445 0.468
## BIC =  4083
tibble(bones_pc$loadings) |> head(n = 100)
## # A tibble: 11 × 1
##    `bones_pc$loadings`[,"PA1"] [,"PA2"]
##                          <dbl>    <dbl>
##  1                       0.849    0.423
##  2                       0.838    0.447
##  3                       0.916    0.235
##  4                       0.920    0.235
##  5                       0.883    0.384
##  6                       0.878    0.391
##  7                       0.928    0.236
##  8                       0.928    0.236
##  9                       0.321    0.930
## 10                       0.322    0.935
## 11                       0.228    0.745
corrplot(bones_pc$loadings[,], cl.pos = "n")

# ML Method
(bones_ml <- 
   fa(
     r = cor(bones |> select(-AceR, -AceL)), 
     nfactors = 2, 
     fm="ml",
     rotate = "varimax",
     n.obs = n
   )
)
## Factor Analysis using method =  ml
## Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n, 
##     rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##       ML2  ML1   h2    u2 com
## HumL 0.85 0.42 0.89 0.107 1.5
## HumR 0.84 0.44 0.89 0.105 1.5
## RadL 0.90 0.26 0.87 0.131 1.2
## RadR 0.90 0.26 0.88 0.123 1.2
## FemL 0.89 0.38 0.94 0.058 1.4
## FemR 0.89 0.39 0.94 0.062 1.4
## TibL 0.93 0.24 0.92 0.078 1.1
## TibR 0.93 0.24 0.92 0.077 1.1
## IBL  0.31 0.93 0.97 0.030 1.2
## IBR  0.31 0.94 0.98 0.015 1.2
## BIB  0.22 0.75 0.61 0.392 1.2
## 
##                        ML2  ML1
## SS loadings           6.58 3.24
## Proportion Var        0.60 0.29
## Cumulative Var        0.60 0.89
## Proportion Explained  0.67 0.33
## Cumulative Proportion 0.67 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 2 factors are sufficient.
## 
## df null model =  55  with the objective function =  26.6 with Chi Square =  15892
## df of  the model are 34  and the objective function was  7.14 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  604 with the empirical chi square  49.5  with prob <  0.042 
## The total n.obs was  604  with Likelihood Chi Square =  4266  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.567
## RMSEA index =  0.454  and the 90 % confidence intervals are  0.443 0.466
## BIC =  4048
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML2  ML1
## Correlation of (regression) scores with factors   0.99 0.99
## Multiple R square of scores with factors          0.98 0.98
## Minimum correlation of possible factor scores     0.96 0.97
summary(bones_ml)
## 
## Factor analysis with Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n, 
##     rotate = "varimax", fm = "ml")
## 
## Test of the hypothesis that 2 factors are sufficient.
## The degrees of freedom for the model is 34  and the objective function was  7.14 
## The number of observations was  604  with Chi Square =  4266  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.03 
## The df corrected root mean square of the residuals is  0.03 
## 
## Tucker Lewis Index of factoring reliability =  0.567
## RMSEA index =  0.454  and the 10 % confidence intervals are  0.443 0.466
## BIC =  4048
bones_ml$loadings
## 
## Loadings:
##      ML2   ML1  
## HumL 0.845 0.423
## HumR 0.835 0.444
## RadL 0.896 0.255
## RadR 0.901 0.255
## FemL 0.891 0.384
## FemR 0.886 0.391
## TibL 0.930 0.241
## TibR 0.929 0.243
## IBL  0.314 0.934
## IBR  0.313 0.942
## BIB  0.223 0.747
## 
##                  ML2   ML1
## SS loadings    6.582 3.240
## Proportion Var 0.598 0.295
## Cumulative Var 0.598 0.893
corrplot(bones_ml$loadings[,], cl.pos = "n")