knitr::opts_chunk$set(echo = TRUE,
warning = F,
message = F,
fig.align = "center")
options(digits=3)
# Loading packages: tidyverse & moderndive
pacman::p_load(tidyverse, corrplot, psych, MVN, GGally, ggrepel)
# Changing the default theme to theme_bw()
theme_set(theme_classic())
theme_update(axis.title = element_text(size = 12),
axis.text = element_text(size = 10),
plot.title = element_text(size = 12, hjust = 0.5),
plot.subtitle = element_text(size = 10, hjust = 0.5),
plot.caption = element_text(size = 8))
# Reading in the dataset
bones <-
read.csv("Bones.csv") %>%
filter(complete.cases(.)) |>
dplyr::select(-sex)
p <- ncol(bones)
Need to run twice to remove all the outliers, since removing the original outliers changes the mean vector and covariance matrix
bones <-
bones |>
filter(
mahalanobis(bones, center = colMeans(bones), cov = var(bones)) <
qchisq((nrow(bones)-0.5)/nrow(bones), df = p)
)
bones <-
bones |>
filter(
mahalanobis(bones, center = colMeans(bones), cov = var(bones)) <
qchisq((nrow(bones)-0.5)/nrow(bones), df = p)
)
n <- nrow(bones)
Since the QQ plot showed that the data are aren’t MVN, but let’s use
the MLE method to estimate the Factor Analysis Model using the
fa()
function in the psych package anyway. We can use MLE
method by specifying fm = "ml"
Let’s compare when m = 1, m = 2, and m = 3
# Fitting the FA model using PC with 1 factor
bones_mlfa1 <-
fa(
r = cor(bones),
nfactors = 1,
fm = "ml",
rotate = "none",
n.obs = n
)
# Fitting the FA model using PC with 2 factors
bones_mlfa2 <-
fa(
r = cor(bones),
nfactors = 2,
fm = "ml",
rotate = "none",
n.obs = n
)
# Fitting the FA model using PC with 3 factors
bones_mlfa3 <-
fa(
r = cor(bones),
nfactors = 3,
fm = "ml",
rotate = "none",
n.obs = n
)
Examining the FA scree plot
fa.parallel(
bones,
fm = "ml",
fa = "fa"
)
## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
Checking the “fit” of the estimated correlation matrix
data.frame(
corr_fit = c("m = 1" = bones_mlfa1$fit,
"m = 2" = bones_mlfa2$fit,
"m = 3" = bones_mlfa3$fit),
off_diag = c("m = 1" = bones_mlfa1$fit.off,
"m = 2" = bones_mlfa2$fit.off,
"m = 3" = bones_mlfa3$fit.off)
)
## corr_fit off_diag
## m = 1 0.950 0.964
## m = 2 0.992 0.996
## m = 3 0.993 0.997
# Reduction in residuals between m = 1 and m = 2
ggcorr(
data = NULL,
cor_matrix = bones_mlfa1$residual - bones_mlfa2$residual,
low = "red",
mid = "white",
high = "red"
) +
labs(title = "Reduction in Residual Matrix from m = 1 to m = 2")
# Reduction in residuals between m = 2 and m = 3
ggcorr(
data = NULL,
cor_matrix = bones_mlfa2$residual - bones_mlfa3$residual,
low = "red",
mid = "white",
high = "red"
) +
labs(title = "Reduction in Residual Matrix from m = 2 to m = 3")
The fit improves noticeably from m = 1 to m = 2, but not much of an increase from m = 2 to m = 3
If the data are multivariate normal, we can use either AIC or BIC/SBC to help determine the “best” choice for the number of factors.
The “FA AIC function.R” file has a function that will
calculate the AIC (default) or BIC (penalty = log(n)
) for a
FA model.
Let’s look at the AIC value for m = 1 to m = 5
source("FA AIC function.R")
c(
"m = 1" = fa_IC(Y = bones, m = 1),
"m = 2" = fa_IC(Y = bones, m = 2),
"m = 3" = fa_IC(Y = bones, m = 3),
"m = 4" = fa_IC(Y = bones, m = 4),
"m = 5" = fa_IC(Y = bones, m = 5)
)
## m = 1 m = 2 m = 3 m = 4 m = 5
## 15823 15492 15482 15486 15502
The AIC suggests three factors, then four factors, then two factors.
What if we use BIC?
The difference between AIC and BIC is the penalty for the additional model terms.
AIC uses a penalty of 2: complexity term = 2 * k
where k = the number of model terms (p * m for a FA model)
BIC uses a penalty of \(\log(n)\) where \(n\) is the sample size: complexity term = \(\log(n) * k\)
c(
"m = 1" = fa_IC(Y = bones, m = 1, penalty = log(n)),
"m = 2" = fa_IC(Y = bones, m = 2, penalty = log(n)),
"m = 3" = fa_IC(Y = bones, m = 3, penalty = log(n)),
"m = 4" = fa_IC(Y = bones, m = 4, penalty = log(n)),
"m = 5" = fa_IC(Y = bones, m = 5, penalty = log(n))
)
## m = 1 m = 2 m = 3 m = 4 m = 5
## 15880 15606 15654 15715 15789
BIC will penalize larger, more complex models more than AIC does, and we see that for our example. BIC suggests two factors, then three factors, then 4 factors.
Two options:
### Need to give it the data, not the correlation matrix
bones_fa3 <-
fa(
r = bones,
nfactors = 3,
fm = "ml",
rotate = "varimax",
n.obs = n,
scores = "regression"
)
bones_fa3$scores |>
data.frame() |>
tibble()
## # A tibble: 604 × 3
## ML1 ML2 ML3
## <dbl> <dbl> <dbl>
## 1 -1.96 1.14 0.685
## 2 0.268 0.511 -0.243
## 3 -0.853 1.28 -1.11
## 4 -1.82 1.47 -0.655
## 5 -2.08 0.223 -0.435
## 6 -1.29 -0.173 0.664
## 7 -0.993 1.55 0.247
## 8 -1.73 0.547 1.14
## 9 -1.63 2.05 -0.0673
## 10 0.824 1.26 4.29
## # ℹ 594 more rows
fa()
and the correlation matrix
to fit the factor model, you can use factor.scores()
to
retrieve the factor scores.x =
data matrixf =
factor analysis object created by fa()method =
the method you want to estimate the scores
factor.scores(
x = bones |> head(10),
f = bones_fa3,
method = "Thurstone"
)
## $scores
## ML1 ML2 ML3
## 1 -4.849 4.951 2.986
## 2 0.343 1.048 -0.924
## 3 1.218 -2.166 -1.551
## 4 4.272 -5.605 -3.916
## 5 -2.545 0.186 3.658
## 6 -0.376 -0.607 -0.283
## 7 1.814 -0.150 -1.659
## 8 2.985 -4.266 -2.999
## 9 -4.771 6.215 2.275
## 10 1.909 0.394 2.414
##
## $weights
## ML1 ML2 ML3
## HumL -0.977 2.455 -2.19
## HumR -17.939 17.932 19.09
## RadL -1.481 0.175 2.10
## RadR 20.509 -25.717 -13.47
## FemL 8.973 -10.120 -4.98
## FemR -6.522 7.535 6.74
## TibL 0.481 1.042 -2.59
## TibR 11.617 -10.818 -12.64
## IBL -9.983 13.845 2.31
## IBR -2.877 0.439 7.49
## AceL 11.336 -11.352 -12.40
## AceR -21.759 25.316 18.64
## BIB 10.562 -10.545 -9.64
##
## $r.scores
## ML1 ML2 ML3
## ML1 1.000 -0.899 -0.810
## ML2 -0.899 1.000 0.782
## ML3 -0.810 0.782 1.000
##
## $missing
## [1] FALSE
##
## $R2
## [1] NA NA NA
It looks like that we should remove the Ace vars since those 2 aren’t that strongly correlated with the other 11 bones and rerun the analysis with just 2 factors: length and width
# PC Method
(bones_pc <-
fa(
r = cor(bones |> select(-AceR, -AceL)),
nfactors = 2,
fm = "pa",
rotate = "varimax",
n.obs = n
)
)
## Factor Analysis using method = pa
## Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n,
## rotate = "varimax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 h2 u2 com
## HumL 0.85 0.42 0.90 0.100 1.5
## HumR 0.84 0.45 0.90 0.098 1.5
## RadL 0.92 0.23 0.89 0.107 1.1
## RadR 0.92 0.24 0.90 0.098 1.1
## FemL 0.88 0.38 0.93 0.073 1.4
## FemR 0.88 0.39 0.92 0.077 1.4
## TibL 0.93 0.24 0.92 0.084 1.1
## TibR 0.93 0.24 0.92 0.082 1.1
## IBL 0.32 0.93 0.97 0.032 1.2
## IBR 0.32 0.94 0.98 0.022 1.2
## BIB 0.23 0.75 0.61 0.393 1.2
##
## PA1 PA2
## SS loadings 6.64 3.20
## Proportion Var 0.60 0.29
## Cumulative Var 0.60 0.89
## Proportion Explained 0.68 0.32
## Cumulative Proportion 0.68 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 2 factors are sufficient.
##
## df null model = 55 with the objective function = 26.6 with Chi Square = 15892
## df of the model are 34 and the objective function was 7.2
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 604 with the empirical chi square 46.1 with prob < 0.081
## The total n.obs was 604 with Likelihood Chi Square = 4301 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.563
## RMSEA index = 0.456 and the 90 % confidence intervals are 0.445 0.468
## BIC = 4083
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA2
## Correlation of (regression) scores with factors 0.99 0.99
## Multiple R square of scores with factors 0.98 0.98
## Minimum correlation of possible factor scores 0.96 0.96
summary(bones_pc)
##
## Factor analysis with Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n,
## rotate = "varimax", fm = "pa")
##
## Test of the hypothesis that 2 factors are sufficient.
## The degrees of freedom for the model is 34 and the objective function was 7.2
## The number of observations was 604 with Chi Square = 4301 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.03
## The df corrected root mean square of the residuals is 0.03
##
## Tucker Lewis Index of factoring reliability = 0.563
## RMSEA index = 0.456 and the 10 % confidence intervals are 0.445 0.468
## BIC = 4083
tibble(bones_pc$loadings) |> head(n = 100)
## # A tibble: 11 × 1
## `bones_pc$loadings`[,"PA1"] [,"PA2"]
## <dbl> <dbl>
## 1 0.849 0.423
## 2 0.838 0.447
## 3 0.916 0.235
## 4 0.920 0.235
## 5 0.883 0.384
## 6 0.878 0.391
## 7 0.928 0.236
## 8 0.928 0.236
## 9 0.321 0.930
## 10 0.322 0.935
## 11 0.228 0.745
corrplot(bones_pc$loadings[,], cl.pos = "n")
# ML Method
(bones_ml <-
fa(
r = cor(bones |> select(-AceR, -AceL)),
nfactors = 2,
fm="ml",
rotate = "varimax",
n.obs = n
)
)
## Factor Analysis using method = ml
## Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n,
## rotate = "varimax", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML2 ML1 h2 u2 com
## HumL 0.85 0.42 0.89 0.107 1.5
## HumR 0.84 0.44 0.89 0.105 1.5
## RadL 0.90 0.26 0.87 0.131 1.2
## RadR 0.90 0.26 0.88 0.123 1.2
## FemL 0.89 0.38 0.94 0.058 1.4
## FemR 0.89 0.39 0.94 0.062 1.4
## TibL 0.93 0.24 0.92 0.078 1.1
## TibR 0.93 0.24 0.92 0.077 1.1
## IBL 0.31 0.93 0.97 0.030 1.2
## IBR 0.31 0.94 0.98 0.015 1.2
## BIB 0.22 0.75 0.61 0.392 1.2
##
## ML2 ML1
## SS loadings 6.58 3.24
## Proportion Var 0.60 0.29
## Cumulative Var 0.60 0.89
## Proportion Explained 0.67 0.33
## Cumulative Proportion 0.67 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 2 factors are sufficient.
##
## df null model = 55 with the objective function = 26.6 with Chi Square = 15892
## df of the model are 34 and the objective function was 7.14
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 604 with the empirical chi square 49.5 with prob < 0.042
## The total n.obs was 604 with Likelihood Chi Square = 4266 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.567
## RMSEA index = 0.454 and the 90 % confidence intervals are 0.443 0.466
## BIC = 4048
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML2 ML1
## Correlation of (regression) scores with factors 0.99 0.99
## Multiple R square of scores with factors 0.98 0.98
## Minimum correlation of possible factor scores 0.96 0.97
summary(bones_ml)
##
## Factor analysis with Call: fa(r = cor(select(bones, -AceR, -AceL)), nfactors = 2, n.obs = n,
## rotate = "varimax", fm = "ml")
##
## Test of the hypothesis that 2 factors are sufficient.
## The degrees of freedom for the model is 34 and the objective function was 7.14
## The number of observations was 604 with Chi Square = 4266 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.03
## The df corrected root mean square of the residuals is 0.03
##
## Tucker Lewis Index of factoring reliability = 0.567
## RMSEA index = 0.454 and the 10 % confidence intervals are 0.443 0.466
## BIC = 4048
bones_ml$loadings
##
## Loadings:
## ML2 ML1
## HumL 0.845 0.423
## HumR 0.835 0.444
## RadL 0.896 0.255
## RadR 0.901 0.255
## FemL 0.891 0.384
## FemR 0.886 0.391
## TibL 0.930 0.241
## TibR 0.929 0.243
## IBL 0.314 0.934
## IBR 0.313 0.942
## BIB 0.223 0.747
##
## ML2 ML1
## SS loadings 6.582 3.240
## Proportion Var 0.598 0.295
## Cumulative Var 0.598 0.893
corrplot(bones_ml$loadings[,], cl.pos = "n")