data.frame(
x = 1:nrow(bones),
y = mahalanobis(bones, center = colMeans(bones), cov = var(bones))
) |>
ggplot(
mapping = aes(x = x, y = y)
) +
geom_col() +
geom_hline(
yintercept = qchisq((nrow(bones)-0.5)/nrow(bones), df = p),
color = "red"
) +
scale_y_continuous(expand = c(0, 0, 0.05, 0)) +
scale_x_continuous(expand = c(0, 2)) +
labs(y = "Mahalanobix Distance",
x = "Row Number")
Need to run twice to remove all the outliers, since removing the original outliers changes the mean vector and covariance matrix
bones <-
bones |>
filter(
mahalanobis(bones,center = colMeans(bones), cov = var(bones)) <
qchisq((nrow(bones)-0.5)/nrow(bones), df = p)
)
data.frame(
x = 1:nrow(bones),
y = mahalanobis(bones, center = colMeans(bones), cov = var(bones))
) |>
ggplot(
mapping = aes(x = x, y = y)
) +
geom_col() +
geom_hline(
yintercept = qchisq((nrow(bones)-0.5)/nrow(bones), df = p),
color = "red"
) +
scale_y_continuous(expand = c(0, 0, 0.05, 0)) +
scale_x_continuous(expand = c(0, 2)) +
labs(y = "Mahalanobix Distance",
x = "Row Number")
# Recording the sample size and number of numeric variables
n <- nrow(bones)
# Calculating the mean vector and covariance matrix
ybar <- colMeans(bones)
covmat <- cov(bones)
We can rearrange the order to find sets of highly correlated variables. Orders them based on the First PC: order = “FPC”
corrplot::corrplot(
cor(bones),
method = "circle",
order = "FPC"
)
# Arm and leg bones
bones |>
dplyr::select(HumL:TibR) |>
ggpairs(
lower = list(continuous = wrap("smooth",
method = "loess",
se = F,
alpha = 0.5,
color="steelblue")),
diag = list(continuous = wrap("densityDiag",
fill = "steelblue" ))
) +
theme_bw()
## Hip bones
bones |>
dplyr::select(IBL:BIB) |>
ggpairs(
lower = list(continuous = wrap("smooth",
method = "loess",
se = F,
alpha = 0.5,
color="darkred")),
diag = list(continuous = wrap("densityDiag",
fill = "darkred" ))
) +
theme_bw()
KMO can be used to check if the data are correlated enough for factor analysis
?KMO
KMO(bones)
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = bones)
## Overall MSA = 0.89
## MSA for each item =
## HumL HumR RadL RadR FemL FemR TibL TibR IBL IBR AceL AceR BIB
## 0.90 0.90 0.88 0.88 0.89 0.89 0.87 0.88 0.86 0.87 0.88 0.88 0.98
Since all the variables are above 0.70, factor analysis should work well!
Need to see if the data are MVN to determine if we need to use the PCA method or Likelihood Method
mvn(
data = bones,
mvnTest = "mardia",
multivariatePlot = "qq",
univariateTest = "SW",
desc = F
)
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 578.399294789812 7.47719995663404e-05 NO
## 2 Mardia Kurtosis 8.26316727098036 2.22044604925031e-16 NO
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk HumL 0.997 0.2810 YES
## 2 Shapiro-Wilk HumR 0.997 0.3971 YES
## 3 Shapiro-Wilk RadL 0.996 0.1426 YES
## 4 Shapiro-Wilk RadR 0.996 0.1677 YES
## 5 Shapiro-Wilk FemL 0.998 0.6776 YES
## 6 Shapiro-Wilk FemR 0.998 0.7913 YES
## 7 Shapiro-Wilk TibL 0.996 0.1787 YES
## 8 Shapiro-Wilk TibR 0.996 0.1470 YES
## 9 Shapiro-Wilk IBL 0.993 0.0078 NO
## 10 Shapiro-Wilk IBR 0.994 0.0176 NO
## 11 Shapiro-Wilk AceL 0.995 0.0499 NO
## 12 Shapiro-Wilk AceR 0.997 0.2920 YES
## 13 Shapiro-Wilk BIB 0.994 0.0191 NO
While the data doesn’t appear to be MVN, let’s use both methods to fit the model and see how they compare
Use principal() in the psych package
Need to specify how many factors there are. When using the PCA method, scree plots can be beneficial
screeplot(
prcomp(bones,
scale. = T),
type = "l"
)
Scree plot recommends 2 factors
An alternative to the screeplot for factor analysis models is the parallel plot:
#?fa.parallel
fa.parallel(
bones,
fm = "pa",
fa = "fa"
)
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
To use the parallel plot, we should choose the number of factors above the red dotted line.
The red line represents what we would expect to see if there no factors in our data!
It recommends 2 or 3 factors when using the PC method for Factor Analysis!
Let’s compare when m = 1, m = 2, and m = 3
# Fitting the FA model using PC with 1 factor
bones_pcfa1 <-
fa(
r = cor(bones),
nfactors = 1,
fm = "pa",
rotate = "none",
n.obs = n
)
# Fitting the FA model using PC with 2 factors
bones_pcfa2 <-
fa(r = cor(bones),
nfactors = 2,
fm = "pa",
rotate = "none",
n.obs = n
)
# Fitting the FA model using PC with 3 factors
bones_pcfa3 <-
fa(r = cor(bones),
nfactors = 3,
fm = "pa",
rotate = "none",
n.obs = n
)
We can get the residual matrix by adding $residuals.
But what is the residual matrix given by the fa()
function? It is the \(R - LL'\)
matrix!
Ideally the Residual matrix should be small: Most are around 0 - 0.10
# m = 1
ggcorr(
data = NULL,
cor_matrix = bones_pcfa1$residual,
low = "red",
mid = "white",
high = "red"
)
# m = 2
ggcorr(
data = NULL,
cor_matrix = bones_pcfa2$residual,
low = "red",
mid = "white",
high = "red"
)
# m = 3
ggcorr(
data = NULL,
cor_matrix = bones_pcfa3$residual,
low = "red",
mid = "white",
high = "red"
)
From the correlation plot, it looks like there is a large decrease in the residuals going from 1 to 2 factors, but not much of a reduction going from 2 to 3. Reinforces our earlier conclusion that m = 2
Additionally, the $fit will show how much of the correlation the FA model correlation matrix is retained:
\[\textrm{fit} = 1 - \sum\left( \frac{r_{ij}^2 - \hat{r}_{ij}^2}{r_{ij}^2} \right)\]
c(
"m = 1" = bones_pcfa1$fit,
"m = 2" = bones_pcfa2$fit,
"m = 3" = bones_pcfa3$fit
)
## m = 1 m = 2 m = 3
## 0.963 0.993 0.997
# Reduction in residuals between m = 1 and m = 2
ggcorr(
data = NULL,
cor_matrix = bones_pcfa1$residual - bones_pcfa2$residual,
low = "red",
mid = "white",
high = "red"
) +
labs(title = "Reduction in Residual Matrix from m = 1 to m = 2")
# Reduction in residuals between m = 2 and m = 3
ggcorr(
data = NULL,
cor_matrix = bones_pcfa2$residual - bones_pcfa3$residual,
low = "red",
mid = "white",
high = "red"
) +
labs(title = "Reduction in Residual Matrix from m = 2 to m = 3")
We want to rotate the factors using varimax
bones_pcfa <- fa(
r = cor(bones),
nfactors = 3,
fm = "pa",
rotate = "varimax",
n.obs = n
)
# We can get just the rotated Loadings using loadings()
loadings(bones_pcfa)
##
## Loadings:
## PA1 PA2 PA3
## HumL 0.816 0.367 0.312
## HumR 0.807 0.394 0.305
## RadL 0.890 0.193 0.253
## RadR 0.897 0.196 0.243
## FemL 0.852 0.335 0.303
## FemR 0.844 0.341 0.312
## TibL 0.928 0.231 0.130
## TibR 0.927 0.233 0.137
## IBL 0.301 0.894 0.283
## IBR 0.303 0.895 0.287
## AceL 0.382 0.447 0.782
## AceR 0.377 0.462 0.772
## BIB 0.223 0.732 0.177
##
## PA1 PA2 PA3
## SS loadings 6.594 3.251 1.939
## Proportion Var 0.507 0.250 0.149
## Cumulative Var 0.507 0.757 0.906
# Convert the loadings to a dataframe
bones_pcfa$loadings[,] |>
data.frame() |>
# Changing the variables from a row name to a column named variable
rownames_to_column(var = "Variable") |>
# Extending it so the loadings are 1 column and the Factor has an indicator column
pivot_longer(
cols = where(is.numeric),
names_to = "Factor",
values_to ="Loadings"
) |>
# Changing the order of the bones to match the order in the FA model
mutate(
Variable = factor(Variable, levels = rev(colnames(bones)))
) |>
# Removing any loadings below a chosen limit
filter(Loadings > 0.40) |>
# Call on ggplot() and specify x = 1st and y = 2nd factor
ggplot(
mapping = aes(
x = Factor,
y = Variable
)
) +
geom_tile(
mapping = aes(fill = Loadings),
color = "white"
) +
scale_fill_gradient(
low = "white",
high = "darkblue",
limits = c(0, 1)
) +
labs(fill = "Loading Values") +
theme_classic()
## Or we can use corrplot to make the process a little simpler
corrplot(bones_pcfa$loadings[,])
# Compare it to the correlation plot of the data
corrplot(cor(bones), order="hclust")
data.frame(communality = bones_pcfa$communality) |>
mutate(specific = 1-communality) |>
rownames_to_column(var = "variable") |>
gt::gt()
variable | communality | specific |
---|---|---|
HumL | 0.898 | 0.1019 |
HumR | 0.899 | 0.1007 |
RadL | 0.894 | 0.1062 |
RadR | 0.902 | 0.0976 |
FemL | 0.929 | 0.0710 |
FemR | 0.925 | 0.0747 |
TibL | 0.932 | 0.0678 |
TibR | 0.933 | 0.0673 |
IBL | 0.971 | 0.0295 |
IBR | 0.974 | 0.0260 |
AceL | 0.957 | 0.0433 |
AceR | 0.952 | 0.0481 |
BIB | 0.617 | 0.3829 |