Derivation: https://people.stat.sc.edu/hitchcock/stat530ch4slides_extra.pdf
##########################################################################
###################### #
# R example code for factor analysis: #
###################### #
##########################################################################
## Example 1(a):
# The 13 observed variables in the WAIS study are as follows:
#X1 = Information
#X2 = Comprehension
#X3 = Arithmetic
#X4 = Similarities
#X5 = Digit.span
#X6 = Vocabulary
#X7 = Digit.symbol
#X8 = Picture.completion
#X9 = Block.design
#X10 = Picture.arrangement
#X11 = Object.assembly
#X12 = Age
#X13 = Education
# The observed correlation matrix is:
wais.corr <- matrix( c(
1,0.67,0.62,0.66,0.47,0.81,0.47,0.60,0.49,0.51,0.41,-0.07,0.66,
.67,1,0.54,0.60,0.39,0.72,0.40,0.54,0.45,0.49,0.38,-0.08,0.52,
.62,.54,1,0.51,0.51,0.58,0.41,0.46,0.48,0.43,0.37,-0.08,0.49,
.66,.60,.51,1,0.41,0.68,0.49,0.56,0.50,0.50,0.41,-0.19,0.55,
.47,.39,.51,.41,1,0.45,0.45,0.42,0.39,0.42,0.31,-0.19,0.43,
.81,.72,.58,.68,.45,1,0.49,0.57,0.46,0.52,0.40,-0.02,0.62,
.47,.40,.41,.49,.45,.49,1,0.50,0.50,0.52,0.46,-0.46,0.57,
.60,.54,.46,.56,.42,.57,.50,1,0.61,0.59,0.51,-0.28,0.48,
.49,.45,.48,.50,.39,.46,.50,.61,1,0.54,0.59,-0.32,0.44,
.51,.49,.43,.50,.42,.52,.52,.59,.54,1,0.46,-0.37,0.49,
.41,.38,.37,.41,.31,.40,.46,.51,.59,.46,1,-0.28,0.40,
-.07,-.08,-.08,-.19,-.19,-.02,-.46,-.28,-.32,-.37,-.28,1,-0.29,
.66,.52,.49,.55,.43,.62,.57,.48,.44,.49,.40,-.29,1
), nrow=13, ncol=13, byrow=T)
#Using the built-in R function factanal (which uses the maximum likelihood method):
# If we enter a covariance matrix (or correlation matrix) instead of a raw data matrix,
# we must tell R the number of observations in the data set with n.obs
factanal(covmat=wais.corr,factors=4, rotation="none", n.obs=933)
##
## Call:
## factanal(factors = 4, covmat = wais.corr, n.obs = 933, rotation = "none")
##
## Uniquenesses:
## [1] 0.215 0.394 0.177 0.414 0.615 0.141 0.386 0.379 0.327 0.465 0.523 0.420
## [13] 0.389
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## [1,] 0.860 -0.194
## [2,] 0.756 -0.150
## [3,] 0.744 -0.115 0.504
## [4,] 0.757 -0.110
## [5,] 0.580 0.163 -0.118
## [6,] 0.867 -0.266 -0.191
## [7,] 0.644 0.381 -0.223
## [8,] 0.723 0.222 0.207
## [9,] 0.665 0.365 0.301
## [10,] 0.667 0.290
## [11,] 0.562 0.330 0.229
## [12,] -0.229 -0.685 0.235
## [13,] 0.728 -0.102 -0.253
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 6.250 1.149 0.378 0.377
## Proportion Var 0.481 0.088 0.029 0.029
## Cumulative Var 0.481 0.569 0.598 0.627
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 95.54 on 32 degrees of freedom.
## The p-value is 3.04e-08
# Apparently 4 factors are not enough (P-value too small).
factanal(covmat=wais.corr,factors=5, rotation="none", n.obs=933)
##
## Call:
## factanal(factors = 5, covmat = wais.corr, n.obs = 933, rotation = "none")
##
## Uniquenesses:
## [1] 0.206 0.378 0.005 0.412 0.638 0.148 0.407 0.377 0.297 0.438 0.504 0.381
## [13] 0.304
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5
## [1,] 0.650 0.554 -0.242
## [2,] 0.567 0.494 -0.193 -0.122
## [3,] 0.996
## [4,] 0.539 0.540
## [5,] 0.525 0.266
## [6,] 0.613 0.605 -0.329
## [7,] 0.436 0.508 0.325 -0.199
## [8,] 0.489 0.557 0.166 0.201
## [9,] 0.504 0.454 0.351 0.325 0.121
## [10,] 0.456 0.517 0.245 -0.157
## [11,] 0.392 0.428 0.297 0.228 0.137
## [12,] -0.300 -0.664 0.253 0.125
## [13,] 0.518 0.541 -0.311 0.191
##
## Factor1 Factor2 Factor3 Factor4 Factor5
## SS loadings 4.000 2.888 1.064 0.411 0.141
## Proportion Var 0.308 0.222 0.082 0.032 0.011
## Cumulative Var 0.308 0.530 0.612 0.643 0.654
##
## Test of the hypothesis that 5 factors are sufficient.
## The chi square statistic is 58.16 on 23 degrees of freedom.
## The p-value is 7.02e-05
# Apparently 5 factors are not enough (P-value still too small).
factanal(covmat=wais.corr,factors=6, rotation="none", n.obs=933)
##
## Call:
## factanal(factors = 6, covmat = wais.corr, n.obs = 933, rotation = "none")
##
## Uniquenesses:
## [1] 0.005 0.378 0.060 0.420 0.629 0.102 0.327 0.362 0.313 0.424 0.483 0.429
## [13] 0.398
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## [1,] 0.996
## [2,] 0.690 0.137 0.188 0.269 -0.120
## [3,] 0.658 -0.227 0.674
## [4,] 0.680 0.243 0.183 0.154
## [5,] 0.492 0.131 0.318 -0.100
## [6,] 0.829 0.133 0.117 0.421
## [7,] 0.495 0.500 0.287 -0.272 0.127
## [8,] 0.620 0.388 0.211 0.210 -0.120
## [9,] 0.516 0.418 0.348 -0.135 0.310
## [10,] 0.533 0.433 0.262 -0.185
## [11,] 0.432 0.409 0.265 0.232 0.173
## [12,] -0.567 -0.204 0.362 0.236 0.119
## [13,] 0.675 0.255 0.150 -0.235
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## SS loadings 5.121 1.481 1.092 0.441 0.396 0.138
## Proportion Var 0.394 0.114 0.084 0.034 0.030 0.011
## Cumulative Var 0.394 0.508 0.592 0.626 0.656 0.667
##
## Test of the hypothesis that 6 factors are sufficient.
## The chi square statistic is 20.82 on 15 degrees of freedom.
## The p-value is 0.143
# Apparently 6 factors are enough (P-value = 0.143).
# Let's save the final factor analysis solution as WAIS.fa.6 :
WAIS.fa.6 <- factanal(covmat=wais.corr,factors=6, rotation="none", n.obs=933)
# The specific variances for each variable are labeled "uniquenesses".
# We can request them by:
WAIS.fa.6$uniquenesses
## [1] 0.00500000 0.37845687 0.06013834 0.41968933 0.62876449 0.10224316
## [7] 0.32696262 0.36192374 0.31302999 0.42366536 0.48327624 0.42871629
## [13] 0.39790090
# Since we are using a correlation matrix, the communalities are 1 minus the specific variances:
1-WAIS.fa.6$uniquenesses
## [1] 0.9950000 0.6215431 0.9398617 0.5803107 0.3712355 0.8977568 0.6730374
## [8] 0.6380763 0.6869700 0.5763346 0.5167238 0.5712837 0.6020991
# We see that X1 (= Information) shares very much of its variance with the other 12 variables via the factors.
# It is similar for X3 (= Arithmetic).
# The factors are not too interpretable.
# We will try a varimax rotation to aid interpretation:
WAIS.fa.6.v <- factanal(covmat=wais.corr,factors=6, rotation="varimax", n.obs=933)
WAIS.fa.6.v
##
## Call:
## factanal(factors = 6, covmat = wais.corr, n.obs = 933, rotation = "varimax")
##
## Uniquenesses:
## [1] 0.005 0.378 0.060 0.420 0.629 0.102 0.327 0.362 0.313 0.424 0.483 0.429
## [13] 0.398
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## [1,] 0.794 0.256 0.274 0.470
## [2,] 0.682 0.286 0.249
## [3,] 0.373 0.245 0.857
## [4,] 0.625 0.323 0.193 0.217
## [5,] 0.355 0.211 0.260 0.362
## [6,] 0.882 0.246 0.223
## [7,] 0.387 0.298 0.592 0.189 0.219
## [8,] 0.471 0.541 0.259 0.157 -0.167
## [9,] 0.264 0.708 0.251 0.224
## [10,] 0.436 0.408 0.402 0.171 -0.168
## [11,] 0.235 0.614 0.236 0.138
## [12,] -0.201 -0.725
## [13,] 0.574 0.190 0.374 0.236 0.147 0.138
##
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## SS loadings 3.548 1.921 1.477 1.314 0.257 0.152
## Proportion Var 0.273 0.148 0.114 0.101 0.020 0.012
## Cumulative Var 0.273 0.421 0.534 0.635 0.655 0.667
##
## Test of the hypothesis that 6 factors are sufficient.
## The chi square statistic is 20.82 on 15 degrees of freedom.
## The p-value is 0.143
# The first factor seems to be a combination of Information, Comprehension, and Vocabulary.
# Maybe a "Word ability" factor?
# The fourth factor seems to be a more mathematical factor.
## Example 1(b):
# The 11-variable WAIS correlation matrix:
# This 11-variable data set involves only the "test scores",
# and drops the demographic variables.
wais.corr.11 <- wais.corr[1:11,1:11]
factanal(covmat=wais.corr.11,factors=4, rotation="none", n.obs=933)
##
## Call:
## factanal(factors = 4, covmat = wais.corr.11, n.obs = 933, rotation = "none")
##
## Uniquenesses:
## [1] 0.229 0.387 0.005 0.416 0.645 0.137 0.005 0.375 0.331 0.492 0.519
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## [1,] 0.666 0.546 0.140
## [2,] 0.576 0.513 0.130
## [3,] 0.840 0.537
## [4,] 0.610 0.460
## [5,] 0.577 0.131
## [6,] 0.656 0.623 -0.195
## [7,] 0.835 -0.545
## [8,] 0.586 0.422 0.320
## [9,] 0.594 0.266 0.495
## [10,] 0.576 0.322 0.255
## [11,] 0.503 0.237 0.407
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 4.595 1.587 0.648 0.629
## Proportion Var 0.418 0.144 0.059 0.057
## Cumulative Var 0.418 0.562 0.621 0.678
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 35.4 on 17 degrees of freedom.
## The p-value is 0.00551
# Apparently 4 factors are not enough (P-value too small).
factanal(covmat=wais.corr.11,factors=5, rotation="none", n.obs=933)
##
## Call:
## factanal(factors = 5, covmat = wais.corr.11, n.obs = 933, rotation = "none")
##
## Uniquenesses:
## [1] 0.235 0.389 0.117 0.419 0.600 0.109 0.277 0.308 0.334 0.472 0.456
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5
## [1,] 0.851 -0.172
## [2,] 0.756 -0.141 0.102
## [3,] 0.780 -0.523
## [4,] 0.748 0.140
## [5,] 0.581 0.122 -0.112 -0.143 0.122
## [6,] 0.875 -0.282 0.207
## [7,] 0.630 0.373 0.150 -0.405
## [8,] 0.717 0.273 0.175 0.195 0.188
## [9,] 0.655 0.430 0.194 -0.110
## [10,] 0.651 0.268 0.138 0.110
## [11,] 0.553 0.400 0.103 0.137 -0.222
##
## Factor1 Factor2 Factor3 Factor4 Factor5
## SS loadings 5.638 0.774 0.452 0.291 0.129
## Proportion Var 0.513 0.070 0.041 0.026 0.012
## Cumulative Var 0.513 0.583 0.624 0.650 0.662
##
## Test of the hypothesis that 5 factors are sufficient.
## The chi square statistic is 12.46 on 10 degrees of freedom.
## The p-value is 0.256
# Apparently 5 factors are enough (P-value = 0.256).
# With a varimax rotation:
WAIS.11.fa.5.v<-factanal(covmat=wais.corr.11,factors=5, rotation="varimax", n.obs=933)
WAIS.11.fa.5.v
##
## Call:
## factanal(factors = 5, covmat = wais.corr.11, n.obs = 933, rotation = "varimax")
##
## Uniquenesses:
## [1] 0.235 0.389 0.117 0.419 0.600 0.109 0.277 0.308 0.334 0.472 0.456
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4 Factor5
## [1,] 0.745 0.264 0.301 0.192 0.118
## [2,] 0.667 0.278 0.244 0.129 0.111
## [3,] 0.378 0.236 0.814 0.145
## [4,] 0.591 0.332 0.207 0.252 0.121
## [5,] 0.288 0.208 0.366 0.341 0.155
## [6,] 0.865 0.216 0.207 0.229
## [7,] 0.251 0.364 0.153 0.708
## [8,] 0.425 0.548 0.156 0.216 0.375
## [9,] 0.246 0.708 0.230 0.201 0.107
## [10,] 0.355 0.457 0.163 0.325 0.245
## [11,] 0.211 0.664 0.128 0.205
##
## Factor1 Factor2 Factor3 Factor4 Factor5
## SS loadings 2.799 1.986 1.176 1.043 0.280
## Proportion Var 0.254 0.181 0.107 0.095 0.025
## Cumulative Var 0.254 0.435 0.542 0.637 0.662
##
## Test of the hypothesis that 5 factors are sufficient.
## The chi square statistic is 12.46 on 10 degrees of freedom.
## The p-value is 0.256
# The first factor still seems to be a verbal factor, and the third is the math factor.
## Example 2:
######################################################################################################
######################################################################################################
##
## Entering the raw data for the life expectancy example:
##
"life" <-
structure(.Data = list(c(63., 34., 38., 59., 56., 62., 50., 65., 56., 69., 65., 64., 56., 60., 61., 49., 59., 63., 59., 65., 65., 64.,
64., 67., 61., 68., 67., 65., 59., 58., 57.)
, c(51., 29., 30., 42., 38., 44., 39., 44., 46., 47., 48., 50., 44., 44., 45., 40., 42., 44., 44., 48., 48., 63.,
43., 45., 40., 46., 45., 46., 43., 44., 46.)
, c(30., 13., 17., 20., 18., 24., 20., 22., 24., 24., 26., 28., 25., 22., 22., 22., 22., 23., 24., 28., 26., 21.,
21., 23., 21., 23., 23., 24., 23., 24., 28.)
, c(13., 5., 7., 6., 7., 7., 7., 7., 11., 8., 9., 11., 10., 6., 8., 9., 6., 8., 8., 14., 9., 7., 6., 8., 10., 8.,
8., 9., 10., 9., 9.)
, c(67., 38., 38., 64., 62., 69., 55., 72., 63., 75., 68., 66., 61., 65., 65., 51., 61., 67., 63., 68., 67., 68.,
68., 74., 67., 75., 74., 71., 66., 62., 60.)
, c(54., 32., 34., 46., 46., 50., 43., 50., 54., 53., 50., 51., 48., 45., 49., 41., 43., 48., 46., 51., 49., 47.,
47., 51., 46., 52., 51., 51., 49., 47., 49.)
, c(34., 17., 20., 25., 25., 28., 23., 27., 33., 29., 27., 29., 27., 25., 27., 23., 22., 26., 25., 29., 27., 25.,
24., 28., 25., 29., 28., 28., 27., 25., 28.)
, c(15., 6., 7., 8., 10., 14., 8., 9., 19., 10., 10., 11., 12., 9., 10., 8., 7., 9., 8., 13., 10., 9., 8., 10., 11.,
10., 10., 10., 12., 10., 11.)
)
, class = "data.frame"
, names = c("m0", "m25", "m50", "m75", "w0", "w25", "w50", "w75")
, row.names = c("Algeria", "Cameroon", "Madagascar", "Mauritius", "Reunion", "Seychelles", "South Africa(C)", "South Africa(W)",
"Tunisia", "Canada", "Costa Rica", "Dominican Rep", "El Salvador", "Greenland", "Grenada", "Guatemala",
"Honduras", "Jamaica", "Mexico", "Nicaragua", "Panama", "Trinidad(62)", "Trinidad (67)",
"United States (66)", "United States (NW66)", "United States (W66)", "United States (67)", "Argentina",
"Chile", "Columbia", "Ecuador")
)
##
##
######################################################################################################
######################################################################################################
# A data set called life has been created.
factanal(life,factors=1, rotation="varimax")
##
## Call:
## factanal(x = life, factors = 1, rotation = "varimax")
##
## Uniquenesses:
## m0 m25 m50 m75 w0 w25 w50 w75
## 0.238 0.470 0.399 0.696 0.217 0.005 0.117 0.532
##
## Loadings:
## Factor1
## m0 0.873
## m25 0.728
## m50 0.776
## m75 0.552
## w0 0.885
## w25 0.998
## w50 0.940
## w75 0.684
##
## Factor1
## SS loadings 5.329
## Proportion Var 0.666
##
## Test of the hypothesis that 1 factor is sufficient.
## The chi square statistic is 163.11 on 20 degrees of freedom.
## The p-value is 1.88e-24
# One factor is clearly not enough (tiny P-value).
factanal(life,factors=2, rotation="varimax")
##
## Call:
## factanal(x = life, factors = 2, rotation = "varimax")
##
## Uniquenesses:
## m0 m25 m50 m75 w0 w25 w50 w75
## 0.024 0.442 0.346 0.408 0.015 0.011 0.015 0.178
##
## Loadings:
## Factor1 Factor2
## m0 0.972 0.179
## m25 0.670 0.329
## m50 0.480 0.651
## m75 0.122 0.760
## w0 0.973 0.194
## w25 0.790 0.603
## w50 0.567 0.815
## w75 0.185 0.888
##
## Factor1 Factor2
## SS loadings 3.567 2.994
## Proportion Var 0.446 0.374
## Cumulative Var 0.446 0.820
##
## Test of the hypothesis that 2 factors are sufficient.
## The chi square statistic is 45.24 on 13 degrees of freedom.
## The p-value is 1.91e-05
# Two factors are clearly not enough (tiny P-value).
factanal(life,factors=3, rotation="varimax")
##
## Call:
## factanal(x = life, factors = 3, rotation = "varimax")
##
## Uniquenesses:
## m0 m25 m50 m75 w0 w25 w50 w75
## 0.005 0.362 0.066 0.288 0.005 0.011 0.020 0.146
##
## Loadings:
## Factor1 Factor2 Factor3
## m0 0.964 0.122 0.226
## m25 0.646 0.169 0.438
## m50 0.430 0.354 0.790
## m75 0.525 0.656
## w0 0.970 0.217
## w25 0.764 0.556 0.310
## w50 0.536 0.729 0.401
## w75 0.156 0.867 0.280
##
## Factor1 Factor2 Factor3
## SS loadings 3.375 2.082 1.640
## Proportion Var 0.422 0.260 0.205
## Cumulative Var 0.422 0.682 0.887
##
## Test of the hypothesis that 3 factors are sufficient.
## The chi square statistic is 6.73 on 7 degrees of freedom.
## The p-value is 0.458
# Three factors may be enough (P-value = 0.458).
# How could we interpret the three factors?
# Saving the result as life.fa.3.v:
life.fa.3.v <- factanal(life,factors=3, rotation="varimax")
# The communalities:
1-life.fa.3.v$uniquenesses
## m0 m25 m50 m75 w0 w25 w50 w75
## 0.9950000 0.6383261 0.9337228 0.7122064 0.9950000 0.9889330 0.9798799 0.8540204
# We see that m0, m50, w0, w25, w50 share very much of their variances with the other variables via the factors.
# m25 and m75 are more "unique".
# What does this mean?
# Estimating factor scores for the life expectancy data set:
life.fa.3.scores <- factanal(life,factors=3, rotation="varimax",scores="regression")$scores
#### Plotting the 3 factor scores for this data set using a 3-D plot:
# Creating a data frame with the original data and the scores:
life.df <- data.frame(life, life.fa.3.scores)
attach(life.df)
### Doing a 3-D plot:
library(lattice) # loading the lattice package
cloud(Factor3 ~ Factor1 * Factor2, xlim=range(Factor1), ylim=range(Factor2), zlim=range(Factor3),
pch=row.names(life.df),
scales = list(distance = rep(1, 3), arrows = FALSE))
### A nicer 3-D plot of the factor scores, including labels of the country names:
library(scatterplot3d)
with(life.df, {
s3d <- scatterplot3d(Factor1,Factor2,Factor3, # x y and z axis
color="blue", pch=19, # filled blue circles
type="h", # vertical lines to the x-y plane
main="3-D Scatterplot of Factor Scores",
xlab="Factor 1",
ylab="Factor 2",
zlab="Factor 3")
s3d.coords <- s3d$xyz.convert(Factor1,Factor2,Factor3) # convert 3D coords to 2D projection
text(s3d.coords$x, s3d.coords$y, # x and y coordinates
labels=row.names(life.df), # text to plot
cex=.5, pos=4) # shrink text 50% and place to right of points)
})
# Example code borrowed from:
# https://statmethods.wordpress.com/2012/01/30/getting-fancy-with-3-d-scatterplots/
# Looking back at the factor scores:
life.df[,c("Factor1","Factor2","Factor3")]
## Factor1 Factor2 Factor3
## Algeria -0.258062561 1.90095771 1.91581631
## Cameroon -2.782495791 -0.72340014 -1.84772224
## Madagascar -2.806428187 -0.81158820 -0.01210318
## Mauritius 0.141004934 -0.29028454 -0.85862443
## Reunion -0.196352142 0.47429917 -1.55046466
## Seychelles 0.367371307 0.82902375 -0.55214085
## South Africa(C) -1.028567629 -0.08065792 -0.65421971
## South Africa(W) 0.946193522 0.06400408 -0.91995289
## Tunisia -0.862493550 3.59177195 -0.36442148
## Canada 1.245304248 0.29564122 -0.27342781
## Costa Rica 0.508736247 -0.50500435 1.01328707
## Dominican Rep 0.106044085 0.01111171 1.83871599
## El Salvador -0.608155779 0.65100820 0.48836431
## Greenland 0.235114220 -0.69123901 -0.38558654
## Grenada 0.132008172 0.25241049 -0.15220645
## Guatemala -1.450336359 -0.67765804 0.65911906
## Honduras 0.043253249 -1.85175707 0.30633182
## Jamaica 0.462124701 -0.51918493 0.08032855
## Mexico -0.052332675 -0.72020002 0.44417800
## Nicaragua 0.268974443 0.08407227 1.70568388
## Panama 0.442333434 -0.73778272 1.25218728
## Trinidad(62) 0.711367053 -0.95989475 -0.21545329
## Trinidad (67) 0.787286051 -1.10729029 -0.51958264
## United States (66) 1.128331259 0.16389896 -0.68177046
## United States (NW66) 0.400058903 -0.36230253 -0.74299137
## United States (W66) 1.214345385 0.40877239 -0.69225320
## United States (67) 1.128331259 0.16389896 -0.68177046
## Argentina 0.731344988 0.24811968 -0.12817725
## Chile 0.009751528 0.75222637 -0.49198911
## Columbia -0.240602517 -0.29543613 0.42919600
## Ecuador -0.723451797 0.44246371 1.59164974
row.names(life.df)[order(Factor1)]
## [1] "Madagascar" "Cameroon" "Guatemala"
## [4] "South Africa(C)" "Tunisia" "Ecuador"
## [7] "El Salvador" "Algeria" "Columbia"
## [10] "Reunion" "Mexico" "Chile"
## [13] "Honduras" "Dominican Rep" "Grenada"
## [16] "Mauritius" "Greenland" "Nicaragua"
## [19] "Seychelles" "United States (NW66)" "Panama"
## [22] "Jamaica" "Costa Rica" "Trinidad(62)"
## [25] "Argentina" "Trinidad (67)" "South Africa(W)"
## [28] "United States (66)" "United States (67)" "United States (W66)"
## [31] "Canada"
# Recall factor 1 basically measures life expectancy at birth.
# We see Madagascar and Cameroon have the smallest values for Factor 1.
# The U.S .and Canada have high values for Factor 1.
row.names(life.df)[order(Factor2)]
## [1] "Honduras" "Trinidad (67)" "Trinidad(62)"
## [4] "Madagascar" "Panama" "Cameroon"
## [7] "Mexico" "Greenland" "Guatemala"
## [10] "Jamaica" "Costa Rica" "United States (NW66)"
## [13] "Columbia" "Mauritius" "South Africa(C)"
## [16] "Dominican Rep" "South Africa(W)" "Nicaragua"
## [19] "United States (66)" "United States (67)" "Argentina"
## [22] "Grenada" "Canada" "United States (W66)"
## [25] "Ecuador" "Reunion" "El Salvador"
## [28] "Chile" "Seychelles" "Algeria"
## [31] "Tunisia"
# Recall factor 2 basically measures life expectancy for older women.
# Cameroon also has a low score for this factor.
row.names(life.df)[order(Factor3)]
## [1] "Cameroon" "Reunion" "South Africa(W)"
## [4] "Mauritius" "United States (NW66)" "United States (W66)"
## [7] "United States (66)" "United States (67)" "South Africa(C)"
## [10] "Seychelles" "Trinidad (67)" "Chile"
## [13] "Greenland" "Tunisia" "Canada"
## [16] "Trinidad(62)" "Grenada" "Argentina"
## [19] "Madagascar" "Jamaica" "Honduras"
## [22] "Columbia" "Mexico" "El Salvador"
## [25] "Guatemala" "Costa Rica" "Panama"
## [28] "Ecuador" "Nicaragua" "Dominican Rep"
## [31] "Algeria"
# Factor 3 reflects (mostly) life expectancy for older men.
# Cameroon has the lowest score for this factor.
# Algeria has the highest score for Factor 3.
### Doing separate 2-D scatterplots of the factor scores:
# Factor 2 vs. Factor 1:
plot(Factor1, Factor2, type='n', xlab='Factor 1 scores', ylab='Factor 2 scores')
text(Factor1, Factor2, labels = row.names(life.df), cex = 0.7 )
# Factor 3 vs. Factor 1:
plot(Factor1, Factor3, type='n', xlab='Factor 1 scores', ylab='Factor 3 scores')
text(Factor1, Factor3, labels = row.names(life.df), cex = 0.7 )
# Factor 3 vs. Factor 2:
plot(Factor2, Factor3, type='n', xlab='Factor 2 scores', ylab='Factor 3 scores')
text(Factor2, Factor3, labels = row.names(life.df), cex = 0.7 )
#########################################################################################################
# A separate R function (and some auxilary functions) to do a variety of types of factor analysis:
# (Originally written by Dr. Brian Habing)
# Copy this long section of code into R first:
########################################### BEGIN CODE ##################################################
#########################################################################################################
#########################################################################################################
#########################################################################################################
#########################################################################################################
factpca<-function(x=NULL,r=cor(x)){
#Performs Principal Components Factor Analysis using the#
#correlation matrix. Can be given either the raw data or the#
#correlation matrix.#
xeig<-eigen(r);xval<-xeig$values;xvec<-xeig$vectors
for (i in 1:length(xval)){
xvec[,i]<-xvec[,i]*sqrt(xval[i])}
rownames(xvec)<-colnames(r)
return(xvec)
}
factpf<-function(x=NULL,r=cor(x)){
#Performs Principal Factor Factor Analysis using the#
#correlation matrix. Can be given either the raw data or the#
#correlation matrix.#
n<-ncol(r);redcormat<-r;diag(redcormat)<-apply(abs(r-diag(1,nrow=n,ncol=n)),2,max)
xeig<-eigen(redcormat);xval<-xeig$values;xvec<-xeig$vectors
for (i in 1:length(xval[xval>0])){
xvec[,i]<-xvec[,i]*sqrt(xval[i])}
rownames(xvec)<-colnames(x)
return(xvec[,xval>0])
}
factiter<-function(x=NULL,r=cor(x),niter=100,maxfactors=ncol(r),diag=FALSE){
#Performs Iterated Principal Factor Factor Analysis using the#
#correlation matrix. Can be given either the raw data or the#
#correlation matrix.#
n<-ncol(r);temp<-matrix(0,nrow=n,ncol=n);comm<-matrix(0,nrow=niter+2,ncol=n);y<-factpf(r=r);m<-ncol(y)
temp[1:n,1:m]<-y;comm[1,]<-apply(abs(r-diag(1,nrow=n,ncol=n)),2,max);comm[2,]<-apply(as.matrix(temp[,1:maxfactors])^2,1,sum)
for (i in 3:(niter+2)){
redcormat<-r;diag(redcormat)<-comm[i-1,];xeig<-eigen(redcormat);m<-min(maxfactors,length(xeig$values[xeig$values>0]))
for (j in 1:m){
xeig$vectors[,j]<-xeig$vectors[,j]*sqrt(xeig$values[j])}
temp<-matrix(0,nrow=n,ncol=n);temp[1:n,1:m]<-xeig$vectors[1:n,1:m];comm[i,]<-apply(as.matrix(temp[1:n,1:m])^2,1,sum)
}
loadings<-as.matrix(temp[1:n,1:m]);rownames(loadings)<-colnames(r);rownames(comm)<-(-1:niter)
if (diag!=FALSE)
{list(comm=comm,loadings=loadings)}
else
{loadings}
}
fact<-function(x=NULL,r=cor(x),maxfactors=floor((2*ncol(r)+1-sqrt(8*ncol(r)+5))/2),
method=c("iter","pf","pca","norm"),rotation=c("none","varimax","quartimax"),
niter=100,full=FALSE,plot=TRUE,dec=3){
method<-match.arg(method);rotation<-match.arg(rotation)
if (rotation=="none"){rot="- no rotation"}
if (rotation=="varimax"){rot="- varimax rotation"}
if (rotation=="quartimax"){loaded<-library(GPArotation,logical.return=T)
rot<-" - quartimax"
if (loaded==F){
rot<-"- varimax rotation"
rotation<-"varimax"
warning("GPArotation library is needed for",
" quartimax rotation. Varimax was used instead",
call.=FALSE)}}
if (method=="iter"){ftemp<-factiter(r=r,niter=niter,maxfactors=maxfactors,diag=T)
loadings<-ftemp$loadings
concheck<-max(abs(ftemp$comm[niter+2,]-ftemp$comm[niter+1,]))
if (concheck>=5e-4){
warning("Convergence not achieved, difference of ",
as.character(round(concheck,5)),
" after ",as.character(niter)," iterations.",
call.=FALSE)}
maxcon<-apply(ftemp$comm,1,max); maxlegal<-max(as.integer(names(maxcon[maxcon<1])))
if (as.integer(maxlegal)<niter){
warning("Communality greater than one, ",maxlegal,
" was last legal iteration.",call.=FALSE)}
meth<-paste("iterated principal factor",rot)}
if (method=="pf"){loadings<-factpf(r=r)
loadings<-loadings[,1:min(ncol(loadings),maxfactors)]
meth<-paste("principal factor",rot)}
if (method=="pca"){loadings<-factpca(r=r)
loadings<-loadings[,1:min(ncol(loadings),maxfactors)]
meth<-paste("principal components",rot)}
if (method=="norm"){q<-ncol(r)
maxfactors<-min(maxfactors,
floor((2*ncol(r)+1-sqrt(8*ncol(r)+5))/2))
faout<-factanal(covmat=r,factors=maxfactors,rotation="none")
loadings<-diag(rep(1,ncol(r)))%*%faout$loadings
meth<-paste("multivariate normal",rot)}
loadings<-as.matrix(loadings)
if ((rotation=="varimax")&&(ncol(loadings)>1)){
loadings<-diag(rep(1,ncol(r)))%*%varimax(loadings)$loadings}
if ((rotation=="quartimax")&&(ncol(loadings)>1)){
loadings<-quartimax(loadings,normalize=TRUE)$loadings}
loadings<-as.matrix(loadings)
loadings<-loadings[,order(apply(-loadings^2,2,sum))]
loadings<-as.matrix(loadings)
colnames(loadings)<-paste("Factor",as.character(1:ncol(loadings)),sep="")
rownames(loadings)<-colnames(r)
resid<-r-loadings%*%t(loadings)-diag(1-apply(loadings^2,1,sum))
cortri<-r[upper.tri(r)]
restri<-resid[upper.tri(resid)]
predtri<-cortri-restri
if (plot==TRUE){
par(mfrow=c(2,2))
plot(princomp(covmat=r),main="")
plot(cortri,predtri,xlab="Original Correlation",
ylab="Predicted Correlation",xlim=c(-1,1),ylim=c(-1,1))
lines(c(-1,1),c(-1,1))
plot(predtri,restri,xlab="Predicted Correlation",ylab="Residual Correlation",
main=paste("r-squared=",as.character(round(cor(predtri,restri)^2,3))))
lines(c(-1,1),c(0,0))
abline(lm(restri~predtri))
hist(restri,xlab="Residual Correlations",
main=paste("mean=",as.character(round(mean(restri),3)),
" s.d.=",as.character(round(sd(restri),3))))
par(mfrow=c(1,1))
}
loadings<-t(t(loadings)*sign(apply(sign(loadings),2,sum)+.001))
evs<-eigen(r)$values
comm<-apply(loadings^2,1,sum)
names(comm)<-colnames(r)
vexpl<-apply(loadings^2,2,sum)
vexpl<-rbind(vexpl,vexpl/ncol(r))
colnames(vexpl)<-colnames(loadings)
rownames(vexpl)<-c("variance explained","percent explained")
resid<-resid
if (full!=TRUE){resid<-summary(resid[upper.tri(resid)])}
list(eigen.values=round(evs,dec),
method=meth,
loadings=round(loadings,dec),
communalities=round(comm,dec),
importance=round(vexpl,dec),
residuals=round(resid,dec))
}
#########################################################################################################
#########################################################################################################
#########################################################################################################
#########################################################################################################
############################################# END CODE ##################################################
# Note this can do both ML factor analysis and principal factor analysis.
# It also does either no rotation, varimax rotation,
# or (if you install the GPArotation package first) quartimax rotation.
# By default, it gives some excellent graphics for checking model assumptions:
# Example on the WAIS data:
# method = "norm" means ML using the normality assumption.
fact(r=wais.corr,method="norm",rotation="none",maxfactors=6)
## $eigen.values
## [1] 6.696 1.414 0.798 0.710 0.553 0.461 0.436 0.402 0.386 0.372 0.341 0.262
## [13] 0.170
##
## $method
## [1] "multivariate normal - no rotation"
##
## $loadings
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## [1,] 0.996 -0.015 -0.057 0.016 0.000 0.000
## [2,] 0.690 0.137 0.188 -0.269 0.068 -0.120
## [3,] 0.658 -0.227 0.674 0.034 -0.011 -0.003
## [4,] 0.680 0.243 0.183 -0.154 0.006 -0.047
## [5,] 0.492 0.131 0.318 0.032 -0.100 -0.012
## [6,] 0.829 0.133 0.117 -0.421 -0.031 0.022
## [7,] 0.495 0.500 0.287 0.078 -0.272 0.127
## [8,] 0.620 0.388 0.211 0.024 0.210 -0.120
## [9,] 0.516 0.418 0.348 0.135 0.310 0.100
## [10,] 0.533 0.433 0.262 0.021 0.035 -0.185
## [11,] 0.432 0.409 0.265 0.093 0.232 0.173
## [12,] -0.085 -0.567 -0.204 -0.362 0.236 0.119
## [13,] 0.675 0.255 0.150 -0.002 -0.235 0.052
##
## $communalities
## [1] 0.995 0.622 0.940 0.580 0.371 0.898 0.673 0.638 0.687 0.576 0.517 0.571
## [13] 0.602
##
## $importance
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## variance explained 5.121 1.481 1.092 0.441 0.396 0.138
## percent explained 0.394 0.114 0.084 0.034 0.030 0.011
##
## $residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.021 -0.002 0.000 0.000 0.001 0.027
# method = "pf" means principal factor analysis.
fact(r=wais.corr,method="pf",rotation="none",maxfactors=6)
## $eigen.values
## [1] 6.696 1.414 0.798 0.710 0.553 0.461 0.436 0.402 0.386 0.372 0.341 0.262
## [13] 0.170
##
## $method
## [1] "principal factor - no rotation"
##
## $loadings
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## [1,] 0.830 0.321 0.044 0.040 0.095 -0.002
## [2,] 0.743 0.282 -0.083 0.113 -0.150 -0.136
## [3,] 0.691 0.182 0.003 -0.324 0.016 0.000
## [4,] 0.761 0.118 0.007 0.135 -0.047 0.247
## [5,] 0.594 -0.017 0.165 -0.342 -0.086 0.015
## [6,] 0.821 0.365 0.022 0.115 0.020 -0.014
## [7,] 0.683 -0.301 0.208 0.018 0.045 0.011
## [8,] 0.748 -0.118 -0.174 0.044 -0.099 0.018
## [9,] 0.701 -0.256 -0.258 -0.076 0.038 0.035
## [10,] 0.707 -0.215 -0.020 0.047 -0.181 -0.073
## [11,] 0.611 -0.282 -0.300 -0.014 0.208 -0.043
## [12,] -0.312 0.592 -0.186 -0.089 0.061 0.003
## [13,] 0.736 0.002 0.286 0.091 0.176 -0.067
##
## $communalities
## [1] 0.805 0.693 0.615 0.675 0.505 0.822 0.603 0.616 0.631 0.587 0.587 0.494
## [13] 0.668
##
## $importance
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## variance explained 6.355 1.010 0.383 0.294 0.166 0.093
## percent explained 0.489 0.078 0.029 0.023 0.013 0.007
##
## $residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.031 -0.007 -0.001 0.000 0.006 0.033
# ML with varimax rotation:
fact(r=wais.corr,method="norm",rotation="varimax",maxfactors=6)
## $eigen.values
## [1] 6.696 1.414 0.798 0.710 0.553 0.461 0.436 0.402 0.386 0.372 0.341 0.262
## [13] 0.170
##
## $method
## [1] "multivariate normal - varimax rotation"
##
## $loadings
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## [1,] 0.794 0.256 0.059 0.274 0.470 0.002
## [2,] 0.682 0.286 0.055 0.249 -0.034 0.092
## [3,] 0.373 0.245 0.053 0.857 0.049 0.004
## [4,] 0.625 0.323 0.193 0.217 0.022 0.015
## [5,] 0.355 0.211 0.260 0.362 0.042 -0.025
## [6,] 0.882 0.246 0.019 0.223 -0.032 -0.092
## [7,] 0.387 0.298 0.592 0.189 0.000 -0.219
## [8,] 0.471 0.541 0.259 0.157 0.062 0.167
## [9,] 0.264 0.708 0.251 0.224 0.049 0.006
## [10,] 0.436 0.408 0.402 0.171 -0.024 0.168
## [11,] 0.235 0.614 0.236 0.138 0.033 -0.093
## [12,] 0.043 -0.201 -0.725 -0.008 -0.016 -0.058
## [13,] 0.574 0.190 0.374 0.236 0.147 -0.138
##
## $communalities
## [1] 0.995 0.622 0.940 0.580 0.371 0.898 0.673 0.638 0.687 0.576 0.517 0.571
## [13] 0.602
##
## $importance
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## variance explained 3.548 1.921 1.477 1.314 0.257 0.152
## percent explained 0.273 0.148 0.114 0.101 0.020 0.012
##
## $residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.021 -0.002 0.000 0.000 0.001 0.027
# principal factor analysis with varimax rotation:
fact(r=wais.corr,method="pf",rotation="varimax",maxfactors=6)
## $eigen.values
## [1] 6.696 1.414 0.798 0.710 0.553 0.461 0.436 0.402 0.386 0.372 0.341 0.262
## [13] 0.170
##
## $method
## [1] "principal factor - varimax rotation"
##
## $loadings
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## [1,] 0.789 0.260 0.080 0.321 -0.074 0.009
## [2,] 0.724 0.258 0.064 0.212 0.223 -0.056
## [3,] 0.457 0.281 0.039 0.570 -0.006 0.007
## [4,] 0.652 0.274 0.221 0.206 0.014 0.289
## [5,] 0.291 0.155 0.239 0.580 0.034 0.028
## [6,] 0.835 0.230 0.059 0.258 0.006 0.019
## [7,] 0.350 0.272 0.573 0.268 -0.075 0.020
## [8,] 0.459 0.486 0.293 0.214 0.173 0.091
## [9,] 0.287 0.623 0.294 0.252 0.065 0.077
## [10,] 0.399 0.362 0.440 0.229 0.223 0.008
## [11,] 0.232 0.668 0.256 0.129 -0.061 -0.032
## [12,] 0.048 -0.185 -0.674 -0.040 -0.027 -0.023
## [13,] 0.600 0.164 0.408 0.256 -0.203 -0.088
##
## $communalities
## [1] 0.805 0.693 0.615 0.675 0.505 0.822 0.603 0.616 0.631 0.587 0.587 0.494
## [13] 0.668
##
## $importance
## Factor1 Factor2 Factor3 Factor4 Factor5 Factor6
## variance explained 3.555 1.704 1.502 1.236 0.192 0.112
## percent explained 0.273 0.131 0.116 0.095 0.015 0.009
##
## $residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.031 -0.007 -0.001 0.000 0.006 0.033
# Example on the life expectancy data:
# ML with varimax rotation:
fact(x=life,method="norm",rotation="varimax",maxfactors=3)
## $eigen.values
## [1] 5.602 1.358 0.499 0.308 0.155 0.059 0.013 0.006
##
## $method
## [1] "multivariate normal - varimax rotation"
##
## $loadings
## Factor1 Factor2 Factor3
## m0 0.964 0.122 0.226
## m25 0.646 0.169 0.438
## m50 0.430 0.354 0.790
## m75 0.080 0.525 0.656
## w0 0.970 0.217 0.081
## w25 0.764 0.556 0.310
## w50 0.536 0.729 0.401
## w75 0.156 0.867 0.280
##
## $communalities
## m0 m25 m50 m75 w0 w25 w50 w75
## 0.995 0.638 0.934 0.712 0.995 0.989 0.980 0.854
##
## $importance
## Factor1 Factor2 Factor3
## variance explained 3.375 2.082 1.640
## percent explained 0.422 0.260 0.205
##
## $residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.037 -0.002 0.000 0.001 0.002 0.059
# principal factor analysis with varimax rotation:
fact(x=life,method="pf",rotation="varimax",maxfactors=3)
## $eigen.values
## [1] 5.602 1.358 0.499 0.308 0.155 0.059 0.013 0.006
##
## $method
## [1] "principal factor - varimax rotation"
##
## $loadings
## Factor1 Factor2 Factor3
## m0 0.956 0.143 0.225
## m25 0.665 0.222 0.448
## m50 0.435 0.540 0.595
## m75 0.068 0.718 0.496
## w0 0.969 0.206 0.044
## w25 0.778 0.576 0.185
## w50 0.565 0.771 0.217
## w75 0.189 0.890 0.098
##
## $communalities
## m0 m25 m50 m75 w0 w25 w50 w75
## 0.986 0.692 0.835 0.766 0.984 0.971 0.961 0.838
##
## $importance
## Factor1 Factor2 Factor3
## variance explained 3.451 2.638 0.944
## percent explained 0.431 0.330 0.118
##
## $residuals
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.035 -0.009 0.005 0.001 0.011 0.040