# Course: EDF 6436 - Theory of Measurement
# Assignment: EFA
# Author: Nguyet Hoang | hoangt@ufl.edu

# Set up R
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(conflicted)
conflict_prefer("filter", "dplyr")
## [conflicted] Will prefer dplyr::filter over any other package.
library(psych)
library(scales)
library(GPArotation)

# Read data
data <- read.csv("ToMassignment1data.csv", sep = ";", header = TRUE)
view(data)

# Univariate descriptives
describe(data)
##     vars   n mean   sd median trimmed mad min max range  skew kurtosis   se
## V1     1 225 0.67 0.47      1    0.71   0   0   1     1 -0.72    -1.48 0.03
## V2     2 225 0.66 0.47      1    0.70   0   0   1     1 -0.68    -1.54 0.03
## V3     3 225 0.46 0.50      0    0.45   0   0   1     1  0.17    -1.98 0.03
## V4     4 225 0.39 0.49      0    0.36   0   0   1     1  0.46    -1.79 0.03
## V5     5 225 0.48 0.50      0    0.48   0   0   1     1  0.06    -2.01 0.03
## V6     6 225 0.54 0.50      1    0.55   0   0   1     1 -0.15    -1.99 0.03
## V7     7 225 0.74 0.44      1    0.80   0   0   1     1 -1.10    -0.79 0.03
## V8     8 225 0.78 0.42      1    0.85   0   0   1     1 -1.33    -0.24 0.03
## V9     9 225 0.36 0.48      0    0.32   0   0   1     1  0.60    -1.65 0.03
## V10   10 225 0.28 0.45      0    0.23   0   0   1     1  0.97    -1.06 0.03
## V11   11 225 0.70 0.46      1    0.75   0   0   1     1 -0.88    -1.23 0.03
## V12   12 225 0.77 0.42      1    0.83   0   0   1     1 -1.27    -0.40 0.03
## V13   13 225 0.71 0.45      1    0.76   0   0   1     1 -0.93    -1.15 0.03
## V14   14 225 0.48 0.50      0    0.48   0   0   1     1  0.06    -2.01 0.03
## V15   15 225 0.72 0.45      1    0.78   0   0   1     1 -1.00    -1.01 0.03
## V16   16 225 0.70 0.46      1    0.75   0   0   1     1 -0.88    -1.23 0.03
## V17   17 225 0.41 0.49      0    0.39   0   0   1     1  0.35    -1.89 0.03
## V18   18 225 0.37 0.48      0    0.34   0   0   1     1  0.52    -1.74 0.03
## V19   19 225 0.61 0.49      1    0.64   0   0   1     1 -0.44    -1.81 0.03
## V20   20 225 0.47 0.50      0    0.46   0   0   1     1  0.11    -2.00 0.03
# Step 1: Extract factors - report unrotated number of factors
# 1.1. Using principal components method for factor extraction
tet_matrix <- tetrachoric(data)$rho
options(mc.cores = 1)
pc_result <- fa.parallel(tet_matrix, fa = "pc")
## Warning in fa.parallel(tet_matrix, fa = "pc"): It seems as if you are using a
## correlation matrix, but have not specified the number of cases. The number of
## subjects is arbitrarily set to be 100

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  2
pc_actual_eigenvalues <- pc_result$pc.values
pc_simulated_eigenvalues <- pc_result$pc.sim
pc_eigenvalues_table <- data.frame(
  Component = 1:length(pc_actual_eigenvalues),
  Actual_Eigenvalue = pc_actual_eigenvalues,
  Simulated_Eigenvalue = pc_simulated_eigenvalues
)
pc_eigenvalues_table
##    Component Actual_Eigenvalue Simulated_Eigenvalue
## 1          1        5.25410717            1.8476353
## 2          2        2.07757639            1.7028986
## 3          3        1.57314867            1.5856240
## 4          4        1.44709248            1.4767673
## 5          5        1.37308416            1.3784481
## 6          6        1.21261177            1.2747409
## 7          7        0.99725182            1.2195438
## 8          8        0.88970057            1.1332806
## 9          9        0.83930492            1.0514939
## 10        10        0.75976102            0.9682943
## 11        11        0.73935061            0.9051813
## 12        12        0.58867605            0.8351319
## 13        13        0.55321288            0.7913555
## 14        14        0.44494445            0.7184168
## 15        15        0.40192293            0.6647296
## 16        16        0.34363186            0.6085670
## 17        17        0.20053937            0.5512506
## 18        18        0.16968126            0.4975449
## 19        19        0.08229803            0.4267078
## 20        20        0.05210361            0.3623879
# With the principal components method, it suggests that the number of factors = 2
# 1.2. Using common factors model for factor extraction
cf_result <- fa.parallel(tet_matrix, fa = "fa")
## Warning in fa.parallel(tet_matrix, fa = "fa"): It seems as if you are using a
## correlation matrix, but have not specified the number of cases. The number of
## subjects is arbitrarily set to be 100

## Parallel analysis suggests that the number of factors =  6  and the number of components =  NA
cf_actual_eigenvalues <- cf_result$fa.values
cf_simulated_eigenvalues <- cf_result$fa.sim
cf_eigenvalues_table <- data.frame(
  Factor = 1:length(cf_actual_eigenvalues),
  Actual_Eigenvalue = cf_actual_eigenvalues,
  Simulated_Eigenvalue = cf_simulated_eigenvalues
)
cf_eigenvalues_table
##    Factor Actual_Eigenvalue Simulated_Eigenvalue
## 1       1        4.53021531           1.05225411
## 2       2        1.27337346           0.76624560
## 3       3        0.75793914           0.61553041
## 4       4        0.69951360           0.51830026
## 5       5        0.62917335           0.42056453
## 6       6        0.39972223           0.32406979
## 7       7        0.25091458           0.25193111
## 8       8        0.10331685           0.17645156
## 9       9        0.07595653           0.09274877
## 10     10       -0.03877759           0.02055269
## 11     11       -0.12832065          -0.03974797
## 12     12       -0.18119492          -0.11310174
## 13     13       -0.22869788          -0.17095139
## 14     14       -0.32602132          -0.22225797
## 15     15       -0.36530473          -0.28626087
## 16     16       -0.45151054          -0.35074428
## 17     17       -0.53364304          -0.40411937
## 18     18       -0.60505757          -0.46819774
## 19     19       -0.62789201          -0.53278039
## 20     20       -0.70348954          -0.59823449
# With the common factors model, it suggests that the number of factors = 5

# Step 2: Rotating the factors
# 2.1. Orthogonal rotation (Varimax)
ortho_result <- fa(data, nfactors = 2, rotate = "varimax")
print(ortho_result, cut = 0.4, sort = TRUE)
## Factor Analysis using method =  minres
## Call: fa(r = data, nfactors = 2, rotate = "varimax")
## Standardized loadings (pattern matrix) based upon correlation matrix
##     item   MR1   MR2    h2   u2 com
## V13   13  0.67       0.479 0.52 1.1
## V7     7  0.42       0.264 0.74 1.8
## V8     8  0.42       0.190 0.81 1.2
## V17   17  0.41       0.192 0.81 1.2
## V14   14  0.41       0.208 0.79 1.4
## V20   20  0.41       0.268 0.73 1.9
## V12   12  0.41       0.179 0.82 1.2
## V19   19             0.140 0.86 1.6
## V2     2             0.105 0.90 1.1
## V3     3             0.141 0.86 1.8
## V1     1             0.160 0.84 2.0
## V10   10             0.024 0.98 1.8
## V11   11        0.59 0.348 0.65 1.0
## V9     9        0.52 0.273 0.73 1.0
## V6     6             0.214 0.79 1.8
## V5     5             0.146 0.85 1.1
## V16   16             0.144 0.86 1.3
## V15   15             0.121 0.88 1.5
## V18   18             0.110 0.89 2.0
## V4     4             0.026 0.97 1.0
## 
##                        MR1  MR2
## SS loadings           2.05 1.68
## Proportion Var        0.10 0.08
## Cumulative Var        0.10 0.19
## Proportion Explained  0.55 0.45
## Cumulative Proportion 0.55 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 2 factors are sufficient.
## 
## df null model =  190  with the objective function =  2.89 with Chi Square =  626.1
## df of  the model are 151  and the objective function was  1 
## 
## The root mean square of the residuals (RMSR) is  0.06 
## The df corrected root mean square of the residuals is  0.07 
## 
## The harmonic n.obs is  225 with the empirical chi square  148.71  with prob <  0.54 
## The total n.obs was  225  with Likelihood Chi Square =  215.93  with prob <  0.00042 
## 
## Tucker Lewis Index of factoring reliability =  0.811
## RMSEA index =  0.043  and the 90 % confidence intervals are  0.03 0.056
## BIC =  -601.9
## Fit based upon off diagonal values = 0.85
## Measures of factor score adequacy             
##                                                    MR1  MR2
## Correlation of (regression) scores with factors   0.84 0.81
## Multiple R square of scores with factors          0.71 0.66
## Minimum correlation of possible factor scores     0.43 0.32
# 2.2. Oblique rotation (Oblimin)
oblique_result <- fa(data, nfactors = 5, rotate = "oblimin")
print(oblique_result, cut = 0.4, sort = TRUE)
## Factor Analysis using method =  minres
## Call: fa(r = data, nfactors = 5, rotate = "oblimin")
## Standardized loadings (pattern matrix) based upon correlation matrix
##     item   MR3   MR1   MR4   MR2   MR5    h2     u2 com
## V7     7  1.00                         0.996 0.0039 1.0
## V2     2                               0.142 0.8579 2.7
## V3     3                               0.139 0.8615 4.1
## V18   18        0.55                   0.292 0.7078 1.1
## V20   20        0.55                   0.414 0.5855 1.3
## V1     1                               0.241 0.7593 2.7
## V6     6                               0.220 0.7797 2.6
## V19   19              0.63             0.414 0.5862 1.1
## V8     8              0.45             0.287 0.7134 1.5
## V16   16                               0.235 0.7655 2.3
## V13   13                    0.61       0.483 0.5174 1.3
## V17   17                    0.43       0.291 0.7090 1.8
## V14   14                               0.270 0.7298 2.2
## V12   12                               0.337 0.6628 2.9
## V5     5                          0.45 0.236 0.7639 1.2
## V9     9                          0.44 0.291 0.7088 1.7
## V11   11                               0.355 0.6453 2.4
## V15   15                               0.180 0.8205 2.3
## V4     4                               0.071 0.9286 1.4
## V10   10                               0.084 0.9163 2.1
## 
##                        MR3  MR1  MR4  MR2  MR5
## SS loadings           1.41 1.21 1.20 1.09 1.07
## Proportion Var        0.07 0.06 0.06 0.05 0.05
## Cumulative Var        0.07 0.13 0.19 0.25 0.30
## Proportion Explained  0.24 0.20 0.20 0.18 0.18
## Cumulative Proportion 0.24 0.44 0.64 0.82 1.00
## 
##  With factor correlations of 
##      MR3  MR1  MR4   MR2   MR5
## MR3 1.00 0.29 0.21  0.20  0.16
## MR1 0.29 1.00 0.25  0.18  0.27
## MR4 0.21 0.25 1.00  0.26  0.22
## MR2 0.20 0.18 0.26  1.00 -0.01
## MR5 0.16 0.27 0.22 -0.01  1.00
## 
## Mean item complexity =  2
## Test of the hypothesis that 5 factors are sufficient.
## 
## df null model =  190  with the objective function =  2.89 with Chi Square =  626.1
## df of  the model are 100  and the objective function was  0.46 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic n.obs is  225 with the empirical chi square  55.5  with prob <  1 
## The total n.obs was  225  with Likelihood Chi Square =  97.39  with prob <  0.56 
## 
## Tucker Lewis Index of factoring reliability =  1.012
## RMSEA index =  0  and the 90 % confidence intervals are  0 0.033
## BIC =  -444.22
## Fit based upon off diagonal values = 0.95
## Measures of factor score adequacy             
##                                                    MR3  MR1  MR4  MR2  MR5
## Correlation of (regression) scores with factors   0.98 0.72 0.73 0.74 0.72
## Multiple R square of scores with factors          0.96 0.52 0.54 0.55 0.52
## Minimum correlation of possible factor scores     0.93 0.03 0.08 0.11 0.04