library(psych)

## Warning: package 'psych' was built under R version 4.2.3

library(corrplot)

## corrplot 0.92 loaded

library("psych")
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

Data Overview

# Dataset

url <- "https://raw.githubusercontent.com/housecricket/data/main/efa/sample1.csv"
data_survey <- read.csv(url, sep = ",")

Describing the data

describe(data_survey)

##     vars   n   mean    sd median trimmed   mad min max range  skew kurtosis
## ID     1 259 130.00 74.91    130  130.00 96.37   1 259   258  0.00    -1.21
## KM1    2 259   3.31  1.29      3    3.39  1.48   1   5     4 -0.27    -0.89
## KM2    3 259   3.36  1.37      3    3.45  1.48   1   5     4 -0.35    -1.02
## KM3    4 259   3.40  1.31      3    3.49  1.48   1   5     4 -0.38    -0.90
## QC1    5 259   3.78  1.35      4    3.96  1.48   1   5     4 -0.93    -0.50
## QC2    6 259   3.41  1.17      3    3.48  1.48   1   5     4 -0.30    -0.62
## QC3    7 259   3.34  1.23      3    3.42  1.48   1   5     4 -0.27    -0.81
## CT1    8 259   3.69  1.12      4    3.80  1.48   1   5     4 -0.54    -0.24
## CT2    9 259   3.73  1.08      4    3.84  1.48   1   5     4 -0.56    -0.21
## CT3   10 259   3.36  1.23      3    3.44  1.48   1   5     4 -0.28    -0.73
## PC1   11 259   3.42  1.20      3    3.51  1.48   1   5     4 -0.40    -0.59
## PC2   12 259   3.09  1.34      3    3.11  1.48   1   5     4 -0.13    -1.05
## PC3   13 259   3.71  1.27      4    3.87  1.48   1   5     4 -0.72    -0.49
## QD    14 259   3.55  1.05      4    3.61  1.48   1   5     4 -0.33    -0.39
##       se
## ID  4.65
## KM1 0.08
## KM2 0.09
## KM3 0.08
## QC1 0.08
## QC2 0.07
## QC3 0.08
## CT1 0.07
## CT2 0.07
## CT3 0.08
## PC1 0.07
## PC2 0.08
## PC3 0.08
## QD  0.07

dim(data_survey)

## [1] 259  14

Cleaning data

dat <- data_survey[ , -1] 
head(dat)

##   KM1 KM2 KM3 QC1 QC2 QC3 CT1 CT2 CT3 PC1 PC2 PC3 QD
## 1   5   5   5   5   2   1   1   3   1   4   1   3  4
## 2   3   3   3   4   5   3   4   5   4   2   2   2  4
## 3   2   2   2   2   2   1   3   3   3   4   3   5  2
## 4   4   3   3   4   3   4   4   4   4   1   1   3  3
## 5   4   4   4   2   3   4   4   4   4   3   3   5  4
## 6   1   1   1   2   5   3   5   5   5   4   3   5  3

Correlation matrix

datamatrix <- cor(dat[,c(-13)])
corrplot(datamatrix, method="number")

The Factorability of the Data

X <- dat[,-c(13)]
Y <- dat[,13]

KMO (Kaiser-Meyer-Olkin)

KMO(r=cor(X))

## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(X))
## Overall MSA =  0.83
## MSA for each item = 
##  KM1  KM2  KM3  QC1  QC2  QC3  CT1  CT2  CT3  PC1  PC2  PC3 
## 0.87 0.84 0.82 0.88 0.86 0.86 0.83 0.85 0.85 0.70 0.78 0.80

Bartlett’s Test of Sphericity

cortest.bartlett(X)

## R was not square, finding R from data

## $chisq
## [1] 1595.75
## 
## $p.value
## [1] 8.846246e-290
## 
## $df
## [1] 66

det(cor(X))

## [1] 0.00183051

The Number of Factors to Extract

Scree Pilot

library(ggplot2)
fafitfree <- fa(dat,nfactors = ncol(X), rotate = "none")
n_factors <- length(fafitfree$e.values)
scree     <- data.frame(
  Factor_n =  as.factor(1:n_factors), 
  Eigenvalue = fafitfree$e.values)
ggplot(scree, aes(x = Factor_n, y = Eigenvalue, group = 1)) + 
  geom_point() + geom_line() +
  xlab("Number of factors") +
  ylab("Initial eigenvalue") +
  labs( title = "Scree Plot", 
        subtitle = "(Based on the unreduced correlation matrix)")

Parallel Analysis

parallel <- fa.parallel(X)

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  4  and the number of components =  3

Conducting the Factor Analysis

Factor analysis using fa method

fa.none <- fa(r=X, 
 nfactors = 4, 
 # covar = FALSE, SMC = TRUE,
 fm="pa", # type of factor analysis we want to use ("pa" is principal axis factoring)
 max.iter=100, # (50 is the default, but we have changed it to 100
 rotate="varimax") # none rotation
print(fa.none)

## Factor Analysis using method =  pa
## Call: fa(r = X, nfactors = 4, rotate = "varimax", max.iter = 100, fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      PA1   PA3  PA2   PA4   h2   u2 com
## KM1 0.80  0.13 0.15  0.24 0.73 0.27 1.3
## KM2 0.82  0.22 0.08  0.20 0.76 0.24 1.3
## KM3 0.88  0.21 0.17  0.18 0.88 0.12 1.3
## QC1 0.31  0.06 0.16  0.56 0.43 0.57 1.8
## QC2 0.23  0.34 0.01  0.74 0.71 0.29 1.6
## QC3 0.13  0.35 0.13  0.66 0.59 0.41 1.7
## CT1 0.15  0.72 0.08  0.14 0.56 0.44 1.2
## CT2 0.19  0.75 0.10  0.21 0.65 0.35 1.3
## CT3 0.15  0.69 0.15  0.25 0.59 0.41 1.5
## PC1 0.18 -0.03 0.82  0.10 0.71 0.29 1.1
## PC2 0.13  0.11 0.78  0.22 0.68 0.32 1.3
## PC3 0.04  0.23 0.67 -0.03 0.51 0.49 1.2
## 
##                        PA1  PA3  PA2  PA4
## SS loadings           2.38 1.97 1.86 1.60
## Proportion Var        0.20 0.16 0.16 0.13
## Cumulative Var        0.20 0.36 0.52 0.65
## Proportion Explained  0.30 0.25 0.24 0.20
## Cumulative Proportion 0.30 0.56 0.80 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 4 factors are sufficient.
## 
## df null model =  66  with the objective function =  6.3 with Chi Square =  1595.75
## df of  the model are 24  and the objective function was  0.15 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  259 with the empirical chi square  10.85  with prob <  0.99 
## The total n.obs was  259  with Likelihood Chi Square =  38.29  with prob <  0.032 
## 
## Tucker Lewis Index of factoring reliability =  0.974
## RMSEA index =  0.048  and the 90 % confidence intervals are  0.014 0.075
## BIC =  -95.07
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    PA1  PA3  PA2  PA4
## Correlation of (regression) scores with factors   0.95 0.87 0.91 0.84
## Multiple R square of scores with factors          0.89 0.76 0.83 0.71
## Minimum correlation of possible factor scores     0.79 0.53 0.65 0.43

Factor analysis using the factanal method

factanal.none <- factanal(X, factors=4, scores = c("regression"), rotation = "varimax")
print(factanal.none)

## 
## Call:
## factanal(x = X, factors = 4, scores = c("regression"), rotation = "varimax")
## 
## Uniquenesses:
##   KM1   KM2   KM3   QC1   QC2   QC3   CT1   CT2   CT3   PC1   PC2   PC3 
## 0.272 0.238 0.117 0.577 0.278 0.403 0.451 0.355 0.405 0.269 0.332 0.497 
## 
## Loadings:
##     Factor1 Factor2 Factor3 Factor4
## KM1  0.794   0.132   0.152   0.238 
## KM2  0.820   0.212           0.197 
## KM3  0.885   0.209   0.165   0.169 
## QC1  0.321           0.162   0.538 
## QC2  0.232   0.335           0.745 
## QC3  0.125   0.338   0.138   0.669 
## CT1  0.146   0.704           0.155 
## CT2  0.189   0.747           0.205 
## CT3  0.149   0.703   0.140   0.242 
## PC1  0.179           0.829   0.104 
## PC2  0.132   0.121   0.770   0.206 
## PC3          0.234   0.668         
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.388   1.958   1.864   1.595
## Proportion Var   0.199   0.163   0.155   0.133
## Cumulative Var   0.199   0.362   0.517   0.650
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 37.66 on 24 degrees of freedom.
## The p-value is 0.0376

Graph Factor Loading Matrices

fa.diagram(fa.none)

Regression analysis

Scores for all the rows

head(fa.none$scores)

##              PA1        PA3         PA2         PA4
## [1,]  2.01452365 -1.9781194 -0.47865733 -0.99954894
## [2,] -0.38088283  0.7697328 -1.27812282  0.71255859
## [3,] -0.88620583 -0.3131598  0.69972250 -1.42078948
## [4,] -0.02336401  0.5722214 -1.64788160 -0.02484005
## [5,]  0.45117677  0.5996335  0.01897338 -0.59929794
## [6,] -2.36524692  1.6224651  0.50346364  0.30393048

Labeling the data

regdata <- cbind(dat["QD"], fa.none$scores)
#Labeling the data
names(regdata) <- c("QD", "F1", "F2",
 "F3", "F4")
head(regdata)

##   QD          F1         F2          F3          F4
## 1  4  2.01452365 -1.9781194 -0.47865733 -0.99954894
## 2  4 -0.38088283  0.7697328 -1.27812282  0.71255859
## 3  2 -0.88620583 -0.3131598  0.69972250 -1.42078948
## 4  3 -0.02336401  0.5722214 -1.64788160 -0.02484005
## 5  4  0.45117677  0.5996335  0.01897338 -0.59929794
## 6  3 -2.36524692  1.6224651  0.50346364  0.30393048

Splitting the data to train and test set

#Splitting the data 70:30
#Random number generator, set seed.
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]

Regression Model using train data

model.fa.score = lm(QD ~ ., data = train)
summary(model.fa.score)

## 
## Call:
## lm(formula = QD ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2041 -0.2225 -0.0894  0.2571  1.4851 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.55854    0.02932 121.387  < 2e-16 ***
## F1           0.72846    0.03028  24.058  < 2e-16 ***
## F2           0.28709    0.03382   8.489 8.43e-15 ***
## F3           0.07357    0.03061   2.403   0.0173 *  
## F4           0.62655    0.03494  17.932  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3935 on 176 degrees of freedom
## Multiple R-squared:  0.8703, Adjusted R-squared:  0.8674 
## F-statistic: 295.4 on 4 and 176 DF,  p-value: < 2.2e-16

Check vif

vif(model.fa.score)

##       F1       F2       F3       F4 
## 1.009786 1.056645 1.001647 1.063940

Check prediction of the model in the test dataset

#Model Performance metrics:
pred_test <- predict(model.fa.score, newdata = test, type = "response")
pred_test

##        3        6        9       17       19       23       28       29 
## 1.984360 2.528829 4.143840 3.323313 3.476094 3.246743 4.366091 5.200139 
##       33       34       36       38       54       56       57       59 
## 1.462812 3.579119 2.046164 4.822811 3.260452 3.703757 2.226866 3.281367 
##       61       63       66       67       83       86       89       90 
## 3.279521 3.994264 3.638951 3.638951 4.204144 4.071425 2.381443 3.260452 
##       96       99      101      102      104      109      113      114 
## 3.200268 4.123159 3.105286 3.776780 4.300589 2.672036 5.145143 4.049866 
##      117      120      122      125      128      129      134      136 
## 3.161295 3.125512 3.071635 3.718316 3.257887 3.161295 3.260452 1.267189 
##      138      139      141      144      145      149      161      162 
## 3.260452 3.149997 3.161576 4.073711 3.554128 4.252041 4.559893 4.368536 
##      166      168      173      176      177      178      181      184 
## 4.111750 3.768509 3.260452 3.685362 2.249302 3.371394 4.354512 4.732477 
##      186      190      195      201      204      208      209      217 
## 2.260460 3.735488 1.739988 3.198181 5.089403 3.260452 3.836492 4.001907 
##      220      221      223      233      240      241      244      245 
## 4.237114 3.298169 5.089403 4.444631 3.323962 5.187997 3.652735 2.992014 
##      248      250      252      255      257      258 
## 2.719640 3.625425 3.149997 2.984902 4.771091 2.630307

test$QD_Predicted <- pred_test
head(test[c("QD","QD_Predicted")], 10)

##    QD QD_Predicted
## 3   2     1.984360
## 6   3     2.528829
## 9   4     4.143840
## 17  3     3.323313
## 19  3     3.476094
## 23  3     3.246743
## 28  4     4.366091
## 29  5     5.200139
## 33  1     1.462812
## 34  4     3.579119

Factor Analysis in R

Kyle Kenneth Ruaya

2023-05-06

Data Overview

Describing the data

Cleaning data

Correlation matrix

The Factorability of the Data

KMO (Kaiser-Meyer-Olkin)

Bartlett’s Test of Sphericity

The Number of Factors to Extract

Parallel Analysis

Conducting the Factor Analysis

Regression analysis

Labeling the data

Splitting the data to train and test set

Regression Model using train data

Check vif

Check prediction of the model in the test dataset