library(psych)
## Warning: package 'psych' was built under R version 4.2.3
library(corrplot)
## corrplot 0.92 loaded
library("psych")
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit

Data Overview

# Dataset

url <- "https://raw.githubusercontent.com/housecricket/data/main/efa/sample1.csv"
data_survey <- read.csv(url, sep = ",")

Describing the data

describe(data_survey)
##     vars   n   mean    sd median trimmed   mad min max range  skew kurtosis
## ID     1 259 130.00 74.91    130  130.00 96.37   1 259   258  0.00    -1.21
## KM1    2 259   3.31  1.29      3    3.39  1.48   1   5     4 -0.27    -0.89
## KM2    3 259   3.36  1.37      3    3.45  1.48   1   5     4 -0.35    -1.02
## KM3    4 259   3.40  1.31      3    3.49  1.48   1   5     4 -0.38    -0.90
## QC1    5 259   3.78  1.35      4    3.96  1.48   1   5     4 -0.93    -0.50
## QC2    6 259   3.41  1.17      3    3.48  1.48   1   5     4 -0.30    -0.62
## QC3    7 259   3.34  1.23      3    3.42  1.48   1   5     4 -0.27    -0.81
## CT1    8 259   3.69  1.12      4    3.80  1.48   1   5     4 -0.54    -0.24
## CT2    9 259   3.73  1.08      4    3.84  1.48   1   5     4 -0.56    -0.21
## CT3   10 259   3.36  1.23      3    3.44  1.48   1   5     4 -0.28    -0.73
## PC1   11 259   3.42  1.20      3    3.51  1.48   1   5     4 -0.40    -0.59
## PC2   12 259   3.09  1.34      3    3.11  1.48   1   5     4 -0.13    -1.05
## PC3   13 259   3.71  1.27      4    3.87  1.48   1   5     4 -0.72    -0.49
## QD    14 259   3.55  1.05      4    3.61  1.48   1   5     4 -0.33    -0.39
##       se
## ID  4.65
## KM1 0.08
## KM2 0.09
## KM3 0.08
## QC1 0.08
## QC2 0.07
## QC3 0.08
## CT1 0.07
## CT2 0.07
## CT3 0.08
## PC1 0.07
## PC2 0.08
## PC3 0.08
## QD  0.07
dim(data_survey)
## [1] 259  14

Cleaning data

dat <- data_survey[ , -1] 
head(dat)
##   KM1 KM2 KM3 QC1 QC2 QC3 CT1 CT2 CT3 PC1 PC2 PC3 QD
## 1   5   5   5   5   2   1   1   3   1   4   1   3  4
## 2   3   3   3   4   5   3   4   5   4   2   2   2  4
## 3   2   2   2   2   2   1   3   3   3   4   3   5  2
## 4   4   3   3   4   3   4   4   4   4   1   1   3  3
## 5   4   4   4   2   3   4   4   4   4   3   3   5  4
## 6   1   1   1   2   5   3   5   5   5   4   3   5  3

Correlation matrix

datamatrix <- cor(dat[,c(-13)])
corrplot(datamatrix, method="number")

The Factorability of the Data

X <- dat[,-c(13)]
Y <- dat[,13]

KMO (Kaiser-Meyer-Olkin)

KMO(r=cor(X))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(X))
## Overall MSA =  0.83
## MSA for each item = 
##  KM1  KM2  KM3  QC1  QC2  QC3  CT1  CT2  CT3  PC1  PC2  PC3 
## 0.87 0.84 0.82 0.88 0.86 0.86 0.83 0.85 0.85 0.70 0.78 0.80

Bartlett’s Test of Sphericity

cortest.bartlett(X)
## R was not square, finding R from data
## $chisq
## [1] 1595.75
## 
## $p.value
## [1] 8.846246e-290
## 
## $df
## [1] 66
det(cor(X))
## [1] 0.00183051

The Number of Factors to Extract

Scree Pilot

library(ggplot2)
fafitfree <- fa(dat,nfactors = ncol(X), rotate = "none")
n_factors <- length(fafitfree$e.values)
scree     <- data.frame(
  Factor_n =  as.factor(1:n_factors), 
  Eigenvalue = fafitfree$e.values)
ggplot(scree, aes(x = Factor_n, y = Eigenvalue, group = 1)) + 
  geom_point() + geom_line() +
  xlab("Number of factors") +
  ylab("Initial eigenvalue") +
  labs( title = "Scree Plot", 
        subtitle = "(Based on the unreduced correlation matrix)")

Parallel Analysis

parallel <- fa.parallel(X)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

## Parallel analysis suggests that the number of factors =  4  and the number of components =  3

Conducting the Factor Analysis

Factor analysis using fa method

fa.none <- fa(r=X, 
 nfactors = 4, 
 # covar = FALSE, SMC = TRUE,
 fm="pa", # type of factor analysis we want to use ("pa" is principal axis factoring)
 max.iter=100, # (50 is the default, but we have changed it to 100
 rotate="varimax") # none rotation
print(fa.none)
## Factor Analysis using method =  pa
## Call: fa(r = X, nfactors = 4, rotate = "varimax", max.iter = 100, fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      PA1   PA3  PA2   PA4   h2   u2 com
## KM1 0.80  0.13 0.15  0.24 0.73 0.27 1.3
## KM2 0.82  0.22 0.08  0.20 0.76 0.24 1.3
## KM3 0.88  0.21 0.17  0.18 0.88 0.12 1.3
## QC1 0.31  0.06 0.16  0.56 0.43 0.57 1.8
## QC2 0.23  0.34 0.01  0.74 0.71 0.29 1.6
## QC3 0.13  0.35 0.13  0.66 0.59 0.41 1.7
## CT1 0.15  0.72 0.08  0.14 0.56 0.44 1.2
## CT2 0.19  0.75 0.10  0.21 0.65 0.35 1.3
## CT3 0.15  0.69 0.15  0.25 0.59 0.41 1.5
## PC1 0.18 -0.03 0.82  0.10 0.71 0.29 1.1
## PC2 0.13  0.11 0.78  0.22 0.68 0.32 1.3
## PC3 0.04  0.23 0.67 -0.03 0.51 0.49 1.2
## 
##                        PA1  PA3  PA2  PA4
## SS loadings           2.38 1.97 1.86 1.60
## Proportion Var        0.20 0.16 0.16 0.13
## Cumulative Var        0.20 0.36 0.52 0.65
## Proportion Explained  0.30 0.25 0.24 0.20
## Cumulative Proportion 0.30 0.56 0.80 1.00
## 
## Mean item complexity =  1.4
## Test of the hypothesis that 4 factors are sufficient.
## 
## df null model =  66  with the objective function =  6.3 with Chi Square =  1595.75
## df of  the model are 24  and the objective function was  0.15 
## 
## The root mean square of the residuals (RMSR) is  0.02 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  259 with the empirical chi square  10.85  with prob <  0.99 
## The total n.obs was  259  with Likelihood Chi Square =  38.29  with prob <  0.032 
## 
## Tucker Lewis Index of factoring reliability =  0.974
## RMSEA index =  0.048  and the 90 % confidence intervals are  0.014 0.075
## BIC =  -95.07
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    PA1  PA3  PA2  PA4
## Correlation of (regression) scores with factors   0.95 0.87 0.91 0.84
## Multiple R square of scores with factors          0.89 0.76 0.83 0.71
## Minimum correlation of possible factor scores     0.79 0.53 0.65 0.43

Factor analysis using the factanal method

factanal.none <- factanal(X, factors=4, scores = c("regression"), rotation = "varimax")
print(factanal.none)
## 
## Call:
## factanal(x = X, factors = 4, scores = c("regression"), rotation = "varimax")
## 
## Uniquenesses:
##   KM1   KM2   KM3   QC1   QC2   QC3   CT1   CT2   CT3   PC1   PC2   PC3 
## 0.272 0.238 0.117 0.577 0.278 0.403 0.451 0.355 0.405 0.269 0.332 0.497 
## 
## Loadings:
##     Factor1 Factor2 Factor3 Factor4
## KM1  0.794   0.132   0.152   0.238 
## KM2  0.820   0.212           0.197 
## KM3  0.885   0.209   0.165   0.169 
## QC1  0.321           0.162   0.538 
## QC2  0.232   0.335           0.745 
## QC3  0.125   0.338   0.138   0.669 
## CT1  0.146   0.704           0.155 
## CT2  0.189   0.747           0.205 
## CT3  0.149   0.703   0.140   0.242 
## PC1  0.179           0.829   0.104 
## PC2  0.132   0.121   0.770   0.206 
## PC3          0.234   0.668         
## 
##                Factor1 Factor2 Factor3 Factor4
## SS loadings      2.388   1.958   1.864   1.595
## Proportion Var   0.199   0.163   0.155   0.133
## Cumulative Var   0.199   0.362   0.517   0.650
## 
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 37.66 on 24 degrees of freedom.
## The p-value is 0.0376

Graph Factor Loading Matrices

fa.diagram(fa.none)

Regression analysis

Scores for all the rows

head(fa.none$scores)
##              PA1        PA3         PA2         PA4
## [1,]  2.01452365 -1.9781194 -0.47865733 -0.99954894
## [2,] -0.38088283  0.7697328 -1.27812282  0.71255859
## [3,] -0.88620583 -0.3131598  0.69972250 -1.42078948
## [4,] -0.02336401  0.5722214 -1.64788160 -0.02484005
## [5,]  0.45117677  0.5996335  0.01897338 -0.59929794
## [6,] -2.36524692  1.6224651  0.50346364  0.30393048

Labeling the data

regdata <- cbind(dat["QD"], fa.none$scores)
#Labeling the data
names(regdata) <- c("QD", "F1", "F2",
 "F3", "F4")
head(regdata)
##   QD          F1         F2          F3          F4
## 1  4  2.01452365 -1.9781194 -0.47865733 -0.99954894
## 2  4 -0.38088283  0.7697328 -1.27812282  0.71255859
## 3  2 -0.88620583 -0.3131598  0.69972250 -1.42078948
## 4  3 -0.02336401  0.5722214 -1.64788160 -0.02484005
## 5  4  0.45117677  0.5996335  0.01897338 -0.59929794
## 6  3 -2.36524692  1.6224651  0.50346364  0.30393048

Splitting the data to train and test set

#Splitting the data 70:30
#Random number generator, set seed.
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]

Regression Model using train data

model.fa.score = lm(QD ~ ., data = train)
summary(model.fa.score)
## 
## Call:
## lm(formula = QD ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2041 -0.2225 -0.0894  0.2571  1.4851 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.55854    0.02932 121.387  < 2e-16 ***
## F1           0.72846    0.03028  24.058  < 2e-16 ***
## F2           0.28709    0.03382   8.489 8.43e-15 ***
## F3           0.07357    0.03061   2.403   0.0173 *  
## F4           0.62655    0.03494  17.932  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3935 on 176 degrees of freedom
## Multiple R-squared:  0.8703, Adjusted R-squared:  0.8674 
## F-statistic: 295.4 on 4 and 176 DF,  p-value: < 2.2e-16

Check vif

vif(model.fa.score)
##       F1       F2       F3       F4 
## 1.009786 1.056645 1.001647 1.063940

Check prediction of the model in the test dataset

#Model Performance metrics:
pred_test <- predict(model.fa.score, newdata = test, type = "response")
pred_test
##        3        6        9       17       19       23       28       29 
## 1.984360 2.528829 4.143840 3.323313 3.476094 3.246743 4.366091 5.200139 
##       33       34       36       38       54       56       57       59 
## 1.462812 3.579119 2.046164 4.822811 3.260452 3.703757 2.226866 3.281367 
##       61       63       66       67       83       86       89       90 
## 3.279521 3.994264 3.638951 3.638951 4.204144 4.071425 2.381443 3.260452 
##       96       99      101      102      104      109      113      114 
## 3.200268 4.123159 3.105286 3.776780 4.300589 2.672036 5.145143 4.049866 
##      117      120      122      125      128      129      134      136 
## 3.161295 3.125512 3.071635 3.718316 3.257887 3.161295 3.260452 1.267189 
##      138      139      141      144      145      149      161      162 
## 3.260452 3.149997 3.161576 4.073711 3.554128 4.252041 4.559893 4.368536 
##      166      168      173      176      177      178      181      184 
## 4.111750 3.768509 3.260452 3.685362 2.249302 3.371394 4.354512 4.732477 
##      186      190      195      201      204      208      209      217 
## 2.260460 3.735488 1.739988 3.198181 5.089403 3.260452 3.836492 4.001907 
##      220      221      223      233      240      241      244      245 
## 4.237114 3.298169 5.089403 4.444631 3.323962 5.187997 3.652735 2.992014 
##      248      250      252      255      257      258 
## 2.719640 3.625425 3.149997 2.984902 4.771091 2.630307
test$QD_Predicted <- pred_test
head(test[c("QD","QD_Predicted")], 10)
##    QD QD_Predicted
## 3   2     1.984360
## 6   3     2.528829
## 9   4     4.143840
## 17  3     3.323313
## 19  3     3.476094
## 23  3     3.246743
## 28  4     4.366091
## 29  5     5.200139
## 33  1     1.462812
## 34  4     3.579119