library(psych)
## Warning: package 'psych' was built under R version 4.2.3
library(corrplot)
## corrplot 0.92 loaded
library("psych")
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
# Dataset
url <- "https://raw.githubusercontent.com/housecricket/data/main/efa/sample1.csv"
data_survey <- read.csv(url, sep = ",")
describe(data_survey)
## vars n mean sd median trimmed mad min max range skew kurtosis
## ID 1 259 130.00 74.91 130 130.00 96.37 1 259 258 0.00 -1.21
## KM1 2 259 3.31 1.29 3 3.39 1.48 1 5 4 -0.27 -0.89
## KM2 3 259 3.36 1.37 3 3.45 1.48 1 5 4 -0.35 -1.02
## KM3 4 259 3.40 1.31 3 3.49 1.48 1 5 4 -0.38 -0.90
## QC1 5 259 3.78 1.35 4 3.96 1.48 1 5 4 -0.93 -0.50
## QC2 6 259 3.41 1.17 3 3.48 1.48 1 5 4 -0.30 -0.62
## QC3 7 259 3.34 1.23 3 3.42 1.48 1 5 4 -0.27 -0.81
## CT1 8 259 3.69 1.12 4 3.80 1.48 1 5 4 -0.54 -0.24
## CT2 9 259 3.73 1.08 4 3.84 1.48 1 5 4 -0.56 -0.21
## CT3 10 259 3.36 1.23 3 3.44 1.48 1 5 4 -0.28 -0.73
## PC1 11 259 3.42 1.20 3 3.51 1.48 1 5 4 -0.40 -0.59
## PC2 12 259 3.09 1.34 3 3.11 1.48 1 5 4 -0.13 -1.05
## PC3 13 259 3.71 1.27 4 3.87 1.48 1 5 4 -0.72 -0.49
## QD 14 259 3.55 1.05 4 3.61 1.48 1 5 4 -0.33 -0.39
## se
## ID 4.65
## KM1 0.08
## KM2 0.09
## KM3 0.08
## QC1 0.08
## QC2 0.07
## QC3 0.08
## CT1 0.07
## CT2 0.07
## CT3 0.08
## PC1 0.07
## PC2 0.08
## PC3 0.08
## QD 0.07
dim(data_survey)
## [1] 259 14
dat <- data_survey[ , -1]
head(dat)
## KM1 KM2 KM3 QC1 QC2 QC3 CT1 CT2 CT3 PC1 PC2 PC3 QD
## 1 5 5 5 5 2 1 1 3 1 4 1 3 4
## 2 3 3 3 4 5 3 4 5 4 2 2 2 4
## 3 2 2 2 2 2 1 3 3 3 4 3 5 2
## 4 4 3 3 4 3 4 4 4 4 1 1 3 3
## 5 4 4 4 2 3 4 4 4 4 3 3 5 4
## 6 1 1 1 2 5 3 5 5 5 4 3 5 3
datamatrix <- cor(dat[,c(-13)])
corrplot(datamatrix, method="number")
X <- dat[,-c(13)]
Y <- dat[,13]
KMO(r=cor(X))
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cor(X))
## Overall MSA = 0.83
## MSA for each item =
## KM1 KM2 KM3 QC1 QC2 QC3 CT1 CT2 CT3 PC1 PC2 PC3
## 0.87 0.84 0.82 0.88 0.86 0.86 0.83 0.85 0.85 0.70 0.78 0.80
cortest.bartlett(X)
## R was not square, finding R from data
## $chisq
## [1] 1595.75
##
## $p.value
## [1] 8.846246e-290
##
## $df
## [1] 66
det(cor(X))
## [1] 0.00183051
Scree Pilot
library(ggplot2)
fafitfree <- fa(dat,nfactors = ncol(X), rotate = "none")
n_factors <- length(fafitfree$e.values)
scree <- data.frame(
Factor_n = as.factor(1:n_factors),
Eigenvalue = fafitfree$e.values)
ggplot(scree, aes(x = Factor_n, y = Eigenvalue, group = 1)) +
geom_point() + geom_line() +
xlab("Number of factors") +
ylab("Initial eigenvalue") +
labs( title = "Scree Plot",
subtitle = "(Based on the unreduced correlation matrix)")
parallel <- fa.parallel(X)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Parallel analysis suggests that the number of factors = 4 and the number of components = 3
Factor analysis using fa method
fa.none <- fa(r=X,
nfactors = 4,
# covar = FALSE, SMC = TRUE,
fm="pa", # type of factor analysis we want to use ("pa" is principal axis factoring)
max.iter=100, # (50 is the default, but we have changed it to 100
rotate="varimax") # none rotation
print(fa.none)
## Factor Analysis using method = pa
## Call: fa(r = X, nfactors = 4, rotate = "varimax", max.iter = 100, fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA3 PA2 PA4 h2 u2 com
## KM1 0.80 0.13 0.15 0.24 0.73 0.27 1.3
## KM2 0.82 0.22 0.08 0.20 0.76 0.24 1.3
## KM3 0.88 0.21 0.17 0.18 0.88 0.12 1.3
## QC1 0.31 0.06 0.16 0.56 0.43 0.57 1.8
## QC2 0.23 0.34 0.01 0.74 0.71 0.29 1.6
## QC3 0.13 0.35 0.13 0.66 0.59 0.41 1.7
## CT1 0.15 0.72 0.08 0.14 0.56 0.44 1.2
## CT2 0.19 0.75 0.10 0.21 0.65 0.35 1.3
## CT3 0.15 0.69 0.15 0.25 0.59 0.41 1.5
## PC1 0.18 -0.03 0.82 0.10 0.71 0.29 1.1
## PC2 0.13 0.11 0.78 0.22 0.68 0.32 1.3
## PC3 0.04 0.23 0.67 -0.03 0.51 0.49 1.2
##
## PA1 PA3 PA2 PA4
## SS loadings 2.38 1.97 1.86 1.60
## Proportion Var 0.20 0.16 0.16 0.13
## Cumulative Var 0.20 0.36 0.52 0.65
## Proportion Explained 0.30 0.25 0.24 0.20
## Cumulative Proportion 0.30 0.56 0.80 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 4 factors are sufficient.
##
## df null model = 66 with the objective function = 6.3 with Chi Square = 1595.75
## df of the model are 24 and the objective function was 0.15
##
## The root mean square of the residuals (RMSR) is 0.02
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 259 with the empirical chi square 10.85 with prob < 0.99
## The total n.obs was 259 with Likelihood Chi Square = 38.29 with prob < 0.032
##
## Tucker Lewis Index of factoring reliability = 0.974
## RMSEA index = 0.048 and the 90 % confidence intervals are 0.014 0.075
## BIC = -95.07
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA1 PA3 PA2 PA4
## Correlation of (regression) scores with factors 0.95 0.87 0.91 0.84
## Multiple R square of scores with factors 0.89 0.76 0.83 0.71
## Minimum correlation of possible factor scores 0.79 0.53 0.65 0.43
Factor analysis using the factanal method
factanal.none <- factanal(X, factors=4, scores = c("regression"), rotation = "varimax")
print(factanal.none)
##
## Call:
## factanal(x = X, factors = 4, scores = c("regression"), rotation = "varimax")
##
## Uniquenesses:
## KM1 KM2 KM3 QC1 QC2 QC3 CT1 CT2 CT3 PC1 PC2 PC3
## 0.272 0.238 0.117 0.577 0.278 0.403 0.451 0.355 0.405 0.269 0.332 0.497
##
## Loadings:
## Factor1 Factor2 Factor3 Factor4
## KM1 0.794 0.132 0.152 0.238
## KM2 0.820 0.212 0.197
## KM3 0.885 0.209 0.165 0.169
## QC1 0.321 0.162 0.538
## QC2 0.232 0.335 0.745
## QC3 0.125 0.338 0.138 0.669
## CT1 0.146 0.704 0.155
## CT2 0.189 0.747 0.205
## CT3 0.149 0.703 0.140 0.242
## PC1 0.179 0.829 0.104
## PC2 0.132 0.121 0.770 0.206
## PC3 0.234 0.668
##
## Factor1 Factor2 Factor3 Factor4
## SS loadings 2.388 1.958 1.864 1.595
## Proportion Var 0.199 0.163 0.155 0.133
## Cumulative Var 0.199 0.362 0.517 0.650
##
## Test of the hypothesis that 4 factors are sufficient.
## The chi square statistic is 37.66 on 24 degrees of freedom.
## The p-value is 0.0376
Graph Factor Loading Matrices
fa.diagram(fa.none)
Scores for all the rows
head(fa.none$scores)
## PA1 PA3 PA2 PA4
## [1,] 2.01452365 -1.9781194 -0.47865733 -0.99954894
## [2,] -0.38088283 0.7697328 -1.27812282 0.71255859
## [3,] -0.88620583 -0.3131598 0.69972250 -1.42078948
## [4,] -0.02336401 0.5722214 -1.64788160 -0.02484005
## [5,] 0.45117677 0.5996335 0.01897338 -0.59929794
## [6,] -2.36524692 1.6224651 0.50346364 0.30393048
regdata <- cbind(dat["QD"], fa.none$scores)
#Labeling the data
names(regdata) <- c("QD", "F1", "F2",
"F3", "F4")
head(regdata)
## QD F1 F2 F3 F4
## 1 4 2.01452365 -1.9781194 -0.47865733 -0.99954894
## 2 4 -0.38088283 0.7697328 -1.27812282 0.71255859
## 3 2 -0.88620583 -0.3131598 0.69972250 -1.42078948
## 4 3 -0.02336401 0.5722214 -1.64788160 -0.02484005
## 5 4 0.45117677 0.5996335 0.01897338 -0.59929794
## 6 3 -2.36524692 1.6224651 0.50346364 0.30393048
#Splitting the data 70:30
#Random number generator, set seed.
set.seed(100)
indices= sample(1:nrow(regdata), 0.7*nrow(regdata))
train=regdata[indices,]
test = regdata[-indices,]
model.fa.score = lm(QD ~ ., data = train)
summary(model.fa.score)
##
## Call:
## lm(formula = QD ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2041 -0.2225 -0.0894 0.2571 1.4851
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.55854 0.02932 121.387 < 2e-16 ***
## F1 0.72846 0.03028 24.058 < 2e-16 ***
## F2 0.28709 0.03382 8.489 8.43e-15 ***
## F3 0.07357 0.03061 2.403 0.0173 *
## F4 0.62655 0.03494 17.932 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3935 on 176 degrees of freedom
## Multiple R-squared: 0.8703, Adjusted R-squared: 0.8674
## F-statistic: 295.4 on 4 and 176 DF, p-value: < 2.2e-16
vif(model.fa.score)
## F1 F2 F3 F4
## 1.009786 1.056645 1.001647 1.063940
#Model Performance metrics:
pred_test <- predict(model.fa.score, newdata = test, type = "response")
pred_test
## 3 6 9 17 19 23 28 29
## 1.984360 2.528829 4.143840 3.323313 3.476094 3.246743 4.366091 5.200139
## 33 34 36 38 54 56 57 59
## 1.462812 3.579119 2.046164 4.822811 3.260452 3.703757 2.226866 3.281367
## 61 63 66 67 83 86 89 90
## 3.279521 3.994264 3.638951 3.638951 4.204144 4.071425 2.381443 3.260452
## 96 99 101 102 104 109 113 114
## 3.200268 4.123159 3.105286 3.776780 4.300589 2.672036 5.145143 4.049866
## 117 120 122 125 128 129 134 136
## 3.161295 3.125512 3.071635 3.718316 3.257887 3.161295 3.260452 1.267189
## 138 139 141 144 145 149 161 162
## 3.260452 3.149997 3.161576 4.073711 3.554128 4.252041 4.559893 4.368536
## 166 168 173 176 177 178 181 184
## 4.111750 3.768509 3.260452 3.685362 2.249302 3.371394 4.354512 4.732477
## 186 190 195 201 204 208 209 217
## 2.260460 3.735488 1.739988 3.198181 5.089403 3.260452 3.836492 4.001907
## 220 221 223 233 240 241 244 245
## 4.237114 3.298169 5.089403 4.444631 3.323962 5.187997 3.652735 2.992014
## 248 250 252 255 257 258
## 2.719640 3.625425 3.149997 2.984902 4.771091 2.630307
test$QD_Predicted <- pred_test
head(test[c("QD","QD_Predicted")], 10)
## QD QD_Predicted
## 3 2 1.984360
## 6 3 2.528829
## 9 4 4.143840
## 17 3 3.323313
## 19 3 3.476094
## 23 3 3.246743
## 28 4 4.366091
## 29 5 5.200139
## 33 1 1.462812
## 34 4 3.579119