d <- read.csv("/home/ekaterina/Rstudio/data_for_fa.csv")
Это данные по оценки собственных качеств студентами самиx у себя по девятибальной шкале.
summary(d)
## X distant talkatv carelss
## Min. : 1.00 Min. :1.000 Min. :2.000 Min. :1.000
## 1st Qu.: 60.75 1st Qu.:2.000 1st Qu.:5.000 1st Qu.:2.000
## Median :120.50 Median :3.000 Median :6.000 Median :3.000
## Mean :120.50 Mean :3.867 Mean :5.883 Mean :3.413
## 3rd Qu.:180.25 3rd Qu.:5.000 3rd Qu.:7.000 3rd Qu.:5.000
## Max. :240.00 Max. :8.000 Max. :9.000 Max. :9.000
## hardwrk anxious agreebl tense
## Min. :2.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:6.000 1st Qu.:4.000 1st Qu.:6.000 1st Qu.:3.000
## Median :7.000 Median :5.000 Median :7.000 Median :5.000
## Mean :6.925 Mean :5.129 Mean :6.629 Mean :4.617
## 3rd Qu.:8.000 3rd Qu.:7.000 3rd Qu.:8.000 3rd Qu.:6.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.000
## kind opposng relaxed disorgn
## Min. :2.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:6.000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:2.000
## Median :7.000 Median :4.000 Median :5.000 Median :4.000
## Mean :6.971 Mean :3.858 Mean :5.475 Mean :4.083
## 3rd Qu.:8.000 3rd Qu.:5.000 3rd Qu.:7.000 3rd Qu.:6.000
## Max. :9.000 Max. :8.000 Max. :9.000 Max. :9.000
## outgoin approvn shy discipl
## Min. :2.000 Min. :2.000 Min. :1.000 Min. :1.000
## 1st Qu.:5.000 1st Qu.:5.000 1st Qu.:3.000 1st Qu.:5.000
## Median :6.000 Median :6.000 Median :5.000 Median :7.000
## Mean :6.021 Mean :5.858 Mean :4.558 Mean :6.308
## 3rd Qu.:7.000 3rd Qu.:7.000 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.000
## harsh persevr friendl worryin
## Min. :1.0 Min. :2.000 Min. :2.00 Min. :1.000
## 1st Qu.:2.0 1st Qu.:6.000 1st Qu.:7.00 1st Qu.:3.000
## Median :3.0 Median :7.000 Median :7.00 Median :6.000
## Mean :3.6 Mean :6.804 Mean :7.25 Mean :5.213
## 3rd Qu.:5.0 3rd Qu.:8.000 3rd Qu.:8.00 3rd Qu.:7.000
## Max. :8.0 Max. :9.000 Max. :9.00 Max. :9.000
## respnsi contrar sociabl lazy
## Min. :1.000 Min. :1.000 Min. :2.000 Min. :1.000
## 1st Qu.:7.000 1st Qu.:3.000 1st Qu.:5.000 1st Qu.:3.000
## Median :8.000 Median :4.000 Median :7.000 Median :4.000
## Mean :7.292 Mean :3.771 Mean :6.446 Mean :4.179
## 3rd Qu.:8.000 3rd Qu.:5.000 3rd Qu.:8.000 3rd Qu.:5.000
## Max. :9.000 Max. :8.000 Max. :9.000 Max. :9.000
## coopera quiet organiz criticl
## Min. :3.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:6.000 1st Qu.:3.000 1st Qu.:5.000 1st Qu.:4.000
## Median :7.000 Median :5.000 Median :6.000 Median :5.000
## Mean :6.696 Mean :4.604 Mean :6.154 Mean :5.171
## 3rd Qu.:7.000 3rd Qu.:6.000 3rd Qu.:8.000 3rd Qu.:6.000
## Max. :9.000 Max. :9.000 Max. :9.000 Max. :9.000
## lax laidbck withdrw givinup
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:1.750
## Median :4.000 Median :5.000 Median :3.000 Median :2.000
## Mean :4.083 Mean :5.246 Mean :3.754 Mean :2.675
## 3rd Qu.:5.000 3rd Qu.:7.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :9.000 Max. :9.000 Max. :7.000 Max. :8.000
## easygon
## Min. :2.000
## 1st Qu.:5.000
## Median :6.000
## Mean :6.067
## 3rd Qu.:7.000
## Max. :9.000
corrplot(cor(d), order = "hclust", tl.col='black', tl.cex=.75)
На этом графике можно выявить корреляционные плеяды признаков, которые синонимичны между собой.
Корреляционные плеяды на основе модуля корреляции
corrplot(abs(cor(d)), order = "hclust", tl.col='black', tl.cex=.75,cl.lim=c(0,1))
d_stan = as.data.frame(scale(d))
Можно выделить 5 кластеров (сверху вниз):
Социальная активность
Тревожность
Отношение к труду
Дружелюбие
Критичность
#Bartlett's test
sigma <- cor(d_stan)
cortest.bartlett(sigma, n = 100)
## $chisq
## [1] 1551.592
##
## $p.value
## [1] 2.636052e-101
##
## $df
## [1] 528
Гипотеза о незначимости факторного анализа отвергается
# scree plot
scree(d_stan)
Возьмем 5 факторов.
Модель факторного анализа
\[\mathbb{S} = \mathbb{F}\mathbb{F}^T + \mathbb{\Psi}\] где \(S\) - ковариционная матрица.
Ordinary Least Squares
\[ ||\mathbb{S} - (\mathbb{F}\mathbb{F}^T + \mathbb{\Psi}) ||^2 = ||\mathbb{S} - \mathbb{\tilde{S}}|| \rightarrow \min_{\mathbb{F}, \mathbb{\Psi}}\]
Задача разбивается на две подзадачи:
\[\sum_{i \ne j} (s_{ij} - \sum_{k=1}^r f_{ik} f_{jk})^2 \rightarrow \min_{\mathbb{F}}\]
\[(\mathbb{F} \mathbb{F}^T)_{ij} \leqslant 1 \to \sigma_i^2 = 1 -(\mathbb{F} \mathbb{F}^T)_{ij} \]
### FA_MINRES
fa_minres <- fa(d_stan, nfactors = 5, rotate = "varimax", fm = "minres")
fa_minres$loadings
##
## Loadings:
## MR1 MR2 MR3 MR4 MR5
## X -0.171
## distant 0.566 -0.117 0.332
## talkatv -0.771 0.119 0.136
## carelss 0.620 0.276
## hardwrk -0.204 -0.634 0.139 0.107
## anxious 0.156 0.710 0.146 0.228
## agreebl 0.636 -0.162
## tense 0.147 0.779 0.278
## kind -0.317 0.621 -0.185
## opposng 0.103 -0.125 0.633
## relaxed 0.137 -0.681 0.349
## disorgn 0.745 0.146
## outgoin -0.827 0.219
## approvn -0.271 -0.126 -0.106 0.501 -0.121
## shy 0.736 0.193 0.160
## discipl -0.618
## harsh 0.121 -0.234 0.645
## persevr -0.164 -0.527 0.108 0.194
## friendl -0.491 -0.167 0.548 -0.178
## worryin 0.174 0.742 0.141
## respnsi -0.723 0.234
## contrar 0.138 0.140 -0.151 0.713
## sociabl -0.709 0.271 -0.122
## lazy 0.190 0.659 0.138
## coopera -0.178 -0.109 0.555 -0.282
## quiet 0.800 0.167 0.163
## organiz -0.776
## criticl -0.179 0.131 -0.108 0.577
## lax 0.469 -0.211 0.231 0.101
## laidbck 0.254 -0.567 0.289
## withdrw 0.709 0.143 0.121 0.300
## givinup 0.355 0.441 0.218 -0.107 0.163
## easygon -0.136 0.128 -0.429 0.432
##
## MR1 MR2 MR3 MR4 MR5
## SS loadings 4.467 4.416 2.933 2.537 2.455
## Proportion Var 0.135 0.134 0.089 0.077 0.074
## Cumulative Var 0.135 0.269 0.358 0.435 0.509
circle_cor_plot(scores = fa_minres$scores)
Первый фактор можно инерпретировать как общительность, второй как трудоспособность.
circle_cor_plot(scores = fa_minres$scores, num_factor1 = 2, num_factor2 = 3)
Третий фактор можно интерпретировать как уровень тревожности.
Функция правдоподобия
\[L(X; \mathbb{F}, \mathbb{\Psi}) = \prod_{i = 1}^n P_{\xi}(X; \mathbb{F}, \mathbb{\Psi})\] где \(P_{\xi}(X; \mathbb{F}, \mathbb{\Psi})\) - плотность нормального распределения
Оценки максимального правдоподобия примерно можно найти из WLS (взвешенный метод наименьших квадратов)
\[\sum_{i \ne j} \frac{(s_{ij} - \tilde{s}_{ij})^2}{\hat{\sigma_i}^2 \hat{\sigma_j}^2} \rightarrow \min_{\mathbb{F}, \mathbb{\Psi}}\]
где \(\hat{\sigma_i}^2 \hat{\sigma_j}^2\) - специфичности
### FA_LIKLIHOOD
fa_like <- fa(d_stan, nfactors = 5, rotate = "varimax", fm = "ml")
fa_like$loadings
##
## Loadings:
## ML1 ML2 ML3 ML4 ML5
## X -0.170
## distant 0.565 -0.118 0.331
## talkatv -0.771 0.119 0.136
## carelss 0.620 0.275
## hardwrk -0.204 -0.634 0.139 0.107
## anxious 0.156 0.711 0.146 0.227
## agreebl 0.636 -0.162
## tense 0.147 0.780 0.277
## kind -0.317 0.622 -0.184
## opposng 0.103 -0.124 0.635
## relaxed 0.137 -0.680 0.349
## disorgn 0.744 0.146
## outgoin -0.827 0.219
## approvn -0.271 -0.126 -0.106 0.500 -0.121
## shy 0.737 0.193 0.160
## discipl -0.618
## harsh 0.121 -0.234 0.645
## persevr -0.164 -0.526 0.108 0.194
## friendl -0.491 -0.167 0.548 -0.178
## worryin 0.174 0.742 0.141
## respnsi -0.723 0.235
## contrar 0.138 0.139 -0.151 0.713
## sociabl -0.709 0.272 -0.121
## lazy 0.191 0.659 0.139
## coopera -0.178 -0.109 0.555 -0.282
## quiet 0.801 0.167 0.164
## organiz -0.776
## criticl -0.179 0.130 -0.107 0.578
## lax 0.469 -0.210 0.230 0.100
## laidbck 0.254 -0.566 0.289
## withdrw 0.709 0.143 0.121 -0.100 0.299
## givinup 0.355 0.441 0.218 -0.107 0.162
## easygon -0.137 0.128 -0.429 0.432
##
## ML1 ML2 ML3 ML4 ML5
## SS loadings 4.468 4.417 2.933 2.537 2.457
## Proportion Var 0.135 0.134 0.089 0.077 0.074
## Cumulative Var 0.135 0.269 0.358 0.435 0.509
circle_cor_plot(scores = fa_like$scores)
### PCA
pca_d1 <- prcomp(d_stan)
summary(pca_d1)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.694 2.1277 1.76828 1.52768 1.38067 1.09316
## Proportion of Variance 0.220 0.1372 0.09475 0.07072 0.05777 0.03621
## Cumulative Proportion 0.220 0.3572 0.45193 0.52265 0.58042 0.61663
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 1.03138 0.94004 0.92211 0.89123 0.84326 0.83163
## Proportion of Variance 0.03223 0.02678 0.02577 0.02407 0.02155 0.02096
## Cumulative Proportion 0.64887 0.67564 0.70141 0.72548 0.74703 0.76798
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 0.79950 0.79153 0.7290 0.71204 0.6869 0.67906
## Proportion of Variance 0.01937 0.01899 0.0161 0.01536 0.0143 0.01397
## Cumulative Proportion 0.78735 0.80634 0.8224 0.83781 0.8521 0.86608
## PC19 PC20 PC21 PC22 PC23 PC24
## Standard deviation 0.6675 0.64631 0.63929 0.62240 0.60718 0.57137
## Proportion of Variance 0.0135 0.01266 0.01238 0.01174 0.01117 0.00989
## Cumulative Proportion 0.8796 0.89224 0.90462 0.91636 0.92753 0.93743
## PC25 PC26 PC27 PC28 PC29 PC30
## Standard deviation 0.54631 0.54127 0.52885 0.50142 0.48372 0.46857
## Proportion of Variance 0.00904 0.00888 0.00848 0.00762 0.00709 0.00665
## Cumulative Proportion 0.94647 0.95535 0.96382 0.97144 0.97853 0.98519
## PC31 PC32 PC33
## Standard deviation 0.44696 0.39529 0.36441
## Proportion of Variance 0.00605 0.00474 0.00402
## Cumulative Proportion 0.99124 0.99598 1.00000
print(pca_d1$rotation[,1:5])
## PC1 PC2 PC3 PC4 PC5
## X 0.05501454 0.02468081 -0.036025525 0.013188545 -0.17306184
## distant -0.22712442 -0.08086093 -0.113957098 0.023604116 0.21925625
## talkatv 0.18323852 0.09815015 0.353982313 -0.057099871 0.01370370
## carelss -0.16988322 0.19464723 0.170366259 -0.133502959 0.03484188
## hardwrk 0.16087303 -0.26855760 0.066375752 0.001006559 0.12057579
## anxious -0.13901592 -0.17408589 0.117407284 -0.389627482 -0.03873559
## agreebl 0.12236284 0.08278574 -0.164651998 -0.356703205 0.14347781
## tense -0.15679977 -0.22913105 0.162641914 -0.312721314 -0.06663743
## kind 0.19193670 -0.06612463 -0.140712646 -0.301325620 0.12097715
## opposng -0.12665266 -0.03038469 0.282927975 0.039812454 0.29377639
## relaxed 0.11283845 0.26284056 -0.186124294 0.089190975 0.27151295
## disorgn -0.14632833 0.27626744 0.121680408 -0.152229523 -0.01269195
## outgoin 0.25310837 0.08456953 0.282190849 -0.072982948 -0.03477943
## approvn 0.19616051 0.05593536 -0.045024011 -0.226902790 0.14198327
## shy -0.22198178 -0.04005282 -0.284507428 -0.104572769 -0.01759515
## discipl 0.11003214 -0.26911565 -0.052891653 0.053979471 0.19546930
## harsh -0.16034525 -0.03082163 0.257692815 0.092566062 0.28258350
## persevr 0.14967377 -0.22563864 0.042218256 -0.055583249 0.16595708
## friendl 0.24516434 0.02164098 0.070073774 -0.292735154 0.01417563
## worryin -0.14469466 -0.19762607 0.107165145 -0.342285142 -0.12735604
## respnsi 0.16449968 -0.28016762 -0.087076218 -0.016082117 0.16245950
## contrar -0.16642837 -0.04282923 0.287568345 0.015291110 0.30543565
## sociabl 0.23400732 0.12793480 0.215252665 -0.106831061 -0.05593506
## lazy -0.19644215 0.21942730 0.039434019 -0.169534214 -0.01116168
## coopera 0.18955584 0.02639798 -0.186096622 -0.257304256 0.08679683
## quiet -0.21112550 -0.08217043 -0.307746436 -0.165562632 0.11254385
## organiz 0.17485015 -0.27297708 -0.078541930 0.112932030 0.09552217
## criticl -0.09554815 -0.16292737 0.202718307 0.054093123 0.33326523
## lax -0.05579541 0.25993943 -0.015385507 -0.128961752 0.18189358
## laidbck 0.05419826 0.27492121 -0.094713349 0.063909599 0.33573281
## withdrw -0.26831903 -0.08050879 -0.163187276 -0.021917169 0.18042504
## givinup -0.23814370 0.06780928 -0.008492163 -0.128784795 -0.03856263
## easygon 0.12250732 0.22262830 -0.098362943 -0.070254283 0.27895687
circle_cor_plot(scores = pca_d1$x)
Сравним PCA c факторным анализом без вращений
pca_d2 <- fa(d_stan, nfactors = 5, rotate = "none", fm = "minres")
pca_d2$loadings
##
## Loadings:
## MR1 MR2 MR3 MR4 MR5
## X -0.137 -0.122
## distant 0.619 -0.102 0.230
## talkatv -0.581 -0.140 0.516
## carelss 0.353 -0.448 0.348 0.130
## hardwrk -0.348 0.574 0.152
## anxious 0.400 0.364 0.374 0.419
## agreebl -0.265 -0.213 0.527 0.194
## tense 0.450 0.463 0.449 0.293
## kind -0.418 0.255 -0.229 0.449 0.186
## opposng 0.264 0.426 -0.186 0.386
## relaxed -0.316 -0.493 -0.410 0.314
## disorgn 0.289 -0.607 0.280 0.217
## outgoin -0.752 0.405
## approvn -0.476 0.322 0.172
## shy 0.663 -0.353 0.206
## discipl -0.187 0.542 -0.131 -0.127 0.191
## harsh 0.354 0.392 -0.282 0.361
## persevr -0.318 0.474 0.186
## friendl -0.635 0.422
## worryin 0.417 0.404 0.347 0.368 -0.105
## respnsi -0.322 0.623 -0.201 0.220
## contrar 0.373 0.470 -0.186 0.419
## sociabl -0.684 -0.158 0.283 0.160
## lazy 0.441 -0.467 0.160 0.241
## coopera -0.422 -0.295 0.400 0.103
## quiet 0.661 0.128 -0.395 0.282 0.131
## organiz -0.371 0.618 -0.220 -0.185 0.102
## criticl 0.233 0.259 0.279 -0.216 0.391
## lax -0.491 0.208 0.192
## laidbck -0.183 -0.521 -0.218 0.354
## withdrw 0.751 -0.170 0.205
## givinup 0.584 -0.184 0.114 0.134
## easygon -0.336 -0.360 -0.198 0.183 0.303
##
## MR1 MR2 MR3 MR4 MR5
## SS loadings 6.685 4.059 2.730 1.949 1.386
## Proportion Var 0.203 0.123 0.083 0.059 0.042
## Cumulative Var 0.203 0.326 0.408 0.467 0.509
circle_cor_plot(scores = pca_d2$scores)
PCA и факторный анализ без вращений выдали похожие результаты с точнотью до знака. Их не очень просто проинтерпретировать.
Объединим некоторые признаки по смыслу и проведем факторный анализ для таких данных.
###Creating composite variables
shy = rowMeans(cbind(d_stan$distant, d_stan$shy, d_stan$withdrw, d_stan$quiet))
outgoing = rowMeans(cbind(d_stan$talkatv, d_stan$outgoin, d_stan$sociabl))
hardworking = rowMeans(cbind(d_stan$hardwrk, d_stan$persevr, d_stan$discipl))
friendly = rowMeans(cbind(d_stan$friendl, d_stan$kind, d_stan$coopera, d_stan$agreebl, d_stan$approvn, d_stan$sociabl))
anxious = rowMeans(cbind(d_stan$tense, d_stan$anxious, d_stan$worryin))
combined_data = cbind(shy,outgoing,hardworking,friendly,anxious)
combined_data = as.data.frame(combined_data)
pairs.panels(combined_data)
scree(combined_data)
Выберем 2 фактора
fa_comb = fa(combined_data, nfactors = 2, rotate = "varimax")
fa_comb$loadings
##
## Loadings:
## MR1 MR2
## shy -0.499 0.736
## outgoing 0.705 -0.555
## hardworking 0.312
## friendly 0.616 -0.213
## anxious 0.443
##
## MR1 MR2
## SS loadings 1.223 1.093
## Proportion Var 0.245 0.219
## Cumulative Var 0.245 0.463
circle_cor_plot(n=5,scores = fa_comb$scores, data = combined_data)
Факторы получились не очень хорошими. Первый описывает человека по уровню его социальной активности и трудолюбию. Второй показывает разницу между тревожностью и скромностью и дружелюбностью.