事例:①年齢、②学歴、③財産、④職業、⑤性別、⑥人種による不公不公平感についてきいたデータがあるとする。不平等感を規定する潜在因子を抽出したい。
dat <- data.frame(
age_un = c(4,5,2,3,3,2,2,2,4,3,1,1,2,5,2,4,1,3,3,2),
edu_un = c(4,2,1,1,2,0,2,1,3,1,1,1,2,4,2,3,2,2,1,1),
wealth_un = c(3,4,2,1,3,1,1,1,2,2,1,1,1,4,1,2,2,2,4,2),
job_un = c(3,4,3,2,3,4,2,2,4,3,3,3,3,4,4,3,5,2,3,2),
sex_un = c(5,2,1,1,2,1,2,1,4,1,1,1,2,4,2,4,2,2,1,1),
race_un = c(5,5,4,2,5,2,1,2,4,4,1,2,1,5,2,4,3,4,5,4)
)
# IDをrownamesとしてつける
rownames(dat) <- 1:20
ひとまず記述統計を見てみる(良い習慣として)
library(psych)
describe(dat) # 記述統計
## vars n mean sd median trimmed mad min max range skew kurtosis
## age_un 1 20 2.70 1.22 2.5 2.62 0.74 1 5 4 0.38 -0.93
## edu_un 2 20 1.80 1.06 2.0 1.69 1.48 0 4 4 0.63 -0.40
## wealth_un 3 20 2.00 1.08 2.0 1.88 1.48 1 4 3 0.72 -0.84
## job_un 4 20 3.10 0.85 3.0 3.06 1.48 2 5 3 0.31 -0.76
## sex_un 5 20 2.00 1.26 2.0 1.81 1.48 1 5 4 1.06 -0.23
## race_un 6 20 3.25 1.48 4.0 3.31 1.48 1 5 4 -0.22 -1.57
## se
## age_un 0.27
## edu_un 0.24
## wealth_un 0.24
## job_un 0.19
## sex_un 0.28
## race_un 0.33
pairs(dat) # 変数ペアのプロット
fa.out <- fa(scale(dat), # データ(標準化したもの)
nfactors = 2, # 因子数
fm = "gls", # 一般化最小二乗法
rotate = "Promax", # オブリミン回転
scores = T) # 因子得点を保存
# 結果
fa.out # 結果の表示(全体)
## Factor Analysis using method = gls
## Call: fa(r = scale(dat), nfactors = 2, rotate = "Promax", scores = T,
## fm = "gls")
## Standardized loadings (pattern matrix) based upon correlation matrix
## GLS1 GLS2 h2 u2 com
## age_un 0.81 0.21 0.87 0.13 1.1
## edu_un 0.20 0.72 0.70 0.30 1.2
## wealth_un 0.84 0.02 0.73 0.27 1.0
## job_un -0.23 0.69 0.36 0.64 1.2
## sex_un 0.15 0.74 0.70 0.30 1.1
## race_un 0.89 -0.04 0.76 0.24 1.0
##
## GLS1 GLS2
## SS loadings 2.41 1.73
## Proportion Var 0.40 0.29
## Cumulative Var 0.40 0.69
## Proportion Explained 0.58 0.42
## Cumulative Proportion 0.58 1.00
##
## With factor correlations of
## GLS1 GLS2
## GLS1 1.00 0.52
## GLS2 0.52 1.00
##
## Mean item complexity = 1.1
## Test of the hypothesis that 2 factors are sufficient.
##
## df null model = 15 with the objective function = 5.92 with Chi Square = 95.66
## df of the model are 4 and the objective function was 2.27
##
## The root mean square of the residuals (RMSR) is 0.12
## The df corrected root mean square of the residuals is 0.23
##
## The harmonic n.obs is 20 with the empirical chi square 8.31 with prob < 0.081
## The total n.obs was 20 with Likelihood Chi Square = 33.67 with prob < 8.7e-07
##
## Tucker Lewis Index of factoring reliability = -0.529
## RMSEA index = 0.607 and the 90 % confidence intervals are 0.441 0.828
## BIC = 21.69
## Fit based upon off diagonal values = 0.95
## Measures of factor score adequacy
## GLS1 GLS2
## Correlation of (regression) scores with factors 0.97 0.92
## Multiple R square of scores with factors 0.95 0.84
## Minimum correlation of possible factor scores 0.90 0.69
fa.out$scores # 因子得点
## GLS1 GLS2
## 1 1.03621576 1.371611074
## 2 1.76979056 0.839442683
## 3 -0.12338461 -0.651642376
## 4 -0.17171788 -0.779998699
## 5 0.71599129 0.004733827
## 6 -0.93647271 -0.559346535
## 7 -0.92872422 -0.500888707
## 8 -0.67688914 -1.028742736
## 9 0.66947593 1.461301133
## 10 0.38178665 -0.402898338
## 11 -1.46961948 -0.800104339
## 12 -1.21459559 -0.780164550
## 13 -0.96125942 -0.003566483
## 14 1.77616816 1.935030321
## 15 -0.73877073 0.513695531
## 16 0.70201112 0.963978909
## 17 -0.94546135 0.622112064
## 18 0.41751064 -0.352426744
## 19 0.78879444 -0.703161435
## 20 -0.09084941 -1.148964600
plot(fa.out$e.values, type = "o") # スクリープロット
結果のまとめ
以下のものを個別に書き出しEXCELファイル等で統合するのが最も柔軟性のあるやり方。
cbind(fa.out$loadings, # 因子負荷量
communality = fa.out$communality) # 共通性
## GLS1 GLS2 communality
## age_un 0.8051380 0.21459497 0.8736755
## edu_un 0.1981348 0.71693137 0.7007223
## wealth_un 0.8442813 0.02434673 0.7347442
## job_un -0.2281871 0.68941400 0.3640373
## sex_un 0.1545549 0.74451446 0.6976521
## race_un 0.8920669 -0.03670727 0.7631347
# 共通性
fa.out$Vaccounted[1:3,] # 固有値および分散説明率
## GLS1 GLS2
## SS loadings 2.4072232 1.7267431
## Proportion Var 0.4012039 0.2877905
## Cumulative Var 0.4012039 0.6889944
fa.out$Phi # 因子間相関
## GLS1 GLS2
## GLS1 1.0000000 0.5190963
## GLS2 0.5190963 1.0000000
以下の平行分析に基づいた場合、2因子を採用する意義があるか怪しいことになる。最終的には、理論的関心とあわせて分析者が判断する必要がある。
library(psych)
fa.parallel(scale(dat), # データ
fm = "gls", # 一般化最小二乗法
fa = "fa") # 因子分析(paにすると主成分分析)
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
## Warning in fac(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, : An
## ultra-Heywood case was detected. Examine the results carefully
## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
datデータをもとに、以下のようなカテゴリデータに再コード化したデータを作成する。
年齢に関する不平等感:平均以上を1、以下を0とする(二値データ)
教育に関する不平等感:平均以上を1、以下を0とする(二値データ)
収入に関する不平等感:もとの回答のうち1と2を「1」, 3を「2」, 4と5を「3」に分ける(三値の順序データ)
なお、仕事に対する不公平感、性別に関する不公平感、人種に関する不公平感のデータは元のままとする。
age_un.bin <- ifelse(dat$age_un > mean(dat$age_un), yes = 1, no = 0)
edu_un.bin <- ifelse(dat$edu_un > mean(dat$edu_un), yes = 1, no = 0)
wealth_un.3cat <- cut(dat$wealth,
breaks = c(-Inf, 2.5, 3.5, Inf),
labels = c("low","middle","high"),
right = F, ordered_result = T)
# 新しいデータセットをつくる
dat2 <- data.frame(age_un.bin = age_un.bin,
edu_un.bin = edu_un.bin,
wealth_un.3cat = as.numeric(wealth_un.3cat), # 数値にすることに注意
dat[,4:6])
# パッケージを読み込む
library(mirt)
# カテゴリカル因子分析
fa.out2 <- mirt(dat2, # データ(標準化しない, 4~6列目は順序データとしてみなされる)
2, # 因子数
method = "EM", # 推定方法(EMアルゴリズム)
rotate = "promax")
# 結果
summary(fa.out2)
##
## Rotation: oblimin
##
## Rotated factor loadings:
##
## F1 F2 h2
## age_un.bin 0.7733 0.251 0.862
## edu_un.bin 0.0101 0.995 1.000
## wealth_un.3cat 1.0647 -0.139 1.000
## job_un 0.1373 0.299 0.151
## sex_un 0.0131 0.993 1.000
## race_un 0.9401 0.106 0.998
##
## Rotated SS loadings: 2.635 2.159
##
## Factor correlations:
##
## F1 F2
## F1 1.000 0.516
## F2 0.516 1.000
# 因子得点
fa.score <- fscores(fa.out2)
なお参考のため、通常の因子分析をもとのデータに当てはめた場合の結果を再掲する。
fa.out
## Factor Analysis using method = gls
## Call: fa(r = scale(dat), nfactors = 2, rotate = "Promax", scores = T,
## fm = "gls")
## Standardized loadings (pattern matrix) based upon correlation matrix
## GLS1 GLS2 h2 u2 com
## age_un 0.81 0.21 0.87 0.13 1.1
## edu_un 0.20 0.72 0.70 0.30 1.2
## wealth_un 0.84 0.02 0.73 0.27 1.0
## job_un -0.23 0.69 0.36 0.64 1.2
## sex_un 0.15 0.74 0.70 0.30 1.1
## race_un 0.89 -0.04 0.76 0.24 1.0
##
## GLS1 GLS2
## SS loadings 2.41 1.73
## Proportion Var 0.40 0.29
## Cumulative Var 0.40 0.69
## Proportion Explained 0.58 0.42
## Cumulative Proportion 0.58 1.00
##
## With factor correlations of
## GLS1 GLS2
## GLS1 1.00 0.52
## GLS2 0.52 1.00
##
## Mean item complexity = 1.1
## Test of the hypothesis that 2 factors are sufficient.
##
## df null model = 15 with the objective function = 5.92 with Chi Square = 95.66
## df of the model are 4 and the objective function was 2.27
##
## The root mean square of the residuals (RMSR) is 0.12
## The df corrected root mean square of the residuals is 0.23
##
## The harmonic n.obs is 20 with the empirical chi square 8.31 with prob < 0.081
## The total n.obs was 20 with Likelihood Chi Square = 33.67 with prob < 8.7e-07
##
## Tucker Lewis Index of factoring reliability = -0.529
## RMSEA index = 0.607 and the 90 % confidence intervals are 0.441 0.828
## BIC = 21.69
## Fit based upon off diagonal values = 0.95
## Measures of factor score adequacy
## GLS1 GLS2
## Correlation of (regression) scores with factors 0.97 0.92
## Multiple R square of scores with factors 0.95 0.84
## Minimum correlation of possible factor scores 0.90 0.69
参考文献
永吉希久子,2016,『行動科学の統計学:社会調査のデータ分析』共立出版.