# 輸入data
data(bfi)
bfi[, c(1, 9, 10, 11, 12, 22)] = 7- bfi[, c(1, 9, 10, 11, 12, 22)]
dta <- bfi
dta <- dta[, 1:25]
my_summary <- function(x) {
 require(moments)
 funs <- c(mean, sd, skewness, kurtosis)
 sapply(funs, function(f) f(x, na.rm = T))
 }
dta_desc <- apply(dta, 2, my_summary)
head(dta_desc)
##              A1        A2        A3        A4        A5         C1         C2
## [1,]  4.5865661  4.802380  4.603821  4.699748  4.560345  4.5023390  4.3699568
## [2,]  1.4077372  1.172020  1.301834  1.479633  1.258512  1.2413465  1.3183465
## [3,] -0.8254883 -1.124894 -0.998997 -1.031499 -0.847690 -0.8551631 -0.7422207
## [4,]  2.6942957  4.057765  3.444524  3.042640  3.161176  3.3068088  2.8656243
##              C3         C4          C5         E1         E2         E3
## [1,]  4.3039568  4.4466474  3.70330460  4.0255672  3.8581178  4.0007207
## [2,]  1.2885518  1.3751181  1.62854187  1.6315055  1.6052103  1.3527188
## [3,] -0.6918287 -0.5964955 -0.06620282 -0.3736569 -0.2209396 -0.4706335
## [4,]  2.8697332  2.3802970  1.78461246  1.9090390  1.8526925  2.5367154
##              E4        E5        N1          N2        N3        N4        N5
## [1,]  4.4224292  4.416337 2.9290857  3.50773660 3.2165651 3.1856006 2.9696860
## [2,]  1.4575174  1.334768 1.5709175  1.52594359 1.6029021 1.5696851 1.6186474
## [3,] -0.8241831 -0.777486 0.3714298 -0.07698521 0.1506797 0.1969966 0.3744599
## [4,]  2.6977079  2.908401 1.9885722  1.95035250 1.8227046 1.9090309 1.9401121
##              O1        O2         O3        O4        O5
## [1,]  4.8160547  4.286786  4.4383117  4.892319 2.4895683
## [2,]  1.1295303  1.565152  1.2209011  1.221250 1.3279590
## [3,] -0.8973669 -0.585679 -0.7730516 -1.218247 0.7384818
## [4,]  3.4277033  2.188889  3.3043641  4.082686 2.7630094
# 重新命名
rownames(dta_desc) <- c("mean", "sd", "skewness", "kurtosis")

# 改成dataframe
rslt1 <- as.data.frame(t(dta_desc))

# 畫出means、sd、skewness、kurtosis的表格
rslt1 |> knitr::kable()
mean sd skewness kurtosis
A1 4.586566 1.407737 -0.8254883 2.694296
A2 4.802380 1.172020 -1.1248938 4.057765
A3 4.603821 1.301834 -0.9989970 3.444524
A4 4.699748 1.479633 -1.0314991 3.042640
A5 4.560345 1.258512 -0.8476900 3.161176
C1 4.502339 1.241346 -0.8551631 3.306809
C2 4.369957 1.318347 -0.7422207 2.865624
C3 4.303957 1.288552 -0.6918287 2.869733
C4 4.446647 1.375118 -0.5964955 2.380297
C5 3.703305 1.628542 -0.0662028 1.784612
E1 4.025567 1.631506 -0.3736569 1.909039
E2 3.858118 1.605210 -0.2209396 1.852693
E3 4.000721 1.352719 -0.4706335 2.536715
E4 4.422429 1.457517 -0.8241831 2.697708
E5 4.416337 1.334768 -0.7774860 2.908401
N1 2.929086 1.570917 0.3714298 1.988572
N2 3.507737 1.525944 -0.0769852 1.950352
N3 3.216565 1.602902 0.1506797 1.822705
N4 3.185601 1.569685 0.1969966 1.909031
N5 2.969686 1.618647 0.3744599 1.940112
O1 4.816055 1.129530 -0.8973669 3.427703
O2 4.286786 1.565152 -0.5856790 2.188889
O3 4.438312 1.220901 -0.7730516 3.304364
O4 4.892319 1.221250 -1.2182471 4.082686
O5 2.489568 1.327959 0.7384818 2.763009
dtal_desc <- melt(dta_desc)
names(dtal_desc)[1:2] <- c("moments", "items")

根據圖表來判斷

中間那一條線是指平均,例如:全部平均數的平均、全部標準差的平均…

左右兩邊的線代表一個標準差。

如果所有的點越集中,表示沒有變異,大家都一樣,所以才會很集中。

# mean
ggplot(data = subset(dtal_desc, moments == "mean"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 2) +
 geom_hline(yintercept = mean(t(dta_desc["mean",])) + # mean of the means as a reference
 c(-1.5, 0, 1.5) * sd(t(dta_desc["mean", ])), # sd of the mean values
 linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "mean") +
  theme_bw()

平均的higher value會落在O4中,而平均的lower value會落在O5中。

其中,O5、N1、N5測驗的平均是極端低於整體mean的平均,也就是說,這三個項目皆屬於低分組,但其中的O5是低分組中的低分。

# sd
ggplot(data = subset(dtal_desc, moments == "sd"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 2) +
 geom_hline(yintercept = mean(t(dta_desc["sd",])) +  # mean of sd as reference
 c(-1.5, 0, 1.5) * sd(t(dta_desc["sd", ])), linetype = "dashed") + #sd of the sd
 coord_flip() +
 labs(x = "items",  y = "sd") +
  theme_bw()

標準差的higher value為E1,lower value會落在O1。

而,O1的標準差極端低於整體sd的平均。

# skewness
ggplot(data = subset(dtal_desc, moments == "skewness"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 2) +
 geom_hline(yintercept = mean(t(dta_desc["skewness",])) +
 c(-1.5, 0, 1.5) * sd(t(dta_desc["skewness", ])), linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "skewness") +
  theme_bw()

偏態的higher value為O5,lower value會落在O4。

其中,O5、N5、N1是極端高於整體skewness的平均。

# kurtosis
ggplot(data = subset(dtal_desc, moments == "kurtosis"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 2) +
 geom_hline(yintercept = mean(t(dta_desc["kurtosis",])) +
 c(-1.5, 0, 1.5) * sd(t(dta_desc["kurtosis", ])), linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "kurtosis") +
  theme_bw()

峰度的higher value為O4,lower value為C5。

其中,O4和A2極端高於kurtosis的平均。

Discrimination Index

# 把NA移除
dta <- na.omit(dta)
dta$tot <- apply(dta, 1, sum)
dta$grp <- NA
# show high and low points in data
dta$grp[rank(dta$tot) < 2436*.27] <- "L" # below 27%
dta$grp[rank(dta$tot) > 2436*.73] <- "H" # upper 73%
dta$grp <- factor(dta$grp)
head(dta)
##       A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2 O3 O4
## 61617  5  4  3  4  4  2  3  3  3  3  4  4  3  4  4  3  4  2  2  3  3  1  3  4
## 61618  5  4  5  2  5  5  4  4  4  3  6  6  6  4  3  3  3  3  5  5  4  5  4  3
## 61620  2  4  5  4  4  4  5  4  5  2  5  3  4  4  5  4  5  4  2  3  4  5  5  5
## 61621  3  4  6  5  5  4  4  3  2  2  2  4  4  4  4  2  5  2  4  1  3  4  4  3
## 61622  5  3  3  4  5  4  4  5  4  5  5  5  5  4  5  2  3  4  4  3  3  4  4  3
## 61623  1  6  5  6  5  6  6  6  6  4  5  6  6  5  6  3  5  2  2  3  4  4  5  6
##       O5 tot  grp
## 61617  3  81    L
## 61618  3 104 <NA>
## 61620  2  99 <NA>
## 61621  5  89    L
## 61622  3  99 <NA>
## 61623  1 114    H
# calculate mean score for each group (H & L)
dtam <- aggregate(dta[, 1:25], by=list(dta$grp), mean)
print(dtam)
##   Group.1       A1       A2       A3       A4       A5       C1       C2
## 1       H 5.007407 5.438519 5.364444 5.325926 5.237037 5.136296 5.102222
## 2       L 4.202080 4.017831 3.693908 3.918276 3.796434 3.897474 3.632987
##         C3       C4       C5       E1       E2       E3       E4       E5
## 1 4.885926 5.040000 4.358519 4.746667 4.617778 4.856296 5.145185 5.229630
## 2 3.742942 3.832095 3.005944 3.077266 2.921248 3.062407 3.401189 3.423477
##         N1       N2       N3       N4       N5       O1       O2       O3
## 1 3.364444 3.955556 3.680000 3.269630 3.330370 5.299259 4.691852 5.091852
## 2 2.585438 3.200594 2.783061 3.194651 2.649331 4.326895 4.031204 3.790490
##         O4       O5
## 1 5.272593 2.334815
## 2 4.679049 2.563150
# 把第一個column刪除
dtam <- t(dtam[, -1])
# using t-test to exam the different between two mean values (high\low)
item_t <- sapply(dta[, 1:25], function(x) t.test(x ~ dta$grp)$statistic)
print(item_t)
##       A1.t       A2.t       A3.t       A4.t       A5.t       C1.t       C2.t 
## 10.8435846 24.1029485 26.7156154 18.7839003 23.0699742 19.1247670 22.3832065 
##       C3.t       C4.t       C5.t       E1.t       E2.t       E3.t       E4.t 
## 17.1453481 16.9457542 16.0464821 20.4585020 21.2538284 28.5111542 23.8114957 
##       E5.t       N1.t       N2.t       N3.t       N4.t       N5.t       O1.t 
## 28.1321438  9.2082776  9.2587359 10.4863674  0.8665417  7.6373698 16.2839271 
##       O2.t       O3.t       O4.t       O5.t 
##  7.7166981 20.8345645  9.2886876 -3.1290125
# 用t-test的結果,創造出其他數值
rslt2 <- data.frame(Item = rownames(dtam), low.mean.score = dtam[, 2], high.mean.score = dtam[, 1], mean.dif = dtam[, 1]-dtam[, 2], t.value = item_t)
print(rslt2)
##    Item low.mean.score high.mean.score    mean.dif    t.value
## A1   A1       4.202080        5.007407  0.80532717 10.8435846
## A2   A2       4.017831        5.438519  1.42068791 24.1029485
## A3   A3       3.693908        5.364444  1.67053657 26.7156154
## A4   A4       3.918276        5.325926  1.40764955 18.7839003
## A5   A5       3.796434        5.237037  1.44060316 23.0699742
## C1   C1       3.897474        5.136296  1.23882230 19.1247670
## C2   C2       3.632987        5.102222  1.46923560 22.3832065
## C3   C3       3.742942        4.885926  1.14298388 17.1453481
## C4   C4       3.832095        5.040000  1.20790490 16.9457542
## C5   C5       3.005944        4.358519  1.35257498 16.0464821
## E1   E1       3.077266        4.746667  1.66940069 20.4585020
## E2   E2       2.921248        4.617778  1.69652964 21.2538284
## E3   E3       3.062407        4.856296  1.79388916 28.5111542
## E4   E4       3.401189        5.145185  1.74399648 23.8114957
## E5   E5       3.423477        5.229630  1.80615266 28.1321438
## N1   N1       2.585438        3.364444  0.77900611  9.2082776
## N2   N2       3.200594        3.955556  0.75496120  9.2587359
## N3   N3       2.783061        3.680000  0.89693908 10.4863674
## N4   N4       3.194651        3.269630  0.07497881  0.8665417
## N5   N5       2.649331        3.330370  0.68103902  7.6373698
## O1   O1       4.326895        5.299259  0.97236476 16.2839271
## O2   O2       4.031204        4.691852  0.66064829  7.7166981
## O3   O3       3.790490        5.091852  1.30136151 20.8345645
## O4   O4       4.679049        5.272593  0.59354356  9.2886876
## O5   O5       2.563150        2.334815 -0.22833526 -3.1290125
# 畫圖
ggplot(data = rslt2, aes(x = reorder(Item, t.value, max), y = t.value)) +
 geom_point() +
 geom_hline(yintercept = 2, linetype = "dashed") +
 coord_flip() +
 labs(x = "Items", y = "t-value") +
 theme_bw()

Items correlation

# 計算 Cronbach α for total scale
itotr <- psych::alpha(dta[, 1:25])$item.stats[, "r.drop"]
## Warning in psych::alpha(dta[, 1:25]): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( N1 N2 N3 N4 N5 O4 O5 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
# create subscale
ldta <- list(A = dta[, 1:5], C = dta[, 6:10], E = dta[ ,11:15],  N = dta[, 16:20], O = dta[, 21:25])
# 計算 Cronbach α for each scale
isubalpha <- lapply(ldta, psych::alpha)
## Warning in FUN(X[[i]], ...): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( O5 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
isubr <- c(isubalpha$A$item.stats[, "r.drop"],
           isubalpha$C$item.stats[, "r.drop"],
           isubalpha$E$item.stats[, "r.drop"],
           isubalpha$N$item.stats[, "r.drop"],
           isubalpha$O$item.stats[, "r.drop"])
rslt3 <- as.data.frame(t(rbind(itotr, isubr)))
names(rslt3) <- c("Item-total correlation without item", "Item-subscale correlation without item")
print(rslt3)
##    Item-total correlation without item Item-subscale correlation without item
## 1                           0.12129886                              0.3190962
## 2                           0.44014085                              0.5759233
## 3                           0.46294030                              0.6035693
## 4                           0.31532083                              0.4145254
## 5                           0.40108276                              0.5004352
## 6                           0.31617880                              0.4654163
## 7                           0.36605572                              0.5128535
## 8                           0.26908306                              0.4769297
## 9                           0.24405601                              0.5731250
## 10                          0.21036732                              0.4860793
## 11                          0.29545305                              0.5153693
## 12                          0.31820168                              0.6142087
## 13                          0.46367816                              0.5049821
## 14                          0.40479750                              0.5827738
## 15                          0.46538882                              0.4634332
## 16                          0.07803633                              0.6778437
## 17                          0.07567906                              0.6548330
## 18                          0.11094446                              0.6781411
## 19                         -0.10220864                              0.5485366
## 20                          0.04465205                              0.4874632
## 21                          0.27231436                              0.2924873
## 22                          0.03491339                              0.1077996
## 23                          0.36270021                              0.2883799
## 24                          0.08713837                              0.1138836
## 25                         -0.18664142                             -0.4197456

Items reliability

itotalpha <- psych::alpha(dta[, 1:25])$alpha.drop[, "raw_alpha"]
## Warning in psych::alpha(dta[, 1:25]): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( N1 N2 N3 N4 N5 O4 O5 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option
ialphad <- c(isubalpha$A$alpha.drop[, "raw_alpha"],
             isubalpha$C$alpha.drop[, "raw_alpha"],
             isubalpha$E$alpha.drop[, "raw_alpha"],
             isubalpha$N$alpha.drop[, "raw_alpha"],
             isubalpha$O$alpha.drop[, "raw_alpha"])
rslt4 <- as.data.frame(t(rbind(itotalpha, ialphad)))
names(rslt4) <- c("Main Reliability(item drop)", "Sub Reliability (item drop)")
print(rslt4)
##    Main Reliability(item drop) Sub Reliability (item drop)
## 1                    0.6693317                  0.73146066
## 2                    0.6434533                  0.63320031
## 3                    0.6391788                  0.61508422
## 4                    0.6511088                  0.69631360
## 5                    0.6452547                  0.65824215
## 6                    0.6527621                  0.70449132
## 7                    0.6477104                  0.68698743
## 8                    0.6563284                  0.70008993
## 9                    0.6582432                  0.66308532
## 10                   0.6617175                  0.70318182
## 11                   0.6525855                  0.73127333
## 12                   0.6501819                  0.69249537
## 13                   0.6383950                  0.73291957
## 14                   0.6423673                  0.70564233
## 15                   0.6383858                  0.74573665
## 16                   0.6748697                  0.75981673
## 17                   0.6746403                  0.76737324
## 18                   0.6717707                  0.75946602
## 19                   0.6921333                  0.79821383
## 20                   0.6787839                  0.81676522
## 21                   0.6569754                 -0.23250289
## 22                   0.6788003                 -0.05384144
## 23                   0.6492884                 -0.25466269
## 24                   0.6707930                 -0.02721134
## 25                   0.6938900                  0.52184548

從數據來看,如果把第20題刪掉的話,得到的信度會最高,所以如果刪掉這個題目是可行的。

而如果刪掉第23題的話,得到的信度反而會降低,而且是所有題目中最低的,所以不應該刪掉它。

Exploratory factor analysis 探索性因素分析

factor loading 因素負載量

越高越好,越能反映子構念

print.psych(fa(dta[, 1:25],         # define the data
               nfactor = 5,         # number of factors研究者設定的因素數目 
               fm = "pa",           # factor method因素抽取法為主軸抽取法(principal axis)
               rotate = "Promax"),  # 最優斜交轉軸法
               cut = .3)            #不列印.3以下的因素負載量
## Factor Analysis using method =  pa
## Call: fa(r = dta[, 1:25], nfactors = 5, rotate = "Promax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      PA2   PA1   PA3   PA5   PA4   h2   u2 com
## A1                    0.44       0.20 0.80 1.8
## A2                    0.63       0.46 0.54 1.1
## A3                    0.65       0.54 0.46 1.2
## A4                    0.43       0.30 0.70 1.9
## A5        0.31        0.51       0.47 0.53 1.8
## C1              0.57             0.35 0.65 1.2
## C2              0.69             0.45 0.55 1.2
## C3              0.59             0.32 0.68 1.1
## C4              0.65             0.48 0.52 1.1
## C5              0.56             0.44 0.56 1.3
## E1        0.62                   0.35 0.65 1.2
## E2        0.71                   0.55 0.45 1.0
## E3        0.50                   0.44 0.56 2.0
## E4        0.66                   0.54 0.46 1.3
## E5        0.47                   0.41 0.59 2.2
## N1  0.86                         0.68 0.32 1.2
## N2  0.81                         0.61 0.39 1.1
## N3  0.72                         0.54 0.46 1.0
## N4  0.47 -0.36                   0.51 0.49 2.2
## N5  0.48                         0.35 0.65 1.8
## O1                          0.51 0.32 0.68 1.2
## O2                          0.48 0.27 0.73 1.6
## O3                          0.61 0.47 0.53 1.3
## O4                          0.37 0.25 0.75 2.6
## O5                         -0.54 0.30 0.70 1.2
## 
##                        PA2  PA1  PA3  PA5  PA4
## SS loadings           2.62 2.50 2.03 1.86 1.57
## Proportion Var        0.10 0.10 0.08 0.07 0.06
## Cumulative Var        0.10 0.21 0.29 0.36 0.42
## Proportion Explained  0.25 0.24 0.19 0.18 0.15
## Cumulative Proportion 0.25 0.48 0.68 0.85 1.00
## 
##  With factor correlations of 
##       PA2   PA1   PA3  PA5  PA4
## PA2  1.00 -0.33 -0.24 0.02 0.00
## PA1 -0.33  1.00  0.37 0.29 0.15
## PA3 -0.24  0.37  1.00 0.21 0.23
## PA5  0.02  0.29  0.21 1.00 0.19
## PA4  0.00  0.15  0.23 0.19 1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 5 factors are sufficient.
## 
## The degrees of freedom for the null model are  300  and the objective function was  7.48 with Chi Square of  18146.07
## The degrees of freedom for the model are 185  and the objective function was  0.64 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.04 
## 
## The harmonic number of observations is  2436 with the empirical chi square  1131.91  with prob <  1.1e-135 
## The total number of observations was  2436  with Likelihood Chi Square =  1538.69  with prob <  8e-212 
## 
## Tucker Lewis Index of factoring reliability =  0.877
## RMSEA index =  0.055  and the 90 % confidence intervals are  0.052 0.057
## BIC =  96.03
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy             
##                                                    PA2  PA1  PA3  PA5  PA4
## Correlation of (regression) scores with factors   0.93 0.91 0.89 0.87 0.84
## Multiple R square of scores with factors          0.86 0.82 0.79 0.76 0.71
## Minimum correlation of possible factor scores     0.72 0.64 0.57 0.52 0.42

A5和N4的PA有兩組,表示較沒有辨別度,所以代表A5和N4的題目較無鑑別度。

因為因素負載量越高越好,所以從各子項目中挑出最高的前三者:

#Select three items for each subscale.

A組的A1、A2、A3

C組的C2、C3、C4

E組的E1、E2、E3

N組的N1、N2、N3

O組的O1、o3、O5

Parallel analysis 平行分析

特徵值大於1,代表是好的

fa.parallel(dta[, 1:25], fa = "pc", show.legend = FALSE)

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  5

有五筆資料是大於1的,代表是好的。