data(bfi)
dta <-bfi
dta <- na.omit(dta)
消除DATA中的NA
str(bfi)
## 'data.frame': 2800 obs. of 28 variables:
## $ A1 : int 2 2 5 4 2 6 2 4 4 2 ...
## $ A2 : int 4 4 4 4 3 6 5 3 3 5 ...
## $ A3 : int 3 5 5 6 3 5 5 1 6 6 ...
## $ A4 : int 4 2 4 5 4 6 3 5 3 6 ...
## $ A5 : int 4 5 4 5 5 5 5 1 3 5 ...
## $ C1 : int 2 5 4 4 4 6 5 3 6 6 ...
## $ C2 : int 3 4 5 4 4 6 4 2 6 5 ...
## $ C3 : int 3 4 4 3 5 6 4 4 3 6 ...
## $ C4 : int 4 3 2 5 3 1 2 2 4 2 ...
## $ C5 : int 4 4 5 5 2 3 3 4 5 1 ...
## $ E1 : int 3 1 2 5 2 2 4 3 5 2 ...
## $ E2 : int 3 1 4 3 2 1 3 6 3 2 ...
## $ E3 : int 3 6 4 4 5 6 4 4 NA 4 ...
## $ E4 : int 4 4 4 4 4 5 5 2 4 5 ...
## $ E5 : int 4 3 5 4 5 6 5 1 3 5 ...
## $ N1 : int 3 3 4 2 2 3 1 6 5 5 ...
## $ N2 : int 4 3 5 5 3 5 2 3 5 5 ...
## $ N3 : int 2 3 4 2 4 2 2 2 2 5 ...
## $ N4 : int 2 5 2 4 4 2 1 6 3 2 ...
## $ N5 : int 3 5 3 1 3 3 1 4 3 4 ...
## $ O1 : int 3 4 4 3 3 4 5 3 6 5 ...
## $ O2 : int 6 2 2 3 3 3 2 2 6 1 ...
## $ O3 : int 3 4 5 4 4 5 5 4 6 5 ...
## $ O4 : int 4 3 5 3 3 6 6 5 6 5 ...
## $ O5 : int 3 3 2 5 3 1 1 3 1 2 ...
## $ gender : int 1 2 2 2 1 2 1 1 1 2 ...
## $ education: int NA NA NA NA NA 3 NA 2 1 NA ...
## $ age : int 16 18 17 17 17 21 18 19 19 17 ...
dta <- dta[, -c(26:28)]
只將特定的資料(題目的分數)納入資料中,以便後續的分析
dta[, c("A1", "C4", "C5", "E1", "E2", "O2", "O5")] <- 7 - dta[,c("A1","C4","C5","E1","E2", "O2", "O5")]
將負向題的分數轉為正向題,以便後續計算與分析
my_summary <- function(x) {
require(moments)
funs <- c(mean, sd, skewness, kurtosis)
sapply(funs, function(f) f(x, na.rm = T))
}
dta_desc <- apply(dta, 2, my_summary)
將my_summary function的結果加至各個columns中
head(dta_desc)
## A1 A2 A3 A4 A5 C1 C2
## [1,] 4.6346154 4.834079 4.629249 4.749553 4.5849732 4.5697674 4.4011628
## [2,] 1.3919681 1.156915 1.289373 1.447941 1.2558329 1.2166107 1.3113098
## [3,] -0.8849273 -1.149616 -1.035184 -1.094809 -0.8806471 -0.8947177 -0.7697916
## [4,] 2.8355975 4.144108 3.567472 3.237885 3.2405729 3.4186321 2.9099922
## C3 C4 C5 E1 E2 E3
## [1,] 4.3228980 4.4991055 3.74463327 4.0304114 3.8788014 4.009839
## [2,] 1.2871535 1.3628169 1.62959014 1.6181205 1.6056605 1.342438
## [3,] -0.6925833 -0.6410005 -0.09191721 -0.3825874 -0.2544784 -0.479646
## [4,] 2.8926454 2.4399914 1.77002240 1.9336243 1.8752405 2.570188
## E4 E5 N1 N2 N3 N4 N5
## [1,] 4.4306798 4.4186047 2.9083184 3.48568873 3.1985689 3.1753131 2.952147
## [2,] 1.4591002 1.3301168 1.5644549 1.53476376 1.5963935 1.5605995 1.621980
## [3,] -0.8461735 -0.8112306 0.3898181 -0.06325393 0.1653818 0.2170194 0.399517
## [4,] 2.7347239 2.9743508 2.0115978 1.93343058 1.8219399 1.9423127 1.946799
## O1 O2 O3 O4 O5
## [1,] 4.8215564 4.310823 4.483005 4.948122 4.5447227
## [2,] 1.1200433 1.545865 1.193261 1.175435 1.3295014
## [3,] -0.9081616 -0.606504 -0.792629 -1.260182 -0.7840773
## [4,] 3.4683128 2.234007 3.404031 4.264682 2.8377712
rownames(dta_desc) <- c("mean", "sd", "skewness", "kurtosis")
將每個row進行命名
rslt1 <- as.data.frame(t(dta_desc))
轉置數據(從matrix到data frame)
rslt1 |> knitr::kable()
| mean | sd | skewness | kurtosis | |
|---|---|---|---|---|
| A1 | 4.634615 | 1.391968 | -0.8849273 | 2.835597 |
| A2 | 4.834079 | 1.156915 | -1.1496157 | 4.144108 |
| A3 | 4.629249 | 1.289373 | -1.0351837 | 3.567472 |
| A4 | 4.749553 | 1.447941 | -1.0948090 | 3.237885 |
| A5 | 4.584973 | 1.255833 | -0.8806471 | 3.240573 |
| C1 | 4.569767 | 1.216611 | -0.8947177 | 3.418632 |
| C2 | 4.401163 | 1.311310 | -0.7697916 | 2.909992 |
| C3 | 4.322898 | 1.287154 | -0.6925833 | 2.892645 |
| C4 | 4.499105 | 1.362817 | -0.6410005 | 2.439991 |
| C5 | 3.744633 | 1.629590 | -0.0919172 | 1.770022 |
| E1 | 4.030411 | 1.618121 | -0.3825874 | 1.933624 |
| E2 | 3.878801 | 1.605660 | -0.2544784 | 1.875241 |
| E3 | 4.009839 | 1.342438 | -0.4796460 | 2.570188 |
| E4 | 4.430680 | 1.459100 | -0.8461735 | 2.734724 |
| E5 | 4.418605 | 1.330117 | -0.8112306 | 2.974351 |
| N1 | 2.908318 | 1.564455 | 0.3898181 | 2.011598 |
| N2 | 3.485689 | 1.534764 | -0.0632539 | 1.933431 |
| N3 | 3.198569 | 1.596394 | 0.1653818 | 1.821940 |
| N4 | 3.175313 | 1.560599 | 0.2170194 | 1.942313 |
| N5 | 2.952147 | 1.621980 | 0.3995170 | 1.946799 |
| O1 | 4.821556 | 1.120043 | -0.9081616 | 3.468313 |
| O2 | 4.310823 | 1.545865 | -0.6065040 | 2.234007 |
| O3 | 4.483005 | 1.193261 | -0.7926290 | 3.404031 |
| O4 | 4.948122 | 1.175435 | -1.2601825 | 4.264682 |
| O5 | 4.544723 | 1.329501 | -0.7840773 | 2.837771 |
展示出每題的平均數、標準差、峰值以及偏態
dtal_desc <- melt(dta_desc)
reshape資料,將wide data轉為long data
names(dtal_desc)[1:2] <- c("moments", "items")
將dtal_desc中的cloumn進行命名
ggplot(data = subset(dtal_desc, moments == "mean"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["mean",])) + c(-1.5, 0, 1.5) * sd(t(dta_desc["mean", ])),
linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "mean") +
theme_bw()
code的意義: data = subset(dtal_desc, moments ==
“mean”):指定data為何
aes(x = reorder(items, value, max), y = value, group = moments))
:對於軸的指定
geom_hline(yintercept = mean(t(dta_desc[“mean”,])+ c(-1.5, 0, 1.5) *
sd(t(dta_desc[“mean”,
])):線的使用,而從這個CODE中也可以看到此圖將會以正負1.5個標準差呈現虛線與區間
theme_bw():圖表以黑白呈現
從圖中我們可以看見位於圖的4.0~4.55之間有數據的平均數的存在,而在其左右兩側的虛線,代表著以平均數正負1.5個標準差的值。而我們也可以看見圖中有超出這個區間的點(N3、N4、N5、N1),此時我們可以說這些題目與大部分的題目可能不太一樣(其平均數與其他平均數有差異)。
ggplot(data = subset(dtal_desc, moments == "sd"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["sd",])) + # mean of sd as reference
c(-1.5, 0, 1.5) * sd(t(dta_desc["sd", ])), linetype = "dashed") + #sd of the sd
coord_flip() +
labs(x = "items", y = "sd") +
theme_bw()
看此筆資料的SD 從圖中我們可以看見其整體的標準差平均位於1.4的附近。
而左右兩條虛線表示出了其標準差平均數正負1.5個標準差的值。此外,在圖中我們也可以看見有超出這兩個虛線的區域的點(O1),這就代表這題跟其他題目的sd相差很遠。
ggplot(data = subset(dtal_desc, moments == "skewness"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["skewness",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["skewness", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "skewness") +
theme_bw()
此圖為對於偏態的分析,從圖中我們可以看見這筆DATA中的偏態平均數位於-0.5的附近,而左右兩條虛線表示出了其偏態平均數正負1.5個標準差的值。但同時我們也可以看見有幾個點位在這個區間的外面(N5、N1、N4、N3),而結果代表他們可能與其他題有所差異(不同)。
ggplot(data = subset(dtal_desc, moments == "kurtosis"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["kurtosis",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["kurtosis", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "kurtosis") +
theme_bw()
從此圖中,我們可以看見峰度的平均數落在2.5~3.0之間,而其正負1.5個標準差為兩側虛線。在圖中我們可以看見有點超出於區間的點(O4、A2),所以也許這兩題與其他題目有些不同。
dta<- na.omit(bfi)
dta <- dta[, -c(26:28)]
dta[, c("A1", "C4", "C5", "E1", "E2", "O2", "O5")] <- 7 - dta[,c("A1","C4","C5","E1","E2", "O2", "O5")]
str(dta)
## 'data.frame': 2236 obs. of 25 variables:
## $ A1: num 1 3 3 3 6 5 3 6 5 5 ...
## $ A2: int 6 3 4 5 5 6 5 6 4 5 ...
## $ A3: int 5 1 5 2 6 5 5 6 4 1 ...
## $ A4: int 6 5 6 2 5 6 6 1 4 3 ...
## $ A5: int 5 1 5 1 6 5 5 6 3 5 ...
## $ C1: int 6 3 4 5 4 3 5 5 6 5 ...
## $ C2: int 6 2 3 5 3 5 5 2 5 4 ...
## $ C3: int 6 4 5 5 2 6 4 5 6 5 ...
## $ C4: num 6 5 4 5 3 4 6 6 6 5 ...
## $ C5: num 4 3 5 5 2 1 6 6 6 2 ...
## $ E1: num 5 4 6 4 5 5 4 6 5 6 ...
## $ E2: num 6 1 4 3 6 5 5 6 3 5 ...
## $ E3: int 6 4 2 3 2 4 5 6 4 6 ...
## $ E4: int 5 2 5 6 5 6 5 6 2 5 ...
## $ E5: int 6 1 4 5 2 6 6 6 6 4 ...
## $ N1: int 3 6 3 2 2 4 2 2 3 1 ...
## $ N2: int 5 3 3 4 2 4 3 3 3 4 ...
## $ N3: int 2 2 4 2 2 4 3 1 5 2 ...
## $ N4: int 2 6 2 2 2 6 1 2 3 2 ...
## $ N5: int 3 4 3 3 2 6 1 1 2 5 ...
## $ O1: int 4 3 5 5 6 6 6 6 5 2 ...
## $ O2: num 4 5 4 5 6 6 5 3 5 3 ...
## $ O3: int 5 4 5 5 5 5 5 5 6 5 ...
## $ O4: int 6 5 6 5 5 6 6 5 6 4 ...
## $ O5: num 6 4 4 2 5 6 5 4 6 6 ...
dta$tot <- apply(dta, 1, sum)
把每個column進行加總,然後把結果放入一個新的column(tot)中
dta$grp <- NA
dta$grp[rank(dta$tot) < 2236*.27] <- "L"
dta$grp[rank(dta$tot) > 2236*.73] <- "H"
dta$grp <- factor(dta$grp)
head(dta)
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2 O3 O4
## 61623 1 6 5 6 5 6 6 6 6 4 5 6 6 5 6 3 5 2 2 3 4 4 5 6
## 61629 3 3 1 5 1 3 2 4 5 3 4 1 4 2 1 6 3 2 6 4 3 5 4 5
## 61634 3 4 5 6 5 4 3 5 4 5 6 4 2 5 4 3 3 4 2 3 5 4 5 6
## 61640 3 5 2 2 1 5 5 5 5 5 4 3 3 6 5 2 4 2 2 3 5 5 5 5
## 61661 6 5 6 5 6 4 3 2 3 2 5 6 2 5 2 2 2 2 2 2 6 6 5 5
## 61664 5 6 5 6 5 3 5 6 4 1 5 5 4 6 6 4 4 4 6 6 6 6 5 6
## O5 tot grp
## 61623 6 119 H
## 61629 4 84 L
## 61634 4 104 <NA>
## 61640 2 94 L
## 61661 5 99 <NA>
## 61664 6 125 H
dta\(grp <- NA :在dta這個data中多出一個grp的row,並且都顯示NA 計算用於區辨高低組的分數 在上方的code中我們的定義為低分組為其分數低於27%;高分組為分數高於73%。 dta\)grp <- factor(dta$grp):避免R以類別變項的方式進行分析
dtam <- aggregate(dta[, 1:25], by=list(dta$grp), mean)
print(dtam)
## Group.1 A1 A2 A3 A4 A5 C1 C2
## 1 H 5.107084 5.505766 5.373970 5.352554 5.257002 5.154860 5.116969
## 2 L 4.227575 4.126246 3.777409 4.041528 3.850498 3.923588 3.704319
## C3 C4 C5 E1 E2 E3 E4 E5
## 1 4.848435 5.095552 4.342669 4.741351 4.611203 4.904448 5.093904 5.227348
## 2 3.765781 3.813953 3.008306 3.031561 2.892027 3.033223 3.446844 3.430233
## N1 N2 N3 N4 N5 O1 O2 O3
## 1 3.293245 3.927512 3.660626 3.299835 3.271829 5.359143 4.771005 5.187809
## 2 2.602990 3.202658 2.805648 3.274086 2.774086 4.269103 3.877076 3.762458
## O4 O5
## 1 5.331137 5.064250
## 2 4.710963 4.074751
dtam <- aggregate(dta[, 1:25], by=list(dta$grp), mean):取高分組、低分組的平均
dtam <- t(dtam[, -1])
刪除第一個column
item_t <- sapply(dta[, 1:25], function(x) t.test(x ~ dta$grp)$statistic)
print(item_t)
## A1.t A2.t A3.t A4.t A5.t C1.t C2.t
## 11.6048485 23.0772247 24.8535301 16.8480716 21.4153701 18.3288421 20.3115177
## C3.t C4.t C5.t E1.t E2.t E3.t E4.t
## 14.9526697 17.2490960 14.8831014 20.1982621 20.5972118 28.7359620 21.1694572
## E5.t N1.t N2.t N3.t N4.t N5.t O1.t
## 26.4867355 7.6465266 8.3210388 9.4012398 0.2790589 5.2513414 17.6681816
## O2.t O3.t O4.t O5.t
## 10.1969028 22.6670752 9.3472473 13.5240334
用t-test去看高低分組的平均數是否有所差異。
rslt2 <- data.frame(Item = rownames(dtam), low.mean.score = dtam[, 2], high.mean.score = dtam[, 1], mean.dif = dtam[, 1]-dtam[, 2], t.value = item_t)
print(rslt2)
## Item low.mean.score high.mean.score mean.dif t.value
## A1 A1 4.227575 5.107084 0.87950927 11.6048485
## A2 A2 4.126246 5.505766 1.37952022 23.0772247
## A3 A3 3.777409 5.373970 1.59656171 24.8535301
## A4 A4 4.041528 5.352554 1.31102530 16.8480716
## A5 A5 3.850498 5.257002 1.40650331 21.4153701
## C1 C1 3.923588 5.154860 1.23127193 18.3288421
## C2 C2 3.704319 5.116969 1.41264976 20.3115177
## C3 C3 3.765781 4.848435 1.08265419 14.9526697
## C4 C4 3.813953 5.095552 1.28159841 17.2490960
## C5 C5 3.008306 4.342669 1.33436322 14.8831014
## E1 E1 3.031561 4.741351 1.70978944 20.1982621
## E2 E2 2.892027 4.611203 1.71917606 20.5972118
## E3 E3 3.033223 4.904448 1.87122551 28.7359620
## E4 E4 3.446844 5.093904 1.64706059 21.1694572
## E5 E5 3.430233 5.227348 1.79711505 26.4867355
## N1 N1 2.602990 3.293245 0.69025544 7.6465266
## N2 N2 3.202658 3.927512 0.72485455 8.3210388
## N3 N3 2.805648 3.660626 0.85497819 9.4012398
## N4 N4 3.274086 3.299835 0.02574888 0.2790589
## N5 N5 2.774086 3.271829 0.49774229 5.2513414
## O1 O1 4.269103 5.359143 1.09004034 17.6681816
## O2 O2 3.877076 4.771005 0.89392853 10.1969028
## O3 O3 3.762458 5.187809 1.42535042 22.6670752
## O4 O4 4.710963 5.331137 0.62017328 9.3472473
## O5 O5 4.074751 5.064250 0.98949958 13.5240334
將資料整理成表格 low.mean.score = dtam[, 2] : cloumn名字=dtam的第2cloumn
ggplot(data = rslt2, aes(x = reorder(Item, t.value, max), y = t.value)) +
geom_point() +
geom_hline(yintercept = 2, linetype = "dashed") +
coord_flip() +
labs(x = "Items", y = "t-value") +
theme_bw()
若顯著:item有區辨度
N4可能不具有良好的區辨度
itotr <- psych::alpha(dta[, 1:25])$item.stats[, "r.drop"]
## Warning in psych::alpha(dta[, 1:25]): Some items were negatively correlated with the total scale and probably
## should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( N1 N2 N3 N4 N5 O4 ) were negatively correlated with the total scale and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
psych::alpha -> 指定用paych中的alpha
用以計算總分的alpha,顯示出的分數為去除特定題目後的
ldta <- list(A = dta[, 1:5], C = dta[, 6:10], E = dta[ ,11:15],
N = dta[, 16:20], O = dta[, 21:25])
製造一個subscale
isubalpha <- lapply(ldta, psych::alpha)
isubr <- c(isubalpha$A$item.stats[, "r.drop"],
isubalpha$C$item.stats[, "r.drop"],
isubalpha$E$item.stats[, "r.drop"],
isubalpha$N$item.stats[, "r.drop"],
isubalpha$O$item.stats[, "r.drop"])
rslt3 <- as.data.frame(t(rbind(itotr, isubr)))
names(rslt3) <- c("Item-total correlation without item", "Item-subscale correlation without item")
print(rslt3)
## Item-total correlation without item Item-subscale correlation without item
## 1 0.13243666 0.3077162
## 2 0.42943569 0.5618331
## 3 0.44375121 0.5909941
## 4 0.28316705 0.4036667
## 5 0.38350120 0.4887087
## 6 0.32437095 0.4552472
## 7 0.35341507 0.5018052
## 8 0.25678142 0.4775709
## 9 0.26479190 0.5645098
## 10 0.20805656 0.4830116
## 11 0.31471245 0.5128046
## 12 0.32331085 0.6160002
## 13 0.48035284 0.5213901
## 14 0.37723273 0.5845455
## 15 0.47605288 0.4694015
## 16 0.06791869 0.6771958
## 17 0.08147559 0.6508838
## 18 0.10555562 0.6767828
## 19 -0.10731086 0.5428429
## 20 0.01078638 0.4886151
## 21 0.30525432 0.3982523
## 22 0.09618200 0.3593390
## 23 0.40883437 0.4578104
## 24 0.11107410 0.2143355
## 25 0.18707793 0.4281881
itotalpha <- psych::alpha(dta[, 1:25])$alpha.drop[, "raw_alpha"]
## Warning in psych::alpha(dta[, 1:25]): Some items were negatively correlated with the total scale and probably
## should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( N1 N2 N3 N4 N5 O4 ) were negatively correlated with the total scale and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
ialphad <- c(isubalpha$A$alpha.drop[, "raw_alpha"],
isubalpha$C$alpha.drop[, "raw_alpha"],
isubalpha$E$alpha.drop[, "raw_alpha"],
isubalpha$N$alpha.drop[, "raw_alpha"],
isubalpha$O$alpha.drop[, "raw_alpha"])
rslt4 <- as.data.frame(t(rbind(itotalpha, ialphad)))
names(rslt4) <- c("Main Reliability(item drop)", "Sub Reliability (item drop)")
print(rslt4)
## Main Reliability(item drop) Sub Reliability (item drop)
## 1 0.6945414 0.7220895
## 2 0.6730193 0.6217082
## 3 0.6700579 0.6023930
## 4 0.6821019 0.6846047
## 5 0.6751560 0.6463128
## 6 0.6799121 0.7000553
## 7 0.6769802 0.6826336
## 8 0.6846139 0.6916822
## 9 0.6838192 0.6576527
## 10 0.6891334 0.6961548
## 11 0.6788212 0.7375935
## 12 0.6780166 0.6976668
## 13 0.6662841 0.7329283
## 14 0.6737941 0.7105255
## 15 0.6668361 0.7488675
## 16 0.7015049 0.7578407
## 17 0.6999881 0.7662320
## 18 0.6984647 0.7575138
## 19 0.7165875 0.7977181
## 20 0.7073799 0.8144963
## 21 0.6819352 0.5450899
## 22 0.6988113 0.5687471
## 23 0.6739993 0.5128569
## 24 0.6949209 0.6269339
## 25 0.6899707 0.5233128
當factor loading越高:越能解釋因素
fa.parallel(dta[, 1:25], fa = "pc", show.legend = FALSE)
## Parallel analysis suggests that the number of factors = NA and the number of components = 5
在此結果中我們可以看見,當factor為5個或是小於5時,其eigen value皆不大於1。故較為良好的factor數為5個。