SSP_5/10

data(bfi)
dta <-bfi

dta <- na.omit(dta)

消除DATA中的NA

str(bfi)

## 'data.frame':    2800 obs. of  28 variables:
##  $ A1       : int  2 2 5 4 2 6 2 4 4 2 ...
##  $ A2       : int  4 4 4 4 3 6 5 3 3 5 ...
##  $ A3       : int  3 5 5 6 3 5 5 1 6 6 ...
##  $ A4       : int  4 2 4 5 4 6 3 5 3 6 ...
##  $ A5       : int  4 5 4 5 5 5 5 1 3 5 ...
##  $ C1       : int  2 5 4 4 4 6 5 3 6 6 ...
##  $ C2       : int  3 4 5 4 4 6 4 2 6 5 ...
##  $ C3       : int  3 4 4 3 5 6 4 4 3 6 ...
##  $ C4       : int  4 3 2 5 3 1 2 2 4 2 ...
##  $ C5       : int  4 4 5 5 2 3 3 4 5 1 ...
##  $ E1       : int  3 1 2 5 2 2 4 3 5 2 ...
##  $ E2       : int  3 1 4 3 2 1 3 6 3 2 ...
##  $ E3       : int  3 6 4 4 5 6 4 4 NA 4 ...
##  $ E4       : int  4 4 4 4 4 5 5 2 4 5 ...
##  $ E5       : int  4 3 5 4 5 6 5 1 3 5 ...
##  $ N1       : int  3 3 4 2 2 3 1 6 5 5 ...
##  $ N2       : int  4 3 5 5 3 5 2 3 5 5 ...
##  $ N3       : int  2 3 4 2 4 2 2 2 2 5 ...
##  $ N4       : int  2 5 2 4 4 2 1 6 3 2 ...
##  $ N5       : int  3 5 3 1 3 3 1 4 3 4 ...
##  $ O1       : int  3 4 4 3 3 4 5 3 6 5 ...
##  $ O2       : int  6 2 2 3 3 3 2 2 6 1 ...
##  $ O3       : int  3 4 5 4 4 5 5 4 6 5 ...
##  $ O4       : int  4 3 5 3 3 6 6 5 6 5 ...
##  $ O5       : int  3 3 2 5 3 1 1 3 1 2 ...
##  $ gender   : int  1 2 2 2 1 2 1 1 1 2 ...
##  $ education: int  NA NA NA NA NA 3 NA 2 1 NA ...
##  $ age      : int  16 18 17 17 17 21 18 19 19 17 ...

dta <- dta[, -c(26:28)]

只將特定的資料(題目的分數)納入資料中，以便後續的分析

dta[, c("A1", "C4", "C5", "E1", "E2", "O2", "O5")] <- 7 - dta[,c("A1","C4","C5","E1","E2", "O2", "O5")]

將負向題的分數轉為正向題，以便後續計算與分析

my_summary <- function(x) {
 require(moments)
 funs <- c(mean, sd, skewness, kurtosis)
 sapply(funs, function(f) f(x, na.rm = T))
 }

dta_desc <- apply(dta, 2, my_summary)

將my_summary function的結果加至各個columns中

head(dta_desc)

##              A1        A2        A3        A4         A5         C1         C2
## [1,]  4.6346154  4.834079  4.629249  4.749553  4.5849732  4.5697674  4.4011628
## [2,]  1.3919681  1.156915  1.289373  1.447941  1.2558329  1.2166107  1.3113098
## [3,] -0.8849273 -1.149616 -1.035184 -1.094809 -0.8806471 -0.8947177 -0.7697916
## [4,]  2.8355975  4.144108  3.567472  3.237885  3.2405729  3.4186321  2.9099922
##              C3         C4          C5         E1         E2        E3
## [1,]  4.3228980  4.4991055  3.74463327  4.0304114  3.8788014  4.009839
## [2,]  1.2871535  1.3628169  1.62959014  1.6181205  1.6056605  1.342438
## [3,] -0.6925833 -0.6410005 -0.09191721 -0.3825874 -0.2544784 -0.479646
## [4,]  2.8926454  2.4399914  1.77002240  1.9336243  1.8752405  2.570188
##              E4         E5        N1          N2        N3        N4       N5
## [1,]  4.4306798  4.4186047 2.9083184  3.48568873 3.1985689 3.1753131 2.952147
## [2,]  1.4591002  1.3301168 1.5644549  1.53476376 1.5963935 1.5605995 1.621980
## [3,] -0.8461735 -0.8112306 0.3898181 -0.06325393 0.1653818 0.2170194 0.399517
## [4,]  2.7347239  2.9743508 2.0115978  1.93343058 1.8219399 1.9423127 1.946799
##              O1        O2        O3        O4         O5
## [1,]  4.8215564  4.310823  4.483005  4.948122  4.5447227
## [2,]  1.1200433  1.545865  1.193261  1.175435  1.3295014
## [3,] -0.9081616 -0.606504 -0.792629 -1.260182 -0.7840773
## [4,]  3.4683128  2.234007  3.404031  4.264682  2.8377712

rownames(dta_desc) <- c("mean", "sd", "skewness", "kurtosis")

將每個row進行命名

rslt1 <- as.data.frame(t(dta_desc))

轉置數據(從matrix到data frame)

rslt1 |> knitr::kable()

	mean	sd	skewness	kurtosis
A1	4.634615	1.391968	-0.8849273	2.835597
A2	4.834079	1.156915	-1.1496157	4.144108
A3	4.629249	1.289373	-1.0351837	3.567472
A4	4.749553	1.447941	-1.0948090	3.237885
A5	4.584973	1.255833	-0.8806471	3.240573
C1	4.569767	1.216611	-0.8947177	3.418632
C2	4.401163	1.311310	-0.7697916	2.909992
C3	4.322898	1.287154	-0.6925833	2.892645
C4	4.499105	1.362817	-0.6410005	2.439991
C5	3.744633	1.629590	-0.0919172	1.770022
E1	4.030411	1.618121	-0.3825874	1.933624
E2	3.878801	1.605660	-0.2544784	1.875241
E3	4.009839	1.342438	-0.4796460	2.570188
E4	4.430680	1.459100	-0.8461735	2.734724
E5	4.418605	1.330117	-0.8112306	2.974351
N1	2.908318	1.564455	0.3898181	2.011598
N2	3.485689	1.534764	-0.0632539	1.933431
N3	3.198569	1.596394	0.1653818	1.821940
N4	3.175313	1.560599	0.2170194	1.942313
N5	2.952147	1.621980	0.3995170	1.946799
O1	4.821556	1.120043	-0.9081616	3.468313
O2	4.310823	1.545865	-0.6065040	2.234007
O3	4.483005	1.193261	-0.7926290	3.404031
O4	4.948122	1.175435	-1.2601825	4.264682
O5	4.544723	1.329501	-0.7840773	2.837771

展示出每題的平均數、標準差、峰值以及偏態

dtal_desc <- melt(dta_desc)

reshape資料，將wide data轉為long data

names(dtal_desc)[1:2] <- c("moments", "items")

將dtal_desc中的cloumn進行命名

ggplot(data = subset(dtal_desc, moments == "mean"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["mean",])) +  c(-1.5, 0, 1.5) * sd(t(dta_desc["mean", ])), 
 linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "mean") +
  theme_bw()

code的意義: data = subset(dtal_desc, moments == “mean”):指定data為何
aes(x = reorder(items, value, max), y = value, group = moments)) :對於軸的指定
geom_hline(yintercept = mean(t(dta_desc[“mean”,])+ c(-1.5, 0, 1.5) * sd(t(dta_desc[“mean”, ])):線的使用，而從這個CODE中也可以看到此圖將會以正負1.5個標準差呈現虛線與區間
theme_bw():圖表以黑白呈現

從圖中我們可以看見位於圖的4.0~4.55之間有數據的平均數的存在，而在其左右兩側的虛線，代表著以平均數正負1.5個標準差的值。而我們也可以看見圖中有超出這個區間的點(N3、N4、N5、N1)，此時我們可以說這些題目與大部分的題目可能不太一樣(其平均數與其他平均數有差異)。

ggplot(data = subset(dtal_desc, moments == "sd"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["sd",])) +  # mean of sd as reference
 c(-1.5, 0, 1.5) * sd(t(dta_desc["sd", ])), linetype = "dashed") + #sd of the sd
 coord_flip() +
 labs(x = "items",  y = "sd") +
  theme_bw()

看此筆資料的SD 從圖中我們可以看見其整體的標準差平均位於1.4的附近。而左右兩條虛線表示出了其標準差平均數正負1.5個標準差的值。此外，在圖中我們也可以看見有超出這兩個虛線的區域的點(O1)，這就代表這題跟其他題目的sd相差很遠。

ggplot(data = subset(dtal_desc, moments == "skewness"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["skewness",])) +
 c(-1.5, 0, 1.5) * sd(t(dta_desc["skewness", ])), linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "skewness") +
  theme_bw()

此圖為對於偏態的分析，從圖中我們可以看見這筆DATA中的偏態平均數位於-0.5的附近，而左右兩條虛線表示出了其偏態平均數正負1.5個標準差的值。但同時我們也可以看見有幾個點位在這個區間的外面(N5、N1、N4、N3)，而結果代表他們可能與其他題有所差異(不同)。

ggplot(data = subset(dtal_desc, moments == "kurtosis"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["kurtosis",])) +
 c(-1.5, 0, 1.5) * sd(t(dta_desc["kurtosis", ])), linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "kurtosis") +
  theme_bw()

從此圖中，我們可以看見峰度的平均數落在2.5~3.0之間，而其正負1.5個標準差為兩側虛線。在圖中我們可以看見有點超出於區間的點(O4、A2)，所以也許這兩題與其他題目有些不同。

區辨指數

dta<- na.omit(bfi)
dta <- dta[, -c(26:28)]
dta[, c("A1", "C4", "C5", "E1", "E2", "O2", "O5")] <- 7 - dta[,c("A1","C4","C5","E1","E2", "O2", "O5")]
str(dta)

## 'data.frame':    2236 obs. of  25 variables:
##  $ A1: num  1 3 3 3 6 5 3 6 5 5 ...
##  $ A2: int  6 3 4 5 5 6 5 6 4 5 ...
##  $ A3: int  5 1 5 2 6 5 5 6 4 1 ...
##  $ A4: int  6 5 6 2 5 6 6 1 4 3 ...
##  $ A5: int  5 1 5 1 6 5 5 6 3 5 ...
##  $ C1: int  6 3 4 5 4 3 5 5 6 5 ...
##  $ C2: int  6 2 3 5 3 5 5 2 5 4 ...
##  $ C3: int  6 4 5 5 2 6 4 5 6 5 ...
##  $ C4: num  6 5 4 5 3 4 6 6 6 5 ...
##  $ C5: num  4 3 5 5 2 1 6 6 6 2 ...
##  $ E1: num  5 4 6 4 5 5 4 6 5 6 ...
##  $ E2: num  6 1 4 3 6 5 5 6 3 5 ...
##  $ E3: int  6 4 2 3 2 4 5 6 4 6 ...
##  $ E4: int  5 2 5 6 5 6 5 6 2 5 ...
##  $ E5: int  6 1 4 5 2 6 6 6 6 4 ...
##  $ N1: int  3 6 3 2 2 4 2 2 3 1 ...
##  $ N2: int  5 3 3 4 2 4 3 3 3 4 ...
##  $ N3: int  2 2 4 2 2 4 3 1 5 2 ...
##  $ N4: int  2 6 2 2 2 6 1 2 3 2 ...
##  $ N5: int  3 4 3 3 2 6 1 1 2 5 ...
##  $ O1: int  4 3 5 5 6 6 6 6 5 2 ...
##  $ O2: num  4 5 4 5 6 6 5 3 5 3 ...
##  $ O3: int  5 4 5 5 5 5 5 5 6 5 ...
##  $ O4: int  6 5 6 5 5 6 6 5 6 4 ...
##  $ O5: num  6 4 4 2 5 6 5 4 6 6 ...

dta$tot <- apply(dta, 1, sum)

把每個column進行加總，然後把結果放入一個新的column(tot)中

dta$grp <- NA
dta$grp[rank(dta$tot) < 2236*.27] <- "L"
dta$grp[rank(dta$tot) > 2236*.73] <- "H"
dta$grp <- factor(dta$grp)
head(dta)

##       A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2 O3 O4
## 61623  1  6  5  6  5  6  6  6  6  4  5  6  6  5  6  3  5  2  2  3  4  4  5  6
## 61629  3  3  1  5  1  3  2  4  5  3  4  1  4  2  1  6  3  2  6  4  3  5  4  5
## 61634  3  4  5  6  5  4  3  5  4  5  6  4  2  5  4  3  3  4  2  3  5  4  5  6
## 61640  3  5  2  2  1  5  5  5  5  5  4  3  3  6  5  2  4  2  2  3  5  5  5  5
## 61661  6  5  6  5  6  4  3  2  3  2  5  6  2  5  2  2  2  2  2  2  6  6  5  5
## 61664  5  6  5  6  5  3  5  6  4  1  5  5  4  6  6  4  4  4  6  6  6  6  5  6
##       O5 tot  grp
## 61623  6 119    H
## 61629  4  84    L
## 61634  4 104 <NA>
## 61640  2  94    L
## 61661  5  99 <NA>
## 61664  6 125    H

dta$grp <- NA :在dta這個data中多出一個grp的row，並且都顯示NA 計算用於區辨高低組的分數在上方的code中我們的定義為低分組為其分數低於27%；高分組為分數高於73%。 dta$grp <- factor(dta$grp):避免R以類別變項的方式進行分析

dtam <- aggregate(dta[, 1:25], by=list(dta$grp), mean)
print(dtam)

##   Group.1       A1       A2       A3       A4       A5       C1       C2
## 1       H 5.107084 5.505766 5.373970 5.352554 5.257002 5.154860 5.116969
## 2       L 4.227575 4.126246 3.777409 4.041528 3.850498 3.923588 3.704319
##         C3       C4       C5       E1       E2       E3       E4       E5
## 1 4.848435 5.095552 4.342669 4.741351 4.611203 4.904448 5.093904 5.227348
## 2 3.765781 3.813953 3.008306 3.031561 2.892027 3.033223 3.446844 3.430233
##         N1       N2       N3       N4       N5       O1       O2       O3
## 1 3.293245 3.927512 3.660626 3.299835 3.271829 5.359143 4.771005 5.187809
## 2 2.602990 3.202658 2.805648 3.274086 2.774086 4.269103 3.877076 3.762458
##         O4       O5
## 1 5.331137 5.064250
## 2 4.710963 4.074751

dtam <- aggregate(dta[, 1:25], by=list(dta$grp), mean):取高分組、低分組的平均

dtam <- t(dtam[, -1])

刪除第一個column

item_t <- sapply(dta[, 1:25], function(x) t.test(x ~ dta$grp)$statistic)
print(item_t)

##       A1.t       A2.t       A3.t       A4.t       A5.t       C1.t       C2.t 
## 11.6048485 23.0772247 24.8535301 16.8480716 21.4153701 18.3288421 20.3115177 
##       C3.t       C4.t       C5.t       E1.t       E2.t       E3.t       E4.t 
## 14.9526697 17.2490960 14.8831014 20.1982621 20.5972118 28.7359620 21.1694572 
##       E5.t       N1.t       N2.t       N3.t       N4.t       N5.t       O1.t 
## 26.4867355  7.6465266  8.3210388  9.4012398  0.2790589  5.2513414 17.6681816 
##       O2.t       O3.t       O4.t       O5.t 
## 10.1969028 22.6670752  9.3472473 13.5240334

用t-test去看高低分組的平均數是否有所差異。

rslt2 <- data.frame(Item = rownames(dtam), low.mean.score = dtam[, 2], high.mean.score = dtam[, 1], mean.dif = dtam[, 1]-dtam[, 2], t.value = item_t)
print(rslt2)

##    Item low.mean.score high.mean.score   mean.dif    t.value
## A1   A1       4.227575        5.107084 0.87950927 11.6048485
## A2   A2       4.126246        5.505766 1.37952022 23.0772247
## A3   A3       3.777409        5.373970 1.59656171 24.8535301
## A4   A4       4.041528        5.352554 1.31102530 16.8480716
## A5   A5       3.850498        5.257002 1.40650331 21.4153701
## C1   C1       3.923588        5.154860 1.23127193 18.3288421
## C2   C2       3.704319        5.116969 1.41264976 20.3115177
## C3   C3       3.765781        4.848435 1.08265419 14.9526697
## C4   C4       3.813953        5.095552 1.28159841 17.2490960
## C5   C5       3.008306        4.342669 1.33436322 14.8831014
## E1   E1       3.031561        4.741351 1.70978944 20.1982621
## E2   E2       2.892027        4.611203 1.71917606 20.5972118
## E3   E3       3.033223        4.904448 1.87122551 28.7359620
## E4   E4       3.446844        5.093904 1.64706059 21.1694572
## E5   E5       3.430233        5.227348 1.79711505 26.4867355
## N1   N1       2.602990        3.293245 0.69025544  7.6465266
## N2   N2       3.202658        3.927512 0.72485455  8.3210388
## N3   N3       2.805648        3.660626 0.85497819  9.4012398
## N4   N4       3.274086        3.299835 0.02574888  0.2790589
## N5   N5       2.774086        3.271829 0.49774229  5.2513414
## O1   O1       4.269103        5.359143 1.09004034 17.6681816
## O2   O2       3.877076        4.771005 0.89392853 10.1969028
## O3   O3       3.762458        5.187809 1.42535042 22.6670752
## O4   O4       4.710963        5.331137 0.62017328  9.3472473
## O5   O5       4.074751        5.064250 0.98949958 13.5240334

將資料整理成表格 low.mean.score = dtam[, 2] : cloumn名字=dtam的第2cloumn

ggplot(data = rslt2, aes(x = reorder(Item, t.value, max), y = t.value)) +
 geom_point() +
 geom_hline(yintercept = 2, linetype = "dashed") +
 coord_flip() +
 labs(x = "Items", y = "t-value") +
 theme_bw()

若顯著:item有區辨度
N4可能不具有良好的區辨度

Items correlation

itotr <- psych::alpha(dta[, 1:25])$item.stats[, "r.drop"]

## Warning in psych::alpha(dta[, 1:25]): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

## Some items ( N1 N2 N3 N4 N5 O4 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

psych::alpha -> 指定用paych中的alpha
用以計算總分的alpha，顯示出的分數為去除特定題目後的

 ldta <- list(A = dta[, 1:5], C = dta[, 6:10], E = dta[ ,11:15], 
              N = dta[, 16:20], O = dta[, 21:25])

製造一個subscale

isubalpha <- lapply(ldta, psych::alpha)
isubr <- c(isubalpha$A$item.stats[, "r.drop"],
             isubalpha$C$item.stats[, "r.drop"],
             isubalpha$E$item.stats[, "r.drop"],
           isubalpha$N$item.stats[, "r.drop"],
           isubalpha$O$item.stats[, "r.drop"])

rslt3 <- as.data.frame(t(rbind(itotr, isubr)))
names(rslt3) <- c("Item-total correlation without item", "Item-subscale correlation without item")

print(rslt3)

##    Item-total correlation without item Item-subscale correlation without item
## 1                           0.13243666                              0.3077162
## 2                           0.42943569                              0.5618331
## 3                           0.44375121                              0.5909941
## 4                           0.28316705                              0.4036667
## 5                           0.38350120                              0.4887087
## 6                           0.32437095                              0.4552472
## 7                           0.35341507                              0.5018052
## 8                           0.25678142                              0.4775709
## 9                           0.26479190                              0.5645098
## 10                          0.20805656                              0.4830116
## 11                          0.31471245                              0.5128046
## 12                          0.32331085                              0.6160002
## 13                          0.48035284                              0.5213901
## 14                          0.37723273                              0.5845455
## 15                          0.47605288                              0.4694015
## 16                          0.06791869                              0.6771958
## 17                          0.08147559                              0.6508838
## 18                          0.10555562                              0.6767828
## 19                         -0.10731086                              0.5428429
## 20                          0.01078638                              0.4886151
## 21                          0.30525432                              0.3982523
## 22                          0.09618200                              0.3593390
## 23                          0.40883437                              0.4578104
## 24                          0.11107410                              0.2143355
## 25                          0.18707793                              0.4281881

Items reliability

itotalpha <- psych::alpha(dta[, 1:25])$alpha.drop[, "raw_alpha"]

## Warning in psych::alpha(dta[, 1:25]): Some items were negatively correlated with the total scale and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

## Some items ( N1 N2 N3 N4 N5 O4 ) were negatively correlated with the total scale and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

ialphad <- c(isubalpha$A$alpha.drop[, "raw_alpha"],
             isubalpha$C$alpha.drop[, "raw_alpha"],
             isubalpha$E$alpha.drop[, "raw_alpha"],
             isubalpha$N$alpha.drop[, "raw_alpha"],
             isubalpha$O$alpha.drop[, "raw_alpha"])

rslt4 <- as.data.frame(t(rbind(itotalpha, ialphad)))
names(rslt4) <- c("Main Reliability(item drop)", "Sub Reliability (item drop)")
print(rslt4)

##    Main Reliability(item drop) Sub Reliability (item drop)
## 1                    0.6945414                   0.7220895
## 2                    0.6730193                   0.6217082
## 3                    0.6700579                   0.6023930
## 4                    0.6821019                   0.6846047
## 5                    0.6751560                   0.6463128
## 6                    0.6799121                   0.7000553
## 7                    0.6769802                   0.6826336
## 8                    0.6846139                   0.6916822
## 9                    0.6838192                   0.6576527
## 10                   0.6891334                   0.6961548
## 11                   0.6788212                   0.7375935
## 12                   0.6780166                   0.6976668
## 13                   0.6662841                   0.7329283
## 14                   0.6737941                   0.7105255
## 15                   0.6668361                   0.7488675
## 16                   0.7015049                   0.7578407
## 17                   0.6999881                   0.7662320
## 18                   0.6984647                   0.7575138
## 19                   0.7165875                   0.7977181
## 20                   0.7073799                   0.8144963
## 21                   0.6819352                   0.5450899
## 22                   0.6988113                   0.5687471
## 23                   0.6739993                   0.5128569
## 24                   0.6949209                   0.6269339
## 25                   0.6899707                   0.5233128

factor loading

當factor loading越高:越能解釋因素

Parallel analysis

fa.parallel(dta[, 1:25], fa = "pc", show.legend = FALSE)

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  5

在此結果中我們可以看見，當factor為5個或是小於5時，其eigen value皆不大於1。故較為良好的factor數為5個。