keys <-
list(agree=c("-A1","A2","A3","A4","A5"),conscientious=c("C1","C2","C3","-C4","-C5"),
extraversion=c("-E1","-E2","E3","E4","E5"),neuroticism=c("N1","N2","N3","N4","N5"),
openness = c("O1","-O2","O3","O4","-O5"))
head(keys)
## $agree
## [1] "-A1" "A2" "A3" "A4" "A5"
##
## $conscientious
## [1] "C1" "C2" "C3" "-C4" "-C5"
##
## $extraversion
## [1] "-E1" "-E2" "E3" "E4" "E5"
##
## $neuroticism
## [1] "N1" "N2" "N3" "N4" "N5"
##
## $openness
## [1] "O1" "-O2" "O3" "O4" "-O5"
data("bfi")
dta_bfi <- bfi
dta_bfi[,c(1,9,10,11,12,22)]= 7- dta_bfi[,c(1,9,10,11,12,22)]
head(bfi)
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2 O3 O4
## 61617 2 4 3 4 4 2 3 3 4 4 3 3 3 4 4 3 4 2 2 3 3 6 3 4
## 61618 2 4 5 2 5 5 4 4 3 4 1 1 6 4 3 3 3 3 5 5 4 2 4 3
## 61620 5 4 5 4 4 4 5 4 2 5 2 4 4 4 5 4 5 4 2 3 4 2 5 5
## 61621 4 4 6 5 5 4 4 3 5 5 5 3 4 4 4 2 5 2 4 1 3 3 4 3
## 61622 2 3 3 4 5 4 4 5 3 2 2 2 5 4 5 2 3 4 4 3 3 3 4 3
## 61623 6 6 5 6 5 6 6 6 1 3 2 1 6 5 6 3 5 2 2 3 4 3 5 6
## O5 gender education age
## 61617 3 1 NA 16
## 61618 3 2 NA 18
## 61620 2 2 NA 17
## 61621 5 2 NA 17
## 61622 3 1 NA 17
## 61623 1 2 3 21
head(dta_bfi)
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2 O3 O4
## 61617 5 4 3 4 4 2 3 3 3 3 4 4 3 4 4 3 4 2 2 3 3 1 3 4
## 61618 5 4 5 2 5 5 4 4 4 3 6 6 6 4 3 3 3 3 5 5 4 5 4 3
## 61620 2 4 5 4 4 4 5 4 5 2 5 3 4 4 5 4 5 4 2 3 4 5 5 5
## 61621 3 4 6 5 5 4 4 3 2 2 2 4 4 4 4 2 5 2 4 1 3 4 4 3
## 61622 5 3 3 4 5 4 4 5 4 5 5 5 5 4 5 2 3 4 4 3 3 4 4 3
## 61623 1 6 5 6 5 6 6 6 6 4 5 6 6 5 6 3 5 2 2 3 4 4 5 6
## O5 gender education age
## 61617 3 1 NA 16
## 61618 3 2 NA 18
## 61620 2 2 NA 17
## 61621 5 2 NA 17
## 61622 3 1 NA 17
## 61623 1 2 3 21
str(dta_bfi)
## 'data.frame': 2800 obs. of 28 variables:
## $ A1 : num 5 5 2 3 5 1 5 3 3 5 ...
## $ A2 : int 4 4 4 4 3 6 5 3 3 5 ...
## $ A3 : int 3 5 5 6 3 5 5 1 6 6 ...
## $ A4 : int 4 2 4 5 4 6 3 5 3 6 ...
## $ A5 : int 4 5 4 5 5 5 5 1 3 5 ...
## $ C1 : int 2 5 4 4 4 6 5 3 6 6 ...
## $ C2 : int 3 4 5 4 4 6 4 2 6 5 ...
## $ C3 : int 3 4 4 3 5 6 4 4 3 6 ...
## $ C4 : num 3 4 5 2 4 6 5 5 3 5 ...
## $ C5 : num 3 3 2 2 5 4 4 3 2 6 ...
## $ E1 : num 4 6 5 2 5 5 3 4 2 5 ...
## $ E2 : num 4 6 3 4 5 6 4 1 4 5 ...
## $ E3 : int 3 6 4 4 5 6 4 4 NA 4 ...
## $ E4 : int 4 4 4 4 4 5 5 2 4 5 ...
## $ E5 : int 4 3 5 4 5 6 5 1 3 5 ...
## $ N1 : int 3 3 4 2 2 3 1 6 5 5 ...
## $ N2 : int 4 3 5 5 3 5 2 3 5 5 ...
## $ N3 : int 2 3 4 2 4 2 2 2 2 5 ...
## $ N4 : int 2 5 2 4 4 2 1 6 3 2 ...
## $ N5 : int 3 5 3 1 3 3 1 4 3 4 ...
## $ O1 : int 3 4 4 3 3 4 5 3 6 5 ...
## $ O2 : num 1 5 5 4 4 4 5 5 1 6 ...
## $ O3 : int 3 4 5 4 4 5 5 4 6 5 ...
## $ O4 : int 4 3 5 3 3 6 6 5 6 5 ...
## $ O5 : int 3 3 2 5 3 1 1 3 1 2 ...
## $ gender : int 1 2 2 2 1 2 1 1 1 2 ...
## $ education: int NA NA NA NA NA 3 NA 2 1 NA ...
## $ age : int 16 18 17 17 17 21 18 19 19 17 ...
summary(dta_bfi)
## A1 A2 A3 A4 A5
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.00
## 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.0 1st Qu.:4.00
## Median :5.000 Median :5.000 Median :5.000 Median :5.0 Median :5.00
## Mean :4.587 Mean :4.802 Mean :4.604 Mean :4.7 Mean :4.56
## 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:6.0 3rd Qu.:5.00
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.0 Max. :6.00
## NA's :16 NA's :27 NA's :26 NA's :19 NA's :16
## C1 C2 C3 C4 C5
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:4.00 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:2.000
## Median :5.000 Median :5.00 Median :5.000 Median :5.000 Median :4.000
## Mean :4.502 Mean :4.37 Mean :4.304 Mean :4.447 Mean :3.703
## 3rd Qu.:5.000 3rd Qu.:5.00 3rd Qu.:5.000 3rd Qu.:6.000 3rd Qu.:5.000
## Max. :6.000 Max. :6.00 Max. :6.000 Max. :6.000 Max. :6.000
## NA's :21 NA's :24 NA's :20 NA's :26 NA's :16
## E1 E2 E3 E4
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:3.000 1st Qu.:4.000
## Median :4.000 Median :4.000 Median :4.000 Median :5.000
## Mean :4.026 Mean :3.858 Mean :4.001 Mean :4.422
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:6.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## NA's :23 NA's :16 NA's :25 NA's :9
## E5 N1 N2 N3
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :5.000 Median :3.000 Median :4.000 Median :3.000
## Mean :4.416 Mean :2.929 Mean :3.508 Mean :3.217
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :6.000 Max. :6.000 Max. :6.000 Max. :6.000
## NA's :21 NA's :22 NA's :21 NA's :11
## N4 N5 O1 O2 O3
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.00 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:4.000
## Median :3.000 Median :3.00 Median :5.000 Median :5.000 Median :5.000
## Mean :3.186 Mean :2.97 Mean :4.816 Mean :4.287 Mean :4.438
## 3rd Qu.:4.000 3rd Qu.:4.00 3rd Qu.:6.000 3rd Qu.:6.000 3rd Qu.:5.000
## Max. :6.000 Max. :6.00 Max. :6.000 Max. :6.000 Max. :6.000
## NA's :36 NA's :29 NA's :22 NA's :28
## O4 O5 gender education age
## Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.00 Min. : 3.00
## 1st Qu.:4.000 1st Qu.:1.00 1st Qu.:1.000 1st Qu.:3.00 1st Qu.:20.00
## Median :5.000 Median :2.00 Median :2.000 Median :3.00 Median :26.00
## Mean :4.892 Mean :2.49 Mean :1.672 Mean :3.19 Mean :28.78
## 3rd Qu.:6.000 3rd Qu.:3.00 3rd Qu.:2.000 3rd Qu.:4.00 3rd Qu.:35.00
## Max. :6.000 Max. :6.00 Max. :2.000 Max. :5.00 Max. :86.00
## NA's :14 NA's :20 NA's :223
從summary中檢視是否有極端值
dta_bfi <- dta_bfi[, 1:25]
dta_bfi <- na.omit(dta_bfi)
擷取我們要的資料,其他的不要
str(dta_bfi)
## 'data.frame': 2436 obs. of 25 variables:
## $ A1: num 5 5 2 3 5 1 5 3 5 3 ...
## $ A2: int 4 4 4 4 3 6 5 3 5 4 ...
## $ A3: int 3 5 5 6 3 5 5 1 6 5 ...
## $ A4: int 4 2 4 5 4 6 3 5 6 6 ...
## $ A5: int 4 5 4 5 5 5 5 1 5 5 ...
## $ C1: int 2 5 4 4 4 6 5 3 6 4 ...
## $ C2: int 3 4 5 4 4 6 4 2 5 3 ...
## $ C3: int 3 4 4 3 5 6 4 4 6 5 ...
## $ C4: num 3 4 5 2 4 6 5 5 5 4 ...
## $ C5: num 3 3 2 2 5 4 4 3 6 5 ...
## $ E1: num 4 6 5 2 5 5 3 4 5 6 ...
## $ E2: num 4 6 3 4 5 6 4 1 5 4 ...
## $ E3: int 3 6 4 4 5 6 4 4 4 2 ...
## $ E4: int 4 4 4 4 4 5 5 2 5 5 ...
## $ E5: int 4 3 5 4 5 6 5 1 5 4 ...
## $ N1: int 3 3 4 2 2 3 1 6 5 3 ...
## $ N2: int 4 3 5 5 3 5 2 3 5 3 ...
## $ N3: int 2 3 4 2 4 2 2 2 5 4 ...
## $ N4: int 2 5 2 4 4 2 1 6 2 2 ...
## $ N5: int 3 5 3 1 3 3 1 4 4 3 ...
## $ O1: int 3 4 4 3 3 4 5 3 5 5 ...
## $ O2: num 1 5 5 4 4 4 5 5 6 4 ...
## $ O3: int 3 4 5 4 4 5 5 4 5 5 ...
## $ O4: int 4 3 5 3 3 6 6 5 5 6 ...
## $ O5: int 3 3 2 5 3 1 1 3 2 3 ...
## - attr(*, "na.action")= 'omit' Named int [1:364] 9 12 35 42 63 66 72 90 101 107 ...
## ..- attr(*, "names")= chr [1:364] "61630" "61636" "61684" "61693" ...
去掉所有NA,剩下1140筆資料
head(dta_bfi)
## A1 A2 A3 A4 A5 C1 C2 C3 C4 C5 E1 E2 E3 E4 E5 N1 N2 N3 N4 N5 O1 O2 O3 O4
## 61617 5 4 3 4 4 2 3 3 3 3 4 4 3 4 4 3 4 2 2 3 3 1 3 4
## 61618 5 4 5 2 5 5 4 4 4 3 6 6 6 4 3 3 3 3 5 5 4 5 4 3
## 61620 2 4 5 4 4 4 5 4 5 2 5 3 4 4 5 4 5 4 2 3 4 5 5 5
## 61621 3 4 6 5 5 4 4 3 2 2 2 4 4 4 4 2 5 2 4 1 3 4 4 3
## 61622 5 3 3 4 5 4 4 5 4 5 5 5 5 4 5 2 3 4 4 3 3 4 4 3
## 61623 1 6 5 6 5 6 6 6 6 4 5 6 6 5 6 3 5 2 2 3 4 4 5 6
## O5
## 61617 3
## 61618 3
## 61620 2
## 61621 5
## 61622 3
## 61623 1
my_summary <- function(x) {
require(moments)
funs <- c(mean, sd, skewness, kurtosis)
sapply(funs, function(f) f(x, na.rm = T))
}
dta_desc <- apply(dta_bfi, 2, my_summary)
2:colume 1:row 同時計算mean, sd, skewness, kurtosis 會給出一個矩陣
head(dta_desc)
## A1 A2 A3 A4 A5 C1 C2
## [1,] 4.5935961 4.797209 4.598522 4.687603 4.5435140 4.5250411 4.3723317
## [2,] 1.4071773 1.179535 1.311355 1.485213 1.2708043 1.2352576 1.3191521
## [3,] -0.8384803 -1.119490 -1.014388 -1.024196 -0.8444385 -0.8669062 -0.7382632
## [4,] 2.7223329 4.009703 3.467126 3.018136 3.1294876 3.3250076 2.8503002
## C3 C4 C5 E1 E2 E3
## [1,] 4.3000821 4.4503284 3.6941708 4.0213465 3.8456486 3.9844007
## [2,] 1.2912023 1.3766891 1.6327195 1.6314277 1.6138471 1.3517662
## [3,] -0.6780534 -0.6152687 -0.0605661 -0.3743337 -0.2267567 -0.4665917
## [4,] 2.8549709 2.4130413 1.7692151 1.9082372 1.8498982 2.5422154
## E4 E5 N1 N2 N3 N4 N5
## [1,] 4.408867 4.3908046 2.9437603 3.51765189 3.224548 3.2023810 2.971264
## [2,] 1.467060 1.3433163 1.5759089 1.53323764 1.594674 1.5696331 1.623491
## [3,] -0.821241 -0.7789745 0.3691979 -0.08255376 0.141210 0.1973966 0.377573
## [4,] 2.682392 2.8914674 1.9807563 1.94293789 1.820778 1.9165557 1.933888
## O1 O2 O3 O4 O5
## [1,] 4.812808 4.315271 4.4499179 4.925287 2.468801
## [2,] 1.126613 1.552883 1.2052060 1.193136 1.324021
## [3,] -0.904962 -0.616251 -0.7700466 -1.238690 0.764451
## [4,] 3.460237 2.238317 3.3245519 4.178908 2.817845
1:mean 2:sd 3:skewness 4:kurtosis
rownames(dta_desc) <- c("mean", "sd", "skewness", "kurtosis")
rownames(dta_desc) <- c("mean", "sd", "skewness", "kurtosis")
幫每個row取名字
rslt1 <- as.data.frame(t(dta_desc))
rslt1 |> knitr::kable()
| mean | sd | skewness | kurtosis | |
|---|---|---|---|---|
| A1 | 4.593596 | 1.407177 | -0.8384803 | 2.722333 |
| A2 | 4.797208 | 1.179535 | -1.1194896 | 4.009703 |
| A3 | 4.598522 | 1.311355 | -1.0143878 | 3.467126 |
| A4 | 4.687603 | 1.485213 | -1.0241965 | 3.018137 |
| A5 | 4.543514 | 1.270804 | -0.8444385 | 3.129488 |
| C1 | 4.525041 | 1.235258 | -0.8669062 | 3.325008 |
| C2 | 4.372332 | 1.319152 | -0.7382632 | 2.850300 |
| C3 | 4.300082 | 1.291202 | -0.6780534 | 2.854971 |
| C4 | 4.450328 | 1.376689 | -0.6152687 | 2.413041 |
| C5 | 3.694171 | 1.632720 | -0.0605661 | 1.769215 |
| E1 | 4.021346 | 1.631428 | -0.3743337 | 1.908237 |
| E2 | 3.845649 | 1.613847 | -0.2267567 | 1.849898 |
| E3 | 3.984401 | 1.351766 | -0.4665917 | 2.542215 |
| E4 | 4.408867 | 1.467060 | -0.8212410 | 2.682392 |
| E5 | 4.390805 | 1.343316 | -0.7789745 | 2.891467 |
| N1 | 2.943760 | 1.575909 | 0.3691979 | 1.980756 |
| N2 | 3.517652 | 1.533238 | -0.0825538 | 1.942938 |
| N3 | 3.224548 | 1.594674 | 0.1412100 | 1.820778 |
| N4 | 3.202381 | 1.569633 | 0.1973966 | 1.916556 |
| N5 | 2.971264 | 1.623491 | 0.3775730 | 1.933888 |
| O1 | 4.812808 | 1.126613 | -0.9049620 | 3.460237 |
| O2 | 4.315271 | 1.552883 | -0.6162510 | 2.238317 |
| O3 | 4.449918 | 1.205206 | -0.7700466 | 3.324552 |
| O4 | 4.925287 | 1.193136 | -1.2386895 | 4.178908 |
| O5 | 2.468801 | 1.324021 | 0.7644510 | 2.817845 |
把矩陣轉為data frame
t的意思為轉置,讓看資料的形式變得不一樣(變成直的)
dtal_desc <- melt(dta_desc)
來自reshape package,讓資料變成長形的(l)
names(dtal_desc)[1:2] <- c("moments", "items")
命名第一個column為moments,第二個為items
head(dtal_desc)
## moments items value
## 1 mean A1 4.5935961
## 2 sd A1 1.4071773
## 3 skewness A1 -0.8384803
## 4 kurtosis A1 2.7223329
## 5 mean A2 4.7972085
## 6 sd A2 1.1795348
ggplot(data = subset(dtal_desc, moments == "mean"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["mean",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["mean", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "mean") +
theme_bw()
告訴ggplot資料為dtal_desc,只擷取moments 中的mean。
aes告知ggplot的x跟y為何。x的部分reorder(由大到小) group=指定
geom_point:如何呈現點(point)
geom_hline:如何呈現直線(根據y軸的intercept)
根據mean劃出標準差
coord_flip():將圖轉90度
theme_bw():讓圖為黑白,若沒有的話圖就是灰色的
圖有超過1.5個標準差的點:有outlier(N5、N1、O5)
ggplot(data = subset(dtal_desc, moments == "sd"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["sd",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["sd", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "sd") +
theme_bw()
此圖為將標準差抽出來看,從這個圖中我們可以看見個別題目的標準差與標準差的平均數的差異。 我們可以從圖中看見O1為Outlier
ggplot(data = subset(dtal_desc, moments == "skewness"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["skewness",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["skewness", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "skewness") +
theme_bw()
此圖為將標準差抽出來看,從這個圖中我們可以看見個別題目的skewness與skewness的平均數的差異。
從圖中,我們可以得知O5、N5、N1為Outlier
ggplot(data = subset(dtal_desc, moments == "kurtosis"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["kurtosis",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["kurtosis", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "kurtosis") +
theme_bw()
此圖為將標準差抽出來看,從這個圖中我們可以看見個別題目的kurtosis與kurtosis的平均數的差異。
從圖中我們可以得知O4、A2為outlier
計算區變度,以總分為主,選取低分組與高分組,比較各題在兩組上的差異。
dta_bfi <- apply(dta_bfi, 1, sum)
將每個row加起來
dta_bfi$grp <- NA
## Warning in dta_bfi$grp <- NA: 把公式左側強迫變成串列
在dta這個data中多出一個grp的row,並且都顯示NA
dta_bfi$grp[rank(dta_bfi$tot) < 2436*.27] <- "L"
dta_bfi$grp[rank(dta_bfi$tot) > 2436*.73] <- "H"
dta_bfi$grp <- factor(dta_bfi$grp)
head(dta_bfi)
## $`61617`
## [1] 81
##
## $`61618`
## [1] 104
##
## $`61620`
## [1] 99
##
## $`61621`
## [1] 89
##
## $`61622`
## [1] 99
##
## $`61623`
## [1] 114
算高低分組平均數 dta_bfi\(grp[rank(dta_bfi\)tot) < 1140.27] <- “L”:只要小於1140.27的顯示為L
dta_bfi\(grp[rank(dta_bfi\)tot) > 1140.73] <- “H”: 只要大於1140.73的顯示為H
若不是高分組也不是低分組:顯示為NA
dtam <- aggregate(dta_bfi[, 1:25], by=list(dta_bfi$grp), mean) print(dtam)
dtam <- aggregate(dta_bfi[, 1:9], by=list(dta_bfi$grp),
mean):計算高分組跟低分組的MEAN
print(dtam):將第一欄刪除
dtam <- t(dtam[, -1])
t-test 用T-test看差別(高於1.96或2:可以區分)
item_t <- sapply(data_omit[, 1:25], function(x) t.test(x ~ data_omit\(grp)\)statistic) print(item_t)
若數值大於2,則統計檢定結果為顯著