#load data
data("PoliticalDemocracy")
str(PoliticalDemocracy)
## 'data.frame': 75 obs. of 11 variables:
## $ y1: num 2.5 1.25 7.5 8.9 10 7.5 7.5 7.5 2.5 10 ...
## $ y2: num 0 0 8.8 8.8 3.33 ...
## $ y3: num 3.33 3.33 10 10 10 ...
## $ y4: num 0 0 9.2 9.2 6.67 ...
## $ y5: num 1.25 6.25 8.75 8.91 7.5 ...
## $ y6: num 0 1.1 8.09 8.13 3.33 ...
## $ y7: num 3.73 6.67 10 10 10 ...
## $ y8: num 3.333 0.737 8.212 4.615 6.667 ...
## $ x1: num 4.44 5.38 5.96 6.29 5.86 ...
## $ x2: num 3.64 5.06 6.26 7.57 6.82 ...
## $ x3: num 2.56 3.57 5.22 6.27 4.57 ...
dta <- PoliticalDemocracy
my_summary <- function(x) {
require(moments)
funs <- c(mean, sd, skewness, kurtosis)
sapply(funs, function(f) f(x, na.rm = T))}
dta_desc <- apply(dta, 2, my_summary)
head(dta_desc)
## y1 y2 y3 y4 y5 y6 y7
## [1,] 5.46466667 4.2564429 6.563110 4.4525330 5.1362519 2.9780741 6.1962639
## [2,] 2.62270200 3.9471276 3.280891 3.3494674 2.6126022 3.3727326 3.2862398
## [3,] -0.09330775 0.3251758 -0.606108 0.1201215 -0.2326106 0.9111375 -0.5645539
## [4,] 1.89574603 1.5744267 2.342953 1.8359537 2.2822132 2.6000463 2.3279219
## y8 x1 x2 x3
## [1,] 4.0433897 5.0543838 4.7921946 3.55768979
## [2,] 3.2455927 0.7329043 1.5106644 1.40571124
## [3,] 0.4546483 0.2590885 -0.3528184 0.08551615
## [4,] 2.0939704 2.3072658 2.4952354 2.12005702
#rename rows
rownames(dta_desc) <- c("mean", "sd", "skewness", "kurtosis")
dta_desc
## y1 y2 y3 y4 y5 y6
## mean 5.46466667 4.2564429 6.563110 4.4525330 5.1362519 2.9780741
## sd 2.62270200 3.9471276 3.280891 3.3494674 2.6126022 3.3727326
## skewness -0.09330775 0.3251758 -0.606108 0.1201215 -0.2326106 0.9111375
## kurtosis 1.89574603 1.5744267 2.342953 1.8359537 2.2822132 2.6000463
## y7 y8 x1 x2 x3
## mean 6.1962639 4.0433897 5.0543838 4.7921946 3.55768979
## sd 3.2862398 3.2455927 0.7329043 1.5106644 1.40571124
## skewness -0.5645539 0.4546483 0.2590885 -0.3528184 0.08551615
## kurtosis 2.3279219 2.0939704 2.3072658 2.4952354 2.12005702
rslt1 <- as.data.frame(t(dta_desc))
rslt1 |> knitr::kable()
| mean | sd | skewness | kurtosis | |
|---|---|---|---|---|
| y1 | 5.464667 | 2.6227020 | -0.0933077 | 1.895746 |
| y2 | 4.256443 | 3.9471276 | 0.3251758 | 1.574427 |
| y3 | 6.563110 | 3.2808912 | -0.6061080 | 2.342953 |
| y4 | 4.452533 | 3.3494674 | 0.1201215 | 1.835954 |
| y5 | 5.136252 | 2.6126022 | -0.2326106 | 2.282213 |
| y6 | 2.978074 | 3.3727326 | 0.9111375 | 2.600046 |
| y7 | 6.196264 | 3.2862398 | -0.5645539 | 2.327922 |
| y8 | 4.043390 | 3.2455927 | 0.4546483 | 2.093970 |
| x1 | 5.054384 | 0.7329043 | 0.2590885 | 2.307266 |
| x2 | 4.792195 | 1.5106644 | -0.3528184 | 2.495235 |
| x3 | 3.557690 | 1.4057112 | 0.0855161 | 2.120057 |
# visualize and reshaping the wide data to long data
dtal_desc <- melt(dta_desc)
# assigning name to each column
names(dtal_desc)[1:2] <- c("moments", "items")
# detect extreme means
ggplot(data = subset(dtal_desc, moments == "mean"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["mean",])) + # mean of the means as a reference
c(-1.5, 0, 1.5) * sd(t(dta_desc["mean", ])), # sd of the mean values
linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "mean") +
theme_bw()
The means of y6,y3 seem to be different from others.
# sd: the larger the better
ggplot(data = subset(dtal_desc, moments == "sd"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["sd",])) + # mean of sd as reference
c(-1.5, 0, 1.5) * sd(t(dta_desc["sd", ])), linetype = "dashed") + #sd of the sd
coord_flip() +
labs(x = "items", y = "sd") +
theme_bw()
Vertical line is the average sd of 11 items.
# Ideally skewness should close to 0 for normally distributed
ggplot(data = subset(dtal_desc, moments == "skewness"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["skewness",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["skewness", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "skewness") +
theme_bw()
# the larger the kurtosis = item responses focus on a single option.
ggplot(data = subset(dtal_desc, moments == "kurtosis"),
aes(x = reorder(items, value, max), y = value, group = moments)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(t(dta_desc["kurtosis",])) +
c(-1.5, 0, 1.5) * sd(t(dta_desc["kurtosis", ])), linetype = "dashed") +
coord_flip() +
labs(x = "items", y = "kurtosis") +
theme_bw()
fa.parallel(dta[, 1:11], fa = "pc", show.legend = FALSE)
## Parallel analysis suggests that the number of factors = NA and the number of components = 2
Parallel analysis suggests that the number of factors = NA and the number of components = 2 or 6
因素負載量越高越好,越能反映子構念=factor loading the higher, the better.
# from psych
print.psych(fa(dta[, 1:11], # define the data
nfactor = 2, # number of factors based on parallel analysis
fm = "pa",
rotate = "Promax"),
cut = .3) #不列印.3以下的因素負載量
## Factor Analysis using method = pa
## Call: fa(r = dta[, 1:11], nfactors = 2, rotate = "Promax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA1 PA2 h2 u2 com
## y1 0.89 0.72 0.277 1.0
## y2 0.83 0.59 0.411 1.1
## y3 0.70 0.48 0.523 1.0
## y4 0.83 0.74 0.257 1.0
## y5 0.68 0.66 0.344 1.2
## y6 0.79 0.62 0.385 1.0
## y7 0.81 0.67 0.327 1.0
## y8 0.80 0.70 0.295 1.0
## x1 0.91 0.87 0.131 1.0
## x2 0.97 0.94 0.064 1.0
## x3 0.88 0.75 0.247 1.0
##
## PA1 PA2
## SS loadings 5.08 2.66
## Proportion Var 0.46 0.24
## Cumulative Var 0.46 0.70
## Proportion Explained 0.66 0.34
## Cumulative Proportion 0.66 1.00
##
## With factor correlations of
## PA1 PA2
## PA1 1.00 0.48
## PA2 0.48 1.00
##
## Mean item complexity = 1
## Test of the hypothesis that 2 factors are sufficient.
##
## The degrees of freedom for the null model are 55 and the objective function was 9.74 with Chi Square of 677.07
## The degrees of freedom for the model are 34 and the objective function was 0.84
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.05
##
## The harmonic number of observations is 75 with the empirical chi square 14.95 with prob < 1
## The total number of observations was 75 with Likelihood Chi Square = 57.06 with prob < 0.0079
##
## Tucker Lewis Index of factoring reliability = 0.939
## RMSEA index = 0.094 and the 90 % confidence intervals are 0.049 0.138
## BIC = -89.74
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## PA1 PA2
## Correlation of (regression) scores with factors 0.97 0.98
## Multiple R square of scores with factors 0.94 0.96
## Minimum correlation of possible factor scores 0.88 0.92
print.psych(fa(dta[, 1:11],
nfactor = 6,
fm = "pa",
rotate = "Promax"),
cut = .3)
## maximum iteration exceeded
## Factor Analysis using method = pa
## Call: fa(r = dta[, 1:11], nfactors = 6, rotate = "Promax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
## PA2 PA1 PA3 PA4 PA6 PA5 h2 u2 com
## y1 1.07 0.96 0.043 1.1
## y2 0.82 0.79 0.214 1.2
## y3 0.51 0.59 0.408 1.9
## y4 0.35 0.64 0.88 0.120 1.7
## y5 0.55 0.69 0.313 1.6
## y6 0.70 0.81 0.195 1.4
## y7 0.86 0.92 0.083 1.1
## y8 0.64 0.88 0.118 1.4
## x1 0.91 0.89 0.106 1.1
## x2 0.98 0.93 0.072 1.0
## x3 0.92 0.78 0.222 1.1
##
## PA2 PA1 PA3 PA4 PA6 PA5
## SS loadings 2.70 2.08 1.85 0.99 0.74 0.74
## Proportion Var 0.25 0.19 0.17 0.09 0.07 0.07
## Cumulative Var 0.25 0.43 0.60 0.69 0.76 0.83
## Proportion Explained 0.30 0.23 0.20 0.11 0.08 0.08
## Cumulative Proportion 0.30 0.52 0.73 0.84 0.92 1.00
##
## With factor correlations of
## PA2 PA1 PA3 PA4 PA6 PA5
## PA2 1.00 0.45 0.35 0.45 0.48 0.40
## PA1 0.45 1.00 0.63 0.73 0.61 0.68
## PA3 0.35 0.63 1.00 0.58 0.60 0.50
## PA4 0.45 0.73 0.58 1.00 0.57 0.70
## PA6 0.48 0.61 0.60 0.57 1.00 0.40
## PA5 0.40 0.68 0.50 0.70 0.40 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 6 factors are sufficient.
##
## The degrees of freedom for the null model are 55 and the objective function was 9.74 with Chi Square of 677.07
## The degrees of freedom for the model are 4 and the objective function was 0.07
##
## The root mean square of the residuals (RMSR) is 0.01
## The df corrected root mean square of the residuals is 0.02
##
## The harmonic number of observations is 75 with the empirical chi square 0.31 with prob < 0.99
## The total number of observations was 75 with Likelihood Chi Square = 4.58 with prob < 0.33
##
## Tucker Lewis Index of factoring reliability = 0.986
## RMSEA index = 0.042 and the 90 % confidence intervals are 0 0.186
## BIC = -12.69
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## PA2 PA1 PA3 PA4 PA6 PA5
## Correlation of (regression) scores with factors 0.98 0.98 0.94 0.96 0.91 0.90
## Multiple R square of scores with factors 0.96 0.97 0.88 0.91 0.82 0.81
## Minimum correlation of possible factor scores 0.93 0.93 0.76 0.83 0.65 0.61
My selection process:
Step 1: eliminate items that cover more than one factor.
Step 2: select 3 items having highest h2 and lower u2 within each
sub-scale.
Step 3: check com number to make sure the selected items are closed to
1.
Eliminate y3 because the h2 is relatively low and the com is relatively not closed to 1. Also, even if y3 is removed, there are still two items for the component PA1.