#load data
data("PoliticalDemocracy")
str(PoliticalDemocracy)
## 'data.frame':    75 obs. of  11 variables:
##  $ y1: num  2.5 1.25 7.5 8.9 10 7.5 7.5 7.5 2.5 10 ...
##  $ y2: num  0 0 8.8 8.8 3.33 ...
##  $ y3: num  3.33 3.33 10 10 10 ...
##  $ y4: num  0 0 9.2 9.2 6.67 ...
##  $ y5: num  1.25 6.25 8.75 8.91 7.5 ...
##  $ y6: num  0 1.1 8.09 8.13 3.33 ...
##  $ y7: num  3.73 6.67 10 10 10 ...
##  $ y8: num  3.333 0.737 8.212 4.615 6.667 ...
##  $ x1: num  4.44 5.38 5.96 6.29 5.86 ...
##  $ x2: num  3.64 5.06 6.26 7.57 6.82 ...
##  $ x3: num  2.56 3.57 5.22 6.27 4.57 ...
dta <- PoliticalDemocracy
my_summary <- function(x) {
 require(moments)
 funs <- c(mean, sd, skewness, kurtosis)
 sapply(funs, function(f) f(x, na.rm = T))}
dta_desc <- apply(dta, 2, my_summary)
head(dta_desc)
##               y1        y2        y3        y4         y5        y6         y7
## [1,]  5.46466667 4.2564429  6.563110 4.4525330  5.1362519 2.9780741  6.1962639
## [2,]  2.62270200 3.9471276  3.280891 3.3494674  2.6126022 3.3727326  3.2862398
## [3,] -0.09330775 0.3251758 -0.606108 0.1201215 -0.2326106 0.9111375 -0.5645539
## [4,]  1.89574603 1.5744267  2.342953 1.8359537  2.2822132 2.6000463  2.3279219
##             y8        x1         x2         x3
## [1,] 4.0433897 5.0543838  4.7921946 3.55768979
## [2,] 3.2455927 0.7329043  1.5106644 1.40571124
## [3,] 0.4546483 0.2590885 -0.3528184 0.08551615
## [4,] 2.0939704 2.3072658  2.4952354 2.12005702
#rename rows
rownames(dta_desc) <- c("mean", "sd", "skewness", "kurtosis")
dta_desc
##                   y1        y2        y3        y4         y5        y6
## mean      5.46466667 4.2564429  6.563110 4.4525330  5.1362519 2.9780741
## sd        2.62270200 3.9471276  3.280891 3.3494674  2.6126022 3.3727326
## skewness -0.09330775 0.3251758 -0.606108 0.1201215 -0.2326106 0.9111375
## kurtosis  1.89574603 1.5744267  2.342953 1.8359537  2.2822132 2.6000463
##                  y7        y8        x1         x2         x3
## mean      6.1962639 4.0433897 5.0543838  4.7921946 3.55768979
## sd        3.2862398 3.2455927 0.7329043  1.5106644 1.40571124
## skewness -0.5645539 0.4546483 0.2590885 -0.3528184 0.08551615
## kurtosis  2.3279219 2.0939704 2.3072658  2.4952354 2.12005702
rslt1 <- as.data.frame(t(dta_desc))
rslt1 |> knitr::kable()
mean sd skewness kurtosis
y1 5.464667 2.6227020 -0.0933077 1.895746
y2 4.256443 3.9471276 0.3251758 1.574427
y3 6.563110 3.2808912 -0.6061080 2.342953
y4 4.452533 3.3494674 0.1201215 1.835954
y5 5.136252 2.6126022 -0.2326106 2.282213
y6 2.978074 3.3727326 0.9111375 2.600046
y7 6.196264 3.2862398 -0.5645539 2.327922
y8 4.043390 3.2455927 0.4546483 2.093970
x1 5.054384 0.7329043 0.2590885 2.307266
x2 4.792195 1.5106644 -0.3528184 2.495235
x3 3.557690 1.4057112 0.0855161 2.120057
# visualize and reshaping the wide data to long data
dtal_desc <- melt(dta_desc)
# assigning name to each column
names(dtal_desc)[1:2] <- c("moments", "items")
# detect extreme means
ggplot(data = subset(dtal_desc, moments == "mean"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["mean",])) + # mean of the means as a reference
 c(-1.5, 0, 1.5) * sd(t(dta_desc["mean", ])), # sd of the mean values
 linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "mean") +
  theme_bw()

The means of y6,y3 seem to be different from others.

# sd: the larger the better
ggplot(data = subset(dtal_desc, moments == "sd"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["sd",])) +  # mean of sd as reference
 c(-1.5, 0, 1.5) * sd(t(dta_desc["sd", ])), linetype = "dashed") + #sd of the sd
 coord_flip() +
 labs(x = "items",  y = "sd") +
  theme_bw()

Vertical line is the average sd of 11 items.

# Ideally skewness should close to 0 for normally distributed
ggplot(data = subset(dtal_desc, moments == "skewness"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["skewness",])) +
 c(-1.5, 0, 1.5) * sd(t(dta_desc["skewness", ])), linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "skewness") +
  theme_bw()

# the larger the kurtosis = item responses focus on a single option.
ggplot(data = subset(dtal_desc, moments == "kurtosis"),
 aes(x = reorder(items, value, max), y = value, group = moments)) +
 geom_point(size = 3) +
 geom_hline(yintercept = mean(t(dta_desc["kurtosis",])) +
 c(-1.5, 0, 1.5) * sd(t(dta_desc["kurtosis", ])), linetype = "dashed") +
 coord_flip() +
 labs(x = "items",  y = "kurtosis") +
  theme_bw()

Parallel analysis

fa.parallel(dta[, 1:11], fa = "pc", show.legend = FALSE)

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  2

Parallel analysis suggests that the number of factors = NA and the number of components = 2 or 6

Exploratory factor analysis

Factor loading

因素負載量越高越好,越能反映子構念=factor loading the higher, the better.

# from psych
print.psych(fa(dta[, 1:11],         # define the data
               nfactor = 2,         # number of factors based on parallel analysis 
               fm = "pa",           
               rotate = "Promax"),  
               cut = .3)            #不列印.3以下的因素負載量
## Factor Analysis using method =  pa
## Call: fa(r = dta[, 1:11], nfactors = 2, rotate = "Promax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      PA1   PA2   h2    u2 com
## y1  0.89       0.72 0.277 1.0
## y2  0.83       0.59 0.411 1.1
## y3  0.70       0.48 0.523 1.0
## y4  0.83       0.74 0.257 1.0
## y5  0.68       0.66 0.344 1.2
## y6  0.79       0.62 0.385 1.0
## y7  0.81       0.67 0.327 1.0
## y8  0.80       0.70 0.295 1.0
## x1        0.91 0.87 0.131 1.0
## x2        0.97 0.94 0.064 1.0
## x3        0.88 0.75 0.247 1.0
## 
##                        PA1  PA2
## SS loadings           5.08 2.66
## Proportion Var        0.46 0.24
## Cumulative Var        0.46 0.70
## Proportion Explained  0.66 0.34
## Cumulative Proportion 0.66 1.00
## 
##  With factor correlations of 
##      PA1  PA2
## PA1 1.00 0.48
## PA2 0.48 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  55  and the objective function was  9.74 with Chi Square of  677.07
## The degrees of freedom for the model are 34  and the objective function was  0.84 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic number of observations is  75 with the empirical chi square  14.95  with prob <  1 
## The total number of observations was  75  with Likelihood Chi Square =  57.06  with prob <  0.0079 
## 
## Tucker Lewis Index of factoring reliability =  0.939
## RMSEA index =  0.094  and the 90 % confidence intervals are  0.049 0.138
## BIC =  -89.74
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    PA1  PA2
## Correlation of (regression) scores with factors   0.97 0.98
## Multiple R square of scores with factors          0.94 0.96
## Minimum correlation of possible factor scores     0.88 0.92
print.psych(fa(dta[, 1:11],        
               nfactor = 6,         
               fm = "pa",           
               rotate = "Promax"),  
               cut = .3)           
## maximum iteration exceeded
## Factor Analysis using method =  pa
## Call: fa(r = dta[, 1:11], nfactors = 6, rotate = "Promax", fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      PA2   PA1   PA3   PA4   PA6   PA5   h2    u2 com
## y1        1.07                         0.96 0.043 1.1
## y2              0.82                   0.79 0.214 1.2
## y3        0.51                         0.59 0.408 1.9
## y4              0.35              0.64 0.88 0.120 1.7
## y5        0.55                         0.69 0.313 1.6
## y6              0.70                   0.81 0.195 1.4
## y7                    0.86             0.92 0.083 1.1
## y8                          0.64       0.88 0.118 1.4
## x1  0.91                               0.89 0.106 1.1
## x2  0.98                               0.93 0.072 1.0
## x3  0.92                               0.78 0.222 1.1
## 
##                        PA2  PA1  PA3  PA4  PA6  PA5
## SS loadings           2.70 2.08 1.85 0.99 0.74 0.74
## Proportion Var        0.25 0.19 0.17 0.09 0.07 0.07
## Cumulative Var        0.25 0.43 0.60 0.69 0.76 0.83
## Proportion Explained  0.30 0.23 0.20 0.11 0.08 0.08
## Cumulative Proportion 0.30 0.52 0.73 0.84 0.92 1.00
## 
##  With factor correlations of 
##      PA2  PA1  PA3  PA4  PA6  PA5
## PA2 1.00 0.45 0.35 0.45 0.48 0.40
## PA1 0.45 1.00 0.63 0.73 0.61 0.68
## PA3 0.35 0.63 1.00 0.58 0.60 0.50
## PA4 0.45 0.73 0.58 1.00 0.57 0.70
## PA6 0.48 0.61 0.60 0.57 1.00 0.40
## PA5 0.40 0.68 0.50 0.70 0.40 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 6 factors are sufficient.
## 
## The degrees of freedom for the null model are  55  and the objective function was  9.74 with Chi Square of  677.07
## The degrees of freedom for the model are 4  and the objective function was  0.07 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.02 
## 
## The harmonic number of observations is  75 with the empirical chi square  0.31  with prob <  0.99 
## The total number of observations was  75  with Likelihood Chi Square =  4.58  with prob <  0.33 
## 
## Tucker Lewis Index of factoring reliability =  0.986
## RMSEA index =  0.042  and the 90 % confidence intervals are  0 0.186
## BIC =  -12.69
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    PA2  PA1  PA3  PA4  PA6  PA5
## Correlation of (regression) scores with factors   0.98 0.98 0.94 0.96 0.91 0.90
## Multiple R square of scores with factors          0.96 0.97 0.88 0.91 0.82 0.81
## Minimum correlation of possible factor scores     0.93 0.93 0.76 0.83 0.65 0.61

Items Selcetion

My selection process:
Step 1: eliminate items that cover more than one factor.
Step 2: select 3 items having highest h2 and lower u2 within each sub-scale.
Step 3: check com number to make sure the selected items are closed to 1.

Eliminate y3 because the h2 is relatively low and the com is relatively not closed to 1. Also, even if y3 is removed, there are still two items for the component PA1.