HW10

Read data

data(PoliticalDemocracy) # the data
dim(PoliticalDemocracy) # columns and rows

## [1] 75 11

head(PoliticalDemocracy) # the first six data

##      y1       y2       y3       y4       y5       y6       y7       y8       x1
## 1  2.50 0.000000 3.333333 0.000000 1.250000 0.000000 3.726360 3.333333 4.442651
## 2  1.25 0.000000 3.333333 0.000000 6.250000 1.100000 6.666666 0.736999 5.384495
## 3  7.50 8.800000 9.999998 9.199991 8.750000 8.094061 9.999998 8.211809 5.961005
## 4  8.90 8.800000 9.999998 9.199991 8.907948 8.127979 9.999998 4.615086 6.285998
## 5 10.00 3.333333 9.999998 6.666666 7.500000 3.333333 9.999998 6.666666 5.863631
## 6  7.50 3.333333 6.666666 6.666666 6.250000 1.100000 6.666666 0.368500 5.533389
##         x2       x3
## 1 3.637586 2.557615
## 2 5.062595 3.568079
## 3 6.255750 5.224433
## 4 7.567863 6.267495
## 5 6.818924 4.573679
## 6 5.135798 3.892270

str(PoliticalDemocracy) # structure of data

## 'data.frame':    75 obs. of  11 variables:
##  $ y1: num  2.5 1.25 7.5 8.9 10 7.5 7.5 7.5 2.5 10 ...
##  $ y2: num  0 0 8.8 8.8 3.33 ...
##  $ y3: num  3.33 3.33 10 10 10 ...
##  $ y4: num  0 0 9.2 9.2 6.67 ...
##  $ y5: num  1.25 6.25 8.75 8.91 7.5 ...
##  $ y6: num  0 1.1 8.09 8.13 3.33 ...
##  $ y7: num  3.73 6.67 10 10 10 ...
##  $ y8: num  3.333 0.737 8.212 4.615 6.667 ...
##  $ x1: num  4.44 5.38 5.96 6.29 5.86 ...
##  $ x2: num  3.64 5.06 6.26 7.57 6.82 ...
##  $ x3: num  2.56 3.57 5.22 6.27 4.57 ...

summary(PoliticalDemocracy) # summary of data

##        y1               y2               y3               y4        
##  Min.   : 1.250   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 2.900   1st Qu.: 0.000   1st Qu.: 3.767   1st Qu.: 1.581  
##  Median : 5.400   Median : 3.333   Median : 6.667   Median : 3.333  
##  Mean   : 5.465   Mean   : 4.256   Mean   : 6.563   Mean   : 4.453  
##  3rd Qu.: 7.500   3rd Qu.: 8.283   3rd Qu.:10.000   3rd Qu.: 6.667  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##        y5               y6               y7               y8        
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.692   1st Qu.: 0.000   1st Qu.: 3.478   1st Qu.: 1.301  
##  Median : 5.000   Median : 2.233   Median : 6.667   Median : 3.333  
##  Mean   : 5.136   Mean   : 2.978   Mean   : 6.196   Mean   : 4.043  
##  3rd Qu.: 7.500   3rd Qu.: 4.207   3rd Qu.:10.000   3rd Qu.: 6.667  
##  Max.   :10.000   Max.   :10.000   Max.   :10.000   Max.   :10.000  
##        x1              x2              x3       
##  Min.   :3.784   Min.   :1.386   Min.   :1.002  
##  1st Qu.:4.477   1st Qu.:3.663   1st Qu.:2.300  
##  Median :5.075   Median :4.963   Median :3.568  
##  Mean   :5.054   Mean   :4.792   Mean   :3.558  
##  3rd Qu.:5.515   3rd Qu.:5.830   3rd Qu.:4.523  
##  Max.   :6.737   Max.   :7.872   Max.   :6.425

View(PoliticalDemocracy)
sum(is.na(PoliticalDemocracy)) # sum of na in he data

## [1] 0

Description of the data:

The ‘famous’ Industrialization and Political Democracy dataset. This dataset is used throughout Bollen’s 1989 book (see pages 12, 17, 36 in chapter 2, pages 228 and following in chapter 7, pages 321 and following in chapter 8). The dataset contains various measures of political democracy and industrialization in developing countries.

A data frame with 75 observations on the following 11 variables.

y1:Expert ratings of the freedom of the press in 1960

y2:The freedom of political opposition in 1960

y3:The fairness of elections in 1960

y4:The effectiveness of the elected legislature in 1960

y5:Expert ratings of the freedom of the press in 1965

y6:The freedom of political opposition in 1965

y7:The fairness of elections in 1965

y8:The effectiveness of the elected legislature in 1965

x1:The gross national product (GNP) per capita in 1960

x2:The inanimate energy consumption per capita in 1960

x3:The percentage of the labor force in industry in 1960

NA

We can find out that there are no NAs in education columns.

Descriptive stat of data

describe(PoliticalDemocracy) # summary the items

##    vars  n mean   sd median trimmed  mad  min   max range  skew kurtosis   se
## y1    1 75 5.46 2.62   5.40    5.46 3.11 1.25 10.00  8.75 -0.09    -1.15 0.30
## y2    2 75 4.26 3.95   3.33    4.09 4.94 0.00 10.00 10.00  0.32    -1.47 0.46
## y3    3 75 6.56 3.28   6.67    6.92 4.94 0.00 10.00 10.00 -0.59    -0.72 0.38
## y4    4 75 4.45 3.35   3.33    4.33 4.94 0.00 10.00 10.00  0.12    -1.21 0.39
## y5    5 75 5.14 2.61   5.00    5.23 3.71 0.00 10.00 10.00 -0.23    -0.78 0.30
## y6    6 75 2.98 3.37   2.23    2.51 3.31 0.00 10.00 10.00  0.89    -0.47 0.39
## y7    7 75 6.20 3.29   6.67    6.47 4.94 0.00 10.00 10.00 -0.55    -0.73 0.38
## y8    8 75 4.04 3.25   3.33    3.82 4.40 0.00 10.00 10.00  0.45    -0.96 0.37
## x1    9 75 5.05 0.73   5.08    5.03 0.82 3.78  6.74  2.95  0.25    -0.75 0.08
## x2   10 75 4.79 1.51   4.96    4.85 1.53 1.39  7.87  6.49 -0.35    -0.57 0.17
## x3   11 75 3.56 1.41   3.57    3.53 1.51 1.00  6.42  5.42  0.08    -0.94 0.16

my_summary <- function(x) {
 require(moments)
 funs <- c(mean, sd, skewness, kurtosis)
 sapply(funs, function(f) f(x, na.rm = T))
}
data_desc <- apply(PoliticalDemocracy, 2, my_summary) # applying my_summary function to each columns
rownames(data_desc) <- c("mean", "sd", "skewness", "kurtosis")
result1 <- as.data.frame(t(data_desc))
result1 |> knitr::kable()

	mean	sd	skewness	kurtosis
y1	5.464667	2.6227020	-0.0933077	1.895746
y2	4.256443	3.9471276	0.3251758	1.574427
y3	6.563110	3.2808912	-0.6061080	2.342953
y4	4.452533	3.3494674	0.1201215	1.835954
y5	5.136252	2.6126022	-0.2326106	2.282213
y6	2.978074	3.3727326	0.9111375	2.600046
y7	6.196264	3.2862398	-0.5645539	2.327922
y8	4.043390	3.2455927	0.4546483	2.093970
x1	5.054384	0.7329043	0.2590885	2.307266
x2	4.792195	1.5106644	-0.3528184	2.495235
x3	3.557690	1.4057112	0.0855161	2.120057

datal_desc <- melt(data_desc)
names(datal_desc)[1:2] <- c("moments", "items")

Plot

From these output plots, we can clearly see the descriptive stat(mean,sd,skew,kurtosis) difference between each items.

Discrimination index

# calculate sum for each column and assign the result to a new column(tot)
PoliticalDemocracy$tot <- apply(PoliticalDemocracy, 1, sum)
PoliticalDemocracy$grp <- NA

# using rank() to show the high and low points are in data
PoliticalDemocracy$grp[rank(PoliticalDemocracy$tot) < 75*.27] <- "L" # below 27%
PoliticalDemocracy$grp[rank(PoliticalDemocracy$tot) > 75*.73] <- "H" # upper 73%
PoliticalDemocracy$grp <- factor(PoliticalDemocracy$grp)
head(PoliticalDemocracy)

##      y1       y2       y3       y4       y5       y6       y7       y8       x1
## 1  2.50 0.000000 3.333333 0.000000 1.250000 0.000000 3.726360 3.333333 4.442651
## 2  1.25 0.000000 3.333333 0.000000 6.250000 1.100000 6.666666 0.736999 5.384495
## 3  7.50 8.800000 9.999998 9.199991 8.750000 8.094061 9.999998 8.211809 5.961005
## 4  8.90 8.800000 9.999998 9.199991 8.907948 8.127979 9.999998 4.615086 6.285998
## 5 10.00 3.333333 9.999998 6.666666 7.500000 3.333333 9.999998 6.666666 5.863631
## 6  7.50 3.333333 6.666666 6.666666 6.250000 1.100000 6.666666 0.368500 5.533389
##         x2       x3      tot  grp
## 1 3.637586 2.557615 24.78088    L
## 2 5.062595 3.568079 33.35217    L
## 3 6.255750 5.224433 87.99704    H
## 4 7.567863 6.267495 88.67236    H
## 5 6.818924 4.573679 74.75623    H
## 6 5.135798 3.892270 53.11329 <NA>

# calculate mean score for each group (H & L)
dtam <- aggregate(PoliticalDemocracy[, 1:11], by=list(PoliticalDemocracy$grp), mean)
print(dtam)

##   Group.1      y1        y2       y3       y4       y5        y6       y7
## 1       H 8.27619 8.1507924 9.409522 8.316808 7.686093 7.1589060 9.263757
## 2       L 2.82750 0.5733333 3.336666 1.036850 2.533362 0.5911064 2.341416
##         y8       x1       x2       x3
## 1 8.163568 5.641155 5.968026 4.481955
## 2 1.147683 4.619440 3.947979 2.875530

# delete the first column to keep the data frame neat then transpose it
dtam <- t(dtam[, -1])

# using t-test to exam the different between two mean values (high\low)
item_t <- sapply(PoliticalDemocracy[, 1:11], function(x) t.test(x ~ PoliticalDemocracy$grp)$statistic)
print(item_t)

##      y1.t      y2.t      y3.t      y4.t      y5.t      y6.t      y7.t      y8.t 
## 11.570674 12.436318  8.311272 14.797554  8.781918  9.520979 10.606767 13.850309 
##      x1.t      x2.t      x3.t 
##  4.828984  4.622609  4.137548

# assigning the t-test results to rslt2 and create three more necessary columns
result2 <- data.frame(Item = rownames(dtam), low.mean.score = dtam[, 2], high.mean.score = dtam[, 1] , mean.dif = dtam[, 1]-dtam[, 2],t.value = item_t)
print(result2)

##    Item low.mean.score high.mean.score mean.dif   t.value
## y1   y1      2.8275000        8.276190 5.448690 11.570674
## y2   y2      0.5733333        8.150792 7.577459 12.436318
## y3   y3      3.3366665        9.409522 6.072856  8.311272
## y4   y4      1.0368499        8.316808 7.279958 14.797554
## y5   y5      2.5333620        7.686093 5.152731  8.781918
## y6   y6      0.5911064        7.158906 6.567800  9.520979
## y7   y7      2.3414164        9.263757 6.922341 10.606767
## y8   y8      1.1476830        8.163568 7.015885 13.850309
## x1   x1      4.6194405        5.641155 1.021714  4.828984
## x2   x2      3.9479788        5.968026 2.020047  4.622609
## x3   x3      2.8755298        4.481955 1.606425  4.137548

Factor loading

print.psych(fa(PoliticalDemocracy[, 1:11],nfactor = 2, 
 fm = "pa",  rotate = "promax"),  cut = .3)

## Factor Analysis using method =  pa
## Call: fa(r = PoliticalDemocracy[, 1:11], nfactors = 2, rotate = "promax", 
##     fm = "pa")
## Standardized loadings (pattern matrix) based upon correlation matrix
##      PA1   PA2   h2    u2 com
## y1  0.88       0.72 0.277 1.0
## y2  0.83       0.59 0.411 1.1
## y3  0.70       0.48 0.523 1.0
## y4  0.83       0.74 0.257 1.0
## y5  0.68       0.66 0.344 1.2
## y6  0.79       0.62 0.385 1.0
## y7  0.80       0.67 0.327 1.0
## y8  0.79       0.70 0.295 1.0
## x1        0.92 0.87 0.131 1.0
## x2        0.98 0.94 0.064 1.0
## x3        0.89 0.75 0.247 1.0
## 
##                        PA1  PA2
## SS loadings           5.04 2.70
## Proportion Var        0.46 0.25
## Cumulative Var        0.46 0.70
## Proportion Explained  0.65 0.35
## Cumulative Proportion 0.65 1.00
## 
##  With factor correlations of 
##      PA1  PA2
## PA1 1.00 0.49
## PA2 0.49 1.00
## 
## Mean item complexity =  1
## Test of the hypothesis that 2 factors are sufficient.
## 
## The degrees of freedom for the null model are  55  and the objective function was  9.74 with Chi Square of  677.07
## The degrees of freedom for the model are 34  and the objective function was  0.84 
## 
## The root mean square of the residuals (RMSR) is  0.04 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic number of observations is  75 with the empirical chi square  14.95  with prob <  1 
## The total number of observations was  75  with Likelihood Chi Square =  57.06  with prob <  0.0079 
## 
## Tucker Lewis Index of factoring reliability =  0.939
## RMSEA index =  0.094  and the 90 % confidence intervals are  0.049 0.138
## BIC =  -89.74
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    PA1  PA2
## Correlation of (regression) scores with factors   0.97 0.98
## Multiple R square of scores with factors          0.94 0.96
## Minimum correlation of possible factor scores     0.88 0.93

fa.pd<-fa(PoliticalDemocracy[, 1:11],         
               nfactors  = 2,        
               fm = "pa", max.iter = 100,          
               rotate = "promax")     
fa.diagram(fa.pd)

h2: the amount of variance in the item/variable explained by the (retained) factors.

u2: residual variance

com: how much an item reflects a single construct.

Correlation of (regression) scores with factors: y=0.97 x=0.98

Multiple R square of scores with factors: y=0.94 x=0.96

Minimum correlation of possible factor scores: y=0.88 x=0.93

And for me, I just choose 2 items in each components.

Following the output, we can choose the items! For x: Choose x2,x1. For y: Choose y1,y2.

Parallel analysis

fa.parallel(PoliticalDemocracy[, 1:11], fa = "pc", show.legend = FALSE)

## Parallel analysis suggests that the number of factors =  NA  and the number of components =  2

Parallel analysis suggests that the number of factors = NA and the number of components = 2