if (!require("ggplot2")) install.packages("ggplot2"); library(ggplot2)
## Loading required package: ggplot2
if (!require("base")) install.packages("base"); library(base)
if (require(lavaan) == FALSE){install.packages("lavaan")};library(lavaan)
## Loading required package: lavaan
## This is lavaan 0.5-23.1097
## lavaan is BETA software! Please report any bugs.
if (!require("factoextra")) install.packages("factoextra"); library(factoextra)
## Loading required package: factoextra
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

Reading and cleaning the data names:

FullePIRLSData0 = read.csv("~/Dropbox/Zhao/ePIRL16USA/FullePIRLS.csv")
#View(FullePIRLSData0)
FullePIRLSData1 = FullePIRLSData0
#View(FullePIRLSData1)
#attach(FullePIRLSData1)
#The error "masked" is from attached().
#table(FullePIRLSData1$IDCLASS)
#describeBy(FullePIRLSData1, group = IDSCHOOL)
#Recoding the variables:
for(i in 72:77 ){
  FullePIRLSData1[,i] <- as.numeric(as.character(factor(FullePIRLSData1[ , i],
           levels = c("Disagree a lot","Disagree a little", "Agree a little","Agree a lot"), 
          labels = c("1","2","3","4")))) 
}
names(FullePIRLSData1)
##   [1] "X"        "IDCNTRY"  "IDBOOK"   "IDSCHOOL" "IDCLASS"  "IDSTUD"  
##   [7] "IDGRADE"  "ITSEX"    "ITADMINI" "ITLANG"   "ASBG01"   "ASBG03"  
##  [13] "ASBG04"   "ASBG05A"  "ASBG05B"  "ASBG05C"  "ASBG05D"  "ASBG05E" 
##  [19] "ASBG05F"  "ASBG05G"  "ASBG05H"  "ASBG06"   "ASBG07A"  "ASBG07B" 
##  [25] "ASBG08"   "ASBG09A"  "ASBG09B"  "ASBG09C"  "ASBG10A"  "ASBG10B" 
##  [31] "ASBG11A"  "ASBG11B"  "ASBG11C"  "ASBG11D"  "ASBG12A"  "ASBG12B" 
##  [37] "ASBG12C"  "ASBG12D"  "ASBG12E"  "ASBG13A"  "ASBG13B"  "ASBG13C" 
##  [43] "ASBG13D"  "ASBG13E"  "ASBG13F"  "ASBG13G"  "ASBG13H"  "ASBR01A" 
##  [49] "ASBR01B"  "ASBR01C"  "ASBR01D"  "ASBR01E"  "ASBR01F"  "ASBR01G" 
##  [55] "ASBR01H"  "ASBR01I"  "ASBR02A"  "ASBR02B"  "ASBR02C"  "ASBR03"  
##  [61] "ASBR04"   "ASBR05A"  "ASBR05B"  "ASBR06A"  "ASBR06B"  "ASBR06C" 
##  [67] "ASBR06D"  "ASBR06E"  "ASBR06F"  "ASBR06G"  "ASBR06H"  "ASBR07A" 
##  [73] "ASBR07B"  "ASBR07C"  "ASBR07D"  "ASBR07E"  "ASBR07F"  "IDPOP"   
##  [79] "IDGRADER" "ASDAGE"   "HOUWGT"   "TOTWGT"   "SENWGT"   "WGTADJ1" 
##  [85] "WGTADJ2"  "WGTADJ3"  "WGTFAC1"  "WGTFAC2"  "WGTFAC3"  "JKZONE"  
##  [91] "JKREP"    "ASEREA01" "ASEREA02" "ASEREA03" "ASEREA04" "ASEREA05"
##  [97] "ASERSI01" "ASERSI02" "ASERSI03" "ASERSI04" "ASERSI05" "ASEIIE01"
## [103] "ASEIIE02" "ASEIIE03" "ASEIIE04" "ASEIIE05" "ASEIBM01" "ASEIBM02"
## [109] "ASEIBM03" "ASEIBM04" "ASEIBM05" "ASRREA01" "ASRREA02" "ASRREA03"
## [115] "ASRREA04" "ASRREA05" "ASRLIT01" "ASRLIT02" "ASRLIT03" "ASRLIT04"
## [121] "ASRLIT05" "ASRINF01" "ASRINF02" "ASRINF03" "ASRINF04" "ASRINF05"
## [127] "ASRIIE01" "ASRIIE02" "ASRIIE03" "ASRIIE04" "ASRIIE05" "ASRRSI01"
## [133] "ASRRSI02" "ASRRSI03" "ASRRSI04" "ASRRSI05" "ASRIBM01" "ASRIBM02"
## [139] "ASRIBM03" "ASRIBM04" "ASRIBM05" "ASBGSSB"  "ASDGSSB"  "ASBGSB"  
## [145] "ASDGSB"   "ASBGERL"  "ASDGERL"  "ASBGSLR"  "ASDGSLR"  "ASBGSCR" 
## [151] "ASDGSCR"  "ASBGHRL"  "ASDGHRL"  "ASBGDDH"  "ASDGDDH"  "ASBGSEC" 
## [157] "ASDGSEC"  "ASDELOWP" "VERSION"  "SCOPE"
#View(FullePIRLSData1)
#summary(FullePIRLSData1)
attach(FullePIRLSData1)
#Renaming the confidence Items:
FullePIRLSData1$con1 = ASBR07A
FullePIRLSData1$con2 = ASBR07B
FullePIRLSData1$con3r = ASBR07C
FullePIRLSData1$con4r = ASBR07D
FullePIRLSData1$con5r = ASBR07E
FullePIRLSData1$con6r = ASBR07F

#names(FullePIRLSData1)
#View(FullePIRLSData1)
# 
# FullePIRLSData1$con3 = con3r
# FullePIRLSData1$con4 = con4r
# FullePIRLSData1$con5 = con5r
# FullePIRLSData1$con6 = con6r
# #Reversing the negative items (3,4,5, and 6):
# FullePIRLSData1$con3r = 5 - FullePIRLSData1$con3
# FullePIRLSData1$con4r = 5 - FullePIRLSData1$con4
# FullePIRLSData1$con5r = 5 - FullePIRLSData1$con5
# FullePIRLSData1$con6r = 5 - FullePIRLSData1$con6
# 
# View(FullePIRLSData1)
# # FullePIRLSData2 =  na.omit(FullePIRLSData1)
# #View(FullePIRLSData2)
# names(FullePIRLSData1)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:lavaan':
## 
##     cor2cov
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
# describeBy(FullePIRLSData1$con6r)
# table(FullePIRLSData1$con6r)
# dat=FullePIRLSData1[]
#View(dat)
# library(factoextra)
# 
# sol_clust<- factoextra::eclust(x = dat[,72:77],FUNcluster = "kmeans")
# 
# dat$cluster <- sol_clust$cluster
# 
# dat[,72:77]<- scale(dat[,72:77])
# 
# for(i in 72:77){# i =1
#   p<- ggplot(data = dat)+
#     geom_boxplot(aes(x = cluster, y = dat[,i], group = cluster)) 
#   
# print(p)  
#   }
# 
# dat$cluster <- as.factor(dat$cluster)
# 
# ggplot(data = dat) +
#   geom_point(aes(x = con1, y = con2, group = cluster, color = cluster)) +
#   geom_smooth(aes(x = con1, y = con2, group = cluster, color = cluster), method = "lm",se = F)
# #View(dat)
# datScaled = scale(dat[,72:77])
# datScaled
# 
# fitK=kmeans(datScaled, 8)
# fitK
# str(fitK)
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
dat1 <- FullePIRLSData0[,72:77]

str(FullePIRLSData0)
## 'data.frame':    4090 obs. of  160 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ IDCNTRY : logi  NA NA NA NA NA NA ...
##  $ IDBOOK  : Factor w/ 16 levels "Booklet 01","Booklet 02",..: 10 16 11 12 13 15 16 1 3 6 ...
##  $ IDSCHOOL: int  37 37 37 37 37 37 37 37 37 37 ...
##  $ IDCLASS : int  3701 3701 3701 3701 3701 3701 3701 3701 3701 3701 ...
##  $ IDSTUD  : int  370101 370102 370103 370104 370105 370107 370108 370109 370111 370115 ...
##  $ IDGRADE : Factor w/ 1 level "Grade 4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ITSEX   : Factor w/ 2 levels "Boy","Girl": 2 1 2 1 1 2 2 2 1 2 ...
##  $ ITADMINI: Factor w/ 1 level "Other": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ITLANG  : Factor w/ 1 level "English": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASBG01  : Factor w/ 2 levels "Boy","Girl": 2 1 2 1 1 2 2 2 1 2 ...
##  $ ASBG03  : Factor w/ 5 levels "I almost always speak <language of test> at home",..: 2 2 4 2 5 2 2 2 2 2 ...
##  $ ASBG04  : Factor w/ 6 levels "Enough to fill one bookcase (26–100 books)",..: 5 5 4 2 1 4 1 1 5 5 ...
##  $ ASBG05A : Factor w/ 3 levels "No","Omitted or invalid",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ ASBG05B : Factor w/ 3 levels "No","Omitted or invalid",..: 1 3 3 3 3 1 1 3 1 1 ...
##  $ ASBG05C : Factor w/ 3 levels "No","Omitted or invalid",..: 1 1 1 3 3 1 1 1 3 3 ...
##  $ ASBG05D : Factor w/ 3 levels "No","Omitted or invalid",..: 3 3 3 3 3 3 1 3 3 1 ...
##  $ ASBG05E : Factor w/ 3 levels "No","Omitted or invalid",..: 3 1 1 3 1 1 1 1 3 1 ...
##  $ ASBG05F : Factor w/ 3 levels "No","Omitted or invalid",..: 3 3 3 3 3 3 1 3 3 1 ...
##  $ ASBG05G : Factor w/ 3 levels "No","Omitted or invalid",..: 3 1 3 1 3 3 1 3 3 1 ...
##  $ ASBG05H : logi  NA NA NA NA NA NA ...
##  $ ASBG06  : Factor w/ 5 levels "Never or almost never",..: 1 1 1 1 1 1 4 1 1 3 ...
##  $ ASBG07A : Factor w/ 5 levels "Almost every day",..: 2 5 5 3 2 4 5 2 3 4 ...
##  $ ASBG07B : Factor w/ 5 levels "Almost every day",..: 1 1 5 5 5 5 4 1 3 2 ...
##  $ ASBG08  : Factor w/ 5 levels "Every day","Most days",..: 5 1 5 3 2 2 5 2 3 3 ...
##  $ ASBG09A : Factor w/ 5 levels "Every day or almost every day",..: 1 1 1 1 5 1 5 1 5 1 ...
##  $ ASBG09B : Factor w/ 5 levels "Every day or almost every day",..: 1 3 1 5 5 3 3 2 4 2 ...
##  $ ASBG09C : Factor w/ 5 levels "Every day or almost every day",..: 1 3 5 1 1 3 3 1 4 2 ...
##  $ ASBG10A : Factor w/ 4 levels "30 minutes or less",..: 3 2 2 2 1 1 1 3 1 2 ...
##  $ ASBG10B : Factor w/ 4 levels "30 minutes or less",..: 1 3 2 1 2 4 4 3 1 4 ...
##  $ ASBG11A : Factor w/ 6 levels "2 hours or more",..: 3 2 2 1 1 6 6 2 5 6 ...
##  $ ASBG11B : Factor w/ 6 levels "2 hours or more",..: 3 3 2 3 4 1 2 4 4 1 ...
##  $ ASBG11C : Factor w/ 6 levels "2 hours or more",..: 3 1 1 3 2 6 6 5 4 6 ...
##  $ ASBG11D : Factor w/ 6 levels "2 hours or more",..: 3 5 1 5 4 6 6 2 4 6 ...
##  $ ASBG12A : Factor w/ 5 levels "Agree a little",..: 2 1 1 1 1 5 2 1 1 1 ...
##  $ ASBG12B : Factor w/ 5 levels "Agree a little",..: 2 2 1 1 3 2 5 2 4 5 ...
##  $ ASBG12C : Factor w/ 5 levels "Agree a little",..: 2 2 1 1 1 5 5 3 1 5 ...
##  $ ASBG12D : Factor w/ 5 levels "Agree a little",..: 2 1 3 2 1 5 5 2 2 5 ...
##  $ ASBG12E : Factor w/ 5 levels "Agree a little",..: 2 1 1 2 2 5 5 1 1 5 ...
##  $ ASBG13A : Factor w/ 5 levels "A few times a year",..: 2 3 1 3 1 4 4 3 3 3 ...
##  $ ASBG13B : Factor w/ 5 levels "A few times a year",..: 2 3 1 3 3 4 4 3 2 4 ...
##  $ ASBG13C : Factor w/ 5 levels "A few times a year",..: 2 2 1 3 5 4 4 3 3 4 ...
##  $ ASBG13D : Factor w/ 5 levels "A few times a year",..: 2 3 1 3 5 3 4 3 3 4 ...
##  $ ASBG13E : Factor w/ 5 levels "A few times a year",..: 2 1 3 3 2 4 4 3 2 4 ...
##  $ ASBG13F : Factor w/ 5 levels "A few times a year",..: 2 2 3 3 2 4 2 3 3 4 ...
##  $ ASBG13G : Factor w/ 5 levels "A few times a year",..: 2 3 1 3 5 4 4 3 3 4 ...
##  $ ASBG13H : Factor w/ 5 levels "A few times a year",..: 2 3 3 3 2 4 4 3 3 4 ...
##  $ ASBR01A : Factor w/ 5 levels "Agree a little",..: 2 3 2 1 2 5 1 2 2 5 ...
##  $ ASBR01B : Factor w/ 5 levels "Agree a little",..: 2 2 2 2 2 5 5 2 5 5 ...
##  $ ASBR01C : Factor w/ 5 levels "Agree a little",..: 2 2 2 2 1 2 5 2 1 2 ...
##  $ ASBR01D : Factor w/ 5 levels "Agree a little",..: 2 1 2 2 1 5 5 2 1 5 ...
##  $ ASBR01E : Factor w/ 5 levels "Agree a little",..: 2 2 2 2 2 5 5 2 2 5 ...
##  $ ASBR01F : Factor w/ 5 levels "Agree a little",..: 2 5 2 2 2 5 5 2 2 5 ...
##  $ ASBR01G : Factor w/ 5 levels "Agree a little",..: 2 3 2 1 3 5 5 2 2 5 ...
##  $ ASBR01H : Factor w/ 5 levels "Agree a little",..: 2 1 2 2 1 5 5 2 1 5 ...
##  $ ASBR01I : Factor w/ 5 levels "Agree a little",..: 2 2 2 2 5 5 5 2 2 5 ...
##  $ ASBR02A : Factor w/ 5 levels "Every day or almost every day",..: 1 5 1 1 1 1 3 1 1 3 ...
##  $ ASBR02B : Factor w/ 5 levels "Every day or almost every day",..: 1 2 5 5 1 3 1 1 5 3 ...
##  $ ASBR02C : Factor w/ 5 levels "Every day or almost every day",..: 1 2 5 1 4 3 3 5 1 1 ...
##  $ ASBR03  : Factor w/ 5 levels "A few times a year",..: 4 2 1 1 2 5 2 5 5 5 ...
##  $ ASBR04  : Factor w/ 5 levels "2 hours or more",..: 4 2 2 2 4 2 1 4 4 1 ...
##  $ ASBR05A : Factor w/ 5 levels "Every day or almost every day",..: 1 4 1 5 5 3 5 5 2 1 ...
##  $ ASBR05B : Factor w/ 5 levels "Every day or almost every day",..: 5 5 1 1 5 5 3 1 2 3 ...
##  $ ASBR06A : Factor w/ 5 levels "Agree a little",..: 2 4 2 2 1 5 1 4 2 2 ...
##  $ ASBR06B : Factor w/ 5 levels "Agree a little",..: 1 1 1 1 2 5 5 4 1 5 ...
##  $ ASBR06C : Factor w/ 5 levels "Agree a little",..: 1 3 4 3 4 5 5 1 4 5 ...
##  $ ASBR06D : Factor w/ 5 levels "Agree a little",..: 1 1 1 1 2 5 5 4 1 5 ...
##  $ ASBR06E : Factor w/ 5 levels "Agree a little",..: 1 2 2 2 2 5 5 1 5 5 ...
##  $ ASBR06F : Factor w/ 5 levels "Agree a little",..: 1 3 2 2 2 1 5 2 2 5 ...
##  $ ASBR06G : Factor w/ 5 levels "Agree a little",..: 1 1 2 2 2 5 5 3 1 5 ...
##  $ ASBR06H : Factor w/ 5 levels "Agree a little",..: 1 1 2 2 2 5 5 1 2 5 ...
##  $ ASBR07A : Factor w/ 5 levels "Agree a little",..: 5 1 2 1 2 2 5 2 1 5 ...
##  $ ASBR07B : Factor w/ 5 levels "Agree a little",..: 1 3 1 2 1 5 2 1 2 5 ...
##  $ ASBR07C : Factor w/ 5 levels "Agree a little",..: 2 2 1 1 1 5 5 2 4 5 ...
##  $ ASBR07D : Factor w/ 5 levels "Agree a little",..: 2 1 4 3 1 5 5 1 4 5 ...
##  $ ASBR07E : Factor w/ 5 levels "Agree a little",..: 2 2 3 4 3 5 5 3 4 5 ...
##  $ ASBR07F : Factor w/ 5 levels "Agree a little",..: 2 1 4 4 3 5 5 4 4 2 ...
##  $ IDPOP   : Factor w/ 1 level "Pop 1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ IDGRADER: Factor w/ 1 level "Upper Grade": 1 1 1 1 1 1 1 1 1 1 ...
##  $ ASDAGE  : num  9.25 8.67 9.17 8.58 9.08 8.5 9.25 8.42 8.75 9.17 ...
##  $ HOUWGT  : num  0.587 0.587 0.587 0.587 0.587 ...
##  $ TOTWGT  : num  513 513 513 513 513 ...
##  $ SENWGT  : num  0.0717 0.0717 0.0717 0.0717 0.0717 ...
##  $ WGTADJ1 : num  1.14 1.14 1.14 1.14 1.14 ...
##  $ WGTADJ2 : logi  NA NA NA NA NA NA ...
##  $ WGTADJ3 : num  1.44 1.44 1.44 1.44 1.44 ...
##  $ WGTFAC1 : num  312 312 312 312 312 ...
##  $ WGTFAC2 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ WGTFAC3 : logi  NA NA NA NA NA NA ...
##  $ JKZONE  : int  49 49 49 49 49 49 49 49 49 49 ...
##  $ JKREP   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ASEREA01: num  495 468 593 519 601 ...
##  $ ASEREA02: num  537 501 573 530 571 ...
##  $ ASEREA03: num  514 491 571 518 572 ...
##  $ ASEREA04: num  503 500 552 547 565 ...
##  $ ASEREA05: num  489 500 541 540 580 ...
##  $ ASERSI01: num  494 472 578 518 559 ...
##  $ ASERSI02: num  472 534 578 508 564 ...
##  $ ASERSI03: num  496 469 612 546 586 ...
##   [list output truncated]
for(i in 1:6){
  dat1[,i] <- as.numeric(as.character(factor(dat1[ , i],
           levels = c("Disagree a lot","Disagree a little", "Agree a little","Agree a lot"), 
          labels = c("1","2","3","4")))) 
}

dat1$sex<- FullePIRLSData0$ITSEX
dat1[,8:11] <- FullePIRLSData0[,c("ASBG04","ASBG10A","ASBR04")]

dat2 <- na.omit(dat1)

#Reversing the items:
dat2$con1   <- dat2$ASBR07A
dat2$con2   <- dat2$ASBR07B
dat2$con3  <-  recode(dat2$ASBR07C, '1 = 4; 2 = 3; 3 = 2; 4 = 1')
dat2$con4  <-  recode(dat2$ASBR07D, '1 = 4; 2 = 3; 3 = 2; 4 = 1')
dat2$con5  <-  recode(dat2$ASBR07E, '1 = 4; 2 = 3; 3 = 2; 4 = 1')
dat2$con6  <-  recode(dat2$ASBR07F, '1 = 4; 2 = 3; 3 = 2; 4 = 1')
dat2$books <- recode(dat2$ASBG04, '"None or very few (0–10 books)"=1; 
                                   "Enough to fill one shelf (11–25 books)" = 2;
                                   "Enough to fill one bookcase (26–100 books)" = 3;
                                   "Enough to fill two bookcases (101–200 books)" = 4;
                                   "Enough to fill three or more bookcases (more than 200)" = 5;
                                   "Omitted or invalid" = NA')
dat2$time_computer <- recode(dat2$ASBG10A, '"No time" = 1;
                                            "30 minutes or less" = 2;
                                            "More than 30 minutes" = 3;
                                            "Omitted or invalid"  = NA')
dat2$time_reading<- recode(dat2$ASBR04,    '"Less than 30 minutes" = 1;
                                            "30 minutes up to 1 hour" = 2;
                                            "From 1 hour up to 2 hours" = 3;
                                            "2 hours or more" = 4;
                                            "Omitted or invalid"  = NA')

dat3 <- dat2[,-c(1:6,9:11)]

dat3$books <- as.numeric(as.character(dat3$books))
dat3$time_computer <- as.numeric(as.character(dat3$time_computer))
dat3$time_reading <- as.numeric(as.character(dat3$time_reading))
dat3 <- na.omit(dat3)

dat4<- data.frame(scale(dat3[,c(3:11)]))

dat4$sex <- dat3$sex

dat5 <- data.frame(dat4)

sol_clust<- factoextra::eclust(x = dat5[,1:9],FUNcluster = "kmeans")
## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

## Warning: did not converge in 10 iterations

#This plot shos the eight clusters on the two most discriminatory dimensions.  
dat5$cluster<- sol_clust$cluster

#changing from wide to long, just for plotting the cluster means

library(reshape2)
library(pander)

Number of Cluster Based on Gender:

pander(table(dat5$cluster, dat5$sex))
Boy Girl
374 357
315 288
215 294
158 157
261 236
86 70
118 131
230 299
# these clusters do not differ in gender.
# ggplot(data = plot_dat) +
#   geom_bar(aes(x = cluster, y = value, fill = cluster ), stat = "summary", fun.y = "mean") +
#   facet_grid(sex ~ variable)
plot_dat <- melt(data = dat5, id.vars = c("cluster","sex"))

plot_dat$cluster <- as.factor(plot_dat$cluster)

ggplot(data = plot_dat) +
  geom_bar(aes(x = cluster, y = value, fill = cluster ), stat = "summary", fun.y = "mean") +
  facet_grid(. ~ variable)

  ggsave("cluster.pdf")
## Saving 7 x 5 in image

In the Beginning:
- The goal of this project was to find evidence for a grouping variable regarding: (con1, con2, con3, con4, con5, and con6). Finding a set of grouping variables can minimize the set of variables necessary to conduct future analysis. To start, an exploratory K means cluster analysis was utilized to find the optimal number of homogeneous groups. However, only two clusters (low and high) emerged. This is likely because the six items were measuring a similar construct. One of the extracted clusters was high on confidence while the other cluster was low on confidence. A contingency table was created with the extracted clusters and student gender. The results were an almost even distribution, this suggests high and low confidence was measured by the six items is not contingent on gender. In an effort to find a more diverse clustering solution, three more variables were entered into the model: time spent on reading, number of books at home, and time using a computer at home.

The Nine Variables Included in the Cluster Analyses:

The ePIRLS Confidence Scale Items:
  • How well do you read? Tell how much you agree with each of these statements:
  • con1) I usually do well in reading.
  • con2) Reading is easy for me.
  • con3) I have trouble reading stories with difficult words.
  • con4) Reading is harder for me than for many of my classmates.
  • con5) Reading is harder for me than any other subject.
  • con6) I am just not good at reading.

The Newly Added Three Items:
Time Spent on Reading (which is coded “ASBR04” in ePIRLS dataset): How much time do you spend reading outside of school on a normal school day?
Number of Books at Home (which is coded “ASBG04” in ePIRLS dataset): About how many books are there in your home? (Do not count magazines, newspapers, or your school books).
Time Spent Computer: (which is coded “ASBG10A” in ePIRLS dataset): How much time do you spend using a computer or tablet to find and read information for your schoolwork on a normal school day?

Interpreting the Cluster Analyses Results:
The second clustering attempt provided an eight cluster solution. By standardizing the data and plotting the results, we can create a profile of the clusters (see plot 2).
Cluster One: Cluster One is high on confidence (con1, con2, con3, con4, con5, and con6), and they are one standard deviation above the average in terms of time spent on computers. However, this cluster tends to be less than average in terms of number of books they have at home; on time spent reading outside of school on a normal school day.

Cluster Two: Cluster Two tends to be low on confidence on all of the variables, especially confidence Item Two which asks about how easy reading is.

Cluster Three: Cluster Three tends to be moderately above average on confidence, however, this cluster is outstandingly high on the item nine, which asks about how much time four grade students spend using a computer or tablet to find and read information for their schoolwork on a normal school day. In addition, this cluster is the only cluster that is high on confidence the number of books at home; all of the other clusters are a little below average on number of books at home.

Cluster Four: Cluster Four is on average on doing well on reading and easiness of reading items. Interestingly, this cluster is far away below average on the items that were reverse coded (con3, con4, con5, and con6). It seems that this cluster did not pay attention to the negative items and, therefore, it was way below average on the confidence scale items. It is worth noting that all of the confidence scale items are highly correlated.

Cluster Five: Cluster Five tends to be on average on almost all of the variables except for that this cluster is three standard deviations below average when they were asked baout how much time participants spend using a computer or tablet to find and read information for their schoolwork on a normal school day.

Cluster Six: Cluster Six result indicate that this cluster does not do well in reading and, therefore, think that reading is not as easy for other clusters as for cluster Six.

Cluster Seven: Cluster Seven is way below average on all of the confidence scale items. On the other hand, it was slightly below average on the other three items (Time Spent on Reading, Number of Books at Home, and Time Spent Computer).
Cluster Eight: Finally, Cluster Eight looks like on average on almost all of the items except for the third item, they do not have trouble reading stories with difficult words.

What is Next?
- Apparently, as mentioned earlier, Cluster Four is only below average on the negatively worded items. This cluster consists of 315 participants, 158 boys and 157 girls.
- I think that it is worth investigation that there is a need to include more variables to understand these eight clusters. Including the factors that affect confidence would clearly explain these eight clusters. But the question becomes: Who are those clusters? And What theory can explain them?