Reading and load packages
#install.packages("foreign")#load an SPSSS file
#install.packages("psych")
#install.packages("broom")
#install.packages("gclus")
library(broom)
library(psych)
library(foreign)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## %+%(): ggplot2, psych
## alpha(): ggplot2, psych
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringr)
library(gclus)
## Loading required package: cluster
Reading Data set
dataset = read.spss("/Users/levi.brackman/Downloads/GG T0 Motivation Scale (1).sav", to.data.frame=TRUE)
head(dataset, 3)
## StudentID Semester t0.MvEn_1 t0.MvEn_2
## 1 2016Yr9001 Semester 2 Agree a lot - 6 Agree a lot - 6
## 2 2016Yr9003 Semester 1 Agree - 5 Agree - 5
## 3 2016Yr9003 Semester 2 Agree - 5 Agree - 5
## t0.MvEn_3 t0.MvEn_4 t0.MvEn_5
## 1 Agree a lot - 6 Agree a lot - 6 Agree a lot - 6
## 2 Agree a little - 4 Disagree a little - 3 Agree - 5
## 3 Agree - 5 Agree a little - 4 Agree a little - 4
## t0.MvEn_6 t0.MvEn_7r t0.MvEn_8r t0.MvEn_9r t0.MvEn_10r
## 1 Disagree a little - 3 1 1 1 1
## 2 Agree a little - 4 2 3 4 4
## 3 Agree - 5 3 4 4 3
## t0.MvEn_11r t0.MvEnThought_Tot t0.MvEnBeh_Tot t0.MvEnMuff_Tot
## 1 1 6.000000 5.000000 1.000000
## 2 5 4.666667 4.000000 3.000000
## 3 5 5.000000 4.333333 3.666667
## t0.MvEnGuzz_Tot
## 1 1.0
## 2 4.5
## 3 4.0
There are extra characters in the colunms that R wont be able to read as well as colunms not needed for the analysis. So I now select the correct variables and remove characters from the values and save as newdataset
newdataset<-dataset %>%
select(-c(StudentID, Semester, t0.MvEnThought_Tot, t0.MvEnBeh_Tot, t0.MvEnMuff_Tot,
t0.MvEnGuzz_Tot)) %>%
mutate(t0.MvEn_1 = gsub("[^0-9]", "", t0.MvEn_1),
t0.MvEn_2 = gsub("[^0-9]", "", t0.MvEn_2),
t0.MvEn_3 = gsub("[^0-9]", "", t0.MvEn_3),
t0.MvEn_4 = gsub("[^0-9]", "", t0.MvEn_4),
t0.MvEn_5 = gsub("[^0-9]", "", t0.MvEn_5),
t0.MvEn_6 = gsub("[^0-9]", "", t0.MvEn_6),
t0.MvEn_7r = gsub("[^0-9]", "", t0.MvEn_7r),
t0.MvEn_8r = gsub("[^0-9]", "", t0.MvEn_8r),
t0.MvEn_9r = gsub("[^0-9]", "", t0.MvEn_9r),
t0.MvEn_10r = gsub("[^0-9]", "", t0.MvEn_10r),
t0.MvEn_11r = gsub("[^0-9]", "", t0.MvEn_11r))
#write_csv(newdataset, "derdataset.csv")
#names(dataset)
Now view dataset again
head(newdataset, 3)
## t0.MvEn_1 t0.MvEn_2 t0.MvEn_3 t0.MvEn_4 t0.MvEn_5 t0.MvEn_6 t0.MvEn_7r
## 1 6 6 6 6 6 3 1
## 2 5 5 4 3 5 4 2
## 3 5 5 5 4 4 5 3
## t0.MvEn_8r t0.MvEn_9r t0.MvEn_10r t0.MvEn_11r
## 1 1 1 1 1
## 2 3 4 4 5
## 3 4 4 3 5
Colunm names have dots in them (not a good convention) so I want to rename variable without dots
newdataset<- newdataset %>% rename(MvEn_1 = t0.MvEn_1, MvEn_2 = t0.MvEn_2, MvEn_3 = t0.MvEn_3,
MvEn_4 = t0.MvEn_4, MvEn_5 = t0.MvEn_5, MvEn_6 = t0.MvEn_6,
MvEn_7r = t0.MvEn_7r, MvEn_8r = t0.MvEn_8r, MvEn_9r = t0.MvEn_9r,
MvEn_10r = t0.MvEn_10r, MvEn_11r = t0.MvEn_11r)
Make all items into numeric
newdataset<- data.frame(apply(newdataset,2, as.numeric))
find max inorder to reverse score items
max(newdataset$MvEn_1, na.rm = TRUE)
## [1] 6
Get list of variables without quotes
noquote(names(newdataset))
## [1] MvEn_1 MvEn_2 MvEn_3 MvEn_4 MvEn_5 MvEn_6 MvEn_7r
## [8] MvEn_8r MvEn_9r MvEn_10r MvEn_11r
Reverse score columns
cols<- c("MvEn_7r", "MvEn_8r", "MvEn_9r", "MvEn_10r", "MvEn_11r")
newdataset[,cols] = lapply(cols, function(x) 6 - newdataset[, x])
#check
str(newdataset)
## 'data.frame': 346 obs. of 11 variables:
## $ MvEn_1 : num 6 5 5 5 4 6 5 5 5 6 ...
## $ MvEn_2 : num 6 5 5 6 3 5 5 4 5 6 ...
## $ MvEn_3 : num 6 4 5 4 4 6 2 3 5 6 ...
## $ MvEn_4 : num 6 3 4 3 3 5 5 2 4 6 ...
## $ MvEn_5 : num 6 5 4 3 3 5 4 2 5 6 ...
## $ MvEn_6 : num 3 4 5 5 4 5 5 4 4 6 ...
## $ MvEn_7r : num 5 4 3 3 2 0 5 4 3 5 ...
## $ MvEn_8r : num 5 3 2 0 2 1 5 3 4 5 ...
## $ MvEn_9r : num 5 2 2 0 2 1 1 1 4 5 ...
## $ MvEn_10r: num 5 2 3 3 3 1 2 4 4 5 ...
## $ MvEn_11r: num 5 1 1 0 3 0 2 2 4 0 ...
The darker squares on the plot show strong correlations
#firs remove incomplete data
newdataset<- newdataset[complete.cases(newdataset[,]),]
#look at the corolations
obs<-abs(cor(newdataset))
obs
## MvEn_1 MvEn_2 MvEn_3 MvEn_4 MvEn_5 MvEn_6
## MvEn_1 1.00000000 0.37941338 0.3762868 0.36680462 0.40900350 0.50961810
## MvEn_2 0.37941338 1.00000000 0.4868319 0.32994776 0.40589754 0.44848627
## MvEn_3 0.37628675 0.48683190 1.0000000 0.52939225 0.48555172 0.40617355
## MvEn_4 0.36680462 0.32994776 0.5293923 1.00000000 0.75324414 0.54633631
## MvEn_5 0.40900350 0.40589754 0.4855517 0.75324414 1.00000000 0.63413629
## MvEn_6 0.50961810 0.44848627 0.4061736 0.54633631 0.63413629 1.00000000
## MvEn_7r 0.05987714 0.08146159 0.1306251 0.18066838 0.12596689 0.08172025
## MvEn_8r 0.07838723 0.09889988 0.1093530 0.08484913 0.07685958 0.13114125
## MvEn_9r 0.34956800 0.11606995 0.0702703 0.01210666 0.03315291 0.20448360
## MvEn_10r 0.15499155 0.09247661 0.1275561 0.14758115 0.19277654 0.26231508
## MvEn_11r 0.30022373 0.19640993 0.1306738 0.14082585 0.16309319 0.35105917
## MvEn_7r MvEn_8r MvEn_9r MvEn_10r MvEn_11r
## MvEn_1 0.05987714 0.07838723 0.34956800 0.15499155 0.3002237
## MvEn_2 0.08146159 0.09889988 0.11606995 0.09247661 0.1964099
## MvEn_3 0.13062514 0.10935300 0.07027030 0.12755613 0.1306738
## MvEn_4 0.18066838 0.08484913 0.01210666 0.14758115 0.1408259
## MvEn_5 0.12596689 0.07685958 0.03315291 0.19277654 0.1630932
## MvEn_6 0.08172025 0.13114125 0.20448360 0.26231508 0.3510592
## MvEn_7r 1.00000000 0.26752603 0.12676569 0.17960163 0.2159479
## MvEn_8r 0.26752603 1.00000000 0.36954401 0.31415069 0.3854279
## MvEn_9r 0.12676569 0.36954401 1.00000000 0.46665383 0.5130856
## MvEn_10r 0.17960163 0.31415069 0.46665383 1.00000000 0.5049357
## MvEn_11r 0.21594791 0.38542795 0.51308560 0.50493567 1.0000000
#soem strong corolations. Which indicate that there are items that hang well together
colors<-dmat.color(obs)
ordered<-order.single(cor(newdataset))
cpairs(newdataset, order = ordered, panel.colors = colors)
Parallel analysis suggests that the number of factors = 2 perhaps 3. Scree plot suggests 2. We also look at the eigenvalues: There are two components that have an eigenvalue greater than 1. According to the Keiser criterion this means two factors, at least.
parallel<-fa.parallel(newdataset, fm="ml",fa="fa")
## Parallel analysis suggests that the number of factors = 3 and the number of components = NA
parallel$fa.values
## [1] 3.20741698 1.31986563 0.26388954 0.11659036 -0.01813535
## [6] -0.09772622 -0.15827505 -0.21374820 -0.30943751 -0.45869718
## [11] -0.52552008
parallel2<-princomp(newdataset, cor = TRUE)
summary(parallel2)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.9416218 1.4729576 0.96520913 0.91179133
## Proportion of Variance 0.3427178 0.1972367 0.08469352 0.07557849
## Cumulative Proportion 0.3427178 0.5399545 0.62464800 0.70022650
## Comp.5 Comp.6 Comp.7 Comp.8
## Standard deviation 0.85539696 0.79420330 0.74458489 0.66526095
## Proportion of Variance 0.06651854 0.05734172 0.05040061 0.04023383
## Cumulative Proportion 0.76674504 0.82408675 0.87448736 0.91472119
## Comp.9 Comp.10 Comp.11
## Standard deviation 0.61227675 0.58401667 0.47128403
## Proportion of Variance 0.03408026 0.03100686 0.02019169
## Cumulative Proportion 0.94880144 0.97980831 1.00000000
plot(parallel2)##results show at least two factors
```
Let’s now try and see what actual items make up the individual factors using EFA
TLI and CFI can score anywhere from 0 to 1 where results > .90 are considered an acceptable fit and > .95 represents an excellent fit to the data.
Marsh and colleagues (2010) suggest that while no golden rule exists for RMSEA, a score < .06 reflects a reasonable fit, whilst a RMSEA > .10 represents a poor fit to the data.
twofactor<-fa(newdataset, nfactors=2, rotate="oblimin", fm="ml")
## Loading required namespace: GPArotation
twofactor
## Factor Analysis using method = ml
## Call: fa(r = newdataset, nfactors = 2, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 h2 u2 com
## MvEn_1 0.45 -0.27 0.34 0.66 1.6
## MvEn_2 0.48 -0.11 0.27 0.73 1.1
## MvEn_3 0.61 -0.01 0.37 0.63 1.0
## MvEn_4 0.85 0.09 0.69 0.31 1.0
## MvEn_5 0.89 0.05 0.76 0.24 1.0
## MvEn_6 0.67 -0.22 0.58 0.42 1.2
## MvEn_7r 0.25 0.32 0.13 0.87 1.9
## MvEn_8r 0.02 0.50 0.25 0.75 1.0
## MvEn_9r 0.08 0.73 0.51 0.49 1.0
## MvEn_10r -0.07 0.61 0.40 0.60 1.0
## MvEn_11r -0.06 0.75 0.59 0.41 1.0
##
## ML1 ML2
## SS loadings 2.87 2.01
## Proportion Var 0.26 0.18
## Cumulative Var 0.26 0.44
## Proportion Explained 0.59 0.41
## Cumulative Proportion 0.59 1.00
##
## With factor correlations of
## ML1 ML2
## ML1 1.00 -0.24
## ML2 -0.24 1.00
##
## Mean item complexity = 1.2
## Test of the hypothesis that 2 factors are sufficient.
##
## The degrees of freedom for the null model are 55 and the objective function was 3.89 with Chi Square of 1298.49
## The degrees of freedom for the model are 34 and the objective function was 0.43
##
## The root mean square of the residuals (RMSR) is 0.05
## The df corrected root mean square of the residuals is 0.07
##
## The harmonic number of observations is 339 with the empirical chi square 105.47 with prob < 3e-09
## The total number of observations was 339 with Likelihood Chi Square = 142.3 with prob < 3.3e-15
##
## Tucker Lewis Index of factoring reliability = 0.859
## RMSEA index = 0.098 and the 90 % confidence intervals are 0.081 0.114
## BIC = -55.79
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## ML1 ML2
## Correlation of scores with factors 0.94 0.89
## Multiple R square of scores with factors 0.89 0.79
## Minimum correlation of possible factor scores 0.78 0.59
paste("CFI:", round(1-((twofactor$STATISTIC - twofactor$dof)/(twofactor$null.chisq- twofactor$null.dof)),4))
## [1] "CFI: 0.9129"
threefactor<-fa(newdataset, nfactors=3, rotate="oblimin", fm="ml")
threefactor
## Factor Analysis using method = ml
## Call: fa(r = newdataset, nfactors = 3, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML3 h2 u2 com
## MvEn_1 0.15 -0.06 0.67 0.59 0.41 1.1
## MvEn_2 0.29 0.01 0.38 0.33 0.67 1.9
## MvEn_3 0.48 0.04 0.25 0.38 0.62 1.5
## MvEn_4 0.85 0.02 -0.02 0.71 0.29 1.0
## MvEn_5 0.89 -0.01 0.00 0.80 0.20 1.0
## MvEn_6 0.54 -0.16 0.28 0.59 0.41 1.7
## MvEn_7r 0.15 0.38 0.18 0.16 0.84 1.8
## MvEn_8r -0.05 0.56 0.12 0.28 0.72 1.1
## MvEn_9r 0.19 0.64 -0.27 0.55 0.45 1.6
## MvEn_10r -0.15 0.70 0.15 0.48 0.52 1.2
## MvEn_11r -0.03 0.71 -0.09 0.57 0.43 1.0
##
## ML1 ML2 ML3
## SS loadings 2.42 1.92 1.09
## Proportion Var 0.22 0.17 0.10
## Cumulative Var 0.22 0.39 0.49
## Proportion Explained 0.45 0.35 0.20
## Cumulative Proportion 0.45 0.80 1.00
##
## With factor correlations of
## ML1 ML2 ML3
## ML1 1.00 -0.16 0.43
## ML2 -0.16 1.00 -0.34
## ML3 0.43 -0.34 1.00
##
## Mean item complexity = 1.4
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 55 and the objective function was 3.89 with Chi Square of 1298.49
## The degrees of freedom for the model are 25 and the objective function was 0.23
##
## The root mean square of the residuals (RMSR) is 0.04
## The df corrected root mean square of the residuals is 0.05
##
## The harmonic number of observations is 339 with the empirical chi square 48.11 with prob < 0.0036
## The total number of observations was 339 with Likelihood Chi Square = 77.68 with prob < 2.6e-07
##
## Tucker Lewis Index of factoring reliability = 0.906
## RMSEA index = 0.08 and the 90 % confidence intervals are 0.059 0.099
## BIC = -67.97
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy
## ML1 ML2 ML3
## Correlation of scores with factors 0.94 0.89 0.83
## Multiple R square of scores with factors 0.89 0.79 0.69
## Minimum correlation of possible factor scores 0.78 0.59 0.39
paste("CFI:", round(1-((threefactor$STATISTIC - threefactor$dof)/(threefactor$null.chisq- threefactor$null.dof)),4))
## [1] "CFI: 0.9576"