Setup

library(pacman); p_load(lavaan, psych)

# Experiment 1

lowerExp1 <- '
1                   
0.82    1               
0.42    0.42    1           
0.5 0.6 0.4 1       
0.19    0.22    0.24    0.39    1   
0.46    0.51    0.33    0.5 0.26    1'

lowerCon1 <- '
1                   
0.73    1               
0.49    0.38    1           
0.37    0.42    0.39    1       
0.35    0.44    0.44    0.34    1   
0.45    0.37    0.41    0.57    0.36    1'

Names1 = list("MatCon", "MatDis", "LPS2GK", "LPS2NumSeq",   "LPS2MenRot",   "LPS2Add")

Exp1Cor = getCov(lowerExp1, names = Names1)
Con1Cor = getCov(lowerCon1, names = Names1)

Exp1SDs <- c(4.39, 4.10, 7.86, 3.65, 6.69, 6)
Con1SDs <- c(5.48, 4.83, 9.66, 3.37, 7.54, 5.98)

Exp1Cov = lavaan::cor2cov(Exp1Cor, Exp1SDs)
Con1Cov = lavaan::cor2cov(Con1Cor, Con1SDs)

Exp1Means <- c(13.59, 14.16, 34.59, 20.59, 20.98, 14.46)
Con1Means <- c(7.21, 9.07, 35.91, 20.89, 22.95, 14.79)

ExpCon1N <- 56

# Experiment 2

lowerExp2 <- '
1               
0.45    1           
0.37    0.8 1       
0.5 0.67    0.44    1   
0.22    0.73    0.24    0.32    1'

lowerCon2 <- '
1               
0.38    1           
0.23    0.76    1       
0.46    0.57    0.22    1   
0.24    0.81    0.31    0.35    1'

Names2 = list("FigMat", "LPS2Sum", "LPS2GK", "LPS2NumSeq", "LPS2MenRot")

Exp2Cor = getCov(lowerExp2, names = Names2)
Con2Cor = getCov(lowerCon2, names = Names2)

Exp2SDs <- c(6.02, 15.79, 8.87, 4.1, 8.12)
Con2SDs <- c(7.46, 14.53, 7.77, 3.77, 8.03)

Exp2Cov = lavaan::cor2cov(Exp2Cor, Exp2SDs)
Con2Cov = lavaan::cor2cov(Con2Cor, Con2SDs)

Exp2Means <- c(17.63, 79.87, 36.04, 21.57, 22.26)
Con2Means <- c(9.43, 79.22, 35.73, 21.37, 22.12)

Exp2N <- 114
Con2N <- 115

# Consolidate

Exp1Covs <- list(Exp1Cov, Con1Cov)
Exp2Covs <- list(Exp2Cov, Con2Cov)

Exp1Means <- list(Exp1Means, Con1Means)
Exp2Means <- list(Exp2Means, Con2Means)

Exp1Ns <- list(ExpCon1N, ExpCon1N)
Exp2Ns <- list(Exp2N, Con2N)

# Fit Measures

FITM <- c("chisq", "df", "nPar", "cfi", "rmsea", "rmsea.ci.lower", "rmsea.ci.upper", "aic", "bic")

# Bias Metrics

SDI2.UDI2 = function(p, loading1, loading2, intercept1, intercept2, stdev2, fmean2, fsd2, nodewidth = 0.01, lowerLV = -5, upperLV = 5) {
   LV = seq(lowerLV,upperLV,nodewidth)
   DiffExpItem12 = matrix(NA,length(LV),p)
   pdfLV2 = matrix(NA,length(LV),1)
   SDI2numerator = matrix(NA,length(LV),p)
   UDI2numerator = matrix(NA,length(LV),p)
   SDI2 = matrix(NA,p,1)
   UDI2 = matrix(NA,p,1)
   for(j in 1:p){
    for(k in 1:length(LV)){
      DiffExpItem12[k,j] <- (intercept1[j]-intercept2[j]) + (loading1[j]-loading2[j])*LV[k]
      pdfLV2[k] = dnorm(LV[k], mean=fmean2, sd=fsd2)
      SDI2numerator[k,j] = DiffExpItem12[k,j]*pdfLV2[k]*nodewidth
      UDI2numerator[k,j] = abs(SDI2numerator[k,j])
    }
    SDI2[j] <- sum(SDI2numerator[,j])/stdev2[j]
    UDI2[j] <- sum(UDI2numerator[,j])/stdev2[j]
   }
  colnames(SDI2) = "SDI2"
  colnames(UDI2) = "UDI2"
  return(list(SDI2=round(SDI2,4), UDI2=round(UDI2,4)))}

SUDI <- function(SDI, UDI){
  D <- ifelse(SDI < 0, -1, 1)
  H = D * UDI
  return(H)}

# Cohen's d Function

Cohensd <- function(M1, M2, SD1, SD2, N1 = 1, N2 = 1, rnd = 3){
  SDP = sqrt((SD1^2 + SD2^2)/2)
  SDPW = sqrt((((N1 - 1) * SD1^2) + ((N2 - 1) * SD2^2))/(N1 + N2))
  d = (M2 - M1)/SDP
  delta = (M2 - M1)/SD1
  g = (M2 - M1)/SDPW
  if (N1 & N2 <= 1) {cat(paste0("With group means of ", M1, " and ", M2," with SDs of ", SD1, " and ", SD2, ", Cohen's d is ", round(d, rnd), " Glass' Delta is ", round(delta, rnd), ". \n"))} else {cat(paste0("With group means of ", M1, " and ", M2," with SDs of ", SD1, " and ", SD2, " Cohen's d is ", round(d, rnd), " Glass' Delta is ", round(delta, rnd), " and Hedge's g is ", round(g, rnd), ". \n"))}}

Rationale

Schneider et al. (2020) performed two experiments where they taught samples figural matrix rules and their test scores rose considerable. In both experimental and control groups, figural matrix performance was related to intelligence before and after the experiment, the scores were simply boosted by more than one Cohen’s d.

Experimental manipulations that impact scores are especially interesting for theory, since experiments that boost intelligence should be measurement invariant, but those which simply change measurement properties will bias a model’s loadings, intercepts, and/or residual variances. If a given intervention raises a score by changing the interpretation of an item heterogeneously, it should change the loadings, altering the scaling with respect to intelligence between control and intervention groups. If a given intervention raises a score homogeneously, it will shift intercepts up. In the former case but not always the latter, the residual variance should also change, since the intervention should result in additional forces acting on and differentiating within the intervention group. This last change could also be due to effects on the measurement reliability of items, but it is harder to detect than intercept changes for the same reasons that all mean changes require smaller samples to detect than variance differences. I’ve simulated this for variance ratios in general, here: https://rpubs.com/JLLJ/SDVR.

All that said, it could be useful to see if Schneider et al.’s intervention really altered intelligence or if it just caused measurement bias when it boosted scores. Presumably, the answer is not a change to intelligence, since the non-matrix test items were not altered.

Analysis

Initial Model Fits

Exp1Mod <- '
g =~ MatCon + MatDis + LPS2GK + LPS2NumSeq + LPS2MenRot + LPS2Add'

Exp1InitFit <- cfa(Exp1Mod, 
                   sample.cov = Exp1Cov, 
                   sample.mean = Exp1Means, 
                   sample.nobs = ExpCon1N, 
                   std.lv = T)

Con1InitFit <- cfa(Exp1Mod, 
                   sample.cov = Con1Cov, 
                   sample.mean = Con1Means, 
                   sample.nobs = ExpCon1N, 
                   std.lv = T)

fitMeasures(Exp1InitFit, FITM)
##          chisq             df           npar            cfi          rmsea 
##         12.518          9.000         18.000          0.971          0.084 
## rmsea.ci.lower rmsea.ci.upper            aic            bic 
##          0.000          0.184       1974.637       2011.093
fitMeasures(Con1InitFit, FITM)
##          chisq             df           npar            cfi          rmsea 
##         22.346          9.000         18.000          0.876          0.163 
## rmsea.ci.lower rmsea.ci.upper            aic            bic 
##          0.079          0.249       2066.590       2103.047
Exp2Mod <- '
g =~ FigMat + LPS2GK + LPS2NumSeq + LPS2MenRot'

Exp2InitFit <- cfa(Exp2Mod, 
                   sample.cov = Exp2Cov, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2N, 
                   std.lv = T)

Con2InitFit <- cfa(Exp2Mod, 
                   sample.cov = Con2Cov, 
                   sample.mean = Con2Means, 
                   sample.nobs = Con2N, 
                   std.lv = T)

fitMeasures(Exp2InitFit, FITM)
##          chisq             df           npar            cfi          rmsea 
##          0.340          2.000         12.000          1.000          0.000 
## rmsea.ci.lower rmsea.ci.upper            aic            bic 
##          0.000          0.104       2944.704       2977.539
fitMeasures(Con2InitFit, FITM)
##          chisq             df           npar            cfi          rmsea 
##          4.571          2.000         12.000          0.951          0.106 
## rmsea.ci.lower rmsea.ci.upper            aic            bic 
##          0.000          0.237       2989.309       3022.248

I cannot explain why the initial fit is worse for the non-experimental group in both cases, but it’s consistent with multigroup sampling error and small samples, so it shouldn’t be focused on.

Multigroup Fits

Experiment 1

Exp1Config <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = T)

Exp1Metric <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = F,
                  group.equal = "loadings")

Exp1Scalar <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts"))

Exp1ScalarP <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts"),
                  group.partial = c("MatCon ~ 1", "MatDis ~ 1"))

Exp1Strict <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts", "residuals"),
                  group.partial = c("MatCon ~ 1", "MatDis ~ 1"))

Exp1StrictP <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts", "residuals"),
                  group.partial = c("MatCon ~ 1", "MatDis ~ 1",
                                    "MatDis ~~ MatDis")) 

#MatCon ~~ MatCon does not make for an acceptable model, but when included with MatDis ~~ MatDis, it actually makes the model fit better than the configural one (!). Because I did not want to ovefit, I only did what was sufficient. 

Exp1LVars <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts", "residuals", "lv.variances"),
                  group.partial = c("MatCon ~ 1", "MatDis ~ 1",
                                    "MatDis ~~ MatDis"))

Exp1Means <- cfa(Exp1Mod, 
                   sample.cov = Exp1Covs, 
                   sample.mean = Exp1Means, 
                   sample.nobs = Exp1Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts", "residuals", "lv.variances", "means"),
                  group.partial = c("MatCon ~ 1", "MatDis ~ 1",
                                    "MatDis ~~ MatDis"))

round(cbind(Configural       = fitMeasures(Exp1Config, FITM),
            Metric           = fitMeasures(Exp1Metric, FITM),
            Scalar           = fitMeasures(Exp1Scalar, FITM),
            "Partial Scalar" = fitMeasures(Exp1ScalarP, FITM),
            Strict           = fitMeasures(Exp1Strict, FITM),
            "Partial Strict" = fitMeasures(Exp1StrictP, FITM),
            Variances        = fitMeasures(Exp1LVars, FITM),
            Means            = fitMeasures(Exp1Means, FITM)), 3)
##                Configural   Metric   Scalar Partial Scalar   Strict
## chisq              34.865   40.156  103.483         41.686   54.317
## df                 18.000   23.000   28.000         26.000   32.000
## npar               36.000   31.000   26.000         28.000   22.000
## cfi                 0.926    0.925    0.668          0.931    0.902
## rmsea               0.129    0.115    0.219          0.104    0.112
## rmsea.ci.lower      0.062    0.051    0.175          0.036    0.057
## rmsea.ci.upper      0.193    0.174    0.265          0.160    0.161
## aic              4041.227 4036.518 4089.846       4032.049 4032.680
## bic              4139.093 4120.792 4160.527       4108.167 4092.487
##                Partial Strict Variances    Means
## chisq                  46.938    47.514   48.440
## df                     31.000    32.000   33.000
## npar                   23.000    22.000   21.000
## cfi                     0.930     0.932    0.932
## rmsea                   0.096     0.093    0.091
## rmsea.ci.lower          0.028     0.023    0.020
## rmsea.ci.upper          0.149     0.146    0.144
## aic                  4027.301  4025.877 4024.802
## bic                  4089.826  4085.684 4081.891
pchisq(40.156 - 34.865, 5, lower.tail = F) #Configural to Metric. Pass.
## [1] 0.3814078
pchisq(103.483 - 40.156, 5, lower.tail = F) #Metric to Scalar. Abysmal fail.
## [1] 2.490724e-12
pchisq(41.686 - 40.156, 3, lower.tail = F) #Metric to Partial Scalar. Pass.
## [1] 0.6753639
pchisq(54.317 - 41.686, 6, lower.tail = F) #Partial Scalar to Strict. Fail.
## [1] 0.04928463
pchisq(46.938 - 41.686, 5, lower.tail = F) #Partial Scalar to Partial Strict. Pass
## [1] 0.3859062
pchisq(47.514 - 46.938, 1, lower.tail = F) #Partial Strict to Latent Variances. Pass
## [1] 0.4478845
pchisq(48.440 - 47.514, 1, lower.tail = F) #Latent Variances to Means. Pass
## [1] 0.3359045

How much bias was detected? The control group will be the SD anchor.

summary(Exp1Means, stand = T, fit = T)
## lavaan 0.6.14 ended normally after 50 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        36
##   Number of equality constraints                    15
## 
##   Number of observations per group:                   
##     Group 1                                         56
##     Group 2                                         56
## 
## Model Test User Model:
##                                                       
##   Test statistic                                48.440
##   Degrees of freedom                                33
##   P-value (Chi-square)                           0.041
##   Test statistic for each group:
##     Group 1                                     19.889
##     Group 2                                     28.551
## 
## Model Test Baseline Model:
## 
##   Test statistic                               257.333
##   Degrees of freedom                                30
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.932
##   Tucker-Lewis Index (TLI)                       0.938
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -1991.401
##   Loglikelihood unrestricted model (H1)      -1967.181
##                                                       
##   Akaike (AIC)                                4024.802
##   Bayesian (BIC)                              4081.891
##   Sample-size adjusted Bayesian (SABIC)       4015.523
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.091
##   90 Percent confidence interval - lower         0.020
##   90 Percent confidence interval - upper         0.144
##   P-value H_0: RMSEA <= 0.050                    0.123
##   P-value H_0: RMSEA >= 0.080                    0.658
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.121
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## 
## Group 1 [Group 1]:
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     MatCon            1.000                               4.211    0.854
##     MatDis  (.p2.)    0.932    0.088   10.535    0.000    3.924    0.938
##     LPS2GK  (.p3.)    1.113    0.192    5.812    0.000    4.689    0.536
##     LPS2NmS (.p4.)    0.492    0.075    6.583    0.000    2.071    0.594
##     LPS2MnR (.p5.)    0.684    0.162    4.221    0.000    2.880    0.404
##     LPS2Add (.p6.)    0.799    0.129    6.211    0.000    3.364    0.567
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .MatCon           13.889    0.580   23.957    0.000   13.889    2.818
##    .MatDis           14.439    0.477   30.261    0.000   14.439    3.451
##    .LPS2GK  (.16.)   35.250    0.827   42.624    0.000   35.250    4.028
##    .LPS2NmS (.17.)   20.740    0.329   62.991    0.000   20.740    5.952
##    .LPS2MnR (.18.)   21.965    0.674   32.593    0.000   21.965    3.080
##    .LPS2Add (.19.)   14.625    0.561   26.063    0.000   14.625    2.463
##     g                 0.000                               0.000    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .MatCon  (.p7.)    6.565    1.456    4.508    0.000    6.565    0.270
##    .MatDis            2.106    1.149    1.834    0.067    2.106    0.120
##    .LPS2GK  (.p9.)   54.615    7.682    7.110    0.000   54.615    0.713
##    .LPS2NmS (.10.)    7.853    1.126    6.972    0.000    7.853    0.647
##    .LPS2MnR (.11.)   42.574    5.829    7.305    0.000   42.574    0.837
##    .LPS2Add (.12.)   23.948    3.400    7.043    0.000   23.948    0.679
##     g       (.13.)   17.736    3.336    5.317    0.000    1.000    1.000
## 
## 
## Group 2 [Group 2]:
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     MatCon            1.000                               4.211    0.854
##     MatDis  (.p2.)    0.932    0.088   10.535    0.000    3.924    0.814
##     LPS2GK  (.p3.)    1.113    0.192    5.812    0.000    4.689    0.536
##     LPS2NmS (.p4.)    0.492    0.075    6.583    0.000    2.071    0.594
##     LPS2MnR (.p5.)    0.684    0.162    4.221    0.000    2.880    0.404
##     LPS2Add (.p6.)    0.799    0.129    6.211    0.000    3.364    0.567
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .MatCon            6.911    0.580   11.920    0.000    6.911    1.402
##    .MatDis            8.791    0.575   15.296    0.000    8.791    1.823
##    .LPS2GK  (.16.)   35.250    0.827   42.624    0.000   35.250    4.028
##    .LPS2NmS (.17.)   20.740    0.329   62.991    0.000   20.740    5.952
##    .LPS2MnR (.18.)   21.965    0.674   32.593    0.000   21.965    3.080
##    .LPS2Add (.19.)   14.625    0.561   26.063    0.000   14.625    2.463
##     g                 0.000                               0.000    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .MatCon  (.p7.)    6.565    1.456    4.508    0.000    6.565    0.270
##    .MatDis            7.853    2.134    3.681    0.000    7.853    0.338
##    .LPS2GK  (.p9.)   54.615    7.682    7.110    0.000   54.615    0.713
##    .LPS2NmS (.10.)    7.853    1.126    6.972    0.000    7.853    0.647
##    .LPS2MnR (.11.)   42.574    5.829    7.305    0.000   42.574    0.837
##    .LPS2Add (.12.)   23.948    3.400    7.043    0.000   23.948    0.679
##     g       (.13.)   17.736    3.336    5.317    0.000    1.000    1.000
Exp1p = 6
Exp1l1 = c(0, 0, 0, 0, 0, 0); Exp1l2 = c(0, 0, 0, 0, 0, 0) # No loading differences
Exp1i1 = c(6.911, 8.791, 0, 0, 0, 0); Exp1i2 = c(13.889, 14.439, 0, 0, 0, 0) # Matrix test intercept differences
Exp1sd = c(5.48, 4.83, 9.66, 3.37, 7.54, 5.98)
Exp1fm = 0
Exp1fsd = 1

Exp1ES <- SDI2.UDI2(Exp1p, Exp1l1, Exp1l2, Exp1i1, Exp1i2, Exp1sd, Exp1fm, Exp1fsd)

SUDI(Exp1ES$SDI2, Exp1ES$UDI2)
##         SDI2
## [1,] -1.2734
## [2,] -1.1694
## [3,]  0.0000
## [4,]  0.0000
## [5,]  0.0000
## [6,]  0.0000

The bias induced in favor of the experimental group can be thought of like Hedge’s g because of where the summary statistics involved in its calculation comes from (i.e., the SEM). Plainly, the experimental group received a 1.27 and 1.17 g higher intercept than expected given their latent ability levels. This compares to the total score gaps of

Cohensd(13.59, 7.21, 4.39, 5.48, 56, 56)
## With group means of 13.59 and 7.21 with SDs of 4.39 and 5.48 Cohen's d is -1.285 Glass' Delta is -1.453 and Hedge's g is -1.297.
Cohensd(14.16, 9.07, 4.10, 4.83, 56, 56)
## With group means of 14.16 and 9.07 with SDs of 4.1 and 4.83 Cohen's d is -1.136 Glass' Delta is -1.241 and Hedge's g is -1.146.

Randomization probably worked for this experimental comparison. Great job, Schneider et al. (2020)!

Now onto the larger experiment.

Experiment 2

Exp2Config <- cfa(Exp2Mod, 
                   sample.cov = Exp2Covs, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2Ns, 
                   std.lv = T)

Exp2Metric <- cfa(Exp2Mod, 
                   sample.cov = Exp2Covs, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2Ns, 
                   std.lv = F,
                  group.equal = "loadings")

Exp2Scalar <- cfa(Exp2Mod, 
                   sample.cov = Exp2Covs, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts"))
## Warning in lav_object_post_check(object): lavaan WARNING: some estimated ov
## variances are negative
Exp2ScalarP <- cfa(Exp2Mod, 
                   sample.cov = Exp2Covs, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts"),
                  group.partial = "FigMat ~ 1")

Exp2Strict <- cfa(Exp2Mod, 
                   sample.cov = Exp2Covs, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts", "residuals"),
                  group.partial = "FigMat ~ 1")

Exp2LVars <- cfa(Exp2Mod, 
                   sample.cov = Exp2Covs, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts", "residuals", "lv.variances"),
                  group.partial = "FigMat ~ 1")

Exp2Means <- cfa(Exp2Mod, 
                   sample.cov = Exp2Covs, 
                   sample.mean = Exp2Means, 
                   sample.nobs = Exp2Ns, 
                   std.lv = F,
                  group.equal = c("loadings", "intercepts", "residuals", "lv.variances", "means"),
                  group.partial = "FigMat ~ 1")

round(cbind(Configural       = fitMeasures(Exp2Config, FITM),
            Metric           = fitMeasures(Exp2Metric, FITM),
            Scalar           = fitMeasures(Exp2Scalar, FITM),
            "Partial Scalar" = fitmeasures(Exp2ScalarP, FITM),
            Strict           = fitMeasures(Exp2Strict, FITM),
            Variances        = fitMeasures(Exp2LVars, FITM),
            Means            = fitMeasures(Exp2Means, FITM)), 3)
##                Configural   Metric   Scalar Partial Scalar   Strict Variances
## chisq               4.911    9.619   61.008          9.632   17.320    17.610
## df                  4.000    7.000   10.000          9.000   13.000    14.000
## npar               24.000   21.000   18.000         19.000   15.000    14.000
## cfi                 0.993    0.979    0.584          0.995    0.965     0.971
## rmsea               0.045    0.057    0.211          0.025    0.054     0.047
## rmsea.ci.lower      0.000    0.000    0.162          0.000    0.000     0.000
## rmsea.ci.upper      0.154    0.137    0.263          0.110    0.114     0.108
## aic              5934.014 5932.722 5978.111       5928.735 5928.423  5926.713
## bic              6016.423 6004.830 6039.918       5993.976 5979.928  5974.786
##                   Means
## chisq            17.771
## df               15.000
## npar             13.000
## cfi               0.977
## rmsea             0.040
## rmsea.ci.lower    0.000
## rmsea.ci.upper    0.101
## aic            5924.874
## bic            5969.512
pchisq(9.619 - 4.911, 3, lower.tail = F) #Configural to Metric. Pass.
## [1] 0.1944707

This is a very dubious “pass” that I assume was down to low power. The CFI is bad, the RMSEA is borderline (but not significantly so), the AIC/BIC are both good, and the p-value is of course fine.

pchisq(61.008 - 9.619, 3, lower.tail = F) #Metric to Scalar. Abysmal fail.
## [1] 4.042153e-11
pchisq(9.632 - 9.619, 2, lower.tail = F) #Metric to Partial Scalar. Pass.
## [1] 0.9935211
pchisq(17.320 - 9.632, 4, lower.tail = F) #Partial Scalar to Strict. Pass.
## [1] 0.1036994

This is another very dubious “pass” that I also assume was down to low power. The CFI is horrible, the RMSEA is horrible (but not significantly so), the AIC/BIC are both good, and the p-value is of course fine.

pchisq(17.610 - 17.320, 1, lower.tail = F) #Partial Scalar to Latent Variances. Pass
## [1] 0.5902205
pchisq(17.771 - 17.610, 1, lower.tail = F) #Latent Variances to Means. Pass
## [1] 0.6882375

Now to bias

summary(Exp2Means, stand = T, fit = T)
## lavaan 0.6.14 ended normally after 55 iterations
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                        24
##   Number of equality constraints                    11
## 
##   Number of observations per group:                   
##     Group 1                                        114
##     Group 2                                        115
## 
## Model Test User Model:
##                                                       
##   Test statistic                                17.771
##   Degrees of freedom                                15
##   P-value (Chi-square)                           0.275
##   Test statistic for each group:
##     Group 1                                      7.321
##     Group 2                                     10.450
## 
## Model Test Baseline Model:
## 
##   Test statistic                               134.619
##   Degrees of freedom                                12
##   P-value                                        0.000
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.977
##   Tucker-Lewis Index (TLI)                       0.982
## 
## Loglikelihood and Information Criteria:
## 
##   Loglikelihood user model (H0)              -2949.437
##   Loglikelihood unrestricted model (H1)      -2940.551
##                                                       
##   Akaike (AIC)                                5924.874
##   Bayesian (BIC)                              5969.512
##   Sample-size adjusted Bayesian (SABIC)       5928.310
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.040
##   90 Percent confidence interval - lower         0.000
##   90 Percent confidence interval - upper         0.101
##   P-value H_0: RMSEA <= 0.050                    0.544
##   P-value H_0: RMSEA >= 0.080                    0.168
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.090
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## 
## Group 1 [Group 1]:
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     FigMat            1.000                               4.123    0.611
##     LPS2GK  (.p2.)    0.962    0.182    5.285    0.000    3.967    0.478
##     LPS2NmS (.p3.)    0.719    0.124    5.801    0.000    2.964    0.756
##     LPS2MnR (.p4.)    0.861    0.173    4.981    0.000    3.550    0.442
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .FigMat           17.542    0.593   29.601    0.000   17.542    2.598
##    .LPS2GK  (.11.)   35.884    0.549   65.420    0.000   35.884    4.323
##    .LPS2NmS (.12.)   21.470    0.259   82.843    0.000   21.470    5.474
##    .LPS2MnR (.13.)   22.190    0.531   41.766    0.000   22.190    2.760
##     g                 0.000                               0.000    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .FigMat  (.p5.)   28.600    3.798    7.529    0.000   28.600    0.627
##    .LPS2GK  (.p6.)   53.168    5.722    9.291    0.000   53.168    0.772
##    .LPS2NmS (.p7.)    6.597    1.475    4.472    0.000    6.597    0.429
##    .LPS2MnR (.p8.)   52.035    5.439    9.568    0.000   52.035    0.805
##     g       (.p9.)   16.995    4.278    3.973    0.000    1.000    1.000
## 
## 
## Group 2 [Group 2]:
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##   g =~                                                                  
##     FigMat            1.000                               4.123    0.611
##     LPS2GK  (.p2.)    0.962    0.182    5.285    0.000    3.967    0.478
##     LPS2NmS (.p3.)    0.719    0.124    5.801    0.000    2.964    0.756
##     LPS2MnR (.p4.)    0.861    0.173    4.981    0.000    3.550    0.442
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .FigMat            9.518    0.590   16.121    0.000    9.518    1.410
##    .LPS2GK  (.11.)   35.884    0.549   65.420    0.000   35.884    4.323
##    .LPS2NmS (.12.)   21.470    0.259   82.843    0.000   21.470    5.474
##    .LPS2MnR (.13.)   22.190    0.531   41.766    0.000   22.190    2.760
##     g                 0.000                               0.000    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)   Std.lv  Std.all
##    .FigMat  (.p5.)   28.600    3.798    7.529    0.000   28.600    0.627
##    .LPS2GK  (.p6.)   53.168    5.722    9.291    0.000   53.168    0.772
##    .LPS2NmS (.p7.)    6.597    1.475    4.472    0.000    6.597    0.429
##    .LPS2MnR (.p8.)   52.035    5.439    9.568    0.000   52.035    0.805
##     g       (.p9.)   16.995    4.278    3.973    0.000    1.000    1.000
Exp2p = 4
Exp2l1 = c(0, 0, 0, 0); Exp2l2 = c(0, 0, 0, 0) # No loading differences
Exp2i1 = c(9.518, 0, 0, 0); Exp2i2 = c(17.542, 0, 0, 0) # Matrix test intercept differences
Exp2sd = c(7.46, 7.77, 3.77, 8.03)
Exp2fm = 0
Exp2fsd = 1

Exp2ES <- SDI2.UDI2(Exp2p, Exp2l1, Exp2l2, Exp2i1, Exp2i2, Exp2sd, Exp2fm, Exp2fsd)

SUDI(Exp2ES$SDI2, Exp2ES$UDI2)
##         SDI2
## [1,] -1.0756
## [2,]  0.0000
## [3,]  0.0000
## [4,]  0.0000
Cohensd(17.63, 9.43, 6.02, 7.46, 114, 115)
## With group means of 17.63 and 9.43 with SDs of 6.02 and 7.46 Cohen's d is -1.21 Glass' Delta is -1.362 and Hedge's g is -1.214.

Again, almost all of the gap is explained by measurement bias. What a consistent finding.

Discussion

Everything came out as expected if measurement invariance testing and bias quantification work: the experimental group became better at doing matrix tasks, but not other tasks, and the gains they showed in matrix tasks were basically entirely attributable to bias in the intercepts, with the residual gaps being small enough to be dismissable or attributable to little more than random variation and modest power.

The first but not the latter experiment’s model significantly supported differences in the residual variances, but the first model had more power for detecting said differences and the second model seemed to produce results consistent with their variance too. The significance testing simply failed to bear them out. Many factors influence statistical power, and this was a great example of how the number of indicators can matter more than the sample size at times.

This experiment provided good data for supplementing the paper detailed in https://rpubs.com/JLLJ/PMI22. That paper featured the use of a different questionnaire and the finding that bias was detected as expected. This experiment shows that result manipulation with a mostly or completely homogeneous effect in a sample also produces a bias that can be discovered as expected (i.e., in the intercepts) and quantified accurately.

Two related trials, one described clearly and the other much less so, also support that interventions or formatting differences are detectable by assessing measurement invariance. The first, and better-documented of the two, is Arendasy & Sommer (2013). This study showed used a higher-order model with the group factors Crystallized, Fluid, and Quantitative intelligence, showing that variations in the rules involved in figural matrices tasks resulted in measurement non-invariance in the loadings, intercepts, and residual variances of the figural matrices items alone.

[The] weak measurement invariance model exhibited a significantly worse model fit than the configural invariance model. To test whether the misfit can be attributed to differences in the Gf/g-saturation of the figural matrices test between the latent classes, we allowed the factor loading of the figural matrices test on Gf to be estimated separately for both latent classes. This partial weak measurement invariance model exhibited a good fit and fitted no worse than the configural invariance model. [… The] Gf/g-saturation of the figural matrices test was lower in the latent class assuemd to use response elimination more often to solve figural matrices items. […] Next, we restricted the intercepts and error variances of our cognitive ability tests to be equal across both latent classes. Unfortunately, these models failed to fit the data no worse than their less restrictive precursor. However, allowing the intercept and error variance of the figural matrices test to be estimated separately for the two latent classes resolved the problem.

Becker et al. (2016) did something similar and found essentially the same result in the sense of exclaiming, like Arendasy & Sommer (2013), that the prevention of response elimination strategies helps with the validities for figural matrices tests. Unlike Arendasy & Sommer (2013), it seemed their model involved correlated group factors with intelligence mistreated as if it were not a higher-order factor. They fit three models, with one of them left difficult to interpret.

The first model, which was hard to interpret, was one in which the matrices tests grouped as a factor and covaried with a working memory factor, and this one seemed to show evidence for up to strict invariance despite rules differences. Unfortunately, strict invariance is not tested in the correlated group factor model without testing the typically-structural test of the equality of latent covariances, since things like measurement error can reside there (and may have driven their results accordingly) if every other measurement parameter is constrained. As Millsap & Olivera-Aguilar (2012) implied, the correlated group factor model can turn some parameters from structural to measurement ones. The same interpretive issue is also true for higher-order factor loadings.

In the models in which the matrices factor was covaried with intelligence or both intelligence and working memory, mesaurement invairance was rejected at every level. Unfortunately, without their raw data, it’s uncertain what really went on here. Intelligence was obviously misidentified if it did not include as a factor both working memory and matrices. Ideally, their data could be reacquired and modeled properly. But in any case, their results seem to at least be majorly, if not totally, consistent with those or Arendasy & Sommer (2013) and the results presented here.

Dr. Becker has been contacted for his data and this post will be updated with results in the post-script if I receive it.

References

Schneider, B., Becker, N., Krieger, F., Spinath, F. M., & Sparfeldt, J. R. (2020). Teaching the underlying rules of figural matrices in a short video increases test scores. Intelligence, 82, 101473. https://doi.org/10.1016/j.intell.2020.101473

Arendasy, M. E., & Sommer, M. (2013). Reducing response elimination strategies enhances the construct validity of figural matrices. Intelligence, 41(4), 234–243. https://doi.org/10.1016/j.intell.2013.03.006

Becker, N., Schmitz, F., Falk, A. M., Feldbrügge, J., Recktenwald, D. R., Wilhelm, O., Preckel, F., & Spinath, F. M. (2016). Preventing Response Elimination Strategies Improves the Convergent Validity of Figural Matrices. Journal of Intelligence, 4(1), Article 1. https://doi.org/10.3390/jintelligence4010002

Millsap, R. E., & Olivera-Aguilar, M. (2012). Investigating measurement invariance using confirmatory factor analysis. In R. H. Hoyle (Ed.), Handbook of structural equation modeling (pp. 380-392). New York, NY: Guildford Press

sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] psych_2.2.9   lavaan_0.6-14 pacman_0.5.1 
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.14 knitr_1.42      MASS_7.3-58.1   mnormt_2.1.1   
##  [5] pbivnorm_0.6.0  lattice_0.20-45 R6_2.5.1        rlang_1.0.6    
##  [9] quadprog_1.5-8  fastmap_1.1.0   tools_4.2.2     parallel_4.2.2 
## [13] grid_4.2.2      nlme_3.1-160    xfun_0.37       cli_3.6.0      
## [17] jquerylib_0.1.4 htmltools_0.5.4 yaml_2.3.7      digest_0.6.31  
## [21] sass_0.4.5      cachem_1.0.6    evaluate_0.20   rmarkdown_2.20 
## [25] compiler_4.2.2  bslib_0.4.2     stats4_4.2.2    jsonlite_1.8.4