library(pacman)
p_load(lavaan, dplyr, knitr)
Names = list("I", "S", "A", "V", "C", "DS", "PC", "PA", "BD", "OA", "CO", "HM", "GC", "NR", "T", "WO", "MA", "MS", "PS", "FP", "AR", "R", "RD", "RU")
lowerw = '
1
0.6 1
0.37 0.37 1
0.71 0.69 0.38 1
0.59 0.51 0.25 0.63 1
0.2 0.23 0.52 0.11 0.1 1
0.29 0.38 0.09 0.28 0.34 0.14 1
0.24 0.17 0.23 0.28 0.11 0.14 0.18 1
0.38 0.41 0.34 0.56 0.23 0.25 0.43 0.4 1
0.29 0.37 0.24 0.45 0.24 0.23 0.44 0.29 0.55 1
0.03 0.15 0.1 0.05 0.05 0.14 0.15 0.22 0.16 0.13 1
0.19 0.36 0.45 0.26 0.31 0.4 0.16 0.05 0.21 0.25 0.13 1
0.39 0.31 0 0.28 0.3 0.11 0.37 0.13 0.36 0.26 0.14 0 1
0.12 0.12 0.29 0.13 0.11 0.63 0.13 0.14 0.3 0.18 0 0.35 -0.08 1
0.31 0.29 0.44 0.37 0.21 0.26 0.4 0.37 0.675 0.53 0.27 0.31 0.07 0.25 1
0.11 0.18 0.39 0.12 0.17 0.64 0.19 0.04 0.29 0.21 0.22 0.36 -0.06 0.59 0.31 1
0.23 0.36 0.39 0.41 0.38 0.26 0.36 0.32 0.48 0.4 0.24 0.39 0.17 0.21 0.44 0.25 1
0.02 0.14 0.09 0.07 0.09 0.14 0.29 0.2 0.28 0.32 0.45 0.3 0.17 0.09 0.39 0.25 0.29 1
0.31 0.4 0.32 0.3 0.31 0.09 0.17 0.1 0.29 0.32 0.21 0.33 0.32 -0.02 0.31 0.1 0.14 0.29 1
0.72 0.68 0.2 0.75 0.54 0.09 0.34 0.2 0.4 0.35 -0.09 0.24 0.35 0.08 0.24 0.07 0.26 -0.03 0.32 1
0.49 0.44 0.57 0.54 0.49 0.43 0.15 0.24 0.44 0.24 0.11 0.46 0.13 0.34 0.35 0.35 0.42 0.29 0.33 0.41 1
0.66 0.64 0.33 0.73 0.49 0.15 0.38 0.24 0.46 0.45 0 0.34 0.39 0.1 0.35 0.14 0.33 0.12 0.42 0.7 0.48 1
0.53 0.5 0.41 0.63 0.48 0.41 0.35 0.14 0.42 0.3 -0.04 0.4 0.12 0.35 0.3 0.29 0.51 0.1 0.17 0.53 0.65 0.55 1
0.55 0.51 0.39 0.59 0.46 0.27 0.23 0.18 0.33 0.29 0.1 0.34 0.09 0.16 0.36 0.25 0.37 0.14 0.3 0.47 0.53 0.6 0.68 1'
lowerb = '
1
0.71 1
0.48 0.46 1
0.7 0.65 0.44 1
0.68 0.67 0.53 0.76 1
0.11 0.19 0.15 0.25 0.23 1
0.39 0.43 0.23 0.26 0.39 0.1 1
0.36 0.41 0.3 0.33 0.39 0.17 0.39 1
0.45 0.56 0.44 0.43 0.49 0.22 0.58 0.44 1
0.2 0.25 0.06 0.16 0.19 0.12 0.44 0.37 0.51 1
0.11 0.04 0.21 0.13 0.18 0.24 0.09 0.21 0.25 0.27 1
0.15 0.18 0.27 0.11 0.21 0.38 0.16 0.23 0.29 0.15 0.26 1
0.25 0.33 0.18 0.26 0.27 0.18 0.41 0.36 0.42 0.35 0.1 0.13 1
0.16 0.17 0.19 0.27 0.23 0.52 0.14 0.06 0.23 0.01 0.08 0.19 0.06 1
0.27 0.4 0.27 0.31 0.33 0.19 0.51 0.39 0.66 0.53 0.12 0.19 0.5 0.22 1
0.24 0.2 0.25 0.1 0.28 0.29 0.25 0.23 0.18 0.08 0.12 0.26 0.08 0.4 0.1 1
0.44 0.48 0.35 0.43 0.44 0.33 0.45 0.26 0.59 0.38 0.22 0.26 0.34 0.24 0.41 0.32 1
0.26 0.26 0.18 0.22 0.28 0.31 0.33 0.35 0.44 0.48 0.3 0.22 0.27 0.18 0.48 0.3 0.41 1
0.19 0.18 0.2 0.14 0.26 0.12 0.38 0.44 0.45 0.35 0.39 0.28 0.21 0.09 0.28 0.11 0.35 0.38 1
0.66 0.53 0.25 0.69 0.55 0.18 0.35 0.34 0.38 0.27 0.09 0.13 0.29 0.09 0.26 0.09 0.35 0.24 0.12 1
0.44 0.45 0.59 0.47 0.57 0.14 0.27 0.26 0.51 0.06 0.07 0.28 0.11 0.18 0.3 0.23 0.41 0.12 0.21 0.29 1
0.73 0.64 0.43 0.69 0.67 0.24 0.4 0.58 0.55 0.28 0.17 0.2 0.37 0.26 0.39 0.18 0.47 0.3 0.31 0.69 0.48 1
0.55 0.43 0.35 0.6 0.48 0.3 0.17 0.1 0.34 0.11 0.12 0.16 0.22 0.23 0.12 0.12 0.52 0.24 0.1 0.59 0.39 0.52 1
0.59 0.56 0.41 0.67 0.61 0.15 0.27 0.27 0.39 0.21 0.13 0.25 0.33 0.24 0.23 0.13 0.46 0.21 0.19 0.67 0.45 0.63 0.73 1'
NJW.cor = getCov(lowerw, names = Names)
NJB.cor = getCov(lowerb, names = Names)
NJWSDs <- c(2.11, 2.91, 2.36, 2.4, 2.82, 2.9, 2.53, 2.12, 2.71, 3.02, 2.77, 2.34, 2.72, 2.52, 2.69, 2.12, 2.76, 2.51, 2.04, 12.55, 12.17, 11.08, 12.69, 9.02)
NJBSDs <- c(2.47, 2.66, 2.4, 2.49, 2.41, 2.72, 2.38, 2.22, 3.16, 2.98, 2.39, 2.24, 2.95, 2.47, 2.32, 1.67, 2.32, 2.43, 2.21, 12.44, 9.4, 11.42, 11.05, 8.45)
NJW.cov = lavaan::cor2cov(R = NJW.cor, sds = NJWSDs)
NJB.cov = lavaan::cor2cov(R = NJB.cor, sds = NJBSDs)
Wmeans = c(9.83, 10.73, 9.53, 10.35, 10.29, 8.93, 10.09, 11.38, 10.13, 9.98, 10.37, 9.09, 10.07, 9.84, 10.56, 9.19, 9.53, 9.53, 10.07, 99.63, 99.27, 99.8, 100.8, 96.66)
Bmeans = c(8.58, 8.87, 8.51, 9.05, 9.05, 8.52, 9.08, 10.55, 7.78, 8, 10.2, 8.1, 9.4, 9.79, 9.03, 8.87, 9.03, 8.42, 9.55, 96.06, 90.53, 91.95, 95.5, 92.74)
NJCovs <- list(NJW.cov, NJB.cov)
NJMeans <- list(Wmeans, Bmeans)
NJNs <- list(86, 86)
FITM <- c("chisq", "df", "nPar", "cfi", "rmsea", "rmsea.ci.lower", "rmsea.ci.upper", "aic", "bic")
People often make statements to the effect that “generally lower-scoring group X is catching up on generally higher-scoring group Y” in terms of a measure of some psychological construct. These statements are often made on the basis of observed rather than latent scores. I contend that this leads to improper inferences regarding the state of gaps, as observed scores are not entirely derived from the constructs that actually interest researchers. For example, if two groups differed in their mean levels of neuroticism and this explained a disparity in depression symptoms exhibited by those groups, it would do no good to teach the more neurotic group which answers to bubble in order to obtain observed scores which would normally evince lower neuroticism as, by doing this, they’ve only changed the observed score and left the trait unaltered, making the groups psychometrically incomparable in terms of their responses barring an effect of the observed score itself on the trait. This is similar to equating groups in terms of height by asking the taller group to sit down: it does not address real differences, only ones which exist in that situation.
In order to make this clear for psychological constructs, I show, below, that the mean levels of general intelligence (g) measured by two different assessments - which show different observed score gaps - are virtually identical despite the differences in observed test outcomes. This dataset comes from Naglieri & Jensen (1987) and was previously found to yield strict factorial invariance by Dolan & Hamaker (2001; see also https://rpubs.com/JLLJ/SH where I’ve output a similar result). The samples are composed of fourth- and fifth-grade students, equally-sized, and matched in terms of age, sex, school, and socioeconomic status, which reduces the differences from their typical level (about 1 Hedge’s g) by a modest amount (as reported by Jensen, 1998, this is expected to reduce them by \(\frac{1}{3}\); see also Kane & Oakland, 2010, p. 328; notably, it would be irresponsible to assume that this reduction is evidence of a causal effect of the matching variables since they reflect both independent effects and omitted sources of variance, and, in fact, the gap at any particular level tends to increase up the level of, e.g., socioeconomic status, remain unchanged with age, and to be consistent by sex barring a focus on strongly sex-differentiated group factors like spatial and mechanical ability/skill).
Before showing that the groups have the same gaps in a latent construct measured by different tests, it should be confirmed that the constructs are identically measured by the tests. I’ve performed EFAs of both the WISC-R and the K-ABC separately elsewhere (to be included in the supplement of a forthcoming as of May 13, 2020 paper) and the following confirmatory models are based on them. Because this test is measurement invariant for the groups, I use the data from the White group for this part of the analysis.
#WISC-R Model
HOFWHISKER.model <- '
VIQ =~ I + S + V + C
PIQ =~ PC + PA + BD + OA + CO
FD =~ A + DS
gWI =~ VIQ + PIQ + FD'
BFWHISKER.model <- '
VIQ =~ I + S + V + C
PIQ =~ PC + PA + BD + OA + CO
A ~~ DS
gWI =~ I + S + V + C + PC + PA + BD + OA + CO + A + DS
'
HOFWHISKER.fit <- cfa(HOFWHISKER.model, sample.cov = NJW.cov, sample.nobs = 86, std.lv = T, orthogonal = T)
BFWHISKER.fit <- cfa(BFWHISKER.model, sample.cov = NJW.cov, sample.nobs = 86, std.lv = T, orthogonal = T)
round(cbind(HOF = fitMeasures(HOFWHISKER.fit, FITM),
BF = fitMeasures(BFWHISKER.fit, FITM)),3)
## HOF BF
## chisq 45.557 37.018
## df 41.000 34.000
## npar 25.000 32.000
## cfi 0.985 0.990
## rmsea 0.036 0.032
## rmsea.ci.lower 0.000 0.000
## rmsea.ci.upper 0.085 0.087
## aic 4203.526 4208.986
## bic 4264.885 4287.526
#K-ABC Model
HOFKABC.model <- '
Gfle =~ MA + AR + RD + RU
Gv =~ GC + PS + FP + R
Gsm =~ 1*NR + 1*WO
Gmot =~ HM + T + MA + MS
gKA =~ Gfle + Gv + Gsm + Gmot' #NR + WO set homogeneous to identify; doesn't meaningfully affect fit, which is similar to the model fitted by Dolan & Hamaker anyway.
BFKABC.model <- '
Gfle =~ MA + AR + RD + RU
Gv =~ GC + PS + FP + R
NR ~~ WO
Gmot =~ HM + T + MA + MS
gKA =~ MA + AR + RD + RU + GC + PS + FP + R + NR + WO + HM + T + MS
RD ~~ 0*RD' #required because of RD/RU collinearity - not an issue
HOFKABC.fit <- cfa(HOFKABC.model, sample.cov = NJW.cov, sample.nobs = 86, std.lv = T, orthogonal = T)
BFKABC.fit <- cfa(BFKABC.model, sample.cov = NJW.cov, sample.nobs = 86, std.lv = T, orthogonal = T)
round(cbind(HOF = fitMeasures(HOFKABC.fit, FITM),
BF = fitMeasures(BFKABC.fit, FITM)),3)
## HOF BF
## chisq 97.334 69.020
## df 62.000 53.000
## npar 29.000 38.000
## cfi 0.907 0.958
## rmsea 0.081 0.059
## rmsea.ci.lower 0.048 0.000
## rmsea.ci.upper 0.111 0.096
## aic 6182.016 6171.701
## bic 6253.192 6264.966
In both cases, a higher-order model works better as a description of the data based on BIC. The argument for a higher-order model for comparing factors is principally based on the fact that they allow for a stronger control of the sampling error resulting from the selection of tests in a battery being modeled. For that reason, I’ll assess the relationship between higher-order g factors, but the relationship is virtually identical for the bifactor model (which fits better in terms of centrality measures, absolute fit, but is obviously less parsimonious and atheoretical); if anyone wants to mix-and-match, that’s up to them to both do and justify (which is easy in the first case, and varies in difficulty in the latter).
The steps in comparing these factors are to first, assess a model with no relationship between the g factors, followed by a model with a relationship, then a model where relevant group factors are allowed to covary, followed by a model in which large indicator-level residual covariances are allowed, and finally, a model in which the g factors are constrained to be equal. The covariances are only applied to assess the “true” extent of the relationship between tests and to assess whether it’s mediated by just g, whether that relationship is excessively high due to related group factors and indicators, etc.; others performing the same analysis have explained this. To save space, the first few models are not illustrated but their fits are included; the final model modifications are included and it’s noted that the combined model is just the configurations of the above models.
#How related are the g factors?
round(cbind(NOREL = fitMeasures(HOFNO.fit, FITM),
FREEREL = fitMeasures(HOFSOME.fit, FITM),
GCOVS = fitMeasures(HOFGCOV.fit, FITM),
RECOVS = fitMeasures(HOFRECOV.fit, FITM),
SAME = fitMeasures(HOFSAME.fit, FITM)),3)
## NOREL FREEREL GCOVS RECOVS SAME
## chisq 600.813 477.948 388.882 318.493 319.152
## df 246.000 245.000 244.000 232.000 233.000
## npar 54.000 55.000 56.000 68.000 67.000
## cfi 0.647 0.768 0.856 0.914 0.914
## rmsea 0.130 0.105 0.083 0.066 0.066
## rmsea.ci.lower 0.116 0.091 0.067 0.047 0.046
## rmsea.ci.upper 0.143 0.119 0.098 0.083 0.083
## aic 10385.542 10264.677 10177.611 10131.222 10129.881
## bic 10518.076 10399.667 10315.054 10298.118 10294.323
parameterEstimates(HOFRECOV.fit, stand = T) %>%
filter(op == "~~") %>%
select(Variable = lhs, Target = rhs, SE = se, Z = z, 'p-value' = pvalue, "Standardized Variance/Covariance" = std.all) %>%
kable(digits = 3, format = "pandoc")
| Variable | Target | SE | Z | p-value | Standardized Variance/Covariance |
|---|---|---|---|---|---|
| gWI | gKA | 0.020 | 49.966 | 0.000 | 1.018 |
| VIQ | Gv | 0.000 | NA | NA | 1.000 |
| FD | Gsm | 0.156 | 6.419 | 0.000 | 0.999 |
| PIQ | Gmot | 0.000 | NA | NA | 1.000 |
| A | AR | 1.791 | 3.137 | 0.002 | 0.345 |
| CO | RD | 2.067 | -2.427 | 0.015 | -0.330 |
| CO | MS | 0.634 | 2.944 | 0.003 | 0.323 |
| PC | GC | 0.575 | 2.600 | 0.009 | 0.284 |
| DS | GC | 0.521 | 2.935 | 0.003 | 0.535 |
| V | PS | 0.244 | -2.402 | 0.016 | -0.289 |
| A | PS | 0.374 | 2.959 | 0.003 | 0.303 |
| BD | HM | 0.439 | -2.532 | 0.011 | -0.334 |
| I | MA | 0.301 | -2.221 | 0.026 | -0.254 |
| C | MA | 0.489 | 2.022 | 0.043 | 0.227 |
| A | MA | 0.396 | 2.170 | 0.030 | 0.206 |
| A | MS | 0.419 | -2.520 | 0.012 | -0.243 |
| I | I | 0.266 | 5.872 | 0.000 | 0.362 |
| S | S | 0.558 | 5.980 | 0.000 | 0.399 |
| V | V | 0.246 | 4.908 | 0.000 | 0.212 |
| C | C | 0.688 | 6.226 | 0.000 | 0.534 |
| PC | PC | 0.729 | 6.282 | 0.000 | 0.728 |
| PA | PA | 0.568 | 6.394 | 0.000 | 0.817 |
| BD | BD | 0.533 | 4.715 | 0.000 | 0.340 |
| OA | OA | 0.863 | 6.003 | 0.000 | 0.575 |
| CO | CO | 1.054 | 6.554 | 0.000 | 0.943 |
| A | A | 0.613 | 6.372 | 0.000 | 0.743 |
| DS | DS | 1.022 | 1.320 | 0.187 | 0.165 |
| MA | MA | 0.712 | 6.259 | 0.000 | 0.609 |
| AR | AR | 11.804 | 5.762 | 0.000 | 0.501 |
| RD | RD | 9.759 | 3.430 | 0.001 | 0.210 |
| RU | RU | 6.373 | 5.573 | 0.000 | 0.442 |
| GC | GC | 0.934 | 6.491 | 0.000 | 0.790 |
| PS | PS | 0.526 | 6.467 | 0.000 | 0.856 |
| FP | FP | 8.383 | 5.170 | 0.000 | 0.280 |
| R | R | 7.264 | 5.513 | 0.000 | 0.331 |
| NR | NR | 0.591 | 5.949 | 0.000 | 0.663 |
| WO | WO | 0.424 | 5.402 | 0.000 | 0.562 |
| HM | HM | 0.710 | 6.220 | 0.000 | 0.817 |
| T | T | 0.568 | 4.173 | 0.000 | 0.331 |
| MS | MS | 0.770 | 6.287 | 0.000 | 0.753 |
| VIQ | VIQ | 0.000 | NA | NA | 0.361 |
| PIQ | PIQ | 0.000 | NA | NA | 0.462 |
| FD | FD | 0.000 | NA | NA | 0.675 |
| gWI | gWI | 0.000 | NA | NA | 1.000 |
| Gfle | Gfle | 0.000 | NA | NA | 0.119 |
| Gv | Gv | 0.000 | NA | NA | 0.483 |
| Gsm | Gsm | 0.000 | NA | NA | 0.560 |
| Gmot | Gmot | 0.000 | NA | NA | 0.641 |
| gKA | gKA | 0.000 | NA | NA | 1.000 |
The model fits similarly to the Dolan & Hamaker model (its small size can explain its bad initial fit relative to other tests); of course, a bifactor model in which the batteries are modeled as one with confirmatory fit derived from the same EFA fits much better and sees congruent g loadings and thus more evidence that the tests measure the same g factor. Regardless, the case that the g factors measured by the WISC-R and K-ABC are identical/interchangeable is tenable and as such it is reasonable to expect that the g factors should yield approximately (due to minor differences which could be accounted for in a combined model of the batteries) the same-sized gap.
As already mentioned, MGCFA of this battery has already been conducted. Modifications to increase initial fit or whatnot are possible and people can do as they wish and, within reason, it won’t alter the substantive result (unless using the bifactor model, which is obviously contaminated by the subtest sampling variance which makes the observed total scores differ in the first place), but for this analysis, all I’ll be assessing is whether the g factors produced in the simplest between-group models of the separate WISC-R and K-ABC batteries produce the same mean difference despite dissimilar observed differences. As such, below, are the intercepts from a model in which loadings, intercepts, residuals, and latent variances have been constrained to equality for the separate WISC-R and K-ABC higher-order models.
parameterEstimates(WHISKERDIFF.fit, stand = T) %>%
filter(op == "~1") %>%
select(Indicator = lhs, "Unstandardized Intercept" = est, SE = se, Z = z, 'p-value' = pvalue, "Standardized Intercept" = std.all) %>%
kable(digits = 3, format = "pandoc")
| Indicator | Unstandardized Intercept | SE | Z | p-value | Standardized Intercept |
|---|---|---|---|---|---|
| I | 9.839 | 0.229 | 43.015 | 0.000 | 4.369 |
| S | 10.565 | 0.280 | 37.670 | 0.000 | 3.790 |
| V | 10.408 | 0.247 | 42.152 | 0.000 | 4.372 |
| C | 10.345 | 0.254 | 40.677 | 0.000 | 4.033 |
| PC | 10.218 | 0.220 | 46.402 | 0.000 | 4.355 |
| PA | 11.447 | 0.191 | 59.822 | 0.000 | 5.447 |
| BD | 10.096 | 0.292 | 34.587 | 0.000 | 3.657 |
| OA | 9.852 | 0.280 | 35.172 | 0.000 | 3.369 |
| CO | 10.539 | 0.217 | 48.670 | 0.000 | 4.131 |
| A | 9.502 | 0.264 | 35.942 | 0.000 | 3.831 |
| DS | 9.045 | 0.256 | 35.303 | 0.000 | 3.201 |
| VIQ | 0.000 | 0.000 | NA | NA | 0.000 |
| PIQ | 0.000 | 0.000 | NA | NA | 0.000 |
| FD | 0.000 | 0.000 | NA | NA | 0.000 |
| gWI | 0.000 | 0.000 | NA | NA | 0.000 |
| I | 9.839 | 0.229 | 43.015 | 0.000 | 4.259 |
| S | 10.565 | 0.280 | 37.670 | 0.000 | 3.699 |
| V | 10.408 | 0.247 | 42.152 | 0.000 | 4.250 |
| C | 10.345 | 0.254 | 40.677 | 0.000 | 3.944 |
| PC | 10.218 | 0.220 | 46.402 | 0.000 | 4.123 |
| PA | 11.447 | 0.191 | 59.822 | 0.000 | 5.233 |
| BD | 10.096 | 0.292 | 34.587 | 0.000 | 3.243 |
| OA | 9.852 | 0.280 | 35.172 | 0.000 | 3.158 |
| CO | 10.539 | 0.217 | 48.670 | 0.000 | 4.099 |
| A | 9.502 | 0.264 | 35.942 | 0.000 | 4.191 |
| DS | 9.045 | 0.256 | 35.303 | 0.000 | 3.295 |
| VIQ | -0.125 | 0.159 | -0.787 | 0.431 | -0.069 |
| PIQ | -0.629 | 0.198 | -3.172 | 0.002 | -0.347 |
| FD | 0.006 | 0.196 | 0.032 | 0.975 | 0.005 |
| gWI | -0.759 | 0.165 | -4.590 | 0.000 | -0.680 |
parameterEstimates(KABCDIFF.fit, stand = T) %>%
filter(op == "~1") %>%
select(Indicator = lhs, "Unstandardized Intercept" = est, SE = se, Z = z, 'p-value' = pvalue, "Standardized Intercept" = std.all) %>%
kable(digits = 3, format = "pandoc")
| Indicator | Unstandardized Intercept | SE | Z | p-value | Standardized Intercept |
|---|---|---|---|---|---|
| MA | 9.877 | 0.243 | 40.667 | 0.000 | 3.904 |
| AR | 97.336 | 1.135 | 85.743 | 0.000 | 8.336 |
| RD | 101.174 | 1.259 | 80.381 | 0.000 | 8.330 |
| RU | 96.890 | 0.920 | 105.359 | 0.000 | 10.857 |
| GC | 10.128 | 0.247 | 40.952 | 0.000 | 3.561 |
| PS | 10.090 | 0.184 | 54.789 | 0.000 | 4.731 |
| FP | 100.909 | 1.249 | 80.819 | 0.000 | 8.007 |
| R | 99.481 | 1.279 | 77.758 | 0.000 | 8.338 |
| NR | 9.932 | 0.220 | 45.235 | 0.000 | 4.241 |
| WO | 9.147 | 0.190 | 48.116 | 0.000 | 4.954 |
| HM | 9.016 | 0.217 | 41.529 | 0.000 | 3.872 |
| T | 10.458 | 0.267 | 39.127 | 0.000 | 4.010 |
| MS | 9.500 | 0.243 | 39.098 | 0.000 | 3.777 |
| Gfle | 0.000 | 0.000 | NA | NA | 0.000 |
| Gv | 0.000 | 0.000 | NA | NA | 0.000 |
| Gsm | 0.000 | 0.000 | NA | NA | 0.000 |
| Gmot | 0.000 | 0.000 | NA | NA | 0.000 |
| gKA | 0.000 | 0.000 | NA | NA | 0.000 |
| MA | 9.877 | 0.243 | 40.667 | 0.000 | 4.079 |
| AR | 97.336 | 1.135 | 85.743 | 0.000 | 8.779 |
| RD | 101.174 | 1.259 | 80.381 | 0.000 | 8.982 |
| RU | 96.890 | 0.920 | 105.359 | 0.000 | 11.681 |
| GC | 10.128 | 0.247 | 40.952 | 0.000 | 3.631 |
| PS | 10.090 | 0.184 | 54.789 | 0.000 | 4.815 |
| FP | 100.909 | 1.249 | 80.819 | 0.000 | 8.528 |
| R | 99.481 | 1.279 | 77.758 | 0.000 | 9.224 |
| NR | 9.932 | 0.220 | 45.235 | 0.000 | 4.265 |
| WO | 9.147 | 0.190 | 48.116 | 0.000 | 5.000 |
| HM | 9.016 | 0.217 | 41.529 | 0.000 | 4.002 |
| T | 10.458 | 0.267 | 39.127 | 0.000 | 4.291 |
| MS | 9.500 | 0.243 | 39.098 | 0.000 | 3.949 |
| Gfle | -0.128 | 0.139 | -0.925 | 0.355 | -0.058 |
| Gv | -0.289 | 0.168 | -1.716 | 0.086 | -0.195 |
| Gsm | 0.202 | 0.235 | 0.862 | 0.389 | 0.166 |
| Gmot | -0.425 | 0.205 | -2.077 | 0.038 | -0.378 |
| gKA | -0.594 | 0.153 | -3.890 | 0.000 | -0.650 |
As printed above, the higher-order g differences in the WISC-R and the K-ABC respectively amounted to 0.68 and 0.65 g for this sample; as Naglieri & Jensen reported, for the FSIQs (based on raw scores), the differences were 0.73 (0.77 for the FSIQ difference based on the standardized WISC-R scores) and 0.56 (there was no standardized score basis for the K-ABC, but it presumably wouldn’t change much if there was) g for the same batteries. Consistent with expectations, the differences were around \(\frac{1}{3}\) smaller than normal (though, note, Jensen, 1998 based part of this prediction - for other data - on these data). This result adds evidence to the claim - which bears constant repeating - that the interpretation of group differences based on observed scores should not occur without an analysis of their psychometric properties. If this cannot be done for some reason, then groups shouldn’t be compared without marking the limitation. Finally, there are instances where observed differences on different assessments are derived from bias, such as in comparisons of men and women on an automotive knowledge section of a test otherwise intended to measure spatial abilities, and there are also situations where bias reduces group differences, such as in the analysis by Cockroft et al. (2015; see https://rpubs.com/JLLJ/frontierssaukmgcfa); it is always advisable to attempt to understand and account for these scenarios with models rather than speculating based on observed scores.
Stop interpreting observed gaps without an analysis of latent ones or some theory to explain why the observed gaps are meaningful. Similarly, avoid exploratory methods for understanding latent gaps: it is trivial to demonstrate that if, say, one group lagging in a general factor gains on a multidimensional test due to group factors, with exploratory methods, there can be an apparent gain in the general factor which is illusory. Psychometricians discussing group differences have a duty to account for these sorts of things using psychometrically appropriate methods like multi-group confirmatory factor analysis. This also stands as a proof that even with a construct “explaining” differences on one test, if it’s identically measured by another test, it need not explain the differences there in the same proportions; hence, for a culture-free interpretation of hypotheses like Spearman’s via the use of specific test modalities like elementary cognitive tasks, a test must be elaborated in which the hypothesis is tested for those specific tests, not inferred because one test confirming it is g saturated and they are as well.
Naglieri, J. A., & Jensen, A. R. (1987). Comparison of black-white differences on the WISC-R and the K-ABC: Spearman’s hypothesis. Intelligence, 11(1), 21-43. https://doi.org/10.1016/0160-2896(87)90024-9
Dolan, C. V., & Hamaker, E. L. (2001). Investigating Black-White differences in psychometric IQ: Multi-group confirmatory factor analyses of the WISC-R and K-ABC and a critique of the method of correlated vectors. In Advances in psychology research, Vol. 6. (pp. 31-59). Nova Science Publishers.
Jensen, A. R. (1998). The g Factor: The Science of Mental Ability. Praeger Publishers/Greenwood Publishing Group.
Kane, H. D., & Oakland, T. D. (2010). Group Differences in Cognitive Ability: A CHC Theory Framework. Mankind Quarterly, 50(4). http://mankindquarterly.org/archive/issue/50-4/4
Cockcroft, K., Alloway, T., Copello, E., & Milligan, R. (2015). A cross-cultural comparison between South African and British students on the Wechsler Adult Intelligence Scales Third Edition (WAIS-III). Frontiers in Psychology, 6. https://doi.org/10.3389/fpsyg.2015.00297
Some of the advice offered here should be contradicted for other lines of research, including many types of research into adverse impact, which occurs solely on the basis of observed scores, or in the assessment of basic skills like algebra since, even if groups are truly matched on mathematical ability, it is still useful to know that one is lagging because its members haven’t been taught a skill required for the test. This advice is, thus, most relevant for the study of group differences and test bias themselves.