Init
options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, caret, parallel, doParallel)
#start parallel
doParallel::registerDoParallel(7)
Data
d = read_sav("data/ncds1958/UKDA-8288-spss/spss/spss19/bcs1986_reading_matrices.sav")
#variables table
d_table = tibble(
colname = colnames(d),
label = map_chr(d, ~attr(., "label"))
)
#dataset
dim(d)
## [1] 3651 183
#item names
item_names = names(d) %>% str_subset("ANS_")
item_binary_names = names(d) %>% str_subset("SCR.+?\\d$")
#recode
d$sex = plyr::mapvalues(d$SEX, 1:3, c("Male", "Female", NA)) %>% factor()
## The following `from` values were not present in `x`: 3
#recode scores into numeric
for (v in str_subset(names(d), "SCR_")) d[[v]] = d[[v]] %>% as.numeric()
Traditional psychometrics
#sumscore
d$sumscore = d$SCRTOTAL %>% as.numeric()
#IRT
irt_fit = irt.fa(d %>% select(!!item_binary_names))
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was
## done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs
## = np.obs, : The estimated weights for the factor scores are probably
## incorrect. Try a different factor extraction method.

irt_fit
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = d %>% select(!!item_binary_names))
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## SCR_A1 0.25 0.33 0.23 0.10 0.04 0.01 0.00
## SCR_A2 0.22 0.34 0.28 0.13 0.05 0.02 0.00
## SCR_A3 0.36 0.43 0.23 0.08 0.02 0.01 0.00
## SCR_A4 0.10 0.20 0.27 0.21 0.11 0.05 0.02
## SCR_A5 0.17 0.35 0.36 0.19 0.07 0.02 0.01
## SCR_A6 0.20 0.34 0.30 0.15 0.05 0.02 0.01
## SCR_A7 0.19 0.37 0.36 0.18 0.06 0.02 0.01
## SCR_A8 0.27 0.36 0.24 0.10 0.03 0.01 0.00
## SCR_A9 0.12 0.25 0.33 0.23 0.10 0.04 0.01
## SCR_A10 0.17 0.31 0.30 0.17 0.07 0.02 0.01
## SCR_B1 0.22 0.37 0.31 0.14 0.05 0.02 0.00
## SCR_B2 0.36 1.35 0.53 0.06 0.01 0.00 0.00
## SCR_B3 0.10 0.27 0.40 0.28 0.11 0.04 0.01
## SCR_B4 0.22 0.70 0.63 0.17 0.03 0.01 0.00
## SCR_B5 0.34 0.88 0.48 0.10 0.02 0.00 0.00
## SCR_B6 0.22 0.62 0.54 0.17 0.04 0.01 0.00
## SCR_B7 0.39 1.08 0.47 0.07 0.01 0.00 0.00
## SCR_B8 0.20 0.59 0.58 0.19 0.04 0.01 0.00
## SCR_B9 0.22 0.70 0.60 0.16 0.03 0.01 0.00
## SCR_B10 0.08 0.30 0.59 0.39 0.12 0.03 0.01
## SCR_B11 0.07 0.23 0.45 0.38 0.16 0.05 0.01
## SCR_B12 0.15 0.51 0.66 0.25 0.06 0.01 0.00
## SCR_B13 0.04 0.15 0.36 0.45 0.24 0.08 0.02
## SCR_B14 0.04 0.16 0.48 0.56 0.23 0.06 0.01
## SCR_B15 0.05 0.20 0.56 0.53 0.19 0.04 0.01
## SCR_B16 0.07 0.22 0.46 0.40 0.16 0.05 0.01
## SCR_B17 0.05 0.19 0.44 0.45 0.20 0.06 0.01
## SCR_B18 0.06 0.27 0.68 0.50 0.14 0.03 0.01
## SCR_B19 0.05 0.17 0.41 0.46 0.22 0.06 0.02
## SCR_B20 0.08 0.21 0.33 0.29 0.14 0.05 0.02
## SCR_C1 0.04 0.12 0.27 0.35 0.23 0.10 0.03
## SCR_C2 0.28 0.35 0.23 0.09 0.03 0.01 0.00
## SCR_C3 0.10 0.16 0.18 0.14 0.08 0.04 0.02
## SCR_C4 0.08 0.12 0.13 0.11 0.08 0.05 0.03
## SCR_C5 0.20 0.29 0.25 0.13 0.05 0.02 0.01
## SCR_C6 0.05 0.07 0.09 0.10 0.09 0.07 0.05
## SCR_C7 0.08 0.13 0.16 0.14 0.10 0.06 0.03
## SCR_C8 0.09 0.17 0.23 0.21 0.12 0.06 0.02
## SCR_C9 0.10 0.13 0.14 0.11 0.07 0.04 0.02
## SCR_C10 0.15 0.25 0.26 0.16 0.07 0.03 0.01
## SCR_C11 0.10 0.15 0.16 0.13 0.08 0.04 0.02
## SCR_C12 0.07 0.12 0.17 0.17 0.12 0.07 0.03
## SCR_C13 0.07 0.12 0.15 0.15 0.11 0.06 0.03
## SCR_C14 0.10 0.15 0.17 0.14 0.08 0.04 0.02
## SCR_C15 0.14 0.27 0.30 0.20 0.08 0.03 0.01
## SCR_D1 0.07 0.14 0.22 0.23 0.15 0.07 0.03
## SCR_D2 0.06 0.16 0.30 0.32 0.18 0.07 0.02
## SCR_D3 0.07 0.12 0.17 0.17 0.13 0.07 0.04
## SCR_D4 0.07 0.19 0.36 0.35 0.18 0.06 0.02
## SCR_D6 0.06 0.14 0.26 0.29 0.19 0.08 0.03
## SCR_D7 0.04 0.09 0.15 0.19 0.17 0.11 0.05
## SCR_D8 0.07 0.18 0.33 0.31 0.17 0.06 0.02
## SCR_D10 0.06 0.17 0.33 0.34 0.18 0.07 0.02
## SCR_D11 0.14 0.35 0.45 0.25 0.08 0.02 0.01
## SCR_D12 0.07 0.15 0.23 0.23 0.15 0.07 0.03
## SCR_D13 0.05 0.18 0.42 0.45 0.20 0.06 0.02
## SCR_D14 0.07 0.14 0.20 0.19 0.13 0.07 0.03
## SCR_D15 0.07 0.14 0.21 0.21 0.14 0.07 0.03
## SCR_D16 0.04 0.09 0.16 0.20 0.17 0.11 0.05
## SCR_D17 0.04 0.07 0.11 0.13 0.12 0.09 0.05
## SCR_E1 0.16 0.29 0.30 0.18 0.07 0.03 0.01
## SCR_E2 0.08 0.31 0.65 0.42 0.12 0.03 0.01
## SCR_E3 0.09 0.15 0.19 0.17 0.11 0.05 0.03
## SCR_E5 0.07 0.13 0.19 0.18 0.13 0.07 0.03
## SCR_E6 0.20 0.34 0.30 0.15 0.05 0.02 0.01
## SCR_E7 0.16 0.35 0.39 0.21 0.07 0.02 0.01
## SCR_E8 0.14 0.33 0.41 0.24 0.08 0.03 0.01
## SCR_E9 0.03 0.09 0.24 0.39 0.31 0.13 0.04
## SCR_E10 0.14 0.40 0.53 0.26 0.08 0.02 0.00
## SCR_E11 0.08 0.22 0.41 0.35 0.15 0.05 0.01
## SCR_E12 0.13 0.24 0.29 0.20 0.09 0.03 0.01
## SCR_E13 0.08 0.16 0.24 0.23 0.14 0.07 0.03
## SCR_M1 0.07 0.07 0.07 0.05 0.04 0.03 0.02
## SCR_M2 0.18 0.20 0.15 0.09 0.04 0.02 0.01
## SCR_M3 0.25 0.31 0.22 0.10 0.04 0.01 0.00
## SCR_M4 0.21 0.22 0.15 0.08 0.03 0.01 0.01
## SCR_M5 0.14 0.22 0.22 0.14 0.07 0.03 0.01
## SCR_M6 0.18 0.34 0.33 0.18 0.06 0.02 0.01
## SCR_M7 0.11 0.15 0.16 0.12 0.07 0.04 0.02
## SCR_M8 0.09 0.14 0.17 0.14 0.09 0.05 0.02
## SCR_M9 0.07 0.12 0.17 0.16 0.12 0.07 0.03
## SCR_M10 0.04 0.06 0.08 0.09 0.08 0.07 0.04
## Test Info 10.37 22.15 25.48 17.88 8.57 3.47 1.36
## SEM 0.31 0.21 0.20 0.24 0.34 0.54 0.86
## Reliability 0.90 0.95 0.96 0.94 0.88 0.71 0.27
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 3569 and the objective function was 87.5
## The number of observations was 3651 with Chi Square = 316567 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.06
## The df corrected root mean square of the residuals is 0.06
##
## Tucker Lewis Index of factoring reliability = 0.233
## RMSEA index = 0.156 and the 10 % confidence intervals are 0.155 NA
## BIC = 287291
irt_fit$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## SCR_A1 0.56 0.31279 0.69 1
## SCR_A2 0.57 0.32178 0.68 1
## SCR_A3 0.62 0.38231 0.62 1
## SCR_A4 0.52 0.26827 0.73 1
## SCR_A5 0.59 0.34834 0.65 1
## SCR_A6 0.57 0.32512 0.67 1
## SCR_A7 0.60 0.35843 0.64 1
## SCR_A8 0.58 0.33103 0.67 1
## SCR_A9 0.56 0.31577 0.68 1
## SCR_A10 0.56 0.31358 0.69 1
## SCR_B1 0.59 0.34329 0.66 1
## SCR_B2 0.81 0.65333 0.35 1
## SCR_B3 0.60 0.35428 0.65 1
## SCR_B4 0.73 0.52822 0.47 1
## SCR_B5 0.74 0.55101 0.45 1
## SCR_B6 0.70 0.48890 0.51 1
## SCR_B7 0.77 0.59837 0.40 1
## SCR_B8 0.70 0.48763 0.51 1
## SCR_B9 0.72 0.52332 0.48 1
## SCR_B10 0.67 0.44994 0.55 1
## SCR_B11 0.63 0.39324 0.61 1
## SCR_B12 0.70 0.49130 0.51 1
## SCR_B13 0.63 0.39149 0.61 1
## SCR_B14 0.68 0.45699 0.54 1
## SCR_B15 0.68 0.46869 0.53 1
## SCR_B16 0.63 0.40237 0.60 1
## SCR_B17 0.64 0.41191 0.59 1
## SCR_B18 0.71 0.49801 0.50 1
## SCR_B19 0.64 0.40667 0.59 1
## SCR_B20 0.57 0.31961 0.68 1
## SCR_C1 0.57 0.32961 0.67 1
## SCR_C2 0.57 0.32990 0.67 1
## SCR_C3 0.44 0.19690 0.80 1
## SCR_C4 0.39 0.15231 0.85 1
## SCR_C5 0.54 0.29084 0.71 1
## SCR_C6 0.36 0.12661 0.87 1
## SCR_C7 0.42 0.17956 0.82 1
## SCR_C8 0.50 0.24534 0.75 1
## SCR_C9 0.41 0.16410 0.84 1
## SCR_C10 0.53 0.27608 0.72 1
## SCR_C11 0.43 0.18277 0.82 1
## SCR_C12 0.44 0.19794 0.80 1
## SCR_C13 0.42 0.17622 0.82 1
## SCR_C14 0.43 0.18909 0.81 1
## SCR_C15 0.55 0.30111 0.70 1
## SCR_D1 0.50 0.24697 0.75 1
## SCR_D2 0.56 0.31736 0.68 1
## SCR_D3 0.44 0.19668 0.80 1
## SCR_D4 0.59 0.35063 0.65 1
## SCR_D5 0.23 0.05339 0.95 1
## SCR_D6 0.54 0.29462 0.71 1
## SCR_D7 0.46 0.20921 0.79 1
## SCR_D8 0.57 0.32486 0.68 1
## SCR_D9 0.02 0.00062 1.00 1
## SCR_D10 0.58 0.33668 0.66 1
## SCR_D11 0.62 0.38616 0.61 1
## SCR_D12 0.50 0.25096 0.75 1
## SCR_D13 0.63 0.40254 0.60 1
## SCR_D14 0.47 0.21981 0.78 1
## SCR_D15 0.48 0.23340 0.77 1
## SCR_D16 0.47 0.21676 0.78 1
## SCR_D17 0.40 0.15782 0.84 1
## SCR_E1 0.55 0.30767 0.69 1
## SCR_E2 0.69 0.47451 0.53 1
## SCR_E3 0.46 0.20908 0.79 1
## SCR_E4 0.21 0.04299 0.96 1
## SCR_E5 0.46 0.21228 0.79 1
## SCR_E6 0.57 0.32594 0.67 1
## SCR_E7 0.60 0.36361 0.64 1
## SCR_E8 0.60 0.36551 0.63 1
## SCR_E9 0.60 0.35534 0.64 1
## SCR_E10 0.66 0.43074 0.57 1
## SCR_E11 0.61 0.37101 0.63 1
## SCR_E12 0.54 0.28671 0.71 1
## SCR_E13 0.50 0.25464 0.75 1
## SCR_M1 0.30 0.08969 0.91 1
## SCR_M2 0.47 0.22050 0.78 1
## SCR_M3 0.55 0.30279 0.70 1
## SCR_M4 0.49 0.23611 0.76 1
## SCR_M5 0.49 0.24118 0.76 1
## SCR_M6 0.58 0.33559 0.66 1
## SCR_M7 0.43 0.18108 0.82 1
## SCR_M8 0.43 0.18757 0.81 1
## SCR_M9 0.44 0.19115 0.81 1
## SCR_M10 0.33 0.11097 0.89 1
## SCR_M11 0.21 0.04550 0.95 1
##
## MR1
## SS loadings 26.2
## Proportion Var 0.3
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 3655 and the objective function was 117 with Chi Square of 421819
## The degrees of freedom for the model are 3569 and the objective function was 87.5
##
## The root mean square of the residuals (RMSR) is 0.06
## The df corrected root mean square of the residuals is 0.06
##
## The harmonic number of observations is 3651 with the empirical chi square 101042 with prob < 0
## The total number of observations was 3651 with Likelihood Chi Square = 316567 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.233
## RMSEA index = 0.156 and the 90 % confidence intervals are 0.155 NA
## BIC = 287291
## Fit based upon off diagonal values = 0.96
Prediction
#fit enet
enet_sex = train(as.formula(str_glue("sex ~ {str_c(item_names, collapse = ' + ')}")),
method = "glmnet",
trControl = trainControl(allowParallel = T, method = "repeatedcv", repeats = 10),
data = d)
enet_sex
## glmnet
##
## 3651 samples
## 86 predictor
## 2 classes: 'Female', 'Male'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 3286, 3286, 3285, 3286, 3286, 3286, ...
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0.10 0.00012 0.635 0.261
## 0.10 0.00120 0.639 0.268
## 0.10 0.01202 0.650 0.289
## 0.55 0.00012 0.636 0.262
## 0.55 0.00120 0.643 0.276
## 0.55 0.01202 0.656 0.301
## 1.00 0.00012 0.636 0.262
## 1.00 0.00120 0.646 0.282
## 1.00 0.01202 0.651 0.289
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.012.
#logistic model for comparison
logistic_model = rms::lrm(sex ~ SCR_A + SCR_B + SCR_C + SCR_D + SCR_E + SCR_M, data = d)
logistic_model
## Frequencies of Missing Values Due to Each Variable
## sex SCR_A SCR_B SCR_C SCR_D SCR_E SCR_M
## 0 444 445 473 486 487 443
##
## Logistic Regression Model
##
## rms::lrm(formula = sex ~ SCR_A + SCR_B + SCR_C + SCR_D + SCR_E +
## SCR_M, data = d)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 3068 LR chi2 162.39 R2 0.069 C 0.637
## Female 1615 d.f. 6 g 0.530 Dxy 0.274
## Male 1453 Pr(> chi2) <0.0001 gr 1.699 gamma 0.275
## max |deriv| 2e-12 gp 0.126 tau-a 0.137
## Brier 0.236
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 1.0641 0.2337 4.55 <0.0001
## SCR_A -0.1910 0.0224 -8.52 <0.0001
## SCR_B 0.1250 0.0133 9.39 <0.0001
## SCR_C 0.0256 0.0187 1.37 0.1716
## SCR_D -0.0584 0.0145 -4.02 <0.0001
## SCR_E -0.0872 0.0215 -4.05 <0.0001
## SCR_M -0.0340 0.0265 -1.28 0.1997
##
#accuracy
#base rate
mean(d$sex == "Female", na.rm=T)
## [1] 0.534
logistic_model$preds_binary = predict(logistic_model) < 0
mean(logistic_model$preds_binary == (d$sex == "Female"), na.rm = T)
## [1] 0.6