Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, caret, parallel, doParallel)

#start parallel
doParallel::registerDoParallel(7)

Data

d = read_sav("data/ncds1958/UKDA-8288-spss/spss/spss19/bcs1986_reading_matrices.sav")

#variables table
d_table = tibble(
  colname = colnames(d),
  label = map_chr(d, ~attr(., "label"))
)

#dataset
dim(d)
## [1] 3651  183
#item names
item_names = names(d) %>% str_subset("ANS_")
item_binary_names = names(d) %>% str_subset("SCR.+?\\d$")

#recode
d$sex = plyr::mapvalues(d$SEX, 1:3, c("Male", "Female", NA)) %>% factor()
## The following `from` values were not present in `x`: 3
#recode scores into numeric
for (v in str_subset(names(d), "SCR_")) d[[v]] = d[[v]] %>% as.numeric()

Traditional psychometrics

#sumscore
d$sumscore = d$SCRTOTAL %>% as.numeric()

#IRT
irt_fit = irt.fa(d %>% select(!!item_binary_names))
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was
## done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs
## = np.obs, : The estimated weights for the factor scores are probably
## incorrect. Try a different factor extraction method.

irt_fit
## Item Response Analysis using Factor Analysis 
## 
## Call: irt.fa(x = d %>% select(!!item_binary_names))
## Item Response Analysis using Factor Analysis  
## 
##  Summary information by factor and item
##  Factor =  1 
##                -3    -2    -1     0    1    2    3
## SCR_A1       0.25  0.33  0.23  0.10 0.04 0.01 0.00
## SCR_A2       0.22  0.34  0.28  0.13 0.05 0.02 0.00
## SCR_A3       0.36  0.43  0.23  0.08 0.02 0.01 0.00
## SCR_A4       0.10  0.20  0.27  0.21 0.11 0.05 0.02
## SCR_A5       0.17  0.35  0.36  0.19 0.07 0.02 0.01
## SCR_A6       0.20  0.34  0.30  0.15 0.05 0.02 0.01
## SCR_A7       0.19  0.37  0.36  0.18 0.06 0.02 0.01
## SCR_A8       0.27  0.36  0.24  0.10 0.03 0.01 0.00
## SCR_A9       0.12  0.25  0.33  0.23 0.10 0.04 0.01
## SCR_A10      0.17  0.31  0.30  0.17 0.07 0.02 0.01
## SCR_B1       0.22  0.37  0.31  0.14 0.05 0.02 0.00
## SCR_B2       0.36  1.35  0.53  0.06 0.01 0.00 0.00
## SCR_B3       0.10  0.27  0.40  0.28 0.11 0.04 0.01
## SCR_B4       0.22  0.70  0.63  0.17 0.03 0.01 0.00
## SCR_B5       0.34  0.88  0.48  0.10 0.02 0.00 0.00
## SCR_B6       0.22  0.62  0.54  0.17 0.04 0.01 0.00
## SCR_B7       0.39  1.08  0.47  0.07 0.01 0.00 0.00
## SCR_B8       0.20  0.59  0.58  0.19 0.04 0.01 0.00
## SCR_B9       0.22  0.70  0.60  0.16 0.03 0.01 0.00
## SCR_B10      0.08  0.30  0.59  0.39 0.12 0.03 0.01
## SCR_B11      0.07  0.23  0.45  0.38 0.16 0.05 0.01
## SCR_B12      0.15  0.51  0.66  0.25 0.06 0.01 0.00
## SCR_B13      0.04  0.15  0.36  0.45 0.24 0.08 0.02
## SCR_B14      0.04  0.16  0.48  0.56 0.23 0.06 0.01
## SCR_B15      0.05  0.20  0.56  0.53 0.19 0.04 0.01
## SCR_B16      0.07  0.22  0.46  0.40 0.16 0.05 0.01
## SCR_B17      0.05  0.19  0.44  0.45 0.20 0.06 0.01
## SCR_B18      0.06  0.27  0.68  0.50 0.14 0.03 0.01
## SCR_B19      0.05  0.17  0.41  0.46 0.22 0.06 0.02
## SCR_B20      0.08  0.21  0.33  0.29 0.14 0.05 0.02
## SCR_C1       0.04  0.12  0.27  0.35 0.23 0.10 0.03
## SCR_C2       0.28  0.35  0.23  0.09 0.03 0.01 0.00
## SCR_C3       0.10  0.16  0.18  0.14 0.08 0.04 0.02
## SCR_C4       0.08  0.12  0.13  0.11 0.08 0.05 0.03
## SCR_C5       0.20  0.29  0.25  0.13 0.05 0.02 0.01
## SCR_C6       0.05  0.07  0.09  0.10 0.09 0.07 0.05
## SCR_C7       0.08  0.13  0.16  0.14 0.10 0.06 0.03
## SCR_C8       0.09  0.17  0.23  0.21 0.12 0.06 0.02
## SCR_C9       0.10  0.13  0.14  0.11 0.07 0.04 0.02
## SCR_C10      0.15  0.25  0.26  0.16 0.07 0.03 0.01
## SCR_C11      0.10  0.15  0.16  0.13 0.08 0.04 0.02
## SCR_C12      0.07  0.12  0.17  0.17 0.12 0.07 0.03
## SCR_C13      0.07  0.12  0.15  0.15 0.11 0.06 0.03
## SCR_C14      0.10  0.15  0.17  0.14 0.08 0.04 0.02
## SCR_C15      0.14  0.27  0.30  0.20 0.08 0.03 0.01
## SCR_D1       0.07  0.14  0.22  0.23 0.15 0.07 0.03
## SCR_D2       0.06  0.16  0.30  0.32 0.18 0.07 0.02
## SCR_D3       0.07  0.12  0.17  0.17 0.13 0.07 0.04
## SCR_D4       0.07  0.19  0.36  0.35 0.18 0.06 0.02
## SCR_D6       0.06  0.14  0.26  0.29 0.19 0.08 0.03
## SCR_D7       0.04  0.09  0.15  0.19 0.17 0.11 0.05
## SCR_D8       0.07  0.18  0.33  0.31 0.17 0.06 0.02
## SCR_D10      0.06  0.17  0.33  0.34 0.18 0.07 0.02
## SCR_D11      0.14  0.35  0.45  0.25 0.08 0.02 0.01
## SCR_D12      0.07  0.15  0.23  0.23 0.15 0.07 0.03
## SCR_D13      0.05  0.18  0.42  0.45 0.20 0.06 0.02
## SCR_D14      0.07  0.14  0.20  0.19 0.13 0.07 0.03
## SCR_D15      0.07  0.14  0.21  0.21 0.14 0.07 0.03
## SCR_D16      0.04  0.09  0.16  0.20 0.17 0.11 0.05
## SCR_D17      0.04  0.07  0.11  0.13 0.12 0.09 0.05
## SCR_E1       0.16  0.29  0.30  0.18 0.07 0.03 0.01
## SCR_E2       0.08  0.31  0.65  0.42 0.12 0.03 0.01
## SCR_E3       0.09  0.15  0.19  0.17 0.11 0.05 0.03
## SCR_E5       0.07  0.13  0.19  0.18 0.13 0.07 0.03
## SCR_E6       0.20  0.34  0.30  0.15 0.05 0.02 0.01
## SCR_E7       0.16  0.35  0.39  0.21 0.07 0.02 0.01
## SCR_E8       0.14  0.33  0.41  0.24 0.08 0.03 0.01
## SCR_E9       0.03  0.09  0.24  0.39 0.31 0.13 0.04
## SCR_E10      0.14  0.40  0.53  0.26 0.08 0.02 0.00
## SCR_E11      0.08  0.22  0.41  0.35 0.15 0.05 0.01
## SCR_E12      0.13  0.24  0.29  0.20 0.09 0.03 0.01
## SCR_E13      0.08  0.16  0.24  0.23 0.14 0.07 0.03
## SCR_M1       0.07  0.07  0.07  0.05 0.04 0.03 0.02
## SCR_M2       0.18  0.20  0.15  0.09 0.04 0.02 0.01
## SCR_M3       0.25  0.31  0.22  0.10 0.04 0.01 0.00
## SCR_M4       0.21  0.22  0.15  0.08 0.03 0.01 0.01
## SCR_M5       0.14  0.22  0.22  0.14 0.07 0.03 0.01
## SCR_M6       0.18  0.34  0.33  0.18 0.06 0.02 0.01
## SCR_M7       0.11  0.15  0.16  0.12 0.07 0.04 0.02
## SCR_M8       0.09  0.14  0.17  0.14 0.09 0.05 0.02
## SCR_M9       0.07  0.12  0.17  0.16 0.12 0.07 0.03
## SCR_M10      0.04  0.06  0.08  0.09 0.08 0.07 0.04
## Test Info   10.37 22.15 25.48 17.88 8.57 3.47 1.36
## SEM          0.31  0.21  0.20  0.24 0.34 0.54 0.86
## Reliability  0.90  0.95  0.96  0.94 0.88 0.71 0.27
## 
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## 
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 3569  and the objective function was  87.5 
## The number of observations was  3651  with Chi Square =  316567  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.06 
## The df corrected root mean square of the residuals is  0.06 
## 
## Tucker Lewis Index of factoring reliability =  0.233
## RMSEA index =  0.156  and the 10 % confidence intervals are  0.155 NA
## BIC =  287291
irt_fit$fa
## Factor Analysis using method =  minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
##          MR1      h2   u2 com
## SCR_A1  0.56 0.31279 0.69   1
## SCR_A2  0.57 0.32178 0.68   1
## SCR_A3  0.62 0.38231 0.62   1
## SCR_A4  0.52 0.26827 0.73   1
## SCR_A5  0.59 0.34834 0.65   1
## SCR_A6  0.57 0.32512 0.67   1
## SCR_A7  0.60 0.35843 0.64   1
## SCR_A8  0.58 0.33103 0.67   1
## SCR_A9  0.56 0.31577 0.68   1
## SCR_A10 0.56 0.31358 0.69   1
## SCR_B1  0.59 0.34329 0.66   1
## SCR_B2  0.81 0.65333 0.35   1
## SCR_B3  0.60 0.35428 0.65   1
## SCR_B4  0.73 0.52822 0.47   1
## SCR_B5  0.74 0.55101 0.45   1
## SCR_B6  0.70 0.48890 0.51   1
## SCR_B7  0.77 0.59837 0.40   1
## SCR_B8  0.70 0.48763 0.51   1
## SCR_B9  0.72 0.52332 0.48   1
## SCR_B10 0.67 0.44994 0.55   1
## SCR_B11 0.63 0.39324 0.61   1
## SCR_B12 0.70 0.49130 0.51   1
## SCR_B13 0.63 0.39149 0.61   1
## SCR_B14 0.68 0.45699 0.54   1
## SCR_B15 0.68 0.46869 0.53   1
## SCR_B16 0.63 0.40237 0.60   1
## SCR_B17 0.64 0.41191 0.59   1
## SCR_B18 0.71 0.49801 0.50   1
## SCR_B19 0.64 0.40667 0.59   1
## SCR_B20 0.57 0.31961 0.68   1
## SCR_C1  0.57 0.32961 0.67   1
## SCR_C2  0.57 0.32990 0.67   1
## SCR_C3  0.44 0.19690 0.80   1
## SCR_C4  0.39 0.15231 0.85   1
## SCR_C5  0.54 0.29084 0.71   1
## SCR_C6  0.36 0.12661 0.87   1
## SCR_C7  0.42 0.17956 0.82   1
## SCR_C8  0.50 0.24534 0.75   1
## SCR_C9  0.41 0.16410 0.84   1
## SCR_C10 0.53 0.27608 0.72   1
## SCR_C11 0.43 0.18277 0.82   1
## SCR_C12 0.44 0.19794 0.80   1
## SCR_C13 0.42 0.17622 0.82   1
## SCR_C14 0.43 0.18909 0.81   1
## SCR_C15 0.55 0.30111 0.70   1
## SCR_D1  0.50 0.24697 0.75   1
## SCR_D2  0.56 0.31736 0.68   1
## SCR_D3  0.44 0.19668 0.80   1
## SCR_D4  0.59 0.35063 0.65   1
## SCR_D5  0.23 0.05339 0.95   1
## SCR_D6  0.54 0.29462 0.71   1
## SCR_D7  0.46 0.20921 0.79   1
## SCR_D8  0.57 0.32486 0.68   1
## SCR_D9  0.02 0.00062 1.00   1
## SCR_D10 0.58 0.33668 0.66   1
## SCR_D11 0.62 0.38616 0.61   1
## SCR_D12 0.50 0.25096 0.75   1
## SCR_D13 0.63 0.40254 0.60   1
## SCR_D14 0.47 0.21981 0.78   1
## SCR_D15 0.48 0.23340 0.77   1
## SCR_D16 0.47 0.21676 0.78   1
## SCR_D17 0.40 0.15782 0.84   1
## SCR_E1  0.55 0.30767 0.69   1
## SCR_E2  0.69 0.47451 0.53   1
## SCR_E3  0.46 0.20908 0.79   1
## SCR_E4  0.21 0.04299 0.96   1
## SCR_E5  0.46 0.21228 0.79   1
## SCR_E6  0.57 0.32594 0.67   1
## SCR_E7  0.60 0.36361 0.64   1
## SCR_E8  0.60 0.36551 0.63   1
## SCR_E9  0.60 0.35534 0.64   1
## SCR_E10 0.66 0.43074 0.57   1
## SCR_E11 0.61 0.37101 0.63   1
## SCR_E12 0.54 0.28671 0.71   1
## SCR_E13 0.50 0.25464 0.75   1
## SCR_M1  0.30 0.08969 0.91   1
## SCR_M2  0.47 0.22050 0.78   1
## SCR_M3  0.55 0.30279 0.70   1
## SCR_M4  0.49 0.23611 0.76   1
## SCR_M5  0.49 0.24118 0.76   1
## SCR_M6  0.58 0.33559 0.66   1
## SCR_M7  0.43 0.18108 0.82   1
## SCR_M8  0.43 0.18757 0.81   1
## SCR_M9  0.44 0.19115 0.81   1
## SCR_M10 0.33 0.11097 0.89   1
## SCR_M11 0.21 0.04550 0.95   1
## 
##                 MR1
## SS loadings    26.2
## Proportion Var  0.3
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  3655  and the objective function was  117 with Chi Square of  421819
## The degrees of freedom for the model are 3569  and the objective function was  87.5 
## 
## The root mean square of the residuals (RMSR) is  0.06 
## The df corrected root mean square of the residuals is  0.06 
## 
## The harmonic number of observations is  3651 with the empirical chi square  101042  with prob <  0 
## The total number of observations was  3651  with Likelihood Chi Square =  316567  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.233
## RMSEA index =  0.156  and the 90 % confidence intervals are  0.155 NA
## BIC =  287291
## Fit based upon off diagonal values = 0.96

Prediction

#fit enet
enet_sex = train(as.formula(str_glue("sex ~ {str_c(item_names, collapse = ' + ')}")), 
                          method = "glmnet", 
                          trControl = trainControl(allowParallel = T, method = "repeatedcv", repeats = 10), 
                          data = d)
enet_sex
## glmnet 
## 
## 3651 samples
##   86 predictor
##    2 classes: 'Female', 'Male' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 3286, 3286, 3285, 3286, 3286, 3286, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda   Accuracy  Kappa
##   0.10   0.00012  0.635     0.261
##   0.10   0.00120  0.639     0.268
##   0.10   0.01202  0.650     0.289
##   0.55   0.00012  0.636     0.262
##   0.55   0.00120  0.643     0.276
##   0.55   0.01202  0.656     0.301
##   1.00   0.00012  0.636     0.262
##   1.00   0.00120  0.646     0.282
##   1.00   0.01202  0.651     0.289
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0.55 and lambda = 0.012.
#logistic model for comparison
logistic_model = rms::lrm(sex ~ SCR_A + SCR_B + SCR_C + SCR_D + SCR_E + SCR_M, data = d)
logistic_model
## Frequencies of Missing Values Due to Each Variable
##   sex SCR_A SCR_B SCR_C SCR_D SCR_E SCR_M 
##     0   444   445   473   486   487   443 
## 
## Logistic Regression Model
##  
##  rms::lrm(formula = sex ~ SCR_A + SCR_B + SCR_C + SCR_D + SCR_E + 
##      SCR_M, data = d)
##  
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs          3068    LR chi2     162.39    R2       0.069    C       0.637    
##   Female      1615    d.f.             6    g        0.530    Dxy     0.274    
##   Male        1453    Pr(> chi2) <0.0001    gr       1.699    gamma   0.275    
##  max |deriv| 2e-12                          gp       0.126    tau-a   0.137    
##                                             Brier    0.236                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept  1.0641 0.2337  4.55  <0.0001 
##  SCR_A     -0.1910 0.0224 -8.52  <0.0001 
##  SCR_B      0.1250 0.0133  9.39  <0.0001 
##  SCR_C      0.0256 0.0187  1.37  0.1716  
##  SCR_D     -0.0584 0.0145 -4.02  <0.0001 
##  SCR_E     -0.0872 0.0215 -4.05  <0.0001 
##  SCR_M     -0.0340 0.0265 -1.28  0.1997  
## 
#accuracy
#base rate
mean(d$sex == "Female", na.rm=T)
## [1] 0.534
logistic_model$preds_binary = predict(logistic_model) < 0
mean(logistic_model$preds_binary == (d$sex == "Female"), na.rm = T)
## [1] 0.6