Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, caret, parallel, doParallel)

#start parallel
doParallel::registerDoParallel(7)

Data

#read
d = read_spss("data/wls_bl_13_06.sav")
d_orig = d #for fast reload in interactive session
#reload fast: d = d_orig

Recode

#filter by waves of responding
d$R4 = d$z_ra016rey != -2
d$R5 = d$z_ga016rey != -2
d$R6 = d$z_ha016rey != -2
d["R"+4:6] %>% miss_by_var()
##   R4   R5   R6 
## 5226 7515 9366
d = d %>% filter(R4, R5, R6)

#recode function
recoder = function(x) {
  plyr::mapvalues(x, from = c(-(1:99)), to = rep(NA, 99), warn_missing = F)
}

#cognitive items
#similarities R4
similaritiesR4_names = str_glue("z_ri00{c(2:5, 7:9)}re") %>% as.character()
similarityR4_items = suppressWarnings({d[similaritiesR4_names] %>% map_df(recoder) %>% map_df(as.character)})
names(similarityR4_items) = "similaritiesR4_" + 1:ncol(similarityR4_items)
#similarities R5
similaritiesR5_names = str_glue("z_gi11{1:9}re") %>% as.character()
similarityR5_items = suppressWarnings({d[similaritiesR5_names] %>% map_df(recoder) %>% map_df(as.character)})
names(similarityR5_items) = "similaritiesR5_" + 1:ncol(similarityR5_items)
similarity_items = cbind(similarityR4_items, similarityR5_items)

#recall tests R5
recallR5_names = str_glue("z_gi4{str_pad(3:12, width = 2, side = 'left', pad = '0')}re") %>% as.character()
delayed_recallR5_names = str_glue("z_gi4{47:56}re") %>% as.character()
recallR5_items = suppressWarnings({d[c(recallR5_names, delayed_recallR5_names)] %>% map_df(recoder) %>% map_df(~.==1)})
names(recallR5_items) = "recallR5_" + 1:ncol(recallR5_items)
recall_items = recallR5_items

#number series R6
number_seriesR6_names = str_glue("z_hi6{str_pad(0:14, width = 2, side = 'left', pad = '0')}re") %>% as.character()
number_seriesR6_items = suppressWarnings({d[number_seriesR6_names] %>% map_df(recoder) %>% map_df(~.==1)})
names(number_seriesR6_items) = "number_seriesR6_" + 1:ncol(number_seriesR6_items)
number_series_items = number_seriesR6_items

#for similarities, score 2 as 1, 0-1 as false to create binary item
similarity_items_binary = similarity_items %>% map_df(~.==2)

#all items
all_items = cbind(similarity_items, recall_items, number_series_items)
binary_items = cbind(similarity_items_binary, recall_items, number_series_items)

#others
d$age = d$z_brdxdy %>% recoder() %>% as.numeric()
d$education = d$z_hb103red %>% recoder() %>% as.numeric()
d$income = d$z_hp260hec %>% recoder() %>% as.numeric() %>% plyr::mapvalues(0, NA) %>% log10()
d$fertility = d$z_hd301kd %>% recoder() %>% as.numeric()
d$health = d$z_jxsf1rec %>% recoder() %>% as.numeric()

#outcomes
outcomes = c("age", "income", "education") #others have too weak relations

#missing data by var
miss_by_var(d[c(outcomes)])
##       age    income education 
##         0       792       222
miss_by_var(all_items)
##   similaritiesR4_1   similaritiesR4_2   similaritiesR4_3 
##                 38                 38                 38 
##   similaritiesR4_4   similaritiesR4_5   similaritiesR4_6 
##                 38                 38                 38 
##   similaritiesR4_7   similaritiesR5_1   similaritiesR5_2 
##                 38                507               2267 
##   similaritiesR5_3   similaritiesR5_4   similaritiesR5_5 
##                562               2336               1053 
##   similaritiesR5_6   similaritiesR5_7   similaritiesR5_8 
##                558               3591               2220 
##   similaritiesR5_9         recallR5_1         recallR5_2 
##               1148               2129               2129 
##         recallR5_3         recallR5_4         recallR5_5 
##               2129               2129               2129 
##         recallR5_6         recallR5_7         recallR5_8 
##               2129               2129               2129 
##         recallR5_9        recallR5_10        recallR5_11 
##               2129               2129               2360 
##        recallR5_12        recallR5_13        recallR5_14 
##               2360               2360               2360 
##        recallR5_15        recallR5_16        recallR5_17 
##               2360               2360               2360 
##        recallR5_18        recallR5_19        recallR5_20 
##               2360               2360               2360 
##  number_seriesR6_1  number_seriesR6_2  number_seriesR6_3 
##                425                776               1189 
##  number_seriesR6_4  number_seriesR6_5  number_seriesR6_6 
##               7764               7772               7783 
##  number_seriesR6_7  number_seriesR6_8  number_seriesR6_9 
##               6179               6301               6429 
## number_seriesR6_10 number_seriesR6_11 number_seriesR6_12 
##               5285               5221               5353 
## number_seriesR6_13 number_seriesR6_14 number_seriesR6_15 
##               6816               6820               7224
keep_items = miss_by_var(all_items) < 3600
sum(keep_items)
## [1] 39

Traditional psychometrics

#kept items
binary_items_ok = binary_items[, keep_items]

#sum score
d$sumscore = binary_items_ok %>% rowSums(na.rm = T)

#dist
d$sumscore %>% GG_denhist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#low score people = all missing data or invalid
#remove them since useless for our purposes
keep_cases = d$sumscore > 2
d = d[keep_cases, ]
all_items_ok = all_items[keep_cases, keep_items]
binary_items = binary_items_ok[keep_cases, ]
binary_items_num = binary_items %>% map_df(as.numeric)

#IRT
(irt_fit = irt.fa(binary_items_num))

## Item Response Analysis using Factor Analysis 
## 
## Call: irt.fa(x = binary_items_num)
## Item Response Analysis using Factor Analysis  
## 
##  Summary information by factor and item
##  Factor =  1 
##                     -3   -2   -1    0    1    2     3
## similaritiesR4_1  0.09 0.11 0.12 0.10 0.07 0.04  0.02
## similaritiesR4_2  0.04 0.09 0.14 0.18 0.16 0.10  0.05
## similaritiesR4_3  0.04 0.05 0.07 0.07 0.07 0.06  0.04
## similaritiesR4_6  0.02 0.05 0.12 0.20 0.23 0.17  0.09
## similaritiesR5_1  0.11 0.14 0.14 0.11 0.06 0.03  0.02
## similaritiesR5_2  0.11 0.21 0.26 0.20 0.10 0.04  0.02
## similaritiesR5_3  0.04 0.05 0.06 0.07 0.06 0.05  0.04
## similaritiesR5_4  0.23 0.31 0.24 0.11 0.04 0.01  0.00
## similaritiesR5_5  0.03 0.05 0.07 0.08 0.08 0.07  0.05
## similaritiesR5_6  0.04 0.10 0.18 0.24 0.20 0.11  0.05
## similaritiesR5_7  0.05 0.10 0.17 0.20 0.16 0.09  0.04
## similaritiesR5_8  0.03 0.07 0.12 0.17 0.17 0.12  0.07
## recallR5_3        0.04 0.06 0.07 0.07 0.06 0.05  0.03
## recallR5_11       0.05 0.06 0.07 0.07 0.06 0.05  0.03
## recallR5_15       0.04 0.05 0.07 0.07 0.07 0.06  0.04
## recallR5_18       0.03 0.05 0.06 0.06 0.06 0.05  0.04
## number_seriesR6_1 0.11 0.15 0.16 0.12 0.08 0.04  0.02
## number_seriesR6_3 0.04 0.07 0.10 0.12 0.12 0.09  0.06
## Test Info         1.15 1.78 2.22 2.25 1.85 1.24  0.73
## SEM               0.93 0.75 0.67 0.67 0.73 0.90  1.17
## Reliability       0.13 0.44 0.55 0.56 0.46 0.20 -0.37
## 
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## 
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 702  and the objective function was  18.1 
## The number of observations was  8204  with Chi Square =  147980  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.12 
## The df corrected root mean square of the residuals is  0.12 
## 
## Tucker Lewis Index of factoring reliability =  0.08
## RMSEA index =  0.16  and the 10 % confidence intervals are  0.159 0.161
## BIC =  141653
irt_fit$fa
## Factor Analysis using method =  minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                    MR1     h2   u2 com
## similaritiesR4_1  0.38 0.1415 0.86   1
## similaritiesR4_2  0.44 0.1961 0.80   1
## similaritiesR4_3  0.30 0.0920 0.91   1
## similaritiesR4_4  0.28 0.0778 0.92   1
## similaritiesR4_5  0.04 0.0014 1.00   1
## similaritiesR4_6  0.50 0.2464 0.75   1
## similaritiesR4_7  0.19 0.0378 0.96   1
## similaritiesR5_1  0.41 0.1699 0.83   1
## similaritiesR5_2  0.52 0.2666 0.73   1
## similaritiesR5_3  0.29 0.0835 0.92   1
## similaritiesR5_4  0.55 0.3029 0.70   1
## similaritiesR5_5  0.32 0.1031 0.90   1
## similaritiesR5_6  0.50 0.2509 0.75   1
## similaritiesR5_7  0.47 0.2170 0.78   1
## similaritiesR5_8  0.44 0.1953 0.80   1
## similaritiesR5_9  0.07 0.0047 1.00   1
## recallR5_1        0.25 0.0648 0.94   1
## recallR5_2        0.24 0.0565 0.94   1
## recallR5_3        0.30 0.0871 0.91   1
## recallR5_4        0.25 0.0628 0.94   1
## recallR5_5        0.28 0.0797 0.92   1
## recallR5_6        0.27 0.0703 0.93   1
## recallR5_7        0.20 0.0395 0.96   1
## recallR5_8        0.25 0.0611 0.94   1
## recallR5_9        0.17 0.0294 0.97   1
## recallR5_10       0.07 0.0049 1.00   1
## recallR5_11       0.30 0.0892 0.91   1
## recallR5_12       0.24 0.0562 0.94   1
## recallR5_13       0.28 0.0779 0.92   1
## recallR5_14       0.28 0.0767 0.92   1
## recallR5_15       0.31 0.0936 0.91   1
## recallR5_16       0.28 0.0803 0.92   1
## recallR5_17       0.24 0.0585 0.94   1
## recallR5_18       0.29 0.0828 0.92   1
## recallR5_19       0.24 0.0594 0.94   1
## recallR5_20       0.18 0.0323 0.97   1
## number_seriesR6_1 0.43 0.1818 0.82   1
## number_seriesR6_2 0.22 0.0506 0.95   1
## number_seriesR6_3 0.38 0.1465 0.85   1
## 
##                 MR1
## SS loadings    4.03
## Proportion Var 0.10
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  741  and the objective function was  20.7 with Chi Square of  169686
## The degrees of freedom for the model are 702  and the objective function was  18.1 
## 
## The root mean square of the residuals (RMSR) is  0.12 
## The df corrected root mean square of the residuals is  0.12 
## 
## The harmonic number of observations is  8204 with the empirical chi square  171019  with prob <  0 
## The total number of observations was  8204  with Likelihood Chi Square =  147980  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.08
## RMSEA index =  0.16  and the 90 % confidence intervals are  0.159 0.161
## BIC =  141653
## Fit based upon off diagonal values = 0.43
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.91
## Multiple R square of scores with factors          0.83
## Minimum correlation of possible factor scores     0.66
irt_fit_scores = scoreIrt(irt_fit, binary_items_num)
d$g = irt_fit_scores$theta1 %>% standardize()

#plot
GG_scatter(d, "sumscore", "g")

#TERRIBLE!
#probably related to odd missing data patterns...

Machine learning

#impute explicit NA
all_items_ok_na = all_items_ok %>% map_df(~as.factor(.) %>% fct_explicit_na())

#train glmnet for each
model_list = list()
for (o in outcomes) {
  
  #subset without missing data
  o_subset = cbind(y = d[[o]], all_items_ok_na)
  names(o_subset)[1] = o
  
  #filter missing
  o_subset = na.omit(o_subset)
  
  #train
  model_list[[o]] = train(as.formula(str_glue("{o} ~ {str_c(names(all_items_ok_na), collapse = ' + ')}")), 
                          method = "glmnet", 
                          trControl = trainControl(allowParallel = T, method = "repeatedcv", repeats = 10), 
                          data = o_subset)
}

#mere correlations
d[c("g", "sumscore", outcomes)] %>% wtd.cors() %>% .^2 #square to get r2 as in models
##                g sumscore     age  income education
## g         1.0000  0.17595 0.01070 0.02095   0.18088
## sumscore  0.1759  1.00000 0.00775 0.00637   0.06286
## age       0.0107  0.00775 1.00000 0.01501   0.00325
## income    0.0210  0.00637 0.01501 1.00000   0.06284
## education 0.1809  0.06286 0.00325 0.06284   1.00000
#best param model fits
map_dbl(model_list, ~.$results$Rsquared %>% max())
##       age    income education 
##    0.0269    0.0366    0.2635

This dataset is too sparse with odd missing data to be useful for this kind of analysis I think.