options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, caret, parallel, doParallel)
#start parallel
doParallel::registerDoParallel(7)
#read
d = read_spss("data/wls_bl_13_06.sav")
d_orig = d #for fast reload in interactive session
#reload fast: d = d_orig
#filter by waves of responding
d$R4 = d$z_ra016rey != -2
d$R5 = d$z_ga016rey != -2
d$R6 = d$z_ha016rey != -2
d["R"+4:6] %>% miss_by_var()
## R4 R5 R6
## 5226 7515 9366
d = d %>% filter(R4, R5, R6)
#recode function
recoder = function(x) {
plyr::mapvalues(x, from = c(-(1:99)), to = rep(NA, 99), warn_missing = F)
}
#cognitive items
#similarities R4
similaritiesR4_names = str_glue("z_ri00{c(2:5, 7:9)}re") %>% as.character()
similarityR4_items = suppressWarnings({d[similaritiesR4_names] %>% map_df(recoder) %>% map_df(as.character)})
names(similarityR4_items) = "similaritiesR4_" + 1:ncol(similarityR4_items)
#similarities R5
similaritiesR5_names = str_glue("z_gi11{1:9}re") %>% as.character()
similarityR5_items = suppressWarnings({d[similaritiesR5_names] %>% map_df(recoder) %>% map_df(as.character)})
names(similarityR5_items) = "similaritiesR5_" + 1:ncol(similarityR5_items)
similarity_items = cbind(similarityR4_items, similarityR5_items)
#recall tests R5
recallR5_names = str_glue("z_gi4{str_pad(3:12, width = 2, side = 'left', pad = '0')}re") %>% as.character()
delayed_recallR5_names = str_glue("z_gi4{47:56}re") %>% as.character()
recallR5_items = suppressWarnings({d[c(recallR5_names, delayed_recallR5_names)] %>% map_df(recoder) %>% map_df(~.==1)})
names(recallR5_items) = "recallR5_" + 1:ncol(recallR5_items)
recall_items = recallR5_items
#number series R6
number_seriesR6_names = str_glue("z_hi6{str_pad(0:14, width = 2, side = 'left', pad = '0')}re") %>% as.character()
number_seriesR6_items = suppressWarnings({d[number_seriesR6_names] %>% map_df(recoder) %>% map_df(~.==1)})
names(number_seriesR6_items) = "number_seriesR6_" + 1:ncol(number_seriesR6_items)
number_series_items = number_seriesR6_items
#for similarities, score 2 as 1, 0-1 as false to create binary item
similarity_items_binary = similarity_items %>% map_df(~.==2)
#all items
all_items = cbind(similarity_items, recall_items, number_series_items)
binary_items = cbind(similarity_items_binary, recall_items, number_series_items)
#others
d$age = d$z_brdxdy %>% recoder() %>% as.numeric()
d$education = d$z_hb103red %>% recoder() %>% as.numeric()
d$income = d$z_hp260hec %>% recoder() %>% as.numeric() %>% plyr::mapvalues(0, NA) %>% log10()
d$fertility = d$z_hd301kd %>% recoder() %>% as.numeric()
d$health = d$z_jxsf1rec %>% recoder() %>% as.numeric()
#outcomes
outcomes = c("age", "income", "education") #others have too weak relations
#missing data by var
miss_by_var(d[c(outcomes)])
## age income education
## 0 792 222
miss_by_var(all_items)
## similaritiesR4_1 similaritiesR4_2 similaritiesR4_3
## 38 38 38
## similaritiesR4_4 similaritiesR4_5 similaritiesR4_6
## 38 38 38
## similaritiesR4_7 similaritiesR5_1 similaritiesR5_2
## 38 507 2267
## similaritiesR5_3 similaritiesR5_4 similaritiesR5_5
## 562 2336 1053
## similaritiesR5_6 similaritiesR5_7 similaritiesR5_8
## 558 3591 2220
## similaritiesR5_9 recallR5_1 recallR5_2
## 1148 2129 2129
## recallR5_3 recallR5_4 recallR5_5
## 2129 2129 2129
## recallR5_6 recallR5_7 recallR5_8
## 2129 2129 2129
## recallR5_9 recallR5_10 recallR5_11
## 2129 2129 2360
## recallR5_12 recallR5_13 recallR5_14
## 2360 2360 2360
## recallR5_15 recallR5_16 recallR5_17
## 2360 2360 2360
## recallR5_18 recallR5_19 recallR5_20
## 2360 2360 2360
## number_seriesR6_1 number_seriesR6_2 number_seriesR6_3
## 425 776 1189
## number_seriesR6_4 number_seriesR6_5 number_seriesR6_6
## 7764 7772 7783
## number_seriesR6_7 number_seriesR6_8 number_seriesR6_9
## 6179 6301 6429
## number_seriesR6_10 number_seriesR6_11 number_seriesR6_12
## 5285 5221 5353
## number_seriesR6_13 number_seriesR6_14 number_seriesR6_15
## 6816 6820 7224
keep_items = miss_by_var(all_items) < 3600
sum(keep_items)
## [1] 39
#kept items
binary_items_ok = binary_items[, keep_items]
#sum score
d$sumscore = binary_items_ok %>% rowSums(na.rm = T)
#dist
d$sumscore %>% GG_denhist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#low score people = all missing data or invalid
#remove them since useless for our purposes
keep_cases = d$sumscore > 2
d = d[keep_cases, ]
all_items_ok = all_items[keep_cases, keep_items]
binary_items = binary_items_ok[keep_cases, ]
binary_items_num = binary_items %>% map_df(as.numeric)
#IRT
(irt_fit = irt.fa(binary_items_num))
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = binary_items_num)
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## similaritiesR4_1 0.09 0.11 0.12 0.10 0.07 0.04 0.02
## similaritiesR4_2 0.04 0.09 0.14 0.18 0.16 0.10 0.05
## similaritiesR4_3 0.04 0.05 0.07 0.07 0.07 0.06 0.04
## similaritiesR4_6 0.02 0.05 0.12 0.20 0.23 0.17 0.09
## similaritiesR5_1 0.11 0.14 0.14 0.11 0.06 0.03 0.02
## similaritiesR5_2 0.11 0.21 0.26 0.20 0.10 0.04 0.02
## similaritiesR5_3 0.04 0.05 0.06 0.07 0.06 0.05 0.04
## similaritiesR5_4 0.23 0.31 0.24 0.11 0.04 0.01 0.00
## similaritiesR5_5 0.03 0.05 0.07 0.08 0.08 0.07 0.05
## similaritiesR5_6 0.04 0.10 0.18 0.24 0.20 0.11 0.05
## similaritiesR5_7 0.05 0.10 0.17 0.20 0.16 0.09 0.04
## similaritiesR5_8 0.03 0.07 0.12 0.17 0.17 0.12 0.07
## recallR5_3 0.04 0.06 0.07 0.07 0.06 0.05 0.03
## recallR5_11 0.05 0.06 0.07 0.07 0.06 0.05 0.03
## recallR5_15 0.04 0.05 0.07 0.07 0.07 0.06 0.04
## recallR5_18 0.03 0.05 0.06 0.06 0.06 0.05 0.04
## number_seriesR6_1 0.11 0.15 0.16 0.12 0.08 0.04 0.02
## number_seriesR6_3 0.04 0.07 0.10 0.12 0.12 0.09 0.06
## Test Info 1.15 1.78 2.22 2.25 1.85 1.24 0.73
## SEM 0.93 0.75 0.67 0.67 0.73 0.90 1.17
## Reliability 0.13 0.44 0.55 0.56 0.46 0.20 -0.37
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 702 and the objective function was 18.1
## The number of observations was 8204 with Chi Square = 147980 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.12
## The df corrected root mean square of the residuals is 0.12
##
## Tucker Lewis Index of factoring reliability = 0.08
## RMSEA index = 0.16 and the 10 % confidence intervals are 0.159 0.161
## BIC = 141653
irt_fit$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## similaritiesR4_1 0.38 0.1415 0.86 1
## similaritiesR4_2 0.44 0.1961 0.80 1
## similaritiesR4_3 0.30 0.0920 0.91 1
## similaritiesR4_4 0.28 0.0778 0.92 1
## similaritiesR4_5 0.04 0.0014 1.00 1
## similaritiesR4_6 0.50 0.2464 0.75 1
## similaritiesR4_7 0.19 0.0378 0.96 1
## similaritiesR5_1 0.41 0.1699 0.83 1
## similaritiesR5_2 0.52 0.2666 0.73 1
## similaritiesR5_3 0.29 0.0835 0.92 1
## similaritiesR5_4 0.55 0.3029 0.70 1
## similaritiesR5_5 0.32 0.1031 0.90 1
## similaritiesR5_6 0.50 0.2509 0.75 1
## similaritiesR5_7 0.47 0.2170 0.78 1
## similaritiesR5_8 0.44 0.1953 0.80 1
## similaritiesR5_9 0.07 0.0047 1.00 1
## recallR5_1 0.25 0.0648 0.94 1
## recallR5_2 0.24 0.0565 0.94 1
## recallR5_3 0.30 0.0871 0.91 1
## recallR5_4 0.25 0.0628 0.94 1
## recallR5_5 0.28 0.0797 0.92 1
## recallR5_6 0.27 0.0703 0.93 1
## recallR5_7 0.20 0.0395 0.96 1
## recallR5_8 0.25 0.0611 0.94 1
## recallR5_9 0.17 0.0294 0.97 1
## recallR5_10 0.07 0.0049 1.00 1
## recallR5_11 0.30 0.0892 0.91 1
## recallR5_12 0.24 0.0562 0.94 1
## recallR5_13 0.28 0.0779 0.92 1
## recallR5_14 0.28 0.0767 0.92 1
## recallR5_15 0.31 0.0936 0.91 1
## recallR5_16 0.28 0.0803 0.92 1
## recallR5_17 0.24 0.0585 0.94 1
## recallR5_18 0.29 0.0828 0.92 1
## recallR5_19 0.24 0.0594 0.94 1
## recallR5_20 0.18 0.0323 0.97 1
## number_seriesR6_1 0.43 0.1818 0.82 1
## number_seriesR6_2 0.22 0.0506 0.95 1
## number_seriesR6_3 0.38 0.1465 0.85 1
##
## MR1
## SS loadings 4.03
## Proportion Var 0.10
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 741 and the objective function was 20.7 with Chi Square of 169686
## The degrees of freedom for the model are 702 and the objective function was 18.1
##
## The root mean square of the residuals (RMSR) is 0.12
## The df corrected root mean square of the residuals is 0.12
##
## The harmonic number of observations is 8204 with the empirical chi square 171019 with prob < 0
## The total number of observations was 8204 with Likelihood Chi Square = 147980 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.08
## RMSEA index = 0.16 and the 90 % confidence intervals are 0.159 0.161
## BIC = 141653
## Fit based upon off diagonal values = 0.43
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.91
## Multiple R square of scores with factors 0.83
## Minimum correlation of possible factor scores 0.66
irt_fit_scores = scoreIrt(irt_fit, binary_items_num)
d$g = irt_fit_scores$theta1 %>% standardize()
#plot
GG_scatter(d, "sumscore", "g")
#TERRIBLE!
#probably related to odd missing data patterns...
#impute explicit NA
all_items_ok_na = all_items_ok %>% map_df(~as.factor(.) %>% fct_explicit_na())
#train glmnet for each
model_list = list()
for (o in outcomes) {
#subset without missing data
o_subset = cbind(y = d[[o]], all_items_ok_na)
names(o_subset)[1] = o
#filter missing
o_subset = na.omit(o_subset)
#train
model_list[[o]] = train(as.formula(str_glue("{o} ~ {str_c(names(all_items_ok_na), collapse = ' + ')}")),
method = "glmnet",
trControl = trainControl(allowParallel = T, method = "repeatedcv", repeats = 10),
data = o_subset)
}
#mere correlations
d[c("g", "sumscore", outcomes)] %>% wtd.cors() %>% .^2 #square to get r2 as in models
## g sumscore age income education
## g 1.0000 0.17595 0.01070 0.02095 0.18088
## sumscore 0.1759 1.00000 0.00775 0.00637 0.06286
## age 0.0107 0.00775 1.00000 0.01501 0.00325
## income 0.0210 0.00637 0.01501 1.00000 0.06284
## education 0.1809 0.06286 0.00325 0.06284 1.00000
#best param model fits
map_dbl(model_list, ~.$results$Rsquared %>% max())
## age income education
## 0.0269 0.0366 0.2635
This dataset is too sparse with odd missing data to be useful for this kind of analysis I think.