Data origin

Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, mirt, rms, caret, future, doFuture)

#parallel
registerDoFuture()
plan(multiprocess)

Data

d = read_csv2("data/FE_1933_2006_data.csv", col_names = F) %>% as_tibble()
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Parsed with column specification:
## cols(
##   .default = col_double()
## )
## See spec(...) for full column specifications.
coding = read_csv2("data/FE_1933_2006_variables_and_coding.csv")
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   Variables = col_character(),
##   Coding = col_character(),
##   Subtests = col_character()
## )
names(d) = coding$Variables

coding

Prep

#item prep
items = d %>% select(a1.1:a4.40, b1.1:b5.50) %>% map_df(function(x) {x[x == -9] = NA; x})
itemsNA = d %>% select(a1.1:a4.40, b1.1:b5.50)

IRT score

#fit single factor model
fit_g = mirt(items, 1)
## Item re-scored so that all values are within a distance of 1
## Item re-scored so that all values are within a distance of 1
## 
Iteration: 1, Log-Lik: -163652.651, Max-Change: 8.67358
Iteration: 2, Log-Lik: -155109.148, Max-Change: 4.38435
Iteration: 3, Log-Lik: -153561.129, Max-Change: 2.71341
Iteration: 4, Log-Lik: -153065.916, Max-Change: 1.24756
Iteration: 5, Log-Lik: -152814.470, Max-Change: 0.93387
Iteration: 6, Log-Lik: -152712.266, Max-Change: 0.62086
Iteration: 7, Log-Lik: -152655.800, Max-Change: 0.51834
Iteration: 8, Log-Lik: -152619.016, Max-Change: 0.39072
Iteration: 9, Log-Lik: -152591.418, Max-Change: 0.36920
Iteration: 10, Log-Lik: -152568.807, Max-Change: 0.30980
Iteration: 11, Log-Lik: -152549.614, Max-Change: 0.31839
Iteration: 12, Log-Lik: -152532.953, Max-Change: 0.28563
Iteration: 13, Log-Lik: -152518.403, Max-Change: 0.28459
Iteration: 14, Log-Lik: -152505.609, Max-Change: 0.25460
Iteration: 15, Log-Lik: -152494.369, Max-Change: 0.25840
Iteration: 16, Log-Lik: -152484.467, Max-Change: 0.23271
Iteration: 17, Log-Lik: -152475.693, Max-Change: 0.23391
Iteration: 18, Log-Lik: -152467.951, Max-Change: 0.20927
Iteration: 19, Log-Lik: -152461.044, Max-Change: 0.20750
Iteration: 20, Log-Lik: -152454.919, Max-Change: 0.18394
Iteration: 21, Log-Lik: -152449.421, Max-Change: 0.18271
Iteration: 22, Log-Lik: -152444.534, Max-Change: 0.16221
Iteration: 23, Log-Lik: -152440.131, Max-Change: 0.16076
Iteration: 24, Log-Lik: -152436.209, Max-Change: 0.14222
Iteration: 25, Log-Lik: -152433.786, Max-Change: 0.06840
Iteration: 26, Log-Lik: -152430.242, Max-Change: 0.11407
Iteration: 27, Log-Lik: -152427.253, Max-Change: 0.13957
Iteration: 28, Log-Lik: -152425.540, Max-Change: 0.04698
Iteration: 29, Log-Lik: -152422.860, Max-Change: 0.09444
Iteration: 30, Log-Lik: -152420.690, Max-Change: 0.10279
Iteration: 31, Log-Lik: -152419.477, Max-Change: 0.04193
Iteration: 32, Log-Lik: -152417.441, Max-Change: 0.07168
Iteration: 33, Log-Lik: -152415.809, Max-Change: 0.08446
Iteration: 34, Log-Lik: -152414.927, Max-Change: 0.03009
Iteration: 35, Log-Lik: -152413.195, Max-Change: 0.02860
Iteration: 36, Log-Lik: -152411.917, Max-Change: 0.03428
Iteration: 37, Log-Lik: -152411.160, Max-Change: 0.01989
Iteration: 38, Log-Lik: -152409.874, Max-Change: 0.02733
Iteration: 39, Log-Lik: -152408.934, Max-Change: 0.02254
Iteration: 40, Log-Lik: -152408.235, Max-Change: 0.01032
Iteration: 41, Log-Lik: -152407.346, Max-Change: 0.02572
Iteration: 42, Log-Lik: -152406.522, Max-Change: 0.01020
Iteration: 43, Log-Lik: -152405.863, Max-Change: 0.02291
Iteration: 44, Log-Lik: -152405.190, Max-Change: 0.02045
Iteration: 45, Log-Lik: -152404.596, Max-Change: 0.00906
Iteration: 46, Log-Lik: -152404.034, Max-Change: 0.00892
Iteration: 47, Log-Lik: -152403.586, Max-Change: 0.00709
Iteration: 48, Log-Lik: -152403.184, Max-Change: 0.00812
Iteration: 49, Log-Lik: -152402.096, Max-Change: 0.00780
Iteration: 50, Log-Lik: -152401.826, Max-Change: 0.00552
Iteration: 51, Log-Lik: -152401.613, Max-Change: 0.00478
Iteration: 52, Log-Lik: -152400.850, Max-Change: 0.00918
Iteration: 53, Log-Lik: -152400.724, Max-Change: 0.00367
Iteration: 54, Log-Lik: -152400.631, Max-Change: 0.00338
Iteration: 55, Log-Lik: -152400.270, Max-Change: 0.00403
Iteration: 56, Log-Lik: -152400.229, Max-Change: 0.00353
Iteration: 57, Log-Lik: -152400.184, Max-Change: 0.00320
Iteration: 58, Log-Lik: -152400.085, Max-Change: 0.00452
Iteration: 59, Log-Lik: -152400.062, Max-Change: 0.00461
Iteration: 60, Log-Lik: -152400.022, Max-Change: 0.00309
Iteration: 61, Log-Lik: -152400.007, Max-Change: 0.00176
Iteration: 62, Log-Lik: -152399.991, Max-Change: 0.00210
Iteration: 63, Log-Lik: -152399.975, Max-Change: 0.00147
Iteration: 64, Log-Lik: -152399.966, Max-Change: 0.00095
Iteration: 65, Log-Lik: -152399.956, Max-Change: 0.00097
Iteration: 66, Log-Lik: -152399.948, Max-Change: 0.00355
Iteration: 67, Log-Lik: -152399.931, Max-Change: 0.00125
Iteration: 68, Log-Lik: -152399.921, Max-Change: 0.00106
Iteration: 69, Log-Lik: -152399.913, Max-Change: 0.00086
Iteration: 70, Log-Lik: -152399.896, Max-Change: 0.00141
Iteration: 71, Log-Lik: -152399.893, Max-Change: 0.00097
Iteration: 72, Log-Lik: -152399.891, Max-Change: 0.00066
Iteration: 73, Log-Lik: -152399.889, Max-Change: 0.00197
Iteration: 74, Log-Lik: -152399.883, Max-Change: 0.00271
Iteration: 75, Log-Lik: -152399.872, Max-Change: 0.00070
Iteration: 76, Log-Lik: -152399.871, Max-Change: 0.00066
Iteration: 77, Log-Lik: -152399.868, Max-Change: 0.00048
Iteration: 78, Log-Lik: -152399.866, Max-Change: 0.00034
Iteration: 79, Log-Lik: -152399.864, Max-Change: 0.00053
Iteration: 80, Log-Lik: -152399.863, Max-Change: 0.00037
Iteration: 81, Log-Lik: -152399.861, Max-Change: 0.00134
Iteration: 82, Log-Lik: -152399.856, Max-Change: 0.00177
Iteration: 83, Log-Lik: -152399.854, Max-Change: 0.00119
Iteration: 84, Log-Lik: -152399.853, Max-Change: 0.00079
Iteration: 85, Log-Lik: -152399.852, Max-Change: 0.00082
Iteration: 86, Log-Lik: -152399.848, Max-Change: 0.00079
Iteration: 87, Log-Lik: -152399.844, Max-Change: 0.00142
Iteration: 88, Log-Lik: -152399.846, Max-Change: 0.00423
Iteration: 89, Log-Lik: -152399.833, Max-Change: 0.00146
Iteration: 90, Log-Lik: -152399.832, Max-Change: 0.00099
Iteration: 91, Log-Lik: -152399.831, Max-Change: 0.00023
Iteration: 92, Log-Lik: -152399.831, Max-Change: 0.00082
Iteration: 93, Log-Lik: -152399.829, Max-Change: 0.00095
Iteration: 94, Log-Lik: -152399.829, Max-Change: 0.00041
Iteration: 95, Log-Lik: -152399.828, Max-Change: 0.00028
Iteration: 96, Log-Lik: -152399.828, Max-Change: 0.00098
Iteration: 97, Log-Lik: -152399.827, Max-Change: 0.00140
Iteration: 98, Log-Lik: -152399.826, Max-Change: 0.00094
Iteration: 99, Log-Lik: -152399.826, Max-Change: 0.00062
Iteration: 100, Log-Lik: -152399.825, Max-Change: 0.00061
Iteration: 101, Log-Lik: -152399.824, Max-Change: 0.00072
Iteration: 102, Log-Lik: -152399.824, Max-Change: 0.00048
Iteration: 103, Log-Lik: -152399.823, Max-Change: 0.00061
Iteration: 104, Log-Lik: -152399.822, Max-Change: 0.00030
Iteration: 105, Log-Lik: -152399.822, Max-Change: 0.00020
Iteration: 106, Log-Lik: -152399.822, Max-Change: 0.00070
Iteration: 107, Log-Lik: -152399.821, Max-Change: 0.00086
Iteration: 108, Log-Lik: -152399.821, Max-Change: 0.00058
Iteration: 109, Log-Lik: -152399.821, Max-Change: 0.00066
Iteration: 110, Log-Lik: -152399.820, Max-Change: 0.00081
Iteration: 111, Log-Lik: -152399.820, Max-Change: 0.00054
Iteration: 112, Log-Lik: -152399.819, Max-Change: 0.00063
Iteration: 113, Log-Lik: -152399.819, Max-Change: 0.00077
Iteration: 114, Log-Lik: -152399.818, Max-Change: 0.00052
Iteration: 115, Log-Lik: -152399.818, Max-Change: 0.00060
Iteration: 116, Log-Lik: -152399.818, Max-Change: 0.00075
Iteration: 117, Log-Lik: -152399.818, Max-Change: 0.00050
Iteration: 118, Log-Lik: -152399.817, Max-Change: 0.00060
Iteration: 119, Log-Lik: -152399.817, Max-Change: 0.00075
Iteration: 120, Log-Lik: -152399.817, Max-Change: 0.00050
Iteration: 121, Log-Lik: -152399.817, Max-Change: 0.00063
Iteration: 122, Log-Lik: -152399.816, Max-Change: 0.00081
Iteration: 123, Log-Lik: -152399.816, Max-Change: 0.00054
Iteration: 124, Log-Lik: -152399.816, Max-Change: 0.00073
Iteration: 125, Log-Lik: -152399.816, Max-Change: 0.00096
Iteration: 126, Log-Lik: -152399.815, Max-Change: 0.00064
Iteration: 127, Log-Lik: -152399.815, Max-Change: 0.00019
Iteration: 128, Log-Lik: -152399.815, Max-Change: 0.00065
Iteration: 129, Log-Lik: -152399.815, Max-Change: 0.00087
Iteration: 130, Log-Lik: -152399.815, Max-Change: 0.00017
Iteration: 131, Log-Lik: -152399.815, Max-Change: 0.00057
Iteration: 132, Log-Lik: -152399.814, Max-Change: 0.00075
Iteration: 133, Log-Lik: -152399.814, Max-Change: 0.00016
Iteration: 134, Log-Lik: -152399.814, Max-Change: 0.00055
Iteration: 135, Log-Lik: -152399.814, Max-Change: 0.00073
Iteration: 136, Log-Lik: -152399.814, Max-Change: 0.00015
Iteration: 137, Log-Lik: -152399.814, Max-Change: 0.00054
Iteration: 138, Log-Lik: -152399.813, Max-Change: 0.00069
Iteration: 139, Log-Lik: -152399.813, Max-Change: 0.00015
Iteration: 140, Log-Lik: -152399.813, Max-Change: 0.00053
Iteration: 141, Log-Lik: -152399.813, Max-Change: 0.00065
Iteration: 142, Log-Lik: -152399.813, Max-Change: 0.00014
Iteration: 143, Log-Lik: -152399.813, Max-Change: 0.00053
Iteration: 144, Log-Lik: -152399.813, Max-Change: 0.00062
Iteration: 145, Log-Lik: -152399.812, Max-Change: 0.00013
Iteration: 146, Log-Lik: -152399.812, Max-Change: 0.00052
Iteration: 147, Log-Lik: -152399.812, Max-Change: 0.00059
Iteration: 148, Log-Lik: -152399.812, Max-Change: 0.00013
Iteration: 149, Log-Lik: -152399.812, Max-Change: 0.00052
Iteration: 150, Log-Lik: -152399.812, Max-Change: 0.00056
Iteration: 151, Log-Lik: -152399.812, Max-Change: 0.00012
Iteration: 152, Log-Lik: -152399.812, Max-Change: 0.00051
Iteration: 153, Log-Lik: -152399.812, Max-Change: 0.00054
Iteration: 154, Log-Lik: -152399.812, Max-Change: 0.00011
Iteration: 155, Log-Lik: -152399.812, Max-Change: 0.00051
Iteration: 156, Log-Lik: -152399.811, Max-Change: 0.00052
Iteration: 157, Log-Lik: -152399.811, Max-Change: 0.00011
Iteration: 158, Log-Lik: -152399.811, Max-Change: 0.00050
Iteration: 159, Log-Lik: -152399.811, Max-Change: 0.00049
Iteration: 160, Log-Lik: -152399.811, Max-Change: 0.00010
Iteration: 161, Log-Lik: -152399.811, Max-Change: 0.00050
Iteration: 162, Log-Lik: -152399.811, Max-Change: 0.00047
Iteration: 163, Log-Lik: -152399.811, Max-Change: 0.00010
Iteration: 164, Log-Lik: -152399.811, Max-Change: 0.00049
Iteration: 165, Log-Lik: -152399.811, Max-Change: 0.00045
Iteration: 166, Log-Lik: -152399.811, Max-Change: 0.00010
#score
fit_g_scores = fscores(object = fit_g)
d$g_score = fit_g_scores[, 1] %>% standardize()

#distribution
GG_denhist(d$g_score)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#predict outcomes
wtd.cors(d %>% select(g_score, sex:age))
##         g_score     sex  grade     age
## g_score  1.0000  0.0219 0.5303  0.0757
## sex      0.0219  1.0000 0.0171 -0.0048
## grade    0.5303  0.0171 1.0000  0.3603
## age      0.0757 -0.0048 0.3603  1.0000
lrm(sex ~ g_score, data = d)
## Logistic Regression Model
##  
##  lrm(formula = sex ~ g_score, data = d)
##  
##                       Model Likelihood     Discrimination    Rank Discrim.    
##                          Ratio Test           Indexes           Indexes       
##  Obs          1803    LR chi2      0.97    R2       0.001    C       0.521    
##   1            815    d.f.            1    g        0.053    Dxy     0.042    
##   2            986    Pr(> chi2) 0.3235    gr       1.055    gamma   0.042    
##   3              2                         gp       0.013    tau-a   0.021    
##  max |deriv| 6e-14                         Brier    0.248                     
##  
##          Coef    S.E.   Wald Z Pr(>|Z|)
##  y>=2     0.1925 0.0473  4.07  <0.0001 
##  y>=3    -6.8041 0.7075 -9.62  <0.0001 
##  g_score  0.0468 0.0474  0.99  0.3237  
## 

Machine learning

#fit MLs
(glmnet_grade = caret::train(y = d$grade, x = itemsNA %>% as.data.frame(), method = "glmnet", tuneLength = 5))
## glmnet 
## 
## 1803 samples
##  284 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1803, 1803, 1803, 1803, 1803, 1803, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda    RMSE   Rsquared  MAE  
##   0.100  0.000649  0.919  0.509     0.727
##   0.100  0.003010  0.910  0.516     0.720
##   0.100  0.013973  0.876  0.540     0.698
##   0.100  0.064855  0.824  0.581     0.662
##   0.100  0.301031  0.808  0.599     0.655
##   0.325  0.000649  0.917  0.511     0.725
##   0.325  0.003010  0.894  0.527     0.710
##   0.325  0.013973  0.844  0.565     0.675
##   0.325  0.064855  0.808  0.596     0.653
##   0.325  0.301031  0.851  0.570     0.697
##   0.550  0.000649  0.913  0.514     0.722
##   0.550  0.003010  0.882  0.536     0.701
##   0.550  0.013973  0.827  0.579     0.664
##   0.550  0.064855  0.810  0.596     0.658
##   0.550  0.301031  0.894  0.544     0.738
##   0.775  0.000649  0.909  0.516     0.720
##   0.775  0.003010  0.871  0.544     0.694
##   0.775  0.013973  0.818  0.586     0.658
##   0.775  0.064855  0.817  0.590     0.666
##   0.775  0.301031  0.935  0.523     0.776
##   1.000  0.000649  0.906  0.519     0.717
##   1.000  0.003010  0.862  0.551     0.688
##   1.000  0.013973  0.814  0.590     0.656
##   1.000  0.064855  0.826  0.584     0.675
##   1.000  0.301031  0.980  0.500     0.814
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.325 and lambda = 0.0649.
(glmnet_age = caret::train(y = d$age, x = itemsNA %>% as.data.frame(), method = "glmnet", tuneLength = 5))
## glmnet 
## 
## 1803 samples
##  284 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1803, 1803, 1803, 1803, 1803, 1803, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda    RMSE  Rsquared  MAE  
##   0.100  0.000203  1.68  0.00815   1.131
##   0.100  0.000941  1.67  0.00823   1.128
##   0.100  0.004369  1.65  0.00863   1.111
##   0.100  0.020277  1.58  0.01009   1.058
##   0.100  0.094119  1.46  0.01520   0.961
##   0.325  0.000203  1.68  0.00816   1.130
##   0.325  0.000941  1.67  0.00834   1.123
##   0.325  0.004369  1.62  0.00917   1.091
##   0.325  0.020277  1.52  0.01223   1.006
##   0.325  0.094119  1.39  0.02377   0.910
##   0.550  0.000203  1.67  0.00818   1.129
##   0.550  0.000941  1.66  0.00845   1.118
##   0.550  0.004369  1.60  0.00970   1.073
##   0.550  0.020277  1.47  0.01466   0.972
##   0.550  0.094119  1.38  0.02820   0.902
##   0.775  0.000203  1.67  0.00821   1.128
##   0.775  0.000941  1.65  0.00857   1.113
##   0.775  0.004369  1.58  0.01015   1.057
##   0.775  0.020277  1.45  0.01696   0.950
##   0.775  0.094119  1.37  0.02940   0.904
##   1.000  0.000203  1.67  0.00823   1.127
##   1.000  0.000941  1.65  0.00869   1.108
##   1.000  0.004369  1.56  0.01057   1.043
##   1.000  0.020277  1.43  0.01897   0.934
##   1.000  0.094119  1.37  0.02834   0.908
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.0941.
(glmnet_sex = caret::train(y = d$sex, x = itemsNA %>% as.data.frame(), method = "glmnet", tuneLength = 5))
## glmnet 
## 
## 1803 samples
##  284 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 1803, 1803, 1803, 1803, 1803, 1803, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda    RMSE   Rsquared  MAE  
##   0.100  8.58e-05  0.546  0.0725    0.450
##   0.100  3.98e-04  0.545  0.0731    0.449
##   0.100  1.85e-03  0.536  0.0768    0.445
##   0.100  8.58e-03  0.513  0.0894    0.434
##   0.100  3.98e-02  0.478  0.1190    0.422
##   0.325  8.58e-05  0.546  0.0726    0.450
##   0.325  3.98e-04  0.542  0.0743    0.448
##   0.325  1.85e-03  0.527  0.0817    0.441
##   0.325  8.58e-03  0.493  0.1055    0.425
##   0.325  3.98e-02  0.464  0.1391    0.429
##   0.550  8.58e-05  0.545  0.0728    0.450
##   0.550  3.98e-04  0.539  0.0755    0.447
##   0.550  1.85e-03  0.519  0.0864    0.436
##   0.550  8.58e-03  0.481  0.1165    0.422
##   0.550  3.98e-02  0.466  0.1375    0.443
##   0.775  8.58e-05  0.545  0.0730    0.450
##   0.775  3.98e-04  0.537  0.0766    0.446
##   0.775  1.85e-03  0.512  0.0908    0.433
##   0.775  8.58e-03  0.474  0.1242    0.422
##   0.775  3.98e-02  0.472  0.1257    0.456
##   1.000  8.58e-05  0.544  0.0732    0.449
##   1.000  3.98e-04  0.535  0.0777    0.445
##   1.000  1.85e-03  0.506  0.0948    0.431
##   1.000  8.58e-03  0.470  0.1297    0.423
##   1.000  3.98e-02  0.478  0.1092    0.468
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 0.325 and lambda = 0.0398.