Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, caret, parallel, doParallel, relaimpo, sjstats)

#start parallel
doParallel::registerDoParallel(7)

Data

Download data from https://electionstudies.org/project/2012-time-series-study/

#load
anes2012 = read_sav("data/anes_timeseries_2012.sav")

#recode function
recode_na = function(x) {
  plyr::mapvalues(x, from = c(-c(1:10)), to = rep(NA, 10), warn_missing = F)
}

#recode
anes2012$age = anes2012$dem_age_r_x %>% recode_na() %>% as.numeric()
anes2012$education = anes2012$dem_edu %>% recode_na() %>% as.numeric() %>% plyr::mapvalues(95, NA)
anes2012$income = anes2012$inc_incgroup_pre %>% recode_na() %>% as.numeric()
anes2012$sex = anes2012$gender_respondent_x %>% plyr::mapvalues(1:2, c("Male", "Female")) %>% factor()

Wordsum

#items
wordsum_items = anes2012 %>% df_subset_by_pattern("wordsum_") %>% map_df(as.character)
wordsum_score_keys = c(5, 3, 1, 3, 5, 4, 1, 1, 4, 2)
wordsum_items_scored = score_items(wordsum_items, key = wordsum_score_keys)
anes2012$wordsum = rowSums(wordsum_items_scored)
anes2012$wordsum %>% GG_denhist()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#IRT
wordsum_irt = irt.fa(wordsum_items_scored)

wordsum_irt

## Item Response Analysis using Factor Analysis 
## 
## Call: irt.fa(x = wordsum_items_scored)
## Item Response Analysis using Factor Analysis  
## 
##  Summary information by factor and item
##  Factor =  1 
##                -3   -2   -1    0    1     2     3
## wordsum_setb 0.32 0.93 0.53 0.10 0.02  0.00  0.00
## wordsum_setd 0.48 0.90 0.35 0.06 0.01  0.00  0.00
## wordsum_sete 0.13 0.44 0.61 0.28 0.07  0.02  0.00
## wordsum_setf 0.48 1.13 0.38 0.05 0.01  0.00  0.00
## wordsum_setg 0.04 0.09 0.16 0.21 0.18  0.11  0.06
## wordsum_seth 0.02 0.07 0.22 0.43 0.37  0.16  0.05
## wordsum_setj 0.02 0.07 0.26 0.57 0.44  0.14  0.03
## wordsum_setk 0.05 0.25 0.74 0.56 0.14  0.03  0.00
## wordsum_setl 0.05 0.22 0.56 0.51 0.18  0.04  0.01
## wordsum_seto 0.05 0.17 0.42 0.46 0.21  0.06  0.02
## Test Info    1.63 4.26 4.23 3.24 1.62  0.56  0.17
## SEM          0.78 0.48 0.49 0.56 0.78  1.33  2.43
## Reliability  0.39 0.77 0.76 0.69 0.38 -0.78 -4.90
## 
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## 
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 35  and the objective function was  0.77 
## The number of observations was  5914  with Chi Square =  4552  with prob <  0 
## 
## The root mean square of the residuals (RMSA) is  0.08 
## The df corrected root mean square of the residuals is  0.09 
## 
## Tucker Lewis Index of factoring reliability =  0.799
## RMSEA index =  0.148  and the 10 % confidence intervals are  0.144 0.151
## BIC =  4248

wordsum_irt$rho

##              wordsum_setb wordsum_setd wordsum_sete wordsum_setf
## wordsum_setb        1.000        0.734        0.622        0.620
## wordsum_setd        0.734        1.000        0.647        0.665
## wordsum_sete        0.622        0.647        1.000        0.552
## wordsum_setf        0.620        0.665        0.552        1.000
## wordsum_setg        0.269        0.228        0.179        0.273
## wordsum_seth        0.377        0.411        0.408        0.398
## wordsum_setj        0.474        0.406        0.414        0.514
## wordsum_setk        0.499        0.509        0.451        0.618
## wordsum_setl        0.480        0.451        0.382        0.469
## wordsum_seto        0.408        0.368        0.422        0.518
##              wordsum_setg wordsum_seth wordsum_setj wordsum_setk
## wordsum_setb        0.269        0.377        0.474        0.499
## wordsum_setd        0.228        0.411        0.406        0.509
## wordsum_sete        0.179        0.408        0.414        0.451
## wordsum_setf        0.273        0.398        0.514        0.618
## wordsum_setg        1.000        0.454        0.396        0.391
## wordsum_seth        0.454        1.000        0.493        0.446
## wordsum_setj        0.396        0.493        1.000        0.463
## wordsum_setk        0.391        0.446        0.463        1.000
## wordsum_setl        0.454        0.490        0.462        0.549
## wordsum_seto        0.380        0.413        0.526        0.466
##              wordsum_setl wordsum_seto
## wordsum_setb        0.480        0.408
## wordsum_setd        0.451        0.368
## wordsum_sete        0.382        0.422
## wordsum_setf        0.469        0.518
## wordsum_setg        0.454        0.380
## wordsum_seth        0.490        0.413
## wordsum_setj        0.462        0.526
## wordsum_setk        0.549        0.466
## wordsum_setl        1.000        0.463
## wordsum_seto        0.463        1.000

wordsum_irt$fa

## Factor Analysis using method =  minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
##               MR1   h2   u2 com
## wordsum_setb 0.75 0.57 0.43   1
## wordsum_setd 0.75 0.56 0.44   1
## wordsum_sete 0.68 0.46 0.54   1
## wordsum_setf 0.78 0.61 0.39   1
## wordsum_setg 0.47 0.22 0.78   1
## wordsum_seth 0.62 0.38 0.62   1
## wordsum_setj 0.67 0.45 0.55   1
## wordsum_setk 0.72 0.52 0.48   1
## wordsum_setl 0.68 0.46 0.54   1
## wordsum_seto 0.64 0.41 0.59   1
## 
##                 MR1
## SS loadings    4.65
## Proportion Var 0.47
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  45  and the objective function was  4.9 with Chi Square of  28930
## The degrees of freedom for the model are 35  and the objective function was  0.77 
## 
## The root mean square of the residuals (RMSR) is  0.08 
## The df corrected root mean square of the residuals is  0.09 
## 
## The harmonic number of observations is  5914 with the empirical chi square  3255  with prob <  0 
## The total number of observations was  5914  with Likelihood Chi Square =  4552  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.799
## RMSEA index =  0.148  and the 90 % confidence intervals are  0.144 0.151
## BIC =  4248
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.95
## Multiple R square of scores with factors          0.90
## Minimum correlation of possible factor scores     0.81

wordsum_irt_scores = scoreIrt(wordsum_irt, wordsum_items_scored)
anes2012$g = wordsum_irt_scores$theta1 %>% standardize()

#compare simple and complex
GG_scatter(anes2012, "wordsum", "g")

Machine learning

Use elastic net to predict outcomes using caret package, and compare accuracy with mere correlations.

#outcomes
outcomes = c("age", "income", "education")

#train glmnet for each
model_list = list()
for (o in outcomes) {
  #subset without missing data
  o_subset = cbind(wordsum_items, y = anes2012[[o]])
  names(o_subset)[11] = o
  
  #filter missing
  o_subset = na.omit(o_subset)
  
  #train
  model_list[[o]] = train(as.formula(str_glue("{o} ~ {str_c(names(wordsum_items), collapse = ' + ')}")), method = "glmnet", trControl = trainControl(allowParallel = T, method = "repeatedcv", repeats = 10), data = o_subset)
}

#mere correlations
anes2012[c("g", "wordsum", outcomes)] %>% wtd.cors() %>% .^2 #square to get r2 as in models

##                g wordsum      age  income education
## g         1.0000  0.9910 0.034889 0.12336  0.173166
## wordsum   0.9910  1.0000 0.038769 0.12084  0.170932
## age       0.0349  0.0388 1.000000 0.00358  0.000224
## income    0.1234  0.1208 0.003584 1.00000  0.165203
## education 0.1732  0.1709 0.000224 0.16520  1.000000

#best param model fits
map_dbl(model_list, ~.$results$Rsquared %>% max())

##       age    income education 
##    0.0927    0.1330    0.2041

Thus, one can get nontrivial increase in validity of prediction from supervised machine learning vs. traditional psychometrics, especially for age (not so useful perhaps), but also education (probably useful for selection purposes).

Regression for income: understanding some of the relative importance metrics

Based on:

#predict income
#unstandardized betas -- not very interpretable
fit_income = lm(income ~ age + I(age^2) + sex + education + g, data = anes2012)
fit_income %>% summary()

## 
## Call:
## lm(formula = income ~ age + I(age^2) + sex + education + g, data = anes2012)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.835  -5.232   0.327   5.515  26.810 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.119060   0.911790   -6.71  2.1e-11 ***
## age          0.366723   0.033774   10.86  < 2e-16 ***
## I(age^2)    -0.003600   0.000337  -10.69  < 2e-16 ***
## sexMale      1.082731   0.197203    5.49  4.2e-08 ***
## education    1.010732   0.044193   22.87  < 2e-16 ***
## g            1.767644   0.112573   15.70  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.16 on 5298 degrees of freedom
##   (610 observations deleted due to missingness)
## Multiple R-squared:  0.229,  Adjusted R-squared:  0.229 
## F-statistic:  316 on 5 and 5298 DF,  p-value: <2e-16

#standardized betas
anes2012_std = anes2012 %>% dplyr::select(income, age, education, g) %>% map_df(~standardize(.))
anes2012_std$sex = anes2012$sex %>% as.numeric() %>% standardize()
fit_income_std = lm(income ~ age + I(age^2) + sex + education + g, data = anes2012_std)
fit_income_std %>% summary()

## 
## Call:
## lm(formula = income ~ age + I(age^2) + sex + education + g, data = anes2012_std)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -2.554 -0.641  0.040  0.676  3.287 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.1027     0.0166    6.18  6.7e-10 ***
## age           0.0222     0.0125    1.78    0.076 .  
## I(age^2)     -0.1249     0.0117  -10.69  < 2e-16 ***
## sex           0.0663     0.0121    5.49  4.2e-08 ***
## education     0.3062     0.0134   22.87  < 2e-16 ***
## g             0.2167     0.0138   15.70  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.878 on 5298 degrees of freedom
##   (610 observations deleted due to missingness)
## Multiple R-squared:  0.229,  Adjusted R-squared:  0.229 
## F-statistic:  316 on 5 and 5298 DF,  p-value: <2e-16

#note the funky patterns for age and age² -- there's no trivial way to deal with standardized self-interactions
#but overall model R2 are identical

#relative importance package
calc.relimp(fit_income, rela = T, type = "lmg")

## Response variable: income 
## Total response variance: 66.5 
## Analysis based on 5304 observations 
## 
## 5 Regressors: 
## age I(age^2) sex education g 
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##              lmg
## age       0.0510
## I(age^2)  0.0454
## sex       0.0277
## education 0.5286
## g         0.3472
## 
## Average coefficients for different model sizes: 
## 
##                 1X      2Xs      3Xs      4Xs     5Xs
## age       0.027847  0.13982  0.22564  0.29746  0.3667
## I(age^2)  0.000123 -0.00117 -0.00212 -0.00288 -0.0036
## sex       1.566428  1.38858  1.24792  1.14754  1.0827
## education 1.350808  1.27386  1.18952  1.10069  1.0107
## g         2.944363  2.68016  2.37997  2.06802  1.7676

calc.relimp(fit_income, rela = T, type = "betasq") #odd results for age!

## Response variable: income 
## Total response variance: 66.5 
## Analysis based on 5304 observations 
## 
## 5 Regressors: 
## age I(age^2) sex education g 
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##            betasq
## age       0.45003
## I(age^2)  0.43479
## sex       0.00356
## education 0.07523
## g         0.03638

calc.relimp(fit_income, rela = T, type = "pratt")

## Response variable: income 
## Total response variance: 66.5 
## Analysis based on 5304 observations 
## 
## 5 Regressors: 
## age I(age^2) sex education g 
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##             pratt
## age        0.1841
## I(age^2)  -0.0802
## sex        0.0278
## education  0.5417
## g          0.3265

calc.relimp(fit_income, rela = T, type = "genizi")

## Response variable: income 
## Total response variance: 66.5 
## Analysis based on 5304 observations 
## 
## 5 Regressors: 
## age I(age^2) sex education g 
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE). 
## 
## Relative importance metrics: 
## 
##           genizi
## age       0.0460
## I(age^2)  0.0402
## sex       0.0282
## education 0.5361
## g         0.3496

#ANOVA variance approach
fit_income %>% sjstats::anova_stats()

fit_income_std %>% sjstats::anova_stats() #invariant to standardization

#beware! different order matters depending on which ANOVA errors are used!
#R defaults to type 1 ANOVA, which is hierarchical, so order matters
fit_income_rev = lm(income ~ g + education + sex + age + I(age^2), data = anes2012)
fit_income_rev %>% sjstats::anova_stats()

#one can get SS type 2 (order invariant) with car package
fit_income %>% car::Anova() %>% sjstats::anova_stats()

fit_income_rev %>% car::Anova() %>% sjstats::anova_stats() #same except for order

#to get SS type 3, R is stupid and one has to change an option
#SS 2 vs. SS 3 only matters when non-self interactions are used
#https://rcompanion.org/rcompanion/d_04.html
#https://stats.stackexchange.com/questions/4544/how-does-one-do-a-type-iii-ss-anova-in-r-with-contrast-codes

#these etas type metrics do not necessarily sum up to the complete model R2, which happens when variance could not be uniquely identified to a variable
fit_income %>% car::Anova() %>% sjstats::anova_stats() %>% .$etasq %>% sum(na.rm = T) #this value is not .229!

## [1] 0.164

ANES 2012 wordsum example

Emil O. W. Kirkegaard

Init

Data

Wordsum

Machine learning

Regression for income: understanding some of the relative importance metrics