options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, caret, parallel, doParallel, relaimpo, sjstats)
#start parallel
doParallel::registerDoParallel(7)
Download data from https://electionstudies.org/project/2012-time-series-study/
#load
anes2012 = read_sav("data/anes_timeseries_2012.sav")
#recode function
recode_na = function(x) {
plyr::mapvalues(x, from = c(-c(1:10)), to = rep(NA, 10), warn_missing = F)
}
#recode
anes2012$age = anes2012$dem_age_r_x %>% recode_na() %>% as.numeric()
anes2012$education = anes2012$dem_edu %>% recode_na() %>% as.numeric() %>% plyr::mapvalues(95, NA)
anes2012$income = anes2012$inc_incgroup_pre %>% recode_na() %>% as.numeric()
anes2012$sex = anes2012$gender_respondent_x %>% plyr::mapvalues(1:2, c("Male", "Female")) %>% factor()
#items
wordsum_items = anes2012 %>% df_subset_by_pattern("wordsum_") %>% map_df(as.character)
wordsum_score_keys = c(5, 3, 1, 3, 5, 4, 1, 1, 4, 2)
wordsum_items_scored = score_items(wordsum_items, key = wordsum_score_keys)
anes2012$wordsum = rowSums(wordsum_items_scored)
anes2012$wordsum %>% GG_denhist()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#IRT
wordsum_irt = irt.fa(wordsum_items_scored)
wordsum_irt
## Item Response Analysis using Factor Analysis
##
## Call: irt.fa(x = wordsum_items_scored)
## Item Response Analysis using Factor Analysis
##
## Summary information by factor and item
## Factor = 1
## -3 -2 -1 0 1 2 3
## wordsum_setb 0.32 0.93 0.53 0.10 0.02 0.00 0.00
## wordsum_setd 0.48 0.90 0.35 0.06 0.01 0.00 0.00
## wordsum_sete 0.13 0.44 0.61 0.28 0.07 0.02 0.00
## wordsum_setf 0.48 1.13 0.38 0.05 0.01 0.00 0.00
## wordsum_setg 0.04 0.09 0.16 0.21 0.18 0.11 0.06
## wordsum_seth 0.02 0.07 0.22 0.43 0.37 0.16 0.05
## wordsum_setj 0.02 0.07 0.26 0.57 0.44 0.14 0.03
## wordsum_setk 0.05 0.25 0.74 0.56 0.14 0.03 0.00
## wordsum_setl 0.05 0.22 0.56 0.51 0.18 0.04 0.01
## wordsum_seto 0.05 0.17 0.42 0.46 0.21 0.06 0.02
## Test Info 1.63 4.26 4.23 3.24 1.62 0.56 0.17
## SEM 0.78 0.48 0.49 0.56 0.78 1.33 2.43
## Reliability 0.39 0.77 0.76 0.69 0.38 -0.78 -4.90
##
## Factor analysis with Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
##
## Test of the hypothesis that 1 factor is sufficient.
## The degrees of freedom for the model is 35 and the objective function was 0.77
## The number of observations was 5914 with Chi Square = 4552 with prob < 0
##
## The root mean square of the residuals (RMSA) is 0.08
## The df corrected root mean square of the residuals is 0.09
##
## Tucker Lewis Index of factoring reliability = 0.799
## RMSEA index = 0.148 and the 10 % confidence intervals are 0.144 0.151
## BIC = 4248
wordsum_irt$rho
## wordsum_setb wordsum_setd wordsum_sete wordsum_setf
## wordsum_setb 1.000 0.734 0.622 0.620
## wordsum_setd 0.734 1.000 0.647 0.665
## wordsum_sete 0.622 0.647 1.000 0.552
## wordsum_setf 0.620 0.665 0.552 1.000
## wordsum_setg 0.269 0.228 0.179 0.273
## wordsum_seth 0.377 0.411 0.408 0.398
## wordsum_setj 0.474 0.406 0.414 0.514
## wordsum_setk 0.499 0.509 0.451 0.618
## wordsum_setl 0.480 0.451 0.382 0.469
## wordsum_seto 0.408 0.368 0.422 0.518
## wordsum_setg wordsum_seth wordsum_setj wordsum_setk
## wordsum_setb 0.269 0.377 0.474 0.499
## wordsum_setd 0.228 0.411 0.406 0.509
## wordsum_sete 0.179 0.408 0.414 0.451
## wordsum_setf 0.273 0.398 0.514 0.618
## wordsum_setg 1.000 0.454 0.396 0.391
## wordsum_seth 0.454 1.000 0.493 0.446
## wordsum_setj 0.396 0.493 1.000 0.463
## wordsum_setk 0.391 0.446 0.463 1.000
## wordsum_setl 0.454 0.490 0.462 0.549
## wordsum_seto 0.380 0.413 0.526 0.466
## wordsum_setl wordsum_seto
## wordsum_setb 0.480 0.408
## wordsum_setd 0.451 0.368
## wordsum_sete 0.382 0.422
## wordsum_setf 0.469 0.518
## wordsum_setg 0.454 0.380
## wordsum_seth 0.490 0.413
## wordsum_setj 0.462 0.526
## wordsum_setk 0.549 0.466
## wordsum_setl 1.000 0.463
## wordsum_seto 0.463 1.000
wordsum_irt$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## wordsum_setb 0.75 0.57 0.43 1
## wordsum_setd 0.75 0.56 0.44 1
## wordsum_sete 0.68 0.46 0.54 1
## wordsum_setf 0.78 0.61 0.39 1
## wordsum_setg 0.47 0.22 0.78 1
## wordsum_seth 0.62 0.38 0.62 1
## wordsum_setj 0.67 0.45 0.55 1
## wordsum_setk 0.72 0.52 0.48 1
## wordsum_setl 0.68 0.46 0.54 1
## wordsum_seto 0.64 0.41 0.59 1
##
## MR1
## SS loadings 4.65
## Proportion Var 0.47
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 45 and the objective function was 4.9 with Chi Square of 28930
## The degrees of freedom for the model are 35 and the objective function was 0.77
##
## The root mean square of the residuals (RMSR) is 0.08
## The df corrected root mean square of the residuals is 0.09
##
## The harmonic number of observations is 5914 with the empirical chi square 3255 with prob < 0
## The total number of observations was 5914 with Likelihood Chi Square = 4552 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.799
## RMSEA index = 0.148 and the 90 % confidence intervals are 0.144 0.151
## BIC = 4248
## Fit based upon off diagonal values = 0.97
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.95
## Multiple R square of scores with factors 0.90
## Minimum correlation of possible factor scores 0.81
wordsum_irt_scores = scoreIrt(wordsum_irt, wordsum_items_scored)
anes2012$g = wordsum_irt_scores$theta1 %>% standardize()
#compare simple and complex
GG_scatter(anes2012, "wordsum", "g")
Use elastic net to predict outcomes using caret package, and compare accuracy with mere correlations.
#outcomes
outcomes = c("age", "income", "education")
#train glmnet for each
model_list = list()
for (o in outcomes) {
#subset without missing data
o_subset = cbind(wordsum_items, y = anes2012[[o]])
names(o_subset)[11] = o
#filter missing
o_subset = na.omit(o_subset)
#train
model_list[[o]] = train(as.formula(str_glue("{o} ~ {str_c(names(wordsum_items), collapse = ' + ')}")), method = "glmnet", trControl = trainControl(allowParallel = T, method = "repeatedcv", repeats = 10), data = o_subset)
}
#mere correlations
anes2012[c("g", "wordsum", outcomes)] %>% wtd.cors() %>% .^2 #square to get r2 as in models
## g wordsum age income education
## g 1.0000 0.9910 0.034889 0.12336 0.173166
## wordsum 0.9910 1.0000 0.038769 0.12084 0.170932
## age 0.0349 0.0388 1.000000 0.00358 0.000224
## income 0.1234 0.1208 0.003584 1.00000 0.165203
## education 0.1732 0.1709 0.000224 0.16520 1.000000
#best param model fits
map_dbl(model_list, ~.$results$Rsquared %>% max())
## age income education
## 0.0927 0.1330 0.2041
Thus, one can get nontrivial increase in validity of prediction from supervised machine learning vs. traditional psychometrics, especially for age (not so useful perhaps), but also education (probably useful for selection purposes).
Based on:
#predict income
#unstandardized betas -- not very interpretable
fit_income = lm(income ~ age + I(age^2) + sex + education + g, data = anes2012)
fit_income %>% summary()
##
## Call:
## lm(formula = income ~ age + I(age^2) + sex + education + g, data = anes2012)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.835 -5.232 0.327 5.515 26.810
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.119060 0.911790 -6.71 2.1e-11 ***
## age 0.366723 0.033774 10.86 < 2e-16 ***
## I(age^2) -0.003600 0.000337 -10.69 < 2e-16 ***
## sexMale 1.082731 0.197203 5.49 4.2e-08 ***
## education 1.010732 0.044193 22.87 < 2e-16 ***
## g 1.767644 0.112573 15.70 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.16 on 5298 degrees of freedom
## (610 observations deleted due to missingness)
## Multiple R-squared: 0.229, Adjusted R-squared: 0.229
## F-statistic: 316 on 5 and 5298 DF, p-value: <2e-16
#standardized betas
anes2012_std = anes2012 %>% dplyr::select(income, age, education, g) %>% map_df(~standardize(.))
anes2012_std$sex = anes2012$sex %>% as.numeric() %>% standardize()
fit_income_std = lm(income ~ age + I(age^2) + sex + education + g, data = anes2012_std)
fit_income_std %>% summary()
##
## Call:
## lm(formula = income ~ age + I(age^2) + sex + education + g, data = anes2012_std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.554 -0.641 0.040 0.676 3.287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1027 0.0166 6.18 6.7e-10 ***
## age 0.0222 0.0125 1.78 0.076 .
## I(age^2) -0.1249 0.0117 -10.69 < 2e-16 ***
## sex 0.0663 0.0121 5.49 4.2e-08 ***
## education 0.3062 0.0134 22.87 < 2e-16 ***
## g 0.2167 0.0138 15.70 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.878 on 5298 degrees of freedom
## (610 observations deleted due to missingness)
## Multiple R-squared: 0.229, Adjusted R-squared: 0.229
## F-statistic: 316 on 5 and 5298 DF, p-value: <2e-16
#note the funky patterns for age and age² -- there's no trivial way to deal with standardized self-interactions
#but overall model R2 are identical
#relative importance package
calc.relimp(fit_income, rela = T, type = "lmg")
## Response variable: income
## Total response variance: 66.5
## Analysis based on 5304 observations
##
## 5 Regressors:
## age I(age^2) sex education g
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## age 0.0510
## I(age^2) 0.0454
## sex 0.0277
## education 0.5286
## g 0.3472
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs 5Xs
## age 0.027847 0.13982 0.22564 0.29746 0.3667
## I(age^2) 0.000123 -0.00117 -0.00212 -0.00288 -0.0036
## sex 1.566428 1.38858 1.24792 1.14754 1.0827
## education 1.350808 1.27386 1.18952 1.10069 1.0107
## g 2.944363 2.68016 2.37997 2.06802 1.7676
calc.relimp(fit_income, rela = T, type = "betasq") #odd results for age!
## Response variable: income
## Total response variance: 66.5
## Analysis based on 5304 observations
##
## 5 Regressors:
## age I(age^2) sex education g
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## betasq
## age 0.45003
## I(age^2) 0.43479
## sex 0.00356
## education 0.07523
## g 0.03638
calc.relimp(fit_income, rela = T, type = "pratt")
## Response variable: income
## Total response variance: 66.5
## Analysis based on 5304 observations
##
## 5 Regressors:
## age I(age^2) sex education g
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## pratt
## age 0.1841
## I(age^2) -0.0802
## sex 0.0278
## education 0.5417
## g 0.3265
calc.relimp(fit_income, rela = T, type = "genizi")
## Response variable: income
## Total response variance: 66.5
## Analysis based on 5304 observations
##
## 5 Regressors:
## age I(age^2) sex education g
## Proportion of variance explained by model: 22.9%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## genizi
## age 0.0460
## I(age^2) 0.0402
## sex 0.0282
## education 0.5361
## g 0.3496
#ANOVA variance approach
fit_income %>% sjstats::anova_stats()
fit_income_std %>% sjstats::anova_stats() #invariant to standardization
#beware! different order matters depending on which ANOVA errors are used!
#R defaults to type 1 ANOVA, which is hierarchical, so order matters
fit_income_rev = lm(income ~ g + education + sex + age + I(age^2), data = anes2012)
fit_income_rev %>% sjstats::anova_stats()
#one can get SS type 2 (order invariant) with car package
fit_income %>% car::Anova() %>% sjstats::anova_stats()
fit_income_rev %>% car::Anova() %>% sjstats::anova_stats() #same except for order
#to get SS type 3, R is stupid and one has to change an option
#SS 2 vs. SS 3 only matters when non-self interactions are used
#https://rcompanion.org/rcompanion/d_04.html
#https://stats.stackexchange.com/questions/4544/how-does-one-do-a-type-iii-ss-anova-in-r-with-contrast-codes
#these etas type metrics do not necessarily sum up to the complete model R2, which happens when variance could not be uniquely identified to a variable
fit_income %>% car::Anova() %>% sjstats::anova_stats() %>% .$etasq %>% sum(na.rm = T) #this value is not .229!
## [1] 0.164