options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, rms)
recode_na = function(x) {
plyr::mapvalues(x, from = -c(1:99), to = rep(NA, 99), warn_missing = F)
}
# load("data/els_02_12_byf3stubrr_v1_0.rdata")
# els = els_02_12_byf3stubrr_v1_0
els2 = read_spss("data/els_02_12_byf3stubrr_v1_0.sav")
els = read_spss("data/els_02_12_byf3pststu_v1_0.sav")
els_vars = kirkegaard::df_var_table(els)
els_vars
#sex
els$sex = els$BYSEX %>% recode_na() %>% as_factor() %>% fct_drop() %>% fct_relevel("Male")
#recode race by removing NAs
els$SIRE = els$BYRACE %>% recode_na() %>% as_factor() %>% as.character()
#rename to shorter names
els$SIRE = els$SIRE %>% plyr::mapvalues(
from = unique(els$SIRE),
to = c("Hispanic", "Asian (broad)", "White", "Black", "Hispanic", "Multiracial", NA, "Amerindian")) %>%
fct_relevel("White")
#use their composite score for g
els$g = els$BYTXCSTD %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))
#tilt
els$verbal_tilt = ((els$BYTXRSTD %>% recode_na() %>% standardize()) - (els$BYTXMSTD %>% recode_na() %>% standardize())) %>% standardize(focal_group = (els$SIRE == "White"))
#SAT ACT
#data restricted
# els$SAT = els$TXSATC %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
# els$ACT = els$TXACTC %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
#SES
# els$SES = els$TXACTC %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
els$mom_edu = els$BYMOTHED %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))
els$dad_edu = els$BYFATHED %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))
els$income = els$F3ERN2011 %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))
# els$family_income = els$F3EMPINC2011 %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
# els$occupation = els$BYOCC30 %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
els$welfare = els$F3D24 %>% recode_na() %>% {. == 1}
#politics
els$voted_pres_2008 = els$F3D38 %>% recode_na() %>% {. == 1}
els[c("g", "verbal_tilt", "income", "mom_edu", "dad_edu", "welfare", "voted_pres_2008")] %>% wtd.cors()
## g verbal_tilt income mom_edu dad_edu welfare
## g 1.000000 -0.000517 0.2099 0.4633 0.47515 -0.2428
## verbal_tilt -0.000517 1.000000 -0.0544 0.0162 0.00133 0.0617
## income 0.209898 -0.054420 1.0000 0.1191 0.12131 -0.2467
## mom_edu 0.463295 0.016151 0.1191 1.0000 0.75682 -0.1456
## dad_edu 0.475151 0.001331 0.1213 0.7568 1.00000 -0.1585
## welfare -0.242778 0.061671 -0.2467 -0.1456 -0.15849 1.0000
## voted_pres_2008 0.163879 0.066776 0.0797 0.1293 0.13337 -0.1081
## voted_pres_2008
## g 0.1639
## verbal_tilt 0.0668
## income 0.0797
## mom_edu 0.1293
## dad_edu 0.1334
## welfare -0.1081
## voted_pres_2008 1.0000
#plot distributions
GG_denhist(els, "g", "SIRE", vline = F)
## Warning in GG_denhist(els, "g", "SIRE", vline = F): Grouping variable
## contained missing values. These were removed. If you want an NA group,
## convert to explicit value.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#sum stats
describeBy(els$g, els$SIRE)
##
## Descriptive statistics by group
## group: White
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 8682 0 1 0.07 0.03 1.01 -3.52 3.02 6.54 -0.3 -0.11
## se
## X1 0.01
## --------------------------------------------------------
## group: Amerindian
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 130 -0.84 0.86 -0.8 -0.82 0.91 -3.1 1.37 4.47 -0.14 -0.38
## se
## X1 0.08
## --------------------------------------------------------
## group: Asian (broad)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 1460 -0.12 1.1 -0.16 -0.13 1.17 -3.22 2.9 6.12 0.09 -0.46
## se
## X1 0.03
## --------------------------------------------------------
## group: Black
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2020 -0.95 0.92 -0.96 -0.97 0.92 -3.43 2.45 5.88 0.18 -0.16
## se
## X1 0.02
## --------------------------------------------------------
## group: Hispanic
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2217 -0.85 1.04 -0.88 -0.86 1.12 -3.45 2.37 5.83 0.17 -0.42
## se
## X1 0.02
## --------------------------------------------------------
## group: Multiracial
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 735 -0.29 1.05 -0.24 -0.27 1.07 -3.02 2.43 5.45 -0.13 -0.26
## se
## X1 0.04
#plot distributions
GG_denhist(els, "verbal_tilt", "SIRE", vline = F)
## Warning in GG_denhist(els, "verbal_tilt", "SIRE", vline = F): Grouping
## variable contained missing values. These were removed. If you want an NA
## group, convert to explicit value.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#sum stats
describeBy(els$verbal_tilt, els$SIRE)
##
## Descriptive statistics by group
## group: White
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 8682 0 1 0.01 0.01 0.94 -4.6 3.81 8.4 -0.11 0.52
## se
## X1 0.01
## --------------------------------------------------------
## group: Amerindian
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 130 -0.01 0.97 -0.04 -0.01 0.94 -2.62 2.63 5.25 0.01 -0.22
## se
## X1 0.08
## --------------------------------------------------------
## group: Asian (broad)
## vars n mean sd median trimmed mad min max range skew
## X1 1 1460 -0.51 1.18 -0.42 -0.45 1.03 -6.01 2.52 8.52 -0.66
## kurtosis se
## X1 1.16 0.03
## --------------------------------------------------------
## group: Black
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2020 0.14 0.94 0.15 0.14 0.92 -3.68 3.22 6.9 0.01 0.29
## se
## X1 0.02
## --------------------------------------------------------
## group: Hispanic
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2217 0.02 0.97 0.05 0.03 0.94 -5.25 3.38 8.63 -0.19 0.72
## se
## X1 0.02
## --------------------------------------------------------
## group: Multiracial
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 735 0.07 1.02 0.05 0.08 0.95 -3.75 3.28 7.03 -0.15 0.57
## se
## X1 0.04
#regressions
ols(income ~ sex + SIRE, data = els)
## Frequencies of Missing Values Due to Each Variable
## income sex SIRE
## 0 827 953
##
## Linear Regression Model
##
## ols(formula = income ~ sex + SIRE, data = els)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 15244 LR chi2 321.15 R2 0.021
## sigma0.9884 d.f. 6 R2 adj 0.020
## d.f. 15237 Pr(> chi2) 0.0000 g 0.160
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.0664 -0.7618 -0.1491 0.4914 9.7989
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0709 0.0133 5.31 <0.0001
## sex=Female -0.1403 0.0160 -8.76 <0.0001
## SIRE=Amerindian -0.4161 0.0873 -4.76 <0.0001
## SIRE=Asian (broad) -0.0934 0.0280 -3.34 0.0008
## SIRE=Black -0.3044 0.0244 -12.47 <0.0001
## SIRE=Hispanic -0.2615 0.0235 -11.12 <0.0001
## SIRE=Multiracial -0.1364 0.0380 -3.59 0.0003
##
ols(income ~ sex + SIRE + g, data = els)
## Frequencies of Missing Values Due to Each Variable
## income sex SIRE g
## 0 827 953 0
##
## Linear Regression Model
##
## ols(formula = income ~ sex + SIRE + g, data = els)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 15244 LR chi2 938.51 R2 0.060
## sigma0.9686 d.f. 7 R2 adj 0.059
## d.f. 15236 Pr(> chi2) 0.0000 g 0.278
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.5700 -0.6873 -0.1584 0.4791 10.0570
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0731 0.0131 5.59 <0.0001
## sex=Female -0.1447 0.0157 -9.22 <0.0001
## SIRE=Amerindian -0.2528 0.0858 -2.95 0.0032
## SIRE=Asian (broad) -0.0709 0.0274 -2.59 0.0097
## SIRE=Black -0.1183 0.0251 -4.72 <0.0001
## SIRE=Hispanic -0.0960 0.0240 -4.00 <0.0001
## SIRE=Multiracial -0.0798 0.0373 -2.14 0.0323
## g 0.1954 0.0078 25.09 <0.0001
##
ols(income ~ sex + SIRE + g + verbal_tilt, data = els)
## Frequencies of Missing Values Due to Each Variable
## income sex SIRE g verbal_tilt
## 0 827 953 0 305
##
## Linear Regression Model
##
## ols(formula = income ~ sex + SIRE + g + verbal_tilt, data = els)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 15244 LR chi2 969.61 R2 0.062
## sigma0.9677 d.f. 8 R2 adj 0.061
## d.f. 15235 Pr(> chi2) 0.0000 g 0.282
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.5781 -0.6859 -0.1545 0.4770 10.1651
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0651 0.0131 4.96 <0.0001
## sex=Female -0.1289 0.0159 -8.09 <0.0001
## SIRE=Amerindian -0.2515 0.0858 -2.93 0.0034
## SIRE=Asian (broad) -0.0933 0.0277 -3.37 0.0008
## SIRE=Black -0.1111 0.0251 -4.43 <0.0001
## SIRE=Hispanic -0.0944 0.0240 -3.94 <0.0001
## SIRE=Multiracial -0.0765 0.0372 -2.05 0.0400
## g 0.1964 0.0078 25.24 <0.0001
## verbal_tilt -0.0441 0.0079 -5.58 <0.0001
##
#relative importance
lm(income ~ sex + SIRE + g + verbal_tilt, data = els) %>%
car::Anova() %>%
sjstats::anova_stats()
#rms method
plot(anova(ols(income ~ sex + SIRE + g + verbal_tilt, data = els)), what = 'proportion R2')
#regressions
lrm(welfare ~ sex + SIRE, data = els)
## Frequencies of Missing Values Due to Each Variable
## welfare sex SIRE
## 4390 827 953
##
## Logistic Regression Model
##
## lrm(formula = welfare ~ sex + SIRE, data = els)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 11192 LR chi2 585.21 R2 0.086 C 0.663
## FALSE 9353 d.f. 6 g 0.672 Dxy 0.326
## TRUE 1839 Pr(> chi2) <0.0001 gr 1.959 gamma 0.394
## max |deriv| 5e-11 gp 0.091 tau-a 0.089
## Brier 0.129
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -2.3654 0.0521 -45.39 <0.0001
## sex=Female 0.7274 0.0552 13.19 <0.0001
## SIRE=Amerindian 1.4098 0.2274 6.20 <0.0001
## SIRE=Asian (broad) -0.5442 0.1233 -4.41 <0.0001
## SIRE=Black 1.1955 0.0697 17.16 <0.0001
## SIRE=Hispanic 0.6506 0.0734 8.86 <0.0001
## SIRE=Multiracial 0.6773 0.1129 6.00 <0.0001
##
lrm(welfare ~ sex + SIRE + g, data = els)
## Frequencies of Missing Values Due to Each Variable
## welfare sex SIRE g
## 4390 827 953 0
##
## Logistic Regression Model
##
## lrm(formula = welfare ~ sex + SIRE + g, data = els)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 11192 LR chi2 1200.35 R2 0.172 C 0.743
## FALSE 9353 d.f. 7 g 1.083 Dxy 0.486
## TRUE 1839 Pr(> chi2) <0.0001 gr 2.954 gamma 0.486
## max |deriv| 1e-10 gp 0.133 tau-a 0.133
## Brier 0.121
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -2.4965 0.0554 -45.10 <0.0001
## sex=Female 0.7924 0.0572 13.85 <0.0001
## SIRE=Amerindian 0.9104 0.2345 3.88 0.0001
## SIRE=Asian (broad) -0.6540 0.1262 -5.18 <0.0001
## SIRE=Black 0.6221 0.0750 8.29 <0.0001
## SIRE=Hispanic 0.1230 0.0789 1.56 0.1192
## SIRE=Multiracial 0.5052 0.1176 4.30 <0.0001
## g -0.6823 0.0286 -23.84 <0.0001
##
lrm(welfare ~ sex + SIRE + g + verbal_tilt, data = els)
## Frequencies of Missing Values Due to Each Variable
## welfare sex SIRE g verbal_tilt
## 4390 827 953 0 305
##
## Logistic Regression Model
##
## lrm(formula = welfare ~ sex + SIRE + g + verbal_tilt, data = els)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 11192 LR chi2 1209.74 R2 0.173 C 0.744
## FALSE 9353 d.f. 8 g 1.088 Dxy 0.488
## TRUE 1839 Pr(> chi2) <0.0001 gr 2.968 gamma 0.488
## max |deriv| 1e-10 gp 0.133 tau-a 0.134
## Brier 0.121
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -2.4836 0.0555 -44.76 <0.0001
## sex=Female 0.7640 0.0579 13.19 <0.0001
## SIRE=Amerindian 0.9101 0.2345 3.88 0.0001
## SIRE=Asian (broad) -0.6202 0.1269 -4.89 <0.0001
## SIRE=Black 0.6112 0.0751 8.14 <0.0001
## SIRE=Hispanic 0.1198 0.0790 1.52 0.1292
## SIRE=Multiracial 0.4966 0.1177 4.22 <0.0001
## g -0.6836 0.0286 -23.89 <0.0001
## verbal_tilt 0.0871 0.0285 3.06 0.0022
##
#regressions
lrm(voted_pres_2008 ~ sex + SIRE, data = els)
## Frequencies of Missing Values Due to Each Variable
## voted_pres_2008 sex SIRE
## 4375 827 953
##
## Logistic Regression Model
##
## lrm(formula = voted_pres_2008 ~ sex + SIRE, data = els)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 11207 LR chi2 371.15 R2 0.044 C 0.601
## FALSE 4196 d.f. 6 g 0.417 Dxy 0.202
## TRUE 7011 Pr(> chi2) <0.0001 gr 1.517 gamma 0.251
## max |deriv| 2e-11 gp 0.096 tau-a 0.095
## Brier 0.227
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 0.4466 0.0327 13.66 <0.0001
## sex=Female 0.3570 0.0397 8.99 <0.0001
## SIRE=Amerindian -0.5293 0.2158 -2.45 0.0142
## SIRE=Asian (broad) -0.6902 0.0679 -10.17 <0.0001
## SIRE=Black 0.4310 0.0677 6.36 <0.0001
## SIRE=Hispanic -0.6348 0.0581 -10.93 <0.0001
## SIRE=Multiracial -0.1869 0.0934 -2.00 0.0454
##
lrm(voted_pres_2008 ~ sex + SIRE + g, data = els)
## Frequencies of Missing Values Due to Each Variable
## voted_pres_2008 sex SIRE g
## 4375 827 953 0
##
## Logistic Regression Model
##
## lrm(formula = voted_pres_2008 ~ sex + SIRE + g, data = els)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 11207 LR chi2 769.67 R2 0.090 C 0.652
## FALSE 4196 d.f. 7 g 0.634 Dxy 0.304
## TRUE 7011 Pr(> chi2) <0.0001 gr 1.885 gamma 0.304
## max |deriv| 7e-11 gp 0.142 tau-a 0.143
## Brier 0.219
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 0.4170 0.0334 12.50 <0.0001
## sex=Female 0.3738 0.0405 9.23 <0.0001
## SIRE=Amerindian -0.1703 0.2200 -0.77 0.4391
## SIRE=Asian (broad) -0.6850 0.0694 -9.87 <0.0001
## SIRE=Black 0.8381 0.0720 11.64 <0.0001
## SIRE=Hispanic -0.3292 0.0611 -5.39 <0.0001
## SIRE=Multiracial -0.0696 0.0955 -0.73 0.4662
## g 0.4034 0.0207 19.51 <0.0001
##
lrm(voted_pres_2008 ~ sex + SIRE + g + verbal_tilt, data = els)
## Frequencies of Missing Values Due to Each Variable
## voted_pres_2008 sex SIRE g
## 4375 827 953 0
## verbal_tilt
## 305
##
## Logistic Regression Model
##
## lrm(formula = voted_pres_2008 ~ sex + SIRE + g + verbal_tilt,
## data = els)
##
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 11207 LR chi2 784.94 R2 0.092 C 0.653
## FALSE 4196 d.f. 8 g 0.641 Dxy 0.306
## TRUE 7011 Pr(> chi2) <0.0001 gr 1.899 gamma 0.306
## max |deriv| 8e-11 gp 0.143 tau-a 0.143
## Brier 0.218
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 0.4324 0.0336 12.86 <0.0001
## sex=Female 0.3463 0.0411 8.43 <0.0001
## SIRE=Amerindian -0.1709 0.2202 -0.78 0.4375
## SIRE=Asian (broad) -0.6469 0.0701 -9.23 <0.0001
## SIRE=Black 0.8269 0.0721 11.47 <0.0001
## SIRE=Hispanic -0.3331 0.0611 -5.45 <0.0001
## SIRE=Multiracial -0.0774 0.0955 -0.81 0.4179
## g 0.4023 0.0207 19.45 <0.0001
## verbal_tilt 0.0808 0.0207 3.90 <0.0001
##
Somehow, blacks say they voted MORE than whites? Pattern persists after controls. Odd. Self-report issues?