Init

options(digits = 3)
library(pacman)
p_load(kirkegaard, haven, rms)

Functions

recode_na = function(x) {
  plyr::mapvalues(x, from = -c(1:99), to = rep(NA, 99), warn_missing = F)
}

Data

# load("data/els_02_12_byf3stubrr_v1_0.rdata")
# els = els_02_12_byf3stubrr_v1_0
els2 = read_spss("data/els_02_12_byf3stubrr_v1_0.sav")
els = read_spss("data/els_02_12_byf3pststu_v1_0.sav")
els_vars = kirkegaard::df_var_table(els)
els_vars

Recode

#sex
els$sex = els$BYSEX %>% recode_na() %>% as_factor() %>% fct_drop() %>% fct_relevel("Male")

#recode race by removing NAs
els$SIRE = els$BYRACE %>% recode_na() %>% as_factor() %>% as.character()

#rename to shorter names
els$SIRE = els$SIRE %>% plyr::mapvalues(
  from = unique(els$SIRE),
  to = c("Hispanic", "Asian (broad)", "White", "Black", "Hispanic", "Multiracial", NA, "Amerindian")) %>% 
  fct_relevel("White")

#use their composite score for g
els$g = els$BYTXCSTD %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))

#tilt
els$verbal_tilt = ((els$BYTXRSTD %>% recode_na() %>% standardize()) - (els$BYTXMSTD %>% recode_na() %>% standardize())) %>% standardize(focal_group = (els$SIRE == "White"))

#SAT ACT
#data restricted
# els$SAT = els$TXSATC %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
# els$ACT = els$TXACTC %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))

#SES
# els$SES = els$TXACTC %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
els$mom_edu = els$BYMOTHED %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))
els$dad_edu = els$BYFATHED %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))
els$income = els$F3ERN2011 %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White"))
# els$family_income = els$F3EMPINC2011 %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
# els$occupation = els$BYOCC30 %>% as.numeric() %>% standardize(focal_group = (els$SIRE == "White, non-Hispanic"))
els$welfare = els$F3D24 %>% recode_na() %>% {. == 1}

#politics
els$voted_pres_2008 = els$F3D38 %>% recode_na() %>% {. == 1}

Correlations

els[c("g", "verbal_tilt", "income", "mom_edu", "dad_edu", "welfare", "voted_pres_2008")] %>% wtd.cors()

##                         g verbal_tilt  income mom_edu  dad_edu welfare
## g                1.000000   -0.000517  0.2099  0.4633  0.47515 -0.2428
## verbal_tilt     -0.000517    1.000000 -0.0544  0.0162  0.00133  0.0617
## income           0.209898   -0.054420  1.0000  0.1191  0.12131 -0.2467
## mom_edu          0.463295    0.016151  0.1191  1.0000  0.75682 -0.1456
## dad_edu          0.475151    0.001331  0.1213  0.7568  1.00000 -0.1585
## welfare         -0.242778    0.061671 -0.2467 -0.1456 -0.15849  1.0000
## voted_pres_2008  0.163879    0.066776  0.0797  0.1293  0.13337 -0.1081
##                 voted_pres_2008
## g                        0.1639
## verbal_tilt              0.0668
## income                   0.0797
## mom_edu                  0.1293
## dad_edu                  0.1334
## welfare                 -0.1081
## voted_pres_2008          1.0000

Gap size

Test scores

#plot distributions
GG_denhist(els, "g", "SIRE", vline = F)

## Warning in GG_denhist(els, "g", "SIRE", vline = F): Grouping variable
## contained missing values. These were removed. If you want an NA group,
## convert to explicit value.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#sum stats
describeBy(els$g, els$SIRE)

## 
##  Descriptive statistics by group 
## group: White
##    vars    n mean sd median trimmed  mad   min  max range skew kurtosis
## X1    1 8682    0  1   0.07    0.03 1.01 -3.52 3.02  6.54 -0.3    -0.11
##      se
## X1 0.01
## -------------------------------------------------------- 
## group: Amerindian
##    vars   n  mean   sd median trimmed  mad  min  max range  skew kurtosis
## X1    1 130 -0.84 0.86   -0.8   -0.82 0.91 -3.1 1.37  4.47 -0.14    -0.38
##      se
## X1 0.08
## -------------------------------------------------------- 
## group: Asian (broad)
##    vars    n  mean  sd median trimmed  mad   min max range skew kurtosis
## X1    1 1460 -0.12 1.1  -0.16   -0.13 1.17 -3.22 2.9  6.12 0.09    -0.46
##      se
## X1 0.03
## -------------------------------------------------------- 
## group: Black
##    vars    n  mean   sd median trimmed  mad   min  max range skew kurtosis
## X1    1 2020 -0.95 0.92  -0.96   -0.97 0.92 -3.43 2.45  5.88 0.18    -0.16
##      se
## X1 0.02
## -------------------------------------------------------- 
## group: Hispanic
##    vars    n  mean   sd median trimmed  mad   min  max range skew kurtosis
## X1    1 2217 -0.85 1.04  -0.88   -0.86 1.12 -3.45 2.37  5.83 0.17    -0.42
##      se
## X1 0.02
## -------------------------------------------------------- 
## group: Multiracial
##    vars   n  mean   sd median trimmed  mad   min  max range  skew kurtosis
## X1    1 735 -0.29 1.05  -0.24   -0.27 1.07 -3.02 2.43  5.45 -0.13    -0.26
##      se
## X1 0.04

#plot distributions
GG_denhist(els, "verbal_tilt", "SIRE", vline = F)

## Warning in GG_denhist(els, "verbal_tilt", "SIRE", vline = F): Grouping
## variable contained missing values. These were removed. If you want an NA
## group, convert to explicit value.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#sum stats
describeBy(els$verbal_tilt, els$SIRE)

## 
##  Descriptive statistics by group 
## group: White
##    vars    n mean sd median trimmed  mad  min  max range  skew kurtosis
## X1    1 8682    0  1   0.01    0.01 0.94 -4.6 3.81   8.4 -0.11     0.52
##      se
## X1 0.01
## -------------------------------------------------------- 
## group: Amerindian
##    vars   n  mean   sd median trimmed  mad   min  max range skew kurtosis
## X1    1 130 -0.01 0.97  -0.04   -0.01 0.94 -2.62 2.63  5.25 0.01    -0.22
##      se
## X1 0.08
## -------------------------------------------------------- 
## group: Asian (broad)
##    vars    n  mean   sd median trimmed  mad   min  max range  skew
## X1    1 1460 -0.51 1.18  -0.42   -0.45 1.03 -6.01 2.52  8.52 -0.66
##    kurtosis   se
## X1     1.16 0.03
## -------------------------------------------------------- 
## group: Black
##    vars    n mean   sd median trimmed  mad   min  max range skew kurtosis
## X1    1 2020 0.14 0.94   0.15    0.14 0.92 -3.68 3.22   6.9 0.01     0.29
##      se
## X1 0.02
## -------------------------------------------------------- 
## group: Hispanic
##    vars    n mean   sd median trimmed  mad   min  max range  skew kurtosis
## X1    1 2217 0.02 0.97   0.05    0.03 0.94 -5.25 3.38  8.63 -0.19     0.72
##      se
## X1 0.02
## -------------------------------------------------------- 
## group: Multiracial
##    vars   n mean   sd median trimmed  mad   min  max range  skew kurtosis
## X1    1 735 0.07 1.02   0.05    0.08 0.95 -3.75 3.28  7.03 -0.15     0.57
##      se
## X1 0.04

Regressions

Predict own income

#regressions
ols(income ~ sex + SIRE, data = els)

## Frequencies of Missing Values Due to Each Variable
## income    sex   SIRE 
##      0    827    953 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ sex + SIRE, data = els)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   15244    LR chi2    321.15    R2       0.021    
##  sigma0.9884    d.f.            6    R2 adj   0.020    
##  d.f.  15237    Pr(> chi2) 0.0000    g        0.160    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.0664 -0.7618 -0.1491  0.4914  9.7989 
##  
##  
##                     Coef    S.E.   t      Pr(>|t|)
##  Intercept           0.0709 0.0133   5.31 <0.0001 
##  sex=Female         -0.1403 0.0160  -8.76 <0.0001 
##  SIRE=Amerindian    -0.4161 0.0873  -4.76 <0.0001 
##  SIRE=Asian (broad) -0.0934 0.0280  -3.34 0.0008  
##  SIRE=Black         -0.3044 0.0244 -12.47 <0.0001 
##  SIRE=Hispanic      -0.2615 0.0235 -11.12 <0.0001 
##  SIRE=Multiracial   -0.1364 0.0380  -3.59 0.0003  
##

ols(income ~ sex + SIRE + g, data = els)

## Frequencies of Missing Values Due to Each Variable
## income    sex   SIRE      g 
##      0    827    953      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ sex + SIRE + g, data = els)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   15244    LR chi2    938.51    R2       0.060    
##  sigma0.9686    d.f.            7    R2 adj   0.059    
##  d.f.  15236    Pr(> chi2) 0.0000    g        0.278    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.5700 -0.6873 -0.1584  0.4791 10.0570 
##  
##  
##                     Coef    S.E.   t     Pr(>|t|)
##  Intercept           0.0731 0.0131  5.59 <0.0001 
##  sex=Female         -0.1447 0.0157 -9.22 <0.0001 
##  SIRE=Amerindian    -0.2528 0.0858 -2.95 0.0032  
##  SIRE=Asian (broad) -0.0709 0.0274 -2.59 0.0097  
##  SIRE=Black         -0.1183 0.0251 -4.72 <0.0001 
##  SIRE=Hispanic      -0.0960 0.0240 -4.00 <0.0001 
##  SIRE=Multiracial   -0.0798 0.0373 -2.14 0.0323  
##  g                   0.1954 0.0078 25.09 <0.0001 
##

ols(income ~ sex + SIRE + g + verbal_tilt, data = els)

## Frequencies of Missing Values Due to Each Variable
##      income         sex        SIRE           g verbal_tilt 
##           0         827         953           0         305 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ sex + SIRE + g + verbal_tilt, data = els)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   15244    LR chi2    969.61    R2       0.062    
##  sigma0.9677    d.f.            8    R2 adj   0.061    
##  d.f.  15235    Pr(> chi2) 0.0000    g        0.282    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.5781 -0.6859 -0.1545  0.4770 10.1651 
##  
##  
##                     Coef    S.E.   t     Pr(>|t|)
##  Intercept           0.0651 0.0131  4.96 <0.0001 
##  sex=Female         -0.1289 0.0159 -8.09 <0.0001 
##  SIRE=Amerindian    -0.2515 0.0858 -2.93 0.0034  
##  SIRE=Asian (broad) -0.0933 0.0277 -3.37 0.0008  
##  SIRE=Black         -0.1111 0.0251 -4.43 <0.0001 
##  SIRE=Hispanic      -0.0944 0.0240 -3.94 <0.0001 
##  SIRE=Multiracial   -0.0765 0.0372 -2.05 0.0400  
##  g                   0.1964 0.0078 25.24 <0.0001 
##  verbal_tilt        -0.0441 0.0079 -5.58 <0.0001 
##

#relative importance
lm(income ~ sex + SIRE + g + verbal_tilt, data = els) %>% 
  car::Anova() %>% 
  sjstats::anova_stats()

#rms method
plot(anova(ols(income ~ sex + SIRE + g + verbal_tilt, data = els)), what = 'proportion R2')

Predict own welfare use

#regressions
lrm(welfare ~ sex + SIRE, data = els)

## Frequencies of Missing Values Due to Each Variable
## welfare     sex    SIRE 
##    4390     827     953 
## 
## Logistic Regression Model
##  
##  lrm(formula = welfare ~ sex + SIRE, data = els)
##  
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs         11192    LR chi2     585.21    R2       0.086    C       0.663    
##   FALSE       9353    d.f.             6    g        0.672    Dxy     0.326    
##   TRUE        1839    Pr(> chi2) <0.0001    gr       1.959    gamma   0.394    
##  max |deriv| 5e-11                          gp       0.091    tau-a   0.089    
##                                             Brier    0.129                     
##  
##                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept          -2.3654 0.0521 -45.39 <0.0001 
##  sex=Female          0.7274 0.0552  13.19 <0.0001 
##  SIRE=Amerindian     1.4098 0.2274   6.20 <0.0001 
##  SIRE=Asian (broad) -0.5442 0.1233  -4.41 <0.0001 
##  SIRE=Black          1.1955 0.0697  17.16 <0.0001 
##  SIRE=Hispanic       0.6506 0.0734   8.86 <0.0001 
##  SIRE=Multiracial    0.6773 0.1129   6.00 <0.0001 
##

lrm(welfare ~ sex + SIRE + g, data = els)

## Frequencies of Missing Values Due to Each Variable
## welfare     sex    SIRE       g 
##    4390     827     953       0 
## 
## Logistic Regression Model
##  
##  lrm(formula = welfare ~ sex + SIRE + g, data = els)
##  
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs         11192    LR chi2    1200.35    R2       0.172    C       0.743    
##   FALSE       9353    d.f.             7    g        1.083    Dxy     0.486    
##   TRUE        1839    Pr(> chi2) <0.0001    gr       2.954    gamma   0.486    
##  max |deriv| 1e-10                          gp       0.133    tau-a   0.133    
##                                             Brier    0.121                     
##  
##                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept          -2.4965 0.0554 -45.10 <0.0001 
##  sex=Female          0.7924 0.0572  13.85 <0.0001 
##  SIRE=Amerindian     0.9104 0.2345   3.88 0.0001  
##  SIRE=Asian (broad) -0.6540 0.1262  -5.18 <0.0001 
##  SIRE=Black          0.6221 0.0750   8.29 <0.0001 
##  SIRE=Hispanic       0.1230 0.0789   1.56 0.1192  
##  SIRE=Multiracial    0.5052 0.1176   4.30 <0.0001 
##  g                  -0.6823 0.0286 -23.84 <0.0001 
##

lrm(welfare ~ sex + SIRE + g + verbal_tilt, data = els)

## Frequencies of Missing Values Due to Each Variable
##     welfare         sex        SIRE           g verbal_tilt 
##        4390         827         953           0         305 
## 
## Logistic Regression Model
##  
##  lrm(formula = welfare ~ sex + SIRE + g + verbal_tilt, data = els)
##  
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs         11192    LR chi2    1209.74    R2       0.173    C       0.744    
##   FALSE       9353    d.f.             8    g        1.088    Dxy     0.488    
##   TRUE        1839    Pr(> chi2) <0.0001    gr       2.968    gamma   0.488    
##  max |deriv| 1e-10                          gp       0.133    tau-a   0.134    
##                                             Brier    0.121                     
##  
##                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept          -2.4836 0.0555 -44.76 <0.0001 
##  sex=Female          0.7640 0.0579  13.19 <0.0001 
##  SIRE=Amerindian     0.9101 0.2345   3.88 0.0001  
##  SIRE=Asian (broad) -0.6202 0.1269  -4.89 <0.0001 
##  SIRE=Black          0.6112 0.0751   8.14 <0.0001 
##  SIRE=Hispanic       0.1198 0.0790   1.52 0.1292  
##  SIRE=Multiracial    0.4966 0.1177   4.22 <0.0001 
##  g                  -0.6836 0.0286 -23.89 <0.0001 
##  verbal_tilt         0.0871 0.0285   3.06 0.0022  
##

Predict voter status in 2008

#regressions
lrm(voted_pres_2008 ~ sex + SIRE, data = els)

## Frequencies of Missing Values Due to Each Variable
## voted_pres_2008             sex            SIRE 
##            4375             827             953 
## 
## Logistic Regression Model
##  
##  lrm(formula = voted_pres_2008 ~ sex + SIRE, data = els)
##  
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs         11207    LR chi2     371.15    R2       0.044    C       0.601    
##   FALSE       4196    d.f.             6    g        0.417    Dxy     0.202    
##   TRUE        7011    Pr(> chi2) <0.0001    gr       1.517    gamma   0.251    
##  max |deriv| 2e-11                          gp       0.096    tau-a   0.095    
##                                             Brier    0.227                     
##  
##                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept           0.4466 0.0327  13.66 <0.0001 
##  sex=Female          0.3570 0.0397   8.99 <0.0001 
##  SIRE=Amerindian    -0.5293 0.2158  -2.45 0.0142  
##  SIRE=Asian (broad) -0.6902 0.0679 -10.17 <0.0001 
##  SIRE=Black          0.4310 0.0677   6.36 <0.0001 
##  SIRE=Hispanic      -0.6348 0.0581 -10.93 <0.0001 
##  SIRE=Multiracial   -0.1869 0.0934  -2.00 0.0454  
##

lrm(voted_pres_2008 ~ sex + SIRE + g, data = els)

## Frequencies of Missing Values Due to Each Variable
## voted_pres_2008             sex            SIRE               g 
##            4375             827             953               0 
## 
## Logistic Regression Model
##  
##  lrm(formula = voted_pres_2008 ~ sex + SIRE + g, data = els)
##  
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs         11207    LR chi2     769.67    R2       0.090    C       0.652    
##   FALSE       4196    d.f.             7    g        0.634    Dxy     0.304    
##   TRUE        7011    Pr(> chi2) <0.0001    gr       1.885    gamma   0.304    
##  max |deriv| 7e-11                          gp       0.142    tau-a   0.143    
##                                             Brier    0.219                     
##  
##                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept           0.4170 0.0334 12.50  <0.0001 
##  sex=Female          0.3738 0.0405  9.23  <0.0001 
##  SIRE=Amerindian    -0.1703 0.2200 -0.77  0.4391  
##  SIRE=Asian (broad) -0.6850 0.0694 -9.87  <0.0001 
##  SIRE=Black          0.8381 0.0720 11.64  <0.0001 
##  SIRE=Hispanic      -0.3292 0.0611 -5.39  <0.0001 
##  SIRE=Multiracial   -0.0696 0.0955 -0.73  0.4662  
##  g                   0.4034 0.0207 19.51  <0.0001 
##

lrm(voted_pres_2008 ~ sex + SIRE + g + verbal_tilt, data = els)

## Frequencies of Missing Values Due to Each Variable
## voted_pres_2008             sex            SIRE               g 
##            4375             827             953               0 
##     verbal_tilt 
##             305 
## 
## Logistic Regression Model
##  
##  lrm(formula = voted_pres_2008 ~ sex + SIRE + g + verbal_tilt, 
##      data = els)
##  
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs         11207    LR chi2     784.94    R2       0.092    C       0.653    
##   FALSE       4196    d.f.             8    g        0.641    Dxy     0.306    
##   TRUE        7011    Pr(> chi2) <0.0001    gr       1.899    gamma   0.306    
##  max |deriv| 8e-11                          gp       0.143    tau-a   0.143    
##                                             Brier    0.218                     
##  
##                     Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept           0.4324 0.0336 12.86  <0.0001 
##  sex=Female          0.3463 0.0411  8.43  <0.0001 
##  SIRE=Amerindian    -0.1709 0.2202 -0.78  0.4375  
##  SIRE=Asian (broad) -0.6469 0.0701 -9.23  <0.0001 
##  SIRE=Black          0.8269 0.0721 11.47  <0.0001 
##  SIRE=Hispanic      -0.3331 0.0611 -5.45  <0.0001 
##  SIRE=Multiracial   -0.0774 0.0955 -0.81  0.4179  
##  g                   0.4023 0.0207 19.45  <0.0001 
##  verbal_tilt         0.0808 0.0207  3.90  <0.0001 
##

Somehow, blacks say they voted MORE than whites? Pattern persists after controls. Odd. Self-report issues?

Education Longitudinal Study of 2002: example analyses

Init

Functions

Data

Recode

Correlations

Gap size

Test scores

Regressions

Predict own income

Predict own welfare use

Predict voter status in 2008