What is this

This is the R notebook for the titled study. It features the analysis code (except parts which were generated by the NLSY explorer) and the results. This serves multiple purposes including: 1) anyone can review the analysis code for errors and researcher degrees of freedom, 2) anyone can reuse the whole or parts of the code for any purpose without seeking approval, 3) anyone can gather additional calculated statistics that were not put in the written paper.

Start

options(digits = 2)
library(pacman)
p_load(kirkegaard, readr, haven, polycor, rms)

Ad hoc

A few ad hoc functions.

#psych's breaks rnbs due to a custom class, we just remove that
describe = function(...) {
  x = psych::describe(...)
  class(x) = "data.frame"
  x
}

NLSY79

Below follows the analysis code for the NLSY79 analyses.

Data

#we used the NLSY supplied recoding script, with minor changes
source("data/NLSY79/nlsy79.R")
d79 = new_data; rm(new_data)

#var table
d79_var_table = data_frame(
  var = names(d79),
  label = varlabels
)

Recode

#scrrener perceived race
d79$OPRE1 = d79$R0214700 %>% plyr::mapvalues(1:3, c("Hispanic", "Black", "White")) %>% factor() %>% fct_relevel("White")
d79$OPRE1 %>% table2()

#interviewer perceived
d79$OPRE2 = d79$R0172700 %>% plyr::mapvalues(1:3, c("White", "Black", "Other")) %>% factor() %>% fct_relevel("White")
d79$OPRE2 %>% table2()

#combined
table(d79$OPRE1, d79$OPRE2)

##           
##            White Black Other
##   White     7260    38   161
##   Black       48  3094    17
##   Hispanic  1428    31   533

#we use OPRE1
d79$OPRE = d79$OPRE1

#self-perceived in 2002
SPRE2002 = d79 %>% df_subset_by_pattern("^R709310") %>% 
  set_colnames(c("White", "Black", "Asian", "Pacific Islander", "American Indian", "Other", "Hispanic")) %>% 
  map_df(~as.logical(.))

#recode into standard coding
d79$SPRE = plyr::alply(SPRE2002, .margins = 1, function(r) {
  #all NA?
  if (all(is.na(r))) return(NA)
  
  #partial missing into F
  r[is.na(r)] = F
  
  #Hispanic?
  if (r$Hispanic) return("Hispanic")
  
  #any of the other first 6?
  if (sum(r[1:6] %>% unlist()) == 1) {
    return(names(which(r %>% unlist())))
  }
  
  "Multi-racial"
}) %>% unlist() %>% factor() %>% fct_relevel("White")

table2(d79$SPRE)

#compare
table(d79$SPRE, d79$OPRE) %>% prop.table(margin = 1) %>% unclass() %>% write_clipboard()

##                  White Black Hispanic
## White             0.85  0.01     0.14
## American Indian   0.76  0.07     0.17
## Asian             0.65  0.17     0.17
## Black             0.00  0.99     0.01
## Hispanic          0.01  0.01     0.98
## Multi-racial      0.37  0.43     0.21
## Other             0.08  0.04     0.87
## Pacific Islander  0.40  0.10     0.50

#income vars
d79_income_vars = d79[str_detect(d79_var_table$label, "^(TOT INC)|AMT") %>% which()] %>% 
  map_df(~standardize(.))

#years
income_vars_years = d79_var_table %>% 
  filter(str_detect(label, "^(TOT INC)|AMT")) %>% 
  pull(label) %>% 
  str_match("\\d+") %>% 
  as.vector() %>% 
  map_chr(~{if (str_length(.) == 2) return("19" + .) else .}) %>% 
  as.numeric()

#IQ
d79$AFQT = d79$R0618200 %>% divide_by(100) %>% qnorm()
d79$g = d79$AFQT #synonym

#sex
d79$sex = d79$R0214800 %>% plyr::mapvalues(1:2, c("Male", "Female")) %>% factor()

#age then
d79$age_1979 = d79$R0000600

#born
d79$born = 1979 - d79$age_1979

#age at last follow-up
d79$age = max(income_vars_years) - d79$born

Recode income

#correlations
d79_income_vars %>% wtd.cors() %>% MAT_half() %>% describe()

#missing data?
miss_plot(d79_income_vars)

#impute those with 7<=missing
#how many % is that?
mean(miss_by_case(d79_income_vars) <= 17)

## [1] 0.86

#load from file if possible to avoid waiting time!
imp_inc_filename = "data/NLSY79/imp_inc.rds"
if (file.exists(imp_inc_filename)) {
  d79_income_vars_imp = read_rds(imp_inc_filename)
} else {
 d79_income_vars_imp = d79_income_vars %>% miss_impute(max_na = 17)
 write_rds(d79_income_vars_imp, imp_inc_filename)
}

#average in multiple ways
d79$income_simple = rowMeans(d79_income_vars, na.rm = T)
d79$income_simple_log = log10(d79$income + 1)

## Warning: NaNs produced

d79$income_imp = rowMeans(d79_income_vars_imp)
d79$income_imp_log = log10(d79$income_imp + 1)

#plot
GG_denhist(d79$income_simple)

## Warning: Removed 32 rows containing non-finite values (stat_bin).

## Warning: Removed 32 rows containing non-finite values (stat_density).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 32 rows containing non-finite values (stat_bin).

## Warning: Removed 32 rows containing non-finite values (stat_density).

GG_denhist(d79$income_simple_log)

## Warning: Removed 37 rows containing non-finite values (stat_bin).

## Warning: Removed 37 rows containing non-finite values (stat_density).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 37 rows containing non-finite values (stat_bin).

## Warning: Removed 37 rows containing non-finite values (stat_density).

GG_denhist(d79$income_imp)

## Warning: Removed 1726 rows containing non-finite values (stat_bin).

## Warning: Removed 1726 rows containing non-finite values (stat_density).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1726 rows containing non-finite values (stat_bin).

## Warning: Removed 1726 rows containing non-finite values (stat_density).

GG_denhist(d79$income_imp_log)

## Warning: Removed 1726 rows containing non-finite values (stat_bin).

## Warning: Removed 1726 rows containing non-finite values (stat_density).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1726 rows containing non-finite values (stat_bin).

## Warning: Removed 1726 rows containing non-finite values (stat_density).

Correlations

#cors
hetcor(d79[c("g", "income_simple", "income_simple_log", "income_imp","income_imp_log", "age", "sex")], use = "pairwise")

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##                         g income_simple income_simple_log income_imp
## g                       1       Pearson           Pearson    Pearson
## income_simple       0.405             1           Pearson    Pearson
## income_simple_log   0.407         0.878                 1    Pearson
## income_imp          0.467         0.985             0.856          1
## income_imp_log      0.482         0.894             0.977      0.898
## age                 0.216         0.166             0.127      0.194
## sex               -0.0114         0.397             0.356      0.421
##                   income_imp_log     age        sex
## g                        Pearson Pearson Polyserial
## income_simple            Pearson Pearson Polyserial
## income_simple_log        Pearson Pearson Polyserial
## income_imp               Pearson Pearson Polyserial
## income_imp_log                 1 Pearson Polyserial
## age                        0.147       1 Polyserial
## sex                        0.392 -0.0104          1
## 
## Standard Errors/Numbers of Observations:
##                         g income_simple income_simple_log income_imp
## g                   11878         11868             11865      10430
## income_simple     0.00767         12654             12649      10960
## income_simple_log 0.00766       0.00203             12649      10960
## income_imp        0.00766      0.000278           0.00255      10960
## income_imp_log    0.00751       0.00192          0.000437    0.00185
## age               0.00875       0.00865           0.00875    0.00919
## sex                0.0115        0.0101           0.00981     0.0107
##                   income_imp_log    age   sex
## g                          10430  11878 11878
## income_simple              10960  12654 12654
## income_simple_log          10960  12649 12649
## income_imp                 10960  10960 10960
## income_imp_log             10960  10960 10960
## age                      0.00935  12686 12686
## sex                       0.0102 0.0111 12686
## 
## P-values for Tests of Bivariate Normality:
##                           g income_simple income_simple_log income_imp
## g                                                                     
## income_simple       3.4e-74                                           
## income_simple_log  8.55e-81             0                             
## income_imp         9.85e-69     5.77e-169                 0           
## income_imp_log     3.69e-71             0         2.28e-269          0
## age               6.54e-294             0                 0          0
## sex                3.14e-11      6.93e-82          1.88e-71   1.64e-76
##                          income_imp_log                   age
## g                                                            
## income_simple                                                
## income_simple_log                                            
## income_imp                                                   
## income_imp_log                                               
## age               7.54999999999904e-313                      
## sex                             4.8e-42 7.21999999999999e-310

#use the best version
d79$income = d79$income_imp

#income x IQ by age
d79_age_iq_inc = map_df(seq_along(unique(income_vars_years)), function(b) {
  data_frame(
    age = income_vars_years[b] - mean(d79$born),
    r = wtd.cors(d79_income_vars_imp[[b]], d79$AFQT) %>% as.vector()
  )
})
d79_age_iq_inc

ggplot(d79_age_iq_inc, aes(age, r)) +
  geom_path() +
  theme_bw() +
  scale_y_continuous("r g x income") +
  ggtitle("Income and IQ by age in NLSY79")

GG_save("figs/79_age_iq_inc.png")

#for whites
hetcor(d79 %>% filter(SPRE == "White") %>% .[c("g", "income", "age", "sex")], use = "pairwise")

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##             g  income     age        sex
## g           1 Pearson Pearson Polyserial
## income  0.405       1 Pearson Polyserial
## age     0.183   0.194       1 Polyserial
## sex    0.0205   0.525 -0.0451          1
## 
## Standard Errors/Numbers of Observations:
##             g income    age  sex
## g        4265   4254   4265 4265
## income 0.0128   4433   4433 4433
## age    0.0148 0.0145   4450 4450
## sex    0.0192  0.015 0.0187 4450
## 
## P-values for Tests of Bivariate Normality:
##                g    income       age
## g                                   
## income  3.47e-37                    
## age    8.75e-101 1.99e-140          
## sex      0.00427  1.58e-41 2.45e-103

#for blacks
hetcor(d79 %>% filter(SPRE == "Black") %>% .[c("g", "income", "age", "sex")], use = "pairwise")

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##              g  income     age        sex
## g            1 Pearson Pearson Polyserial
## income   0.452       1 Pearson Polyserial
## age      0.117   0.173       1 Polyserial
## sex    -0.0623   0.284 -0.0152          1
## 
## Standard Errors/Numbers of Observations:
##             g income    age  sex
## g        2204   2196   2204 2204
## income  0.017   2278   2278 2278
## age     0.021 0.0203   2287 2287
## sex    0.0266 0.0253 0.0262 2287
## 
## P-values for Tests of Bivariate Normality:
##               g   income      age
## g                                
## income 9.64e-07                  
## age    4.57e-50 1.71e-59         
## sex      0.0225 9.84e-12 3.71e-50

#hispanic
hetcor(d79 %>% filter(SPRE == "Hispanic") %>% .[c("g", "income", "age", "sex")], use = "pairwise")

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##             g  income      age        sex
## g           1 Pearson  Pearson Polyserial
## income  0.495       1  Pearson Polyserial
## age    0.0306   0.182        1 Polyserial
## sex     0.107   0.494 -0.00874          1
## 
## Standard Errors/Numbers of Observations:
##             g income    age sex
## g         464    464    464 464
## income 0.0351    492    492 492
## age    0.0464 0.0436    492 492
## sex    0.0579 0.0469 0.0566 492
## 
## P-values for Tests of Bivariate Normality:
##               g   income     age
## g                               
## income     0.12                 
## age    2.93e-06 2.07e-06        
## sex       0.103   0.0436 1.7e-06

Models

#subset and standardize
d79b = d79[c("g", "income", "OPRE", "SPRE", "sex", "age")] %>% df_standardize()

## Skipped OPRE because it is a factor.

## Skipped SPRE because it is a factor.

## Skipped sex because it is a factor.

#baseline
(m79_o = ols(income ~ OPRE + sex * rcs(age), data = d79b))

## Frequencies of Missing Values Due to Each Variable
## income   OPRE    sex    age 
##   1726      0      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + sex * rcs(age), data = d79b)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs   10960    LR chi2    2340.40    R2       0.192    
##  sigma0.8992    d.f.            11    R2 adj   0.191    
##  d.f.  10948    Pr(> chi2)  0.0000    g        0.496    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.2268 -0.6045 -0.1420  0.4488  5.8321 
##  
##  
##                    Coef    S.E.   t      Pr(>|t|)
##  Intercept         -0.0480 0.1161  -0.41 0.6794  
##  OPRE=Black        -0.4751 0.0203 -23.42 <0.0001 
##  OPRE=Hispanic     -0.2462 0.0238 -10.35 <0.0001 
##  sex=Male           0.5394 0.1617   3.34 0.0009  
##  age                0.1853 0.0866   2.14 0.0323  
##  age'              -0.2728 0.3670  -0.74 0.4574  
##  age''              0.7213 1.1279   0.64 0.5225  
##  age'''            -0.6669 1.6536  -0.40 0.6867  
##  sex=Male * age     0.0640 0.1204   0.53 0.5947  
##  sex=Male * age'    0.0943 0.5140   0.18 0.8544  
##  sex=Male * age''   0.6085 1.5840   0.38 0.7009  
##  sex=Male * age''' -3.0526 2.3315  -1.31 0.1905  
##

(m79_s = ols(income ~ SPRE + sex * rcs(age), data = d79b))

## Frequencies of Missing Values Due to Each Variable
## income   SPRE    sex    age 
##   1726   4968      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ SPRE + sex * rcs(age), data = d79b)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    7692    LR chi2    1880.79    R2       0.217    
##  sigma0.9014    d.f.            16    R2 adj   0.215    
##  d.f.   7675    Pr(> chi2)  0.0000    g        0.537    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.2966 -0.5930 -0.1405  0.4334  5.6981 
##  
##  
##                        Coef    S.E.   t      Pr(>|t|)
##  Intercept             -0.0702 0.1342  -0.52 0.6010  
##  SPRE=American Indian  -0.4225 0.1337  -3.16 0.0016  
##  SPRE=Asian             0.1947 0.1885   1.03 0.3016  
##  SPRE=Black            -0.5653 0.0233 -24.31 <0.0001 
##  SPRE=Hispanic         -0.3816 0.0428  -8.91 <0.0001 
##  SPRE=Multi-racial     -0.2267 0.0908  -2.50 0.0125  
##  SPRE=Other            -0.3867 0.0531  -7.29 <0.0001 
##  SPRE=Pacific Islander  0.0256 0.2856   0.09 0.9287  
##  sex=Male               0.7249 0.1892   3.83 0.0001  
##  age                    0.1232 0.0998   1.23 0.2173  
##  age'                  -0.0057 0.4276  -0.01 0.9893  
##  age''                 -0.1353 1.3240  -0.10 0.9186  
##  age'''                 0.5820 1.9707   0.30 0.7678  
##  sex=Male * age         0.1794 0.1402   1.28 0.2005  
##  sex=Male * age'       -0.3695 0.6047  -0.61 0.5412  
##  sex=Male * age''       2.0449 1.8757   1.09 0.2757  
##  sex=Male * age'''     -5.2242 2.7970  -1.87 0.0618  
##

#individually
(m79_og = ols(income ~ OPRE + g + sex * rcs(age), data = d79b))

## Frequencies of Missing Values Due to Each Variable
## income   OPRE      g    sex    age 
##   1726      0    808      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + g + sex * rcs(age), data = d79b)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs   10430    LR chi2    4426.94    R2       0.346    
##  sigma0.8112    d.f.            12    R2 adj   0.345    
##  d.f.  10417    Pr(> chi2)  0.0000    g        0.664    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.81798 -0.50481 -0.08837  0.39704  5.60762 
##  
##  
##                    Coef    S.E.   t     Pr(>|t|)
##  Intercept         -0.2848 0.1065 -2.67 0.0075  
##  OPRE=Black        -0.0312 0.0207 -1.50 0.1327  
##  OPRE=Hispanic      0.0960 0.0232  4.14 <0.0001 
##  g                  0.4442 0.0090 49.09 <0.0001 
##  sex=Male           0.4996 0.1483  3.37 0.0008  
##  age                0.0730 0.0793  0.92 0.3571  
##  age'               0.0358 0.3371  0.11 0.9154  
##  age''             -0.2496 1.0383 -0.24 0.8101  
##  age'''             0.6567 1.5294  0.43 0.6677  
##  sex=Male * age     0.0024 0.1103  0.02 0.9827  
##  sex=Male * age'    0.2857 0.4729  0.60 0.5458  
##  sex=Male * age''  -0.0991 1.4620 -0.07 0.9459  
##  sex=Male * age''' -1.8085 2.1646 -0.84 0.4035  
##

(m79_sg = ols(income ~ SPRE + g + sex * rcs(age), data = d79b))

## Frequencies of Missing Values Due to Each Variable
## income   SPRE      g    sex    age 
##   1726   4968    808      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ SPRE + g + sex * rcs(age), data = d79b)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    7372    LR chi2    3305.26    R2       0.361    
##  sigma0.8165    d.f.            17    R2 adj   0.360    
##  d.f.   7354    Pr(> chi2)  0.0000    g        0.693    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.93823 -0.50917 -0.08716  0.39576  5.51322 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.2459 0.1231 -2.00 0.0458  
##  SPRE=American Indian  -0.2248 0.1268 -1.77 0.0762  
##  SPRE=Asian             0.0956 0.1786  0.53 0.5927  
##  SPRE=Black            -0.1195 0.0241 -4.95 <0.0001 
##  SPRE=Hispanic          0.0061 0.0410  0.15 0.8812  
##  SPRE=Multi-racial     -0.1018 0.0839 -1.21 0.2253  
##  SPRE=Other            -0.0046 0.0506 -0.09 0.9282  
##  SPRE=Pacific Islander  0.0504 0.2587  0.19 0.8455  
##  g                      0.4424 0.0109 40.53 <0.0001 
##  sex=Male               0.5552 0.1739  3.19 0.0014  
##  age                    0.0561 0.0915  0.61 0.5396  
##  age'                   0.1225 0.3926  0.31 0.7551  
##  age''                 -0.5178 1.2177 -0.43 0.6707  
##  age'''                 1.0541 1.8199  0.58 0.5625  
##  sex=Male * age         0.0290 0.1287  0.23 0.8216  
##  sex=Male * age'        0.1549 0.5571  0.28 0.7811  
##  sex=Male * age''       0.4825 1.7328  0.28 0.7807  
##  sex=Male * age'''     -3.2087 2.5974 -1.24 0.2167  
##

#both
(m79_osg = ols(income ~ OPRE + SPRE + g + sex * rcs(age), data = d79b))

## Frequencies of Missing Values Due to Each Variable
## income   OPRE   SPRE      g    sex    age 
##   1726      0   4968    808      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + SPRE + g + sex * rcs(age), data = d79b)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    7372    LR chi2    3316.07    R2       0.362    
##  sigma0.8160    d.f.            19    R2 adj   0.361    
##  d.f.   7352    Pr(> chi2)  0.0000    g        0.693    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.01786 -0.50719 -0.08702  0.38881  5.52638 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.2753 0.1233 -2.23 0.0256  
##  OPRE=Black             0.1391 0.0798  1.74 0.0813  
##  OPRE=Hispanic          0.1069 0.0348  3.07 0.0021  
##  SPRE=American Indian  -0.2362 0.1268 -1.86 0.0625  
##  SPRE=Asian             0.0695 0.1791  0.39 0.6979  
##  SPRE=Black            -0.2340 0.0803 -2.92 0.0036  
##  SPRE=Hispanic         -0.0771 0.0493 -1.57 0.1176  
##  SPRE=Multi-racial     -0.1643 0.0902 -1.82 0.0686  
##  SPRE=Other            -0.0820 0.0561 -1.46 0.1440  
##  SPRE=Pacific Islander -0.0003 0.2590  0.00 0.9990  
##  g                      0.4500 0.0112 40.33 <0.0001 
##  sex=Male               0.5686 0.1738  3.27 0.0011  
##  age                    0.0481 0.0915  0.53 0.5987  
##  age'                   0.1554 0.3925  0.40 0.6921  
##  age''                 -0.6078 1.2173 -0.50 0.6176  
##  age'''                 1.1366 1.8190  0.62 0.5321  
##  sex=Male * age         0.0370 0.1287  0.29 0.7738  
##  sex=Male * age'        0.1147 0.5570  0.21 0.8368  
##  sex=Male * age''       0.5853 1.7321  0.34 0.7354  
##  sex=Male * age'''     -3.2756 2.5959 -1.26 0.2070  
##

#relative importance
#use linear age bc eta² does not support nonlinear
(m79_osg2 = ols(income ~ OPRE + SPRE + g + sex * age, data = d79b))

## Frequencies of Missing Values Due to Each Variable
## income   OPRE   SPRE      g    sex    age 
##   1726      0   4968    808      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + SPRE + g + sex * age, data = d79b)
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    7372    LR chi2    3304.72    R2       0.361    
##  sigma0.8163    d.f.            13    R2 adj   0.360    
##  d.f.   7358    Pr(> chi2)  0.0000    g        0.693    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.94757 -0.50962 -0.08683  0.38861  5.51652 
##  
##  
##                        Coef    S.E.   t      Pr(>|t|)
##  Intercept             -0.2434 0.0171 -14.24 <0.0001 
##  OPRE=Black             0.1406 0.0798   1.76 0.0781  
##  OPRE=Hispanic          0.1086 0.0348   3.12 0.0018  
##  SPRE=American Indian  -0.2429 0.1268  -1.92 0.0553  
##  SPRE=Asian             0.0710 0.1792   0.40 0.6918  
##  SPRE=Black            -0.2368 0.0802  -2.95 0.0032  
##  SPRE=Hispanic         -0.0785 0.0493  -1.59 0.1111  
##  SPRE=Multi-racial     -0.1704 0.0902  -1.89 0.0590  
##  SPRE=Other            -0.0852 0.0561  -1.52 0.1290  
##  SPRE=Pacific Islander  0.0042 0.2589   0.02 0.9871  
##  g                      0.4497 0.0112  40.32 <0.0001 
##  sex=Male               0.6998 0.0193  36.30 <0.0001 
##  age                    0.0677 0.0137   4.93 <0.0001 
##  sex=Male * age         0.1455 0.0195   7.47 <0.0001 
##

m79_osg2 %>% aov() %>% lsr::etaSquared() %>% sqrt()

##         eta.sq eta.sq.part
## OPRE     0.031       0.039
## SPRE     0.036       0.045
## g        0.376       0.425
## sex      0.331       0.383
## age      0.132       0.163
## sex:age  0.070       0.087

NLSY97

Data

#call the NLSY R file to load the data
source("data/NLSY97/nlsy97_race_income.R")

#rename
d97 = new_data; rm(new_data)

#var table
d97_var_table = data_frame(
  var = names(d97),
  label = varlabels
)

Recode

##AGE
#born
d97$born = d97$R0536402
GG_denhist(d97, "born")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#calculate age from month and year
#subtract from 2013, last year of data
d97$age = (parse_date("06 2013", "%m %Y") - sprintf("%d %d", d97$R0536401, d97$R0536402) %>% 
             parse_date(format = "%m %Y")) %>% 
  as.numeric() %>% 
  divide_by(365.24)

GG_denhist(d97, "age")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##SEX
d97$sex = if_else(d97$R0536300 == 1, true = "Male", false = "Female") %>% factor()

##IQ
#AFQT
d97$AFQT = d97$R9829600 %>% 
  #recode 0 and 100 centiles
  winsorise(99999, 1) %>% 
  #convert to fraction
  divide_by(100000) %>% 
  #convert to Z score
  qnorm()

#plot
GG_denhist(d97, "AFQT")

## Warning: Removed 1891 rows containing non-finite values (stat_bin).

## Warning: Removed 1891 rows containing non-finite values (stat_density).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1891 rows containing non-finite values (stat_bin).

## Warning: Removed 1891 rows containing non-finite values (stat_density).

#PPAT
#multiple scores
PIAT_data = d97[d97_var_table$var[which(str_detect(d97_var_table$label, "^CV_PIAT"))]]

#z score
PIAT_data %<>% df_standardize()

#cors
wtd.cors(PIAT_data) %T>% 
  print() %>% 
  MAT_half() %>% 
  describe()

##          R5473700 R7237400 S1552700
## R5473700     1.00     0.76     0.78
## R7237400     0.76     1.00     0.77
## S1552700     0.78     0.77     1.00

#simple mean will do
d97$PIAT = rowMeans(PIAT_data, na.rm = T)

#plot
GG_denhist(d97, "PIAT")

## Warning: Removed 7335 rows containing non-finite values (stat_bin).

## Warning: Removed 7335 rows containing non-finite values (stat_density).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 7335 rows containing non-finite values (stat_bin).

## Warning: Removed 7335 rows containing non-finite values (stat_density).

#SAT and ACT for comparison
d97$SAT_math = d97$Z9033700
d97$SAT_verbal = d97$Z9033900
d97$SAT = rowMeans(d97[c("SAT_verbal", "SAT_math")], na.rm=T)
d97$ACT = d97$Z9034100

#g factor
d97$g = rowMeans(d97[c("AFQT", "PIAT")], na.rm = T)

##RACE
#self-perceived dummy version, 2002
SPRE_data = d97[c("S122490" + 0:5, "R0538600")] %>% 
  set_colnames(c("White", "Black", "Asian", "Pacific Islander", "American Indian", "Other", "Hispanic")) %>% 
  df_colFunc(as.logical)

#standard coding
d97$SPRE = plyr::aaply(SPRE_data, .margins = 1, .expand = F, function(x) {
  #all NA? code missing
  if (all(is.na(x))) return(NA)
  
  #otherwise, impute false
  x[is.na(x)] = F
  
  #is hispanic?
  if (x$Hispanic) return("Hispanic")
  
  #is only 1 non-hipanic?
  if (sum(x %>% unlist()) == 1) {
    return(names(which(x %>% unlist())))
  }
  
  #multiracial
  "Multi-racial"
}) %>% factor() %>% fct_relevel("White")

#table
table2(d97$SPRE)

#other-perceived
d97$OPRE = d97$R2395200 %>% plyr::mapvalues(1:3, c("White", "Black", "Other")) %>% factor() %>% fct_relevel("White")
table2(d97$OPRE)

#skin tone
skin_data = d97[c("T3173000", "T4584700", "T6217800")]

#regress out interviewer effect on ratings
#code interviewer data
interviewer_race_data = d97[c("T2023300", "T3614100", "T5214000")] %>% df_colFunc(function(x) plyr::mapvalues(x, 1:6, c("White", "Black", "American Indian", "Asian", "Other", "Multiracial"), warn_missing = F))

#regression
skin_data_fixed = map2_df(skin_data, interviewer_race_data, function(x, y) {
  #make temp df
  tmp = data.frame(x, y) %>% set_colnames(c("skin_tone", "interviewer"))
  
  #model, resids, standardize
  lm(skin_tone ~ interviewer, data = tmp, na.action = na.exclude) %>% resid() %>% standardize()
})

#use mean; no repeated measures
d97$skin_tone = rowMeans(skin_data, na.rm = T)
d97$skin_tone_fixed = rowMeans(skin_data_fixed, na.rm = T)

Recode income

##INCOME
#average the income data
d97_income_data = d97[d97_var_table$var[which(str_detect(d97_var_table$label, "^TTL"))]]

#add years
d97_income_years = str_match(d97_var_table$label, "PAST YR (\\d{4})") %>% .[, 2] %>% na.omit() %>% as.numeric()
colnames(d97_income_data) = "income_" + d97_income_years

#z score
d97_income_data_z = d97_income_data %>% df_standardize()

#log
d97_income_log_data = df_colFunc(d97_income_data, function(x) log10(x + 1))

#cors
wtd.cors(d97_income_data) %>% 
  MAT_half() %>% 
  describe()

wtd.cors(d97_income_log_data) %>% 
  MAT_half() %>% 
  describe()

#missing data
miss_plot(d97_income_data)

#impute
#load from disk if possible to save time
d97_imp_inc_filename = "data/NLSY97/imp_income.rds"
if (file.exists(d97_imp_inc_filename)) {
  d97_income_data_imputed = read_rds(d97_imp_inc_filename)
} else {
  d97_income_data_imputed = miss_impute(d97_income_data, max_na = 10)
  write_rds(d97_income_data_imputed, d97_imp_inc_filename)
}

#average all data in 4 ways
#results in higher N, but some bias due to selective missing data
d97$income_simple = rowMeans(d97_income_data, na.rm = T)

#log version
# d97$income_simple_log = rowMeans(d97_income_log_data, na.rm = T)
d97$income_simple_log = log10(d97$income_simple + 1)

#average of imputed
d97$income_imp = rowMeans(d97_income_data_imputed)
d97$income_imp_log = log10(d97$income_imp + 1)

#income x IQ by age
d97_age_iq_inc = map_df(seq_along(d97_income_years), function(b) {
  data_frame(
    age = d97_income_years[b] - mean(d97$born),
    r = wtd.cors(d97_income_data_imputed[[b]], d97$g) %>% as.vector()
  )
})
d97_age_iq_inc

#AFQT only
d97_age_iq_inc_afqt = map_df(seq_along(d97_income_years), function(b) {
  data_frame(
    age = d97_income_years[b] - mean(d97$born),
    r = wtd.cors(d97_income_data_imputed[[b]], d97$AFQT) %>% as.vector()
  )
})

#Whites only
d97_age_iq_inc_white = map_df(seq_along(d97_income_years), function(b) {
  # browser()
  #only whites
  d_ = d97 %>% filter(SPRE == "White")
  
  data_frame(
    age = d97_income_years[b] - mean(d_$born),
    r = wtd.cors(d97_income_data_imputed[(d97$SPRE == "White") & !is.na(d97$SPRE), ][[b]], d_$AFQT) %>% as.vector()
  )
})

ggplot(d97_age_iq_inc, aes(age, r)) +
  geom_path() +
  theme_bw() +
  scale_y_continuous("r g x income") +
  ggtitle("Income and IQ by age in NLSY97")

GG_save("figs/97_age_iq_inc.png")

#post-education income only; age >= 25
d97_post_edu_income = d97_income_data_imputed["income_" + c(2007:2011, 2013)]
d97$income_post_edu = d97_post_edu_income %>% df_standardize() %>% rowMeans()
d97$income = d97$income_post_edu

GG_scatter(d97, "g", "income")

Results

Correlations

#cors
hetcor(d97[c("g", "AFQT", "PIAT", "SAT", "ACT", "income_simple", "income_simple_log", "income_imp", "income_imp_log", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")

## Warning in hetcor.data.frame(d97[c("g", "AFQT", "PIAT", "SAT", "ACT",
## "income_simple", : the correlation matrix has been adjusted to make it
## positive-definite

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##                         g     AFQT    PIAT     SAT     ACT income_simple
## g                       1  Pearson Pearson Pearson Pearson       Pearson
## AFQT                0.958        1 Pearson Pearson Pearson       Pearson
## PIAT                0.921    0.778       1 Pearson Pearson       Pearson
## SAT                 0.608    0.601   0.609       1 Pearson       Pearson
## ACT                  0.66    0.669   0.576   0.552       1       Pearson
## income_simple       0.228    0.224    0.21   0.134  0.0894             1
## income_simple_log   0.229    0.221   0.217  0.0996  0.0679         0.761
## income_imp          0.198    0.198   0.184   0.137  0.0754         0.914
## income_imp_log      0.217    0.223   0.189   0.112  0.0539         0.846
## skin_tone          -0.367   -0.364   -0.35  -0.263  -0.326        -0.158
## skin_tone_fixed    -0.357   -0.357  -0.341  -0.279  -0.313        -0.153
## age               -0.0189 -0.00685  0.0171  -0.019 0.00158         0.149
## sex               -0.0312  -0.0464  0.0761  0.0925  0.0526         0.238
##                   income_simple_log income_imp income_imp_log skin_tone
## g                           Pearson    Pearson        Pearson   Pearson
## AFQT                        Pearson    Pearson        Pearson   Pearson
## PIAT                        Pearson    Pearson        Pearson   Pearson
## SAT                         Pearson    Pearson        Pearson   Pearson
## ACT                         Pearson    Pearson        Pearson   Pearson
## income_simple               Pearson    Pearson        Pearson   Pearson
## income_simple_log                 1    Pearson        Pearson   Pearson
## income_imp                    0.809          1        Pearson   Pearson
## income_imp_log                0.904      0.913              1   Pearson
## skin_tone                    -0.166     -0.142         -0.154         1
## skin_tone_fixed              -0.153     -0.138         -0.148     0.993
## age                           0.107       0.29          0.269     0.018
## sex                           0.161      0.277          0.248    0.0262
##                   skin_tone_fixed     age        sex
## g                         Pearson Pearson Polyserial
## AFQT                      Pearson Pearson Polyserial
## PIAT                      Pearson Pearson Polyserial
## SAT                       Pearson Pearson Polyserial
## ACT                       Pearson Pearson Polyserial
## income_simple             Pearson Pearson Polyserial
## income_simple_log         Pearson Pearson Polyserial
## income_imp                Pearson Pearson Polyserial
## income_imp_log            Pearson Pearson Polyserial
## skin_tone                 Pearson Pearson Polyserial
## skin_tone_fixed                 1 Pearson Polyserial
## age                       0.00605       1 Polyserial
## sex                        0.0197 -0.0089          1
## 
## Standard Errors/Numbers of Observations:
##                          g   AFQT   PIAT    SAT    ACT income_simple
## g                     7396   7093   1649   2132   1771          7209
## AFQT              0.000256   7093   1346   2057   1722          6925
## PIAT               0.00263 0.0114   1649    487    409          1593
## SAT                 0.0137 0.0141 0.0285   2493    575          2464
## ACT                 0.0134 0.0133 0.0331 0.0291   2005          1988
## income_simple       0.0112 0.0114  0.024 0.0198 0.0223          8685
## income_simple_log   0.0112 0.0114 0.0239   0.02 0.0223       0.00452
## income_imp           0.013 0.0132 0.0287 0.0217 0.0243       0.00207
## income_imp_log      0.0129 0.0131 0.0286 0.0218 0.0244       0.00356
## skin_tone           0.0115 0.0117 0.0241 0.0215 0.0225         0.012
## skin_tone_fixed     0.0127 0.0129 0.0265 0.0232  0.025        0.0131
## age                 0.0116 0.0119 0.0246   0.02 0.0223        0.0105
## sex                 0.0146 0.0149 0.0307 0.0249  0.028        0.0132
##                   income_simple_log income_imp income_imp_log skin_tone
## g                              7209       5420           5420      5678
## AFQT                           6925       5262           5262      5447
## PIAT                           1593       1138           1138      1325
## SAT                            2464       2053           2053      1872
## ACT                            1988       1677           1677      1576
## income_simple                  8685       6340           6340      6636
## income_simple_log              8685       6340           6340      6636
## income_imp                  0.00434       6340           6340      5309
## income_imp_log               0.0023    0.00208           6340      5309
## skin_tone                    0.0119     0.0134         0.0134      6723
## skin_tone_fixed              0.0131     0.0147         0.0146  0.000177
## age                          0.0106     0.0115         0.0117    0.0122
## sex                          0.0133     0.0154         0.0148    0.0153
##                   skin_tone_fixed    age  sex
## g                            4745   7396 7396
## AFQT                         4549   7093 7093
## PIAT                         1115   1649 1649
## SAT                          1580   2493 2493
## ACT                          1307   2005 2005
## income_simple                5561   8685 8685
## income_simple_log            5561   8685 8685
## income_imp                   4474   6340 6340
## income_imp_log               4474   6340 6340
## skin_tone                    5631   6723 6723
## skin_tone_fixed              5631   5631 5631
## age                        0.0133   8984 8984
## sex                        0.0167 0.0132 8984
## 
## P-values for Tests of Bivariate Normality:
##                                       g      AFQT     PIAT       SAT
## g                                                                   
## AFQT                           1.44e-94                             
## PIAT                           2.52e-22  1.16e-06                   
## SAT               4.94065645841247e-324  4.2e-298  1.1e-61          
## ACT                                   0         0 4.91e-88 2.16e-102
## income_simple                  2.13e-37  1.16e-32 6.86e-36  1.09e-83
## income_simple_log             8.45e-262 1.27e-256    3e-57  2.4e-212
## income_imp                     3.93e-81  2.45e-71 1.36e-88  1.22e-68
## income_imp_log                 9.44e-44  1.38e-40 3.33e-43  1.48e-93
## skin_tone                             0         0  2.4e-85 1.12e-194
## skin_tone_fixed               2.69e-201 3.95e-195 5.83e-48 4.82e-142
## age                            9.58e-83  1.14e-52        0  6.66e-97
## sex                             0.00206   0.00625 1.34e-07  1.48e-71
##                         ACT income_simple income_simple_log income_imp
## g                                                                     
## AFQT                                                                  
## PIAT                                                                  
## SAT                                                                   
## ACT                                                                   
## income_simple     2.33e-286                                           
## income_simple_log         0             0                             
## income_imp        6.07e-249             0                 0           
## income_imp_log    3.34e-239     1.28e-302                 0          0
## skin_tone                 0             0                 0          0
## skin_tone_fixed   7.79e-257     9.53e-250                 0  1.44e-275
## age               4.28e-298     6.89e-128                 0  8.04e-112
## sex               1.01e-284      7.23e-48         2.53e-301   3.56e-47
##                   income_imp_log skin_tone skin_tone_fixed      age
## g                                                                  
## AFQT                                                               
## PIAT                                                               
## SAT                                                                
## ACT                                                                
## income_simple                                                      
## income_simple_log                                                  
## income_imp                                                         
## income_imp_log                                                     
## skin_tone                      0                                   
## skin_tone_fixed        1.26e-234         0                         
## age                      2.8e-81         0       1.93e-268         
## sex                     4.55e-19         0       6.41e-223 3.09e-84

#for blacks and hispanics
hetcor(d97 %>% filter(SPRE == "White") %>% .[c("g", "income", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##                       g  income skin_tone skin_tone_fixed     age
## g                     1 Pearson   Pearson         Pearson Pearson
## income            0.166       1   Pearson         Pearson Pearson
## skin_tone         -0.11 -0.0753         1         Pearson Pearson
## skin_tone_fixed -0.0862 -0.0791      0.97               1 Pearson
## age             -0.0169    0.19   0.00313       -0.000442       1
## sex             -0.0232   0.272    0.0893           0.065 -0.0154
##                        sex
## g               Polyserial
## income          Polyserial
## skin_tone       Polyserial
## skin_tone_fixed Polyserial
## age             Polyserial
## sex                      1
## 
## Standard Errors/Numbers of Observations:
##                      g income skin_tone skin_tone_fixed    age  sex
## g                 3335   2849      2631            2148   3335 3335
## income          0.0182   3216      2649            2183   3216 3216
## skin_tone       0.0193 0.0193      2969            2438   2969 2969
## skin_tone_fixed 0.0214 0.0213    0.0012            2438   2438 2438
## age             0.0173  0.017    0.0184          0.0203   3808 3808
## sex             0.0217 0.0219    0.0231          0.0254 0.0203 3808
## 
## P-values for Tests of Bivariate Normality:
##                         g    income skin_tone skin_tone_fixed     age
## g                                                                    
## income            4.3e-47                                            
## skin_tone               0         0                                  
## skin_tone_fixed 1.71e-153 1.02e-183         0                        
## age              1.35e-28  1.75e-64         0       1.71e-183        
## sex                0.0829  1.73e-45         0          1e-176 1.1e-23

#for blacks
hetcor(d97 %>% filter(SPRE == "Black") %>% .[c("g", "income", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##                       g   income skin_tone skin_tone_fixed     age
## g                     1  Pearson   Pearson         Pearson Pearson
## income            0.364        1   Pearson         Pearson Pearson
## skin_tone        -0.111  -0.0237         1         Pearson Pearson
## skin_tone_fixed -0.0918 -0.00796     0.987               1 Pearson
## age             -0.0743    0.156   -0.0219         -0.0263       1
## sex              -0.098    0.142     0.126           0.134 -0.0316
##                        sex
## g               Polyserial
## income          Polyserial
## skin_tone       Polyserial
## skin_tone_fixed Polyserial
## age             Polyserial
## sex                      1
## 
## Standard Errors/Numbers of Observations:
##                      g income skin_tone skin_tone_fixed    age  sex
## g                 1697   1124      1451            1244   1697 1697
## income          0.0259   1326      1160            1009   1326 1326
## skin_tone       0.0259 0.0294      1718            1481   1718 1718
## skin_tone_fixed 0.0281 0.0315  0.000666            1481   1481 1481
## age             0.0241 0.0268    0.0241           0.026   2044 2044
## sex             0.0303  0.034    0.0298           0.032 0.0277 2044
## 
## P-values for Tests of Bivariate Normality:
##                        g   income skin_tone skin_tone_fixed      age
## g                                                                   
## income          5.45e-26                                            
## skin_tone       6.09e-10 1.66e-12                                   
## skin_tone_fixed    0.496  0.00037  4.65e-67                         
## age              1.3e-19 1.93e-19  5.32e-25        5.35e-12         
## sex                0.144 1.94e-07  1.39e-12           0.236 7.61e-23

#hispanic
hetcor(d97 %>% filter(SPRE == "Hispanic") %>% .[c("g", "income", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")

## 
## Two-Step Estimates
## 
## Correlations/Type of Correlation:
##                       g   income skin_tone skin_tone_fixed     age
## g                     1  Pearson   Pearson         Pearson Pearson
## income            0.196        1   Pearson         Pearson Pearson
## skin_tone        -0.134   0.0115         1         Pearson Pearson
## skin_tone_fixed  -0.136 -0.00476     0.982               1 Pearson
## age             -0.0126    0.149    0.0474          0.0459       1
## sex             -0.0225    0.327     0.106          0.0975  0.0247
##                        sex
## g               Polyserial
## income          Polyserial
## skin_tone       Polyserial
## skin_tone_fixed Polyserial
## age             Polyserial
## sex                      1
## 
## Standard Errors/Numbers of Observations:
##                      g income skin_tone skin_tone_fixed    age  sex
## g                 1457   1007      1135             959   1457 1457
## income          0.0303   1274      1089             923   1274 1274
## skin_tone       0.0292 0.0303      1445            1213   1445 1445
## skin_tone_fixed 0.0317 0.0329   0.00103            1213   1213 1213
## age             0.0262 0.0274    0.0263          0.0287   1899 1899
## sex             0.0329 0.0344    0.0326          0.0357 0.0288 1899
## 
## P-values for Tests of Bivariate Normality:
##                        g   income skin_tone skin_tone_fixed      age
## g                                                                   
## income           4.7e-09                                            
## skin_tone       1.38e-23 1.39e-24                                   
## skin_tone_fixed 6.43e-10 1.83e-12   1.8e-74                         
## age             8.76e-19 4.33e-13  7.73e-39        7.59e-19         
## sex              0.00121 4.34e-07  4.76e-31        5.98e-13 1.45e-15

Modeling

#standardized versions
d97b = d97[c("g", "income", "age", "sex", "SPRE", "OPRE", "skin_tone", "skin_tone_fixed")] %>% df_standardize()

## Skipped sex because it is a factor.

## Skipped SPRE because it is a factor.

## Skipped OPRE because it is a factor.

#baseline
(m97_o = ols(income ~ OPRE + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
## income   OPRE    sex    age 
##   2644    619      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    6107    LR chi2    527.53    R2       0.083    
##  sigma0.9603    d.f.           11    R2 adj   0.081    
##  d.f.   6095    Pr(> chi2) 0.0000    g        0.323    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.9731 -0.6262 -0.1453  0.4231  5.4694 
##  
##  
##                    Coef    S.E.   t      Pr(>|t|)
##  Intercept         -0.0601 0.1657  -0.36 0.7167  
##  OPRE=Black        -0.3102 0.0297 -10.43 <0.0001 
##  OPRE=Other         0.0131 0.0437   0.30 0.7635  
##  sex=Male           0.0959 0.2295   0.42 0.6759  
##  age                0.1754 0.1276   1.37 0.1692  
##  age'              -0.0663 0.7690  -0.09 0.9313  
##  age''             -0.1555 2.3287  -0.07 0.9468  
##  age'''             0.6180 3.0195   0.20 0.8378  
##  sex=Male * age    -0.1152 0.1779  -0.65 0.5172  
##  sex=Male * age'    0.7719 1.0684   0.72 0.4700  
##  sex=Male * age''  -1.3901 3.2377  -0.43 0.6677  
##  sex=Male * age'''  0.1792 4.2103   0.04 0.9660  
##

(m97_s = ols(income ~ SPRE + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
## income   SPRE    sex    age 
##   2644      3      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ SPRE + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    6339    LR chi2    660.49    R2       0.099    
##  sigma0.9505    d.f.           16    R2 adj   0.097    
##  d.f.   6322    Pr(> chi2) 0.0000    g        0.353    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.1656 -0.6199 -0.1463  0.4257  5.4932 
##  
##  
##                        Coef    S.E.   t      Pr(>|t|)
##  Intercept             -0.0389 0.1616  -0.24 0.8097  
##  SPRE=American Indian  -0.5503 0.1838  -2.99 0.0028  
##  SPRE=Asian             0.5993 0.1057   5.67 <0.0001 
##  SPRE=Black            -0.3910 0.0311 -12.57 <0.0001 
##  SPRE=Hispanic         -0.2174 0.0315  -6.90 <0.0001 
##  SPRE=Multi-racial     -0.1300 0.0518  -2.51 0.0121  
##  SPRE=Other             0.1745 0.2313   0.75 0.4505  
##  SPRE=Pacific Islander  0.0841 0.2248   0.37 0.7082  
##  sex=Male               0.1412 0.2235   0.63 0.5275  
##  age                    0.1490 0.1245   1.20 0.2313  
##  age'                   0.1515 0.7497   0.20 0.8399  
##  age''                 -0.8870 2.2698  -0.39 0.6960  
##  age'''                 1.6613 2.9393   0.57 0.5720  
##  sex=Male * age        -0.0728 0.1733  -0.42 0.6747  
##  sex=Male * age'        0.5485 1.0397   0.53 0.5979  
##  sex=Male * age''      -0.6997 3.1492  -0.22 0.8242  
##  sex=Male * age'''     -0.7779 4.0896  -0.19 0.8491  
##

(m97_t = ols(income ~ skin_tone + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
##    income skin_tone       sex       age 
##      2644      2261         0         0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ skin_tone + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    5309    LR chi2    516.63    R2       0.093    
##  sigma0.9445    d.f.           10    R2 adj   0.091    
##  d.f.   5298    Pr(> chi2) 0.0000    g        0.337    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.9432 -0.6172 -0.1546  0.4097  5.2349 
##  
##  
##                    Coef    S.E.   t      Pr(>|t|)
##  Intercept         -0.3169 0.1717  -1.85 0.0651  
##  skin_tone         -0.1587 0.0136 -11.67 <0.0001 
##  sex=Male           0.2863 0.2415   1.19 0.2359  
##  age                0.0630 0.1320   0.48 0.6333  
##  age'               0.7508 0.7991   0.94 0.3475  
##  age''             -2.8898 2.4232  -1.19 0.2331  
##  age'''             4.3846 3.1475   1.39 0.1637  
##  sex=Male * age     0.0252 0.1870   0.13 0.8927  
##  sex=Male * age'   -0.2372 1.1238  -0.21 0.8328  
##  sex=Male * age''   1.8661 3.4056   0.55 0.5838  
##  sex=Male * age''' -3.8985 4.4282  -0.88 0.3787  
##

(m97_t2 = ols(income ~ skin_tone_fixed + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
##          income skin_tone_fixed             sex             age 
##            2644            3353               0               0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ skin_tone_fixed + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    4474    LR chi2    461.59    R2       0.098    
##  sigma0.9297    d.f.           10    R2 adj   0.096    
##  d.f.   4463    Pr(> chi2) 0.0000    g        0.341    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.9438 -0.6185 -0.1476  0.4251  4.9118 
##  
##  
##                    Coef    S.E.   t      Pr(>|t|)
##  Intercept         -0.2990 0.1846  -1.62 0.1053  
##  skin_tone_fixed   -0.1477 0.0146 -10.13 <0.0001 
##  sex=Male           0.2820 0.2588   1.09 0.2758  
##  age                0.0841 0.1422   0.59 0.5543  
##  age'               0.6709 0.8609   0.78 0.4358  
##  age''             -2.6480 2.6100  -1.01 0.3104  
##  age'''             4.0041 3.3828   1.18 0.2366  
##  sex=Male * age     0.0128 0.2003   0.06 0.9491  
##  sex=Male * age'   -0.1249 1.2052  -0.10 0.9175  
##  sex=Male * age''   1.2538 3.6538   0.34 0.7315  
##  sex=Male * age''' -2.2842 4.7503  -0.48 0.6306  
##

#individually
(m97_og = ols(income ~ OPRE + g + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
## income   OPRE      g    sex    age 
##   2644    619   1588      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + g + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    5275    LR chi2    803.94    R2       0.141    
##  sigma0.9305    d.f.           12    R2 adj   0.139    
##  d.f.   5262    Pr(> chi2) 0.0000    g        0.424    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.7286 -0.5843 -0.1325  0.4261  5.1156 
##  
##  
##                    Coef    S.E.   t     Pr(>|t|)
##  Intercept         -0.1260 0.1700 -0.74 0.4586  
##  OPRE=Black        -0.1101 0.0328 -3.36 0.0008  
##  OPRE=Other         0.1436 0.0474  3.03 0.0025  
##  g                  0.2729 0.0142 19.18 <0.0001 
##  sex=Male           0.1319 0.2350  0.56 0.5746  
##  age                0.2127 0.1292  1.65 0.0996  
##  age'              -0.3770 0.7916 -0.48 0.6339  
##  age''              0.9963 2.4105  0.41 0.6794  
##  age'''            -1.1824 3.1518 -0.38 0.7076  
##  sex=Male * age    -0.0988 0.1802 -0.55 0.5835  
##  sex=Male * age'    1.0664 1.1014  0.97 0.3330  
##  sex=Male * age''  -2.7195 3.3590 -0.81 0.4182  
##  sex=Male * age'''  2.2870 4.4087  0.52 0.6040  
##

(m97_sg = ols(income ~ SPRE + g + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
## income   SPRE      g    sex    age 
##   2644      3   1588      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ SPRE + g + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    5419    LR chi2    885.80    R2       0.151    
##  sigma0.9265    d.f.           17    R2 adj   0.148    
##  d.f.   5401    Pr(> chi2) 0.0000    g        0.437    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.6934 -0.5908 -0.1312  0.4292  5.0587 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.1022 0.1675 -0.61 0.5417  
##  SPRE=American Indian  -0.4189 0.1794 -2.34 0.0196  
##  SPRE=Asian             0.6633 0.1122  5.91 <0.0001 
##  SPRE=Black            -0.1636 0.0350 -4.68 <0.0001 
##  SPRE=Hispanic         -0.0515 0.0353 -1.46 0.1449  
##  SPRE=Multi-racial     -0.0409 0.0555 -0.74 0.4611  
##  SPRE=Other             0.2438 0.2400  1.02 0.3098  
##  SPRE=Pacific Islander  0.2502 0.2401  1.04 0.2975  
##  g                      0.2575 0.0144 17.86 <0.0001 
##  sex=Male               0.1250 0.2311  0.54 0.5888  
##  age                    0.2057 0.1272  1.62 0.1059  
##  age'                  -0.3196 0.7793 -0.41 0.6817  
##  age''                  0.7657 2.3725  0.32 0.7469  
##  age'''                -0.7655 3.0993 -0.25 0.8049  
##  sex=Male * age        -0.0917 0.1772 -0.52 0.6046  
##  sex=Male * age'        1.0889 1.0829  1.01 0.3146  
##  sex=Male * age''      -2.7702 3.3021 -0.84 0.4016  
##  sex=Male * age'''      2.2499 4.3316  0.52 0.6035  
##

(m97_tg = ols(income ~ skin_tone + g + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
##    income skin_tone         g       sex       age 
##      2644      2261      1588         0         0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ skin_tone + g + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    4563    LR chi2    761.70    R2       0.154    
##  sigma0.9176    d.f.           11    R2 adj   0.152    
##  d.f.   4551    Pr(> chi2) 0.0000    g        0.439    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.7593 -0.5781 -0.1358  0.4239  4.8891 
##  
##  
##                    Coef    S.E.   t     Pr(>|t|)
##  Intercept         -0.2666 0.1775 -1.50 0.1331  
##  skin_tone         -0.0660 0.0151 -4.36 <0.0001 
##  g                  0.2749 0.0150 18.33 <0.0001 
##  sex=Male           0.2705 0.2485  1.09 0.2764  
##  age                0.1311 0.1347  0.97 0.3301  
##  age'               0.1694 0.8285  0.20 0.8380  
##  age''             -0.7888 2.5247 -0.31 0.7547  
##  age'''             1.1965 3.3045  0.36 0.7173  
##  sex=Male * age     0.0162 0.1901  0.09 0.9322  
##  sex=Male * age'    0.3709 1.1650  0.32 0.7502  
##  sex=Male * age''  -0.6341 3.5554 -0.18 0.8585  
##  sex=Male * age'''  0.0303 4.6710  0.01 0.9948  
##

#both
(m97_osg = ols(income ~ OPRE + SPRE + g + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
## income   OPRE   SPRE      g    sex    age 
##   2644    619      3   1588      0      0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + SPRE + g + sex * rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    5274    LR chi2    849.96    R2       0.149    
##  sigma0.9272    d.f.           19    R2 adj   0.146    
##  d.f.   5254    Pr(> chi2) 0.0000    g        0.434    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.6811 -0.5883 -0.1310  0.4240  5.0668 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.1095 0.1697 -0.65 0.5187  
##  OPRE=Black             0.0622 0.0831  0.75 0.4541  
##  OPRE=Other             0.1221 0.0557  2.19 0.0284  
##  SPRE=American Indian  -0.4778 0.1818 -2.63 0.0086  
##  SPRE=Asian             0.5700 0.1208  4.72 <0.0001 
##  SPRE=Black            -0.2177 0.0873 -2.50 0.0126  
##  SPRE=Hispanic         -0.0831 0.0395 -2.11 0.0352  
##  SPRE=Multi-racial     -0.0947 0.0630 -1.50 0.1328  
##  SPRE=Other             0.2351 0.2506  0.94 0.3482  
##  SPRE=Pacific Islander  0.1708 0.2431  0.70 0.4823  
##  g                      0.2571 0.0147 17.54 <0.0001 
##  sex=Male               0.1413 0.2343  0.60 0.5463  
##  age                    0.2027 0.1288  1.57 0.1154  
##  age'                  -0.3171 0.7892 -0.40 0.6878  
##  age''                  0.7999 2.4034  0.33 0.7393  
##  age'''                -0.9047 3.1425 -0.29 0.7735  
##  sex=Male * age        -0.0881 0.1796 -0.49 0.6240  
##  sex=Male * age'        0.9791 1.0979  0.89 0.3725  
##  sex=Male * age''      -2.3994 3.3486 -0.72 0.4737  
##  sex=Male * age'''      1.7553 4.3953  0.40 0.6896  
##

#full
(m97_osgt = ols(income ~ OPRE + SPRE + skin_tone + g + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
##    income      OPRE      SPRE skin_tone         g       sex       age 
##      2644       619         3      2261      1588         0         0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + SPRE + skin_tone + g + sex * rcs(age), 
##      data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    4456    LR chi2    790.12    R2       0.162    
##  sigma0.9144    d.f.           20    R2 adj   0.159    
##  d.f.   4435    Pr(> chi2) 0.0000    g        0.448    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.6993 -0.5734 -0.1322  0.4254  4.9041 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.2772 0.1808 -1.53 0.1252  
##  OPRE=Black             0.1283 0.0951  1.35 0.1773  
##  OPRE=Other             0.1323 0.0596  2.22 0.0265  
##  SPRE=American Indian  -0.4417 0.2135 -2.07 0.0386  
##  SPRE=Asian             0.6842 0.1277  5.36 <0.0001 
##  SPRE=Black            -0.1291 0.0969 -1.33 0.1829  
##  SPRE=Hispanic         -0.0228 0.0426 -0.53 0.5927  
##  SPRE=Multi-racial     -0.1152 0.0715 -1.61 0.1073  
##  SPRE=Other             0.4894 0.2801  1.75 0.0807  
##  SPRE=Pacific Islander  0.2793 0.2579  1.08 0.2790  
##  skin_tone             -0.0645 0.0245 -2.64 0.0084  
##  g                      0.2678 0.0157 17.08 <0.0001 
##  sex=Male               0.2715 0.2504  1.08 0.2783  
##  age                    0.1287 0.1356  0.95 0.3426  
##  age'                   0.1706 0.8355  0.20 0.8382  
##  age''                 -0.7857 2.5468 -0.31 0.7577  
##  age'''                 1.1675 3.3365  0.35 0.7264  
##  sex=Male * age         0.0148 0.1914  0.08 0.9385  
##  sex=Male * age'        0.2973 1.1743  0.25 0.8002  
##  sex=Male * age''      -0.2688 3.5856 -0.07 0.9402  
##  sex=Male * age'''     -0.7773 4.7153 -0.16 0.8691  
##

(m97_osgt2 = ols(income ~ OPRE + SPRE + skin_tone_fixed + g + sex * rcs(age), data = d97b))

## Frequencies of Missing Values Due to Each Variable
##          income            OPRE            SPRE skin_tone_fixed 
##            2644             619               3            3353 
##               g             sex             age 
##            1588               0               0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + SPRE + skin_tone_fixed + g + sex * 
##      rcs(age), data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    3739    LR chi2    729.59    R2       0.177    
##  sigma0.8980    d.f.           20    R2 adj   0.173    
##  d.f.   3718    Pr(> chi2) 0.0000    g        0.461    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.7556 -0.5658 -0.1234  0.4337  4.7877 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.2721 0.1938 -1.40 0.1605  
##  OPRE=Black             0.1170 0.0975  1.20 0.2301  
##  OPRE=Other             0.1493 0.0638  2.34 0.0193  
##  SPRE=American Indian  -0.4578 0.2157 -2.12 0.0338  
##  SPRE=Asian             0.8004 0.1351  5.92 <0.0001 
##  SPRE=Black            -0.1316 0.1003 -1.31 0.1895  
##  SPRE=Hispanic         -0.0373 0.0457 -0.82 0.4141  
##  SPRE=Multi-racial     -0.1628 0.0760 -2.14 0.0322  
##  SPRE=Other             0.5548 0.2884  1.92 0.0545  
##  SPRE=Pacific Islander  0.2694 0.2639  1.02 0.3074  
##  skin_tone_fixed       -0.0448 0.0256 -1.75 0.0799  
##  g                      0.2698 0.0169 16.01 <0.0001 
##  sex=Male               0.2669 0.2680  1.00 0.3195  
##  age                    0.1315 0.1459  0.90 0.3675  
##  age'                   0.2109 0.8997  0.23 0.8147  
##  age''                 -0.9227 2.7446 -0.34 0.7367  
##  age'''                 1.2781 3.5925  0.36 0.7220  
##  sex=Male * age         0.0096 0.2048  0.05 0.9628  
##  sex=Male * age'        0.3942 1.2593  0.31 0.7543  
##  sex=Male * age''      -0.7546 3.8484 -0.20 0.8446  
##  sex=Male * age'''      0.4842 5.0617  0.10 0.9238  
##

#relative importance
#use linear age bc eta² does not support nonlinear
(m97_osgt3 = ols(income ~ OPRE + SPRE + skin_tone + g + sex * age, data = d97b))

## Frequencies of Missing Values Due to Each Variable
##    income      OPRE      SPRE skin_tone         g       sex       age 
##      2644       619         3      2261      1588         0         0 
## 
## Linear Regression Model
##  
##  ols(formula = income ~ OPRE + SPRE + skin_tone + g + sex * age, 
##      data = d97b)
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    4456    LR chi2    787.66    R2       0.162    
##  sigma0.9141    d.f.           14    R2 adj   0.159    
##  d.f.   4441    Pr(> chi2) 0.0000    g        0.447    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.7097 -0.5697 -0.1332  0.4243  4.9126 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.2868 0.0300 -9.56 <0.0001 
##  OPRE=Black             0.1269 0.0950  1.34 0.1817  
##  OPRE=Other             0.1324 0.0596  2.22 0.0263  
##  SPRE=American Indian  -0.4483 0.2133 -2.10 0.0356  
##  SPRE=Asian             0.6809 0.1276  5.34 <0.0001 
##  SPRE=Black            -0.1291 0.0968 -1.33 0.1825  
##  SPRE=Hispanic         -0.0231 0.0426 -0.54 0.5876  
##  SPRE=Multi-racial     -0.1155 0.0715 -1.62 0.1061  
##  SPRE=Other             0.4919 0.2800  1.76 0.0790  
##  SPRE=Pacific Islander  0.2788 0.2578  1.08 0.2795  
##  skin_tone             -0.0639 0.0244 -2.61 0.0090  
##  g                      0.2686 0.0157 17.16 <0.0001 
##  sex=Male               0.4010 0.0276 14.53 <0.0001 
##  age                    0.1062 0.0195  5.46 <0.0001 
##  sex=Male * age         0.1282 0.0272  4.72 <0.0001 
##

m97_osgt3 %>% aov() %>% lsr::etaSquared() %>% sqrt()

##           eta.sq eta.sq.part
## OPRE       0.033       0.036
## SPRE       0.092       0.100
## skin_tone  0.036       0.039
## g          0.236       0.249
## sex        0.197       0.210
## age        0.173       0.186
## sex:age    0.065       0.071

Joint

g x income by age combined

bind_rows("NLSY97" = d97_age_iq_inc,
          "NLSY79" = d79_age_iq_inc,
          .id = "data") %>% 
  ggplot(aes(age, r, color = data)) +
  geom_path() +
  theme_bw() +
  scale_y_continuous("r g x income") +
  ggtitle("Income and IQ by age in NLSY's")

GG_save("figs/age_iq_inc.png")

Meritocracy, not racial discrimination, explains the racial income gaps in the United States

a joint analysis of NLSY79 and NLSY97

Emil O. W. Kirkegaard

What is this

Start

Ad hoc

NLSY79

Data

Recode

Recode income

Correlations

Models

NLSY97

Data

Recode

Recode income

Results

Correlations

Modeling

Joint

g x income by age combined