This is the R notebook for the titled study. It features the analysis code (except parts which were generated by the NLSY explorer) and the results. This serves multiple purposes including: 1) anyone can review the analysis code for errors and researcher degrees of freedom, 2) anyone can reuse the whole or parts of the code for any purpose without seeking approval, 3) anyone can gather additional calculated statistics that were not put in the written paper.
options(digits = 2)
library(pacman)
p_load(kirkegaard, readr, haven, polycor, rms)
A few ad hoc functions.
#psych's breaks rnbs due to a custom class, we just remove that
describe = function(...) {
x = psych::describe(...)
class(x) = "data.frame"
x
}
Below follows the analysis code for the NLSY79 analyses.
#we used the NLSY supplied recoding script, with minor changes
source("data/NLSY79/nlsy79.R")
d79 = new_data; rm(new_data)
#var table
d79_var_table = data_frame(
var = names(d79),
label = varlabels
)
#scrrener perceived race
d79$OPRE1 = d79$R0214700 %>% plyr::mapvalues(1:3, c("Hispanic", "Black", "White")) %>% factor() %>% fct_relevel("White")
d79$OPRE1 %>% table2()
#interviewer perceived
d79$OPRE2 = d79$R0172700 %>% plyr::mapvalues(1:3, c("White", "Black", "Other")) %>% factor() %>% fct_relevel("White")
d79$OPRE2 %>% table2()
#combined
table(d79$OPRE1, d79$OPRE2)
##
## White Black Other
## White 7260 38 161
## Black 48 3094 17
## Hispanic 1428 31 533
#we use OPRE1
d79$OPRE = d79$OPRE1
#self-perceived in 2002
SPRE2002 = d79 %>% df_subset_by_pattern("^R709310") %>%
set_colnames(c("White", "Black", "Asian", "Pacific Islander", "American Indian", "Other", "Hispanic")) %>%
map_df(~as.logical(.))
#recode into standard coding
d79$SPRE = plyr::alply(SPRE2002, .margins = 1, function(r) {
#all NA?
if (all(is.na(r))) return(NA)
#partial missing into F
r[is.na(r)] = F
#Hispanic?
if (r$Hispanic) return("Hispanic")
#any of the other first 6?
if (sum(r[1:6] %>% unlist()) == 1) {
return(names(which(r %>% unlist())))
}
"Multi-racial"
}) %>% unlist() %>% factor() %>% fct_relevel("White")
table2(d79$SPRE)
#compare
table(d79$SPRE, d79$OPRE) %>% prop.table(margin = 1) %>% unclass() %>% write_clipboard()
## White Black Hispanic
## White 0.85 0.01 0.14
## American Indian 0.76 0.07 0.17
## Asian 0.65 0.17 0.17
## Black 0.00 0.99 0.01
## Hispanic 0.01 0.01 0.98
## Multi-racial 0.37 0.43 0.21
## Other 0.08 0.04 0.87
## Pacific Islander 0.40 0.10 0.50
#income vars
d79_income_vars = d79[str_detect(d79_var_table$label, "^(TOT INC)|AMT") %>% which()] %>%
map_df(~standardize(.))
#years
income_vars_years = d79_var_table %>%
filter(str_detect(label, "^(TOT INC)|AMT")) %>%
pull(label) %>%
str_match("\\d+") %>%
as.vector() %>%
map_chr(~{if (str_length(.) == 2) return("19" + .) else .}) %>%
as.numeric()
#IQ
d79$AFQT = d79$R0618200 %>% divide_by(100) %>% qnorm()
d79$g = d79$AFQT #synonym
#sex
d79$sex = d79$R0214800 %>% plyr::mapvalues(1:2, c("Male", "Female")) %>% factor()
#age then
d79$age_1979 = d79$R0000600
#born
d79$born = 1979 - d79$age_1979
#age at last follow-up
d79$age = max(income_vars_years) - d79$born
#correlations
d79_income_vars %>% wtd.cors() %>% MAT_half() %>% describe()
#missing data?
miss_plot(d79_income_vars)
#impute those with 7<=missing
#how many % is that?
mean(miss_by_case(d79_income_vars) <= 17)
## [1] 0.86
#load from file if possible to avoid waiting time!
imp_inc_filename = "data/NLSY79/imp_inc.rds"
if (file.exists(imp_inc_filename)) {
d79_income_vars_imp = read_rds(imp_inc_filename)
} else {
d79_income_vars_imp = d79_income_vars %>% miss_impute(max_na = 17)
write_rds(d79_income_vars_imp, imp_inc_filename)
}
#average in multiple ways
d79$income_simple = rowMeans(d79_income_vars, na.rm = T)
d79$income_simple_log = log10(d79$income + 1)
## Warning: NaNs produced
d79$income_imp = rowMeans(d79_income_vars_imp)
d79$income_imp_log = log10(d79$income_imp + 1)
#plot
GG_denhist(d79$income_simple)
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## Warning: Removed 32 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 32 rows containing non-finite values (stat_bin).
## Warning: Removed 32 rows containing non-finite values (stat_density).
GG_denhist(d79$income_simple_log)
## Warning: Removed 37 rows containing non-finite values (stat_bin).
## Warning: Removed 37 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 37 rows containing non-finite values (stat_bin).
## Warning: Removed 37 rows containing non-finite values (stat_density).
GG_denhist(d79$income_imp)
## Warning: Removed 1726 rows containing non-finite values (stat_bin).
## Warning: Removed 1726 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1726 rows containing non-finite values (stat_bin).
## Warning: Removed 1726 rows containing non-finite values (stat_density).
GG_denhist(d79$income_imp_log)
## Warning: Removed 1726 rows containing non-finite values (stat_bin).
## Warning: Removed 1726 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1726 rows containing non-finite values (stat_bin).
## Warning: Removed 1726 rows containing non-finite values (stat_density).
#cors
hetcor(d79[c("g", "income_simple", "income_simple_log", "income_imp","income_imp_log", "age", "sex")], use = "pairwise")
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g income_simple income_simple_log income_imp
## g 1 Pearson Pearson Pearson
## income_simple 0.405 1 Pearson Pearson
## income_simple_log 0.407 0.878 1 Pearson
## income_imp 0.467 0.985 0.856 1
## income_imp_log 0.482 0.894 0.977 0.898
## age 0.216 0.166 0.127 0.194
## sex -0.0114 0.397 0.356 0.421
## income_imp_log age sex
## g Pearson Pearson Polyserial
## income_simple Pearson Pearson Polyserial
## income_simple_log Pearson Pearson Polyserial
## income_imp Pearson Pearson Polyserial
## income_imp_log 1 Pearson Polyserial
## age 0.147 1 Polyserial
## sex 0.392 -0.0104 1
##
## Standard Errors/Numbers of Observations:
## g income_simple income_simple_log income_imp
## g 11878 11868 11865 10430
## income_simple 0.00767 12654 12649 10960
## income_simple_log 0.00766 0.00203 12649 10960
## income_imp 0.00766 0.000278 0.00255 10960
## income_imp_log 0.00751 0.00192 0.000437 0.00185
## age 0.00875 0.00865 0.00875 0.00919
## sex 0.0115 0.0101 0.00981 0.0107
## income_imp_log age sex
## g 10430 11878 11878
## income_simple 10960 12654 12654
## income_simple_log 10960 12649 12649
## income_imp 10960 10960 10960
## income_imp_log 10960 10960 10960
## age 0.00935 12686 12686
## sex 0.0102 0.0111 12686
##
## P-values for Tests of Bivariate Normality:
## g income_simple income_simple_log income_imp
## g
## income_simple 3.4e-74
## income_simple_log 8.55e-81 0
## income_imp 9.85e-69 5.77e-169 0
## income_imp_log 3.69e-71 0 2.28e-269 0
## age 6.54e-294 0 0 0
## sex 3.14e-11 6.93e-82 1.88e-71 1.64e-76
## income_imp_log age
## g
## income_simple
## income_simple_log
## income_imp
## income_imp_log
## age 7.54999999999904e-313
## sex 4.8e-42 7.21999999999999e-310
#use the best version
d79$income = d79$income_imp
#income x IQ by age
d79_age_iq_inc = map_df(seq_along(unique(income_vars_years)), function(b) {
data_frame(
age = income_vars_years[b] - mean(d79$born),
r = wtd.cors(d79_income_vars_imp[[b]], d79$AFQT) %>% as.vector()
)
})
d79_age_iq_inc
ggplot(d79_age_iq_inc, aes(age, r)) +
geom_path() +
theme_bw() +
scale_y_continuous("r g x income") +
ggtitle("Income and IQ by age in NLSY79")
GG_save("figs/79_age_iq_inc.png")
#for whites
hetcor(d79 %>% filter(SPRE == "White") %>% .[c("g", "income", "age", "sex")], use = "pairwise")
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g income age sex
## g 1 Pearson Pearson Polyserial
## income 0.405 1 Pearson Polyserial
## age 0.183 0.194 1 Polyserial
## sex 0.0205 0.525 -0.0451 1
##
## Standard Errors/Numbers of Observations:
## g income age sex
## g 4265 4254 4265 4265
## income 0.0128 4433 4433 4433
## age 0.0148 0.0145 4450 4450
## sex 0.0192 0.015 0.0187 4450
##
## P-values for Tests of Bivariate Normality:
## g income age
## g
## income 3.47e-37
## age 8.75e-101 1.99e-140
## sex 0.00427 1.58e-41 2.45e-103
#for blacks
hetcor(d79 %>% filter(SPRE == "Black") %>% .[c("g", "income", "age", "sex")], use = "pairwise")
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g income age sex
## g 1 Pearson Pearson Polyserial
## income 0.452 1 Pearson Polyserial
## age 0.117 0.173 1 Polyserial
## sex -0.0623 0.284 -0.0152 1
##
## Standard Errors/Numbers of Observations:
## g income age sex
## g 2204 2196 2204 2204
## income 0.017 2278 2278 2278
## age 0.021 0.0203 2287 2287
## sex 0.0266 0.0253 0.0262 2287
##
## P-values for Tests of Bivariate Normality:
## g income age
## g
## income 9.64e-07
## age 4.57e-50 1.71e-59
## sex 0.0225 9.84e-12 3.71e-50
#hispanic
hetcor(d79 %>% filter(SPRE == "Hispanic") %>% .[c("g", "income", "age", "sex")], use = "pairwise")
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g income age sex
## g 1 Pearson Pearson Polyserial
## income 0.495 1 Pearson Polyserial
## age 0.0306 0.182 1 Polyserial
## sex 0.107 0.494 -0.00874 1
##
## Standard Errors/Numbers of Observations:
## g income age sex
## g 464 464 464 464
## income 0.0351 492 492 492
## age 0.0464 0.0436 492 492
## sex 0.0579 0.0469 0.0566 492
##
## P-values for Tests of Bivariate Normality:
## g income age
## g
## income 0.12
## age 2.93e-06 2.07e-06
## sex 0.103 0.0436 1.7e-06
#subset and standardize
d79b = d79[c("g", "income", "OPRE", "SPRE", "sex", "age")] %>% df_standardize()
## Skipped OPRE because it is a factor.
## Skipped SPRE because it is a factor.
## Skipped sex because it is a factor.
#baseline
(m79_o = ols(income ~ OPRE + sex * rcs(age), data = d79b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE sex age
## 1726 0 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + sex * rcs(age), data = d79b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 10960 LR chi2 2340.40 R2 0.192
## sigma0.8992 d.f. 11 R2 adj 0.191
## d.f. 10948 Pr(> chi2) 0.0000 g 0.496
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.2268 -0.6045 -0.1420 0.4488 5.8321
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.0480 0.1161 -0.41 0.6794
## OPRE=Black -0.4751 0.0203 -23.42 <0.0001
## OPRE=Hispanic -0.2462 0.0238 -10.35 <0.0001
## sex=Male 0.5394 0.1617 3.34 0.0009
## age 0.1853 0.0866 2.14 0.0323
## age' -0.2728 0.3670 -0.74 0.4574
## age'' 0.7213 1.1279 0.64 0.5225
## age''' -0.6669 1.6536 -0.40 0.6867
## sex=Male * age 0.0640 0.1204 0.53 0.5947
## sex=Male * age' 0.0943 0.5140 0.18 0.8544
## sex=Male * age'' 0.6085 1.5840 0.38 0.7009
## sex=Male * age''' -3.0526 2.3315 -1.31 0.1905
##
(m79_s = ols(income ~ SPRE + sex * rcs(age), data = d79b))
## Frequencies of Missing Values Due to Each Variable
## income SPRE sex age
## 1726 4968 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ SPRE + sex * rcs(age), data = d79b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7692 LR chi2 1880.79 R2 0.217
## sigma0.9014 d.f. 16 R2 adj 0.215
## d.f. 7675 Pr(> chi2) 0.0000 g 0.537
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.2966 -0.5930 -0.1405 0.4334 5.6981
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.0702 0.1342 -0.52 0.6010
## SPRE=American Indian -0.4225 0.1337 -3.16 0.0016
## SPRE=Asian 0.1947 0.1885 1.03 0.3016
## SPRE=Black -0.5653 0.0233 -24.31 <0.0001
## SPRE=Hispanic -0.3816 0.0428 -8.91 <0.0001
## SPRE=Multi-racial -0.2267 0.0908 -2.50 0.0125
## SPRE=Other -0.3867 0.0531 -7.29 <0.0001
## SPRE=Pacific Islander 0.0256 0.2856 0.09 0.9287
## sex=Male 0.7249 0.1892 3.83 0.0001
## age 0.1232 0.0998 1.23 0.2173
## age' -0.0057 0.4276 -0.01 0.9893
## age'' -0.1353 1.3240 -0.10 0.9186
## age''' 0.5820 1.9707 0.30 0.7678
## sex=Male * age 0.1794 0.1402 1.28 0.2005
## sex=Male * age' -0.3695 0.6047 -0.61 0.5412
## sex=Male * age'' 2.0449 1.8757 1.09 0.2757
## sex=Male * age''' -5.2242 2.7970 -1.87 0.0618
##
#individually
(m79_og = ols(income ~ OPRE + g + sex * rcs(age), data = d79b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE g sex age
## 1726 0 808 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + g + sex * rcs(age), data = d79b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 10430 LR chi2 4426.94 R2 0.346
## sigma0.8112 d.f. 12 R2 adj 0.345
## d.f. 10417 Pr(> chi2) 0.0000 g 0.664
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.81798 -0.50481 -0.08837 0.39704 5.60762
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2848 0.1065 -2.67 0.0075
## OPRE=Black -0.0312 0.0207 -1.50 0.1327
## OPRE=Hispanic 0.0960 0.0232 4.14 <0.0001
## g 0.4442 0.0090 49.09 <0.0001
## sex=Male 0.4996 0.1483 3.37 0.0008
## age 0.0730 0.0793 0.92 0.3571
## age' 0.0358 0.3371 0.11 0.9154
## age'' -0.2496 1.0383 -0.24 0.8101
## age''' 0.6567 1.5294 0.43 0.6677
## sex=Male * age 0.0024 0.1103 0.02 0.9827
## sex=Male * age' 0.2857 0.4729 0.60 0.5458
## sex=Male * age'' -0.0991 1.4620 -0.07 0.9459
## sex=Male * age''' -1.8085 2.1646 -0.84 0.4035
##
(m79_sg = ols(income ~ SPRE + g + sex * rcs(age), data = d79b))
## Frequencies of Missing Values Due to Each Variable
## income SPRE g sex age
## 1726 4968 808 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ SPRE + g + sex * rcs(age), data = d79b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7372 LR chi2 3305.26 R2 0.361
## sigma0.8165 d.f. 17 R2 adj 0.360
## d.f. 7354 Pr(> chi2) 0.0000 g 0.693
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.93823 -0.50917 -0.08716 0.39576 5.51322
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2459 0.1231 -2.00 0.0458
## SPRE=American Indian -0.2248 0.1268 -1.77 0.0762
## SPRE=Asian 0.0956 0.1786 0.53 0.5927
## SPRE=Black -0.1195 0.0241 -4.95 <0.0001
## SPRE=Hispanic 0.0061 0.0410 0.15 0.8812
## SPRE=Multi-racial -0.1018 0.0839 -1.21 0.2253
## SPRE=Other -0.0046 0.0506 -0.09 0.9282
## SPRE=Pacific Islander 0.0504 0.2587 0.19 0.8455
## g 0.4424 0.0109 40.53 <0.0001
## sex=Male 0.5552 0.1739 3.19 0.0014
## age 0.0561 0.0915 0.61 0.5396
## age' 0.1225 0.3926 0.31 0.7551
## age'' -0.5178 1.2177 -0.43 0.6707
## age''' 1.0541 1.8199 0.58 0.5625
## sex=Male * age 0.0290 0.1287 0.23 0.8216
## sex=Male * age' 0.1549 0.5571 0.28 0.7811
## sex=Male * age'' 0.4825 1.7328 0.28 0.7807
## sex=Male * age''' -3.2087 2.5974 -1.24 0.2167
##
#both
(m79_osg = ols(income ~ OPRE + SPRE + g + sex * rcs(age), data = d79b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE SPRE g sex age
## 1726 0 4968 808 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + SPRE + g + sex * rcs(age), data = d79b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7372 LR chi2 3316.07 R2 0.362
## sigma0.8160 d.f. 19 R2 adj 0.361
## d.f. 7352 Pr(> chi2) 0.0000 g 0.693
##
## Residuals
##
## Min 1Q Median 3Q Max
## -3.01786 -0.50719 -0.08702 0.38881 5.52638
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2753 0.1233 -2.23 0.0256
## OPRE=Black 0.1391 0.0798 1.74 0.0813
## OPRE=Hispanic 0.1069 0.0348 3.07 0.0021
## SPRE=American Indian -0.2362 0.1268 -1.86 0.0625
## SPRE=Asian 0.0695 0.1791 0.39 0.6979
## SPRE=Black -0.2340 0.0803 -2.92 0.0036
## SPRE=Hispanic -0.0771 0.0493 -1.57 0.1176
## SPRE=Multi-racial -0.1643 0.0902 -1.82 0.0686
## SPRE=Other -0.0820 0.0561 -1.46 0.1440
## SPRE=Pacific Islander -0.0003 0.2590 0.00 0.9990
## g 0.4500 0.0112 40.33 <0.0001
## sex=Male 0.5686 0.1738 3.27 0.0011
## age 0.0481 0.0915 0.53 0.5987
## age' 0.1554 0.3925 0.40 0.6921
## age'' -0.6078 1.2173 -0.50 0.6176
## age''' 1.1366 1.8190 0.62 0.5321
## sex=Male * age 0.0370 0.1287 0.29 0.7738
## sex=Male * age' 0.1147 0.5570 0.21 0.8368
## sex=Male * age'' 0.5853 1.7321 0.34 0.7354
## sex=Male * age''' -3.2756 2.5959 -1.26 0.2070
##
#relative importance
#use linear age bc eta² does not support nonlinear
(m79_osg2 = ols(income ~ OPRE + SPRE + g + sex * age, data = d79b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE SPRE g sex age
## 1726 0 4968 808 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + SPRE + g + sex * age, data = d79b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 7372 LR chi2 3304.72 R2 0.361
## sigma0.8163 d.f. 13 R2 adj 0.360
## d.f. 7358 Pr(> chi2) 0.0000 g 0.693
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.94757 -0.50962 -0.08683 0.38861 5.51652
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2434 0.0171 -14.24 <0.0001
## OPRE=Black 0.1406 0.0798 1.76 0.0781
## OPRE=Hispanic 0.1086 0.0348 3.12 0.0018
## SPRE=American Indian -0.2429 0.1268 -1.92 0.0553
## SPRE=Asian 0.0710 0.1792 0.40 0.6918
## SPRE=Black -0.2368 0.0802 -2.95 0.0032
## SPRE=Hispanic -0.0785 0.0493 -1.59 0.1111
## SPRE=Multi-racial -0.1704 0.0902 -1.89 0.0590
## SPRE=Other -0.0852 0.0561 -1.52 0.1290
## SPRE=Pacific Islander 0.0042 0.2589 0.02 0.9871
## g 0.4497 0.0112 40.32 <0.0001
## sex=Male 0.6998 0.0193 36.30 <0.0001
## age 0.0677 0.0137 4.93 <0.0001
## sex=Male * age 0.1455 0.0195 7.47 <0.0001
##
m79_osg2 %>% aov() %>% lsr::etaSquared() %>% sqrt()
## eta.sq eta.sq.part
## OPRE 0.031 0.039
## SPRE 0.036 0.045
## g 0.376 0.425
## sex 0.331 0.383
## age 0.132 0.163
## sex:age 0.070 0.087
#call the NLSY R file to load the data
source("data/NLSY97/nlsy97_race_income.R")
#rename
d97 = new_data; rm(new_data)
#var table
d97_var_table = data_frame(
var = names(d97),
label = varlabels
)
##AGE
#born
d97$born = d97$R0536402
GG_denhist(d97, "born")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#calculate age from month and year
#subtract from 2013, last year of data
d97$age = (parse_date("06 2013", "%m %Y") - sprintf("%d %d", d97$R0536401, d97$R0536402) %>%
parse_date(format = "%m %Y")) %>%
as.numeric() %>%
divide_by(365.24)
GG_denhist(d97, "age")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##SEX
d97$sex = if_else(d97$R0536300 == 1, true = "Male", false = "Female") %>% factor()
##IQ
#AFQT
d97$AFQT = d97$R9829600 %>%
#recode 0 and 100 centiles
winsorise(99999, 1) %>%
#convert to fraction
divide_by(100000) %>%
#convert to Z score
qnorm()
#plot
GG_denhist(d97, "AFQT")
## Warning: Removed 1891 rows containing non-finite values (stat_bin).
## Warning: Removed 1891 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1891 rows containing non-finite values (stat_bin).
## Warning: Removed 1891 rows containing non-finite values (stat_density).
#PPAT
#multiple scores
PIAT_data = d97[d97_var_table$var[which(str_detect(d97_var_table$label, "^CV_PIAT"))]]
#z score
PIAT_data %<>% df_standardize()
#cors
wtd.cors(PIAT_data) %T>%
print() %>%
MAT_half() %>%
describe()
## R5473700 R7237400 S1552700
## R5473700 1.00 0.76 0.78
## R7237400 0.76 1.00 0.77
## S1552700 0.78 0.77 1.00
#simple mean will do
d97$PIAT = rowMeans(PIAT_data, na.rm = T)
#plot
GG_denhist(d97, "PIAT")
## Warning: Removed 7335 rows containing non-finite values (stat_bin).
## Warning: Removed 7335 rows containing non-finite values (stat_density).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 7335 rows containing non-finite values (stat_bin).
## Warning: Removed 7335 rows containing non-finite values (stat_density).
#SAT and ACT for comparison
d97$SAT_math = d97$Z9033700
d97$SAT_verbal = d97$Z9033900
d97$SAT = rowMeans(d97[c("SAT_verbal", "SAT_math")], na.rm=T)
d97$ACT = d97$Z9034100
#g factor
d97$g = rowMeans(d97[c("AFQT", "PIAT")], na.rm = T)
##RACE
#self-perceived dummy version, 2002
SPRE_data = d97[c("S122490" + 0:5, "R0538600")] %>%
set_colnames(c("White", "Black", "Asian", "Pacific Islander", "American Indian", "Other", "Hispanic")) %>%
df_colFunc(as.logical)
#standard coding
d97$SPRE = plyr::aaply(SPRE_data, .margins = 1, .expand = F, function(x) {
#all NA? code missing
if (all(is.na(x))) return(NA)
#otherwise, impute false
x[is.na(x)] = F
#is hispanic?
if (x$Hispanic) return("Hispanic")
#is only 1 non-hipanic?
if (sum(x %>% unlist()) == 1) {
return(names(which(x %>% unlist())))
}
#multiracial
"Multi-racial"
}) %>% factor() %>% fct_relevel("White")
#table
table2(d97$SPRE)
#other-perceived
d97$OPRE = d97$R2395200 %>% plyr::mapvalues(1:3, c("White", "Black", "Other")) %>% factor() %>% fct_relevel("White")
table2(d97$OPRE)
#skin tone
skin_data = d97[c("T3173000", "T4584700", "T6217800")]
#regress out interviewer effect on ratings
#code interviewer data
interviewer_race_data = d97[c("T2023300", "T3614100", "T5214000")] %>% df_colFunc(function(x) plyr::mapvalues(x, 1:6, c("White", "Black", "American Indian", "Asian", "Other", "Multiracial"), warn_missing = F))
#regression
skin_data_fixed = map2_df(skin_data, interviewer_race_data, function(x, y) {
#make temp df
tmp = data.frame(x, y) %>% set_colnames(c("skin_tone", "interviewer"))
#model, resids, standardize
lm(skin_tone ~ interviewer, data = tmp, na.action = na.exclude) %>% resid() %>% standardize()
})
#use mean; no repeated measures
d97$skin_tone = rowMeans(skin_data, na.rm = T)
d97$skin_tone_fixed = rowMeans(skin_data_fixed, na.rm = T)
##INCOME
#average the income data
d97_income_data = d97[d97_var_table$var[which(str_detect(d97_var_table$label, "^TTL"))]]
#add years
d97_income_years = str_match(d97_var_table$label, "PAST YR (\\d{4})") %>% .[, 2] %>% na.omit() %>% as.numeric()
colnames(d97_income_data) = "income_" + d97_income_years
#z score
d97_income_data_z = d97_income_data %>% df_standardize()
#log
d97_income_log_data = df_colFunc(d97_income_data, function(x) log10(x + 1))
#cors
wtd.cors(d97_income_data) %>%
MAT_half() %>%
describe()
wtd.cors(d97_income_log_data) %>%
MAT_half() %>%
describe()
#missing data
miss_plot(d97_income_data)
#impute
#load from disk if possible to save time
d97_imp_inc_filename = "data/NLSY97/imp_income.rds"
if (file.exists(d97_imp_inc_filename)) {
d97_income_data_imputed = read_rds(d97_imp_inc_filename)
} else {
d97_income_data_imputed = miss_impute(d97_income_data, max_na = 10)
write_rds(d97_income_data_imputed, d97_imp_inc_filename)
}
#average all data in 4 ways
#results in higher N, but some bias due to selective missing data
d97$income_simple = rowMeans(d97_income_data, na.rm = T)
#log version
# d97$income_simple_log = rowMeans(d97_income_log_data, na.rm = T)
d97$income_simple_log = log10(d97$income_simple + 1)
#average of imputed
d97$income_imp = rowMeans(d97_income_data_imputed)
d97$income_imp_log = log10(d97$income_imp + 1)
#income x IQ by age
d97_age_iq_inc = map_df(seq_along(d97_income_years), function(b) {
data_frame(
age = d97_income_years[b] - mean(d97$born),
r = wtd.cors(d97_income_data_imputed[[b]], d97$g) %>% as.vector()
)
})
d97_age_iq_inc
#AFQT only
d97_age_iq_inc_afqt = map_df(seq_along(d97_income_years), function(b) {
data_frame(
age = d97_income_years[b] - mean(d97$born),
r = wtd.cors(d97_income_data_imputed[[b]], d97$AFQT) %>% as.vector()
)
})
#Whites only
d97_age_iq_inc_white = map_df(seq_along(d97_income_years), function(b) {
# browser()
#only whites
d_ = d97 %>% filter(SPRE == "White")
data_frame(
age = d97_income_years[b] - mean(d_$born),
r = wtd.cors(d97_income_data_imputed[(d97$SPRE == "White") & !is.na(d97$SPRE), ][[b]], d_$AFQT) %>% as.vector()
)
})
ggplot(d97_age_iq_inc, aes(age, r)) +
geom_path() +
theme_bw() +
scale_y_continuous("r g x income") +
ggtitle("Income and IQ by age in NLSY97")
GG_save("figs/97_age_iq_inc.png")
#post-education income only; age >= 25
d97_post_edu_income = d97_income_data_imputed["income_" + c(2007:2011, 2013)]
d97$income_post_edu = d97_post_edu_income %>% df_standardize() %>% rowMeans()
d97$income = d97$income_post_edu
GG_scatter(d97, "g", "income")
#cors
hetcor(d97[c("g", "AFQT", "PIAT", "SAT", "ACT", "income_simple", "income_simple_log", "income_imp", "income_imp_log", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")
## Warning in hetcor.data.frame(d97[c("g", "AFQT", "PIAT", "SAT", "ACT",
## "income_simple", : the correlation matrix has been adjusted to make it
## positive-definite
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g AFQT PIAT SAT ACT income_simple
## g 1 Pearson Pearson Pearson Pearson Pearson
## AFQT 0.958 1 Pearson Pearson Pearson Pearson
## PIAT 0.921 0.778 1 Pearson Pearson Pearson
## SAT 0.608 0.601 0.609 1 Pearson Pearson
## ACT 0.66 0.669 0.576 0.552 1 Pearson
## income_simple 0.228 0.224 0.21 0.134 0.0894 1
## income_simple_log 0.229 0.221 0.217 0.0996 0.0679 0.761
## income_imp 0.198 0.198 0.184 0.137 0.0754 0.914
## income_imp_log 0.217 0.223 0.189 0.112 0.0539 0.846
## skin_tone -0.367 -0.364 -0.35 -0.263 -0.326 -0.158
## skin_tone_fixed -0.357 -0.357 -0.341 -0.279 -0.313 -0.153
## age -0.0189 -0.00685 0.0171 -0.019 0.00158 0.149
## sex -0.0312 -0.0464 0.0761 0.0925 0.0526 0.238
## income_simple_log income_imp income_imp_log skin_tone
## g Pearson Pearson Pearson Pearson
## AFQT Pearson Pearson Pearson Pearson
## PIAT Pearson Pearson Pearson Pearson
## SAT Pearson Pearson Pearson Pearson
## ACT Pearson Pearson Pearson Pearson
## income_simple Pearson Pearson Pearson Pearson
## income_simple_log 1 Pearson Pearson Pearson
## income_imp 0.809 1 Pearson Pearson
## income_imp_log 0.904 0.913 1 Pearson
## skin_tone -0.166 -0.142 -0.154 1
## skin_tone_fixed -0.153 -0.138 -0.148 0.993
## age 0.107 0.29 0.269 0.018
## sex 0.161 0.277 0.248 0.0262
## skin_tone_fixed age sex
## g Pearson Pearson Polyserial
## AFQT Pearson Pearson Polyserial
## PIAT Pearson Pearson Polyserial
## SAT Pearson Pearson Polyserial
## ACT Pearson Pearson Polyserial
## income_simple Pearson Pearson Polyserial
## income_simple_log Pearson Pearson Polyserial
## income_imp Pearson Pearson Polyserial
## income_imp_log Pearson Pearson Polyserial
## skin_tone Pearson Pearson Polyserial
## skin_tone_fixed 1 Pearson Polyserial
## age 0.00605 1 Polyserial
## sex 0.0197 -0.0089 1
##
## Standard Errors/Numbers of Observations:
## g AFQT PIAT SAT ACT income_simple
## g 7396 7093 1649 2132 1771 7209
## AFQT 0.000256 7093 1346 2057 1722 6925
## PIAT 0.00263 0.0114 1649 487 409 1593
## SAT 0.0137 0.0141 0.0285 2493 575 2464
## ACT 0.0134 0.0133 0.0331 0.0291 2005 1988
## income_simple 0.0112 0.0114 0.024 0.0198 0.0223 8685
## income_simple_log 0.0112 0.0114 0.0239 0.02 0.0223 0.00452
## income_imp 0.013 0.0132 0.0287 0.0217 0.0243 0.00207
## income_imp_log 0.0129 0.0131 0.0286 0.0218 0.0244 0.00356
## skin_tone 0.0115 0.0117 0.0241 0.0215 0.0225 0.012
## skin_tone_fixed 0.0127 0.0129 0.0265 0.0232 0.025 0.0131
## age 0.0116 0.0119 0.0246 0.02 0.0223 0.0105
## sex 0.0146 0.0149 0.0307 0.0249 0.028 0.0132
## income_simple_log income_imp income_imp_log skin_tone
## g 7209 5420 5420 5678
## AFQT 6925 5262 5262 5447
## PIAT 1593 1138 1138 1325
## SAT 2464 2053 2053 1872
## ACT 1988 1677 1677 1576
## income_simple 8685 6340 6340 6636
## income_simple_log 8685 6340 6340 6636
## income_imp 0.00434 6340 6340 5309
## income_imp_log 0.0023 0.00208 6340 5309
## skin_tone 0.0119 0.0134 0.0134 6723
## skin_tone_fixed 0.0131 0.0147 0.0146 0.000177
## age 0.0106 0.0115 0.0117 0.0122
## sex 0.0133 0.0154 0.0148 0.0153
## skin_tone_fixed age sex
## g 4745 7396 7396
## AFQT 4549 7093 7093
## PIAT 1115 1649 1649
## SAT 1580 2493 2493
## ACT 1307 2005 2005
## income_simple 5561 8685 8685
## income_simple_log 5561 8685 8685
## income_imp 4474 6340 6340
## income_imp_log 4474 6340 6340
## skin_tone 5631 6723 6723
## skin_tone_fixed 5631 5631 5631
## age 0.0133 8984 8984
## sex 0.0167 0.0132 8984
##
## P-values for Tests of Bivariate Normality:
## g AFQT PIAT SAT
## g
## AFQT 1.44e-94
## PIAT 2.52e-22 1.16e-06
## SAT 4.94065645841247e-324 4.2e-298 1.1e-61
## ACT 0 0 4.91e-88 2.16e-102
## income_simple 2.13e-37 1.16e-32 6.86e-36 1.09e-83
## income_simple_log 8.45e-262 1.27e-256 3e-57 2.4e-212
## income_imp 3.93e-81 2.45e-71 1.36e-88 1.22e-68
## income_imp_log 9.44e-44 1.38e-40 3.33e-43 1.48e-93
## skin_tone 0 0 2.4e-85 1.12e-194
## skin_tone_fixed 2.69e-201 3.95e-195 5.83e-48 4.82e-142
## age 9.58e-83 1.14e-52 0 6.66e-97
## sex 0.00206 0.00625 1.34e-07 1.48e-71
## ACT income_simple income_simple_log income_imp
## g
## AFQT
## PIAT
## SAT
## ACT
## income_simple 2.33e-286
## income_simple_log 0 0
## income_imp 6.07e-249 0 0
## income_imp_log 3.34e-239 1.28e-302 0 0
## skin_tone 0 0 0 0
## skin_tone_fixed 7.79e-257 9.53e-250 0 1.44e-275
## age 4.28e-298 6.89e-128 0 8.04e-112
## sex 1.01e-284 7.23e-48 2.53e-301 3.56e-47
## income_imp_log skin_tone skin_tone_fixed age
## g
## AFQT
## PIAT
## SAT
## ACT
## income_simple
## income_simple_log
## income_imp
## income_imp_log
## skin_tone 0
## skin_tone_fixed 1.26e-234 0
## age 2.8e-81 0 1.93e-268
## sex 4.55e-19 0 6.41e-223 3.09e-84
#for blacks and hispanics
hetcor(d97 %>% filter(SPRE == "White") %>% .[c("g", "income", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g income skin_tone skin_tone_fixed age
## g 1 Pearson Pearson Pearson Pearson
## income 0.166 1 Pearson Pearson Pearson
## skin_tone -0.11 -0.0753 1 Pearson Pearson
## skin_tone_fixed -0.0862 -0.0791 0.97 1 Pearson
## age -0.0169 0.19 0.00313 -0.000442 1
## sex -0.0232 0.272 0.0893 0.065 -0.0154
## sex
## g Polyserial
## income Polyserial
## skin_tone Polyserial
## skin_tone_fixed Polyserial
## age Polyserial
## sex 1
##
## Standard Errors/Numbers of Observations:
## g income skin_tone skin_tone_fixed age sex
## g 3335 2849 2631 2148 3335 3335
## income 0.0182 3216 2649 2183 3216 3216
## skin_tone 0.0193 0.0193 2969 2438 2969 2969
## skin_tone_fixed 0.0214 0.0213 0.0012 2438 2438 2438
## age 0.0173 0.017 0.0184 0.0203 3808 3808
## sex 0.0217 0.0219 0.0231 0.0254 0.0203 3808
##
## P-values for Tests of Bivariate Normality:
## g income skin_tone skin_tone_fixed age
## g
## income 4.3e-47
## skin_tone 0 0
## skin_tone_fixed 1.71e-153 1.02e-183 0
## age 1.35e-28 1.75e-64 0 1.71e-183
## sex 0.0829 1.73e-45 0 1e-176 1.1e-23
#for blacks
hetcor(d97 %>% filter(SPRE == "Black") %>% .[c("g", "income", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g income skin_tone skin_tone_fixed age
## g 1 Pearson Pearson Pearson Pearson
## income 0.364 1 Pearson Pearson Pearson
## skin_tone -0.111 -0.0237 1 Pearson Pearson
## skin_tone_fixed -0.0918 -0.00796 0.987 1 Pearson
## age -0.0743 0.156 -0.0219 -0.0263 1
## sex -0.098 0.142 0.126 0.134 -0.0316
## sex
## g Polyserial
## income Polyserial
## skin_tone Polyserial
## skin_tone_fixed Polyserial
## age Polyserial
## sex 1
##
## Standard Errors/Numbers of Observations:
## g income skin_tone skin_tone_fixed age sex
## g 1697 1124 1451 1244 1697 1697
## income 0.0259 1326 1160 1009 1326 1326
## skin_tone 0.0259 0.0294 1718 1481 1718 1718
## skin_tone_fixed 0.0281 0.0315 0.000666 1481 1481 1481
## age 0.0241 0.0268 0.0241 0.026 2044 2044
## sex 0.0303 0.034 0.0298 0.032 0.0277 2044
##
## P-values for Tests of Bivariate Normality:
## g income skin_tone skin_tone_fixed age
## g
## income 5.45e-26
## skin_tone 6.09e-10 1.66e-12
## skin_tone_fixed 0.496 0.00037 4.65e-67
## age 1.3e-19 1.93e-19 5.32e-25 5.35e-12
## sex 0.144 1.94e-07 1.39e-12 0.236 7.61e-23
#hispanic
hetcor(d97 %>% filter(SPRE == "Hispanic") %>% .[c("g", "income", "skin_tone", "skin_tone_fixed", "age", "sex")], use = "pairwise")
##
## Two-Step Estimates
##
## Correlations/Type of Correlation:
## g income skin_tone skin_tone_fixed age
## g 1 Pearson Pearson Pearson Pearson
## income 0.196 1 Pearson Pearson Pearson
## skin_tone -0.134 0.0115 1 Pearson Pearson
## skin_tone_fixed -0.136 -0.00476 0.982 1 Pearson
## age -0.0126 0.149 0.0474 0.0459 1
## sex -0.0225 0.327 0.106 0.0975 0.0247
## sex
## g Polyserial
## income Polyserial
## skin_tone Polyserial
## skin_tone_fixed Polyserial
## age Polyserial
## sex 1
##
## Standard Errors/Numbers of Observations:
## g income skin_tone skin_tone_fixed age sex
## g 1457 1007 1135 959 1457 1457
## income 0.0303 1274 1089 923 1274 1274
## skin_tone 0.0292 0.0303 1445 1213 1445 1445
## skin_tone_fixed 0.0317 0.0329 0.00103 1213 1213 1213
## age 0.0262 0.0274 0.0263 0.0287 1899 1899
## sex 0.0329 0.0344 0.0326 0.0357 0.0288 1899
##
## P-values for Tests of Bivariate Normality:
## g income skin_tone skin_tone_fixed age
## g
## income 4.7e-09
## skin_tone 1.38e-23 1.39e-24
## skin_tone_fixed 6.43e-10 1.83e-12 1.8e-74
## age 8.76e-19 4.33e-13 7.73e-39 7.59e-19
## sex 0.00121 4.34e-07 4.76e-31 5.98e-13 1.45e-15
#standardized versions
d97b = d97[c("g", "income", "age", "sex", "SPRE", "OPRE", "skin_tone", "skin_tone_fixed")] %>% df_standardize()
## Skipped sex because it is a factor.
## Skipped SPRE because it is a factor.
## Skipped OPRE because it is a factor.
#baseline
(m97_o = ols(income ~ OPRE + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE sex age
## 2644 619 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 6107 LR chi2 527.53 R2 0.083
## sigma0.9603 d.f. 11 R2 adj 0.081
## d.f. 6095 Pr(> chi2) 0.0000 g 0.323
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.9731 -0.6262 -0.1453 0.4231 5.4694
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.0601 0.1657 -0.36 0.7167
## OPRE=Black -0.3102 0.0297 -10.43 <0.0001
## OPRE=Other 0.0131 0.0437 0.30 0.7635
## sex=Male 0.0959 0.2295 0.42 0.6759
## age 0.1754 0.1276 1.37 0.1692
## age' -0.0663 0.7690 -0.09 0.9313
## age'' -0.1555 2.3287 -0.07 0.9468
## age''' 0.6180 3.0195 0.20 0.8378
## sex=Male * age -0.1152 0.1779 -0.65 0.5172
## sex=Male * age' 0.7719 1.0684 0.72 0.4700
## sex=Male * age'' -1.3901 3.2377 -0.43 0.6677
## sex=Male * age''' 0.1792 4.2103 0.04 0.9660
##
(m97_s = ols(income ~ SPRE + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income SPRE sex age
## 2644 3 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ SPRE + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 6339 LR chi2 660.49 R2 0.099
## sigma0.9505 d.f. 16 R2 adj 0.097
## d.f. 6322 Pr(> chi2) 0.0000 g 0.353
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.1656 -0.6199 -0.1463 0.4257 5.4932
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.0389 0.1616 -0.24 0.8097
## SPRE=American Indian -0.5503 0.1838 -2.99 0.0028
## SPRE=Asian 0.5993 0.1057 5.67 <0.0001
## SPRE=Black -0.3910 0.0311 -12.57 <0.0001
## SPRE=Hispanic -0.2174 0.0315 -6.90 <0.0001
## SPRE=Multi-racial -0.1300 0.0518 -2.51 0.0121
## SPRE=Other 0.1745 0.2313 0.75 0.4505
## SPRE=Pacific Islander 0.0841 0.2248 0.37 0.7082
## sex=Male 0.1412 0.2235 0.63 0.5275
## age 0.1490 0.1245 1.20 0.2313
## age' 0.1515 0.7497 0.20 0.8399
## age'' -0.8870 2.2698 -0.39 0.6960
## age''' 1.6613 2.9393 0.57 0.5720
## sex=Male * age -0.0728 0.1733 -0.42 0.6747
## sex=Male * age' 0.5485 1.0397 0.53 0.5979
## sex=Male * age'' -0.6997 3.1492 -0.22 0.8242
## sex=Male * age''' -0.7779 4.0896 -0.19 0.8491
##
(m97_t = ols(income ~ skin_tone + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income skin_tone sex age
## 2644 2261 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ skin_tone + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 5309 LR chi2 516.63 R2 0.093
## sigma0.9445 d.f. 10 R2 adj 0.091
## d.f. 5298 Pr(> chi2) 0.0000 g 0.337
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.9432 -0.6172 -0.1546 0.4097 5.2349
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.3169 0.1717 -1.85 0.0651
## skin_tone -0.1587 0.0136 -11.67 <0.0001
## sex=Male 0.2863 0.2415 1.19 0.2359
## age 0.0630 0.1320 0.48 0.6333
## age' 0.7508 0.7991 0.94 0.3475
## age'' -2.8898 2.4232 -1.19 0.2331
## age''' 4.3846 3.1475 1.39 0.1637
## sex=Male * age 0.0252 0.1870 0.13 0.8927
## sex=Male * age' -0.2372 1.1238 -0.21 0.8328
## sex=Male * age'' 1.8661 3.4056 0.55 0.5838
## sex=Male * age''' -3.8985 4.4282 -0.88 0.3787
##
(m97_t2 = ols(income ~ skin_tone_fixed + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income skin_tone_fixed sex age
## 2644 3353 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ skin_tone_fixed + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 4474 LR chi2 461.59 R2 0.098
## sigma0.9297 d.f. 10 R2 adj 0.096
## d.f. 4463 Pr(> chi2) 0.0000 g 0.341
##
## Residuals
##
## Min 1Q Median 3Q Max
## -1.9438 -0.6185 -0.1476 0.4251 4.9118
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2990 0.1846 -1.62 0.1053
## skin_tone_fixed -0.1477 0.0146 -10.13 <0.0001
## sex=Male 0.2820 0.2588 1.09 0.2758
## age 0.0841 0.1422 0.59 0.5543
## age' 0.6709 0.8609 0.78 0.4358
## age'' -2.6480 2.6100 -1.01 0.3104
## age''' 4.0041 3.3828 1.18 0.2366
## sex=Male * age 0.0128 0.2003 0.06 0.9491
## sex=Male * age' -0.1249 1.2052 -0.10 0.9175
## sex=Male * age'' 1.2538 3.6538 0.34 0.7315
## sex=Male * age''' -2.2842 4.7503 -0.48 0.6306
##
#individually
(m97_og = ols(income ~ OPRE + g + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE g sex age
## 2644 619 1588 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + g + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 5275 LR chi2 803.94 R2 0.141
## sigma0.9305 d.f. 12 R2 adj 0.139
## d.f. 5262 Pr(> chi2) 0.0000 g 0.424
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.7286 -0.5843 -0.1325 0.4261 5.1156
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.1260 0.1700 -0.74 0.4586
## OPRE=Black -0.1101 0.0328 -3.36 0.0008
## OPRE=Other 0.1436 0.0474 3.03 0.0025
## g 0.2729 0.0142 19.18 <0.0001
## sex=Male 0.1319 0.2350 0.56 0.5746
## age 0.2127 0.1292 1.65 0.0996
## age' -0.3770 0.7916 -0.48 0.6339
## age'' 0.9963 2.4105 0.41 0.6794
## age''' -1.1824 3.1518 -0.38 0.7076
## sex=Male * age -0.0988 0.1802 -0.55 0.5835
## sex=Male * age' 1.0664 1.1014 0.97 0.3330
## sex=Male * age'' -2.7195 3.3590 -0.81 0.4182
## sex=Male * age''' 2.2870 4.4087 0.52 0.6040
##
(m97_sg = ols(income ~ SPRE + g + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income SPRE g sex age
## 2644 3 1588 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ SPRE + g + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 5419 LR chi2 885.80 R2 0.151
## sigma0.9265 d.f. 17 R2 adj 0.148
## d.f. 5401 Pr(> chi2) 0.0000 g 0.437
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.6934 -0.5908 -0.1312 0.4292 5.0587
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.1022 0.1675 -0.61 0.5417
## SPRE=American Indian -0.4189 0.1794 -2.34 0.0196
## SPRE=Asian 0.6633 0.1122 5.91 <0.0001
## SPRE=Black -0.1636 0.0350 -4.68 <0.0001
## SPRE=Hispanic -0.0515 0.0353 -1.46 0.1449
## SPRE=Multi-racial -0.0409 0.0555 -0.74 0.4611
## SPRE=Other 0.2438 0.2400 1.02 0.3098
## SPRE=Pacific Islander 0.2502 0.2401 1.04 0.2975
## g 0.2575 0.0144 17.86 <0.0001
## sex=Male 0.1250 0.2311 0.54 0.5888
## age 0.2057 0.1272 1.62 0.1059
## age' -0.3196 0.7793 -0.41 0.6817
## age'' 0.7657 2.3725 0.32 0.7469
## age''' -0.7655 3.0993 -0.25 0.8049
## sex=Male * age -0.0917 0.1772 -0.52 0.6046
## sex=Male * age' 1.0889 1.0829 1.01 0.3146
## sex=Male * age'' -2.7702 3.3021 -0.84 0.4016
## sex=Male * age''' 2.2499 4.3316 0.52 0.6035
##
(m97_tg = ols(income ~ skin_tone + g + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income skin_tone g sex age
## 2644 2261 1588 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ skin_tone + g + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 4563 LR chi2 761.70 R2 0.154
## sigma0.9176 d.f. 11 R2 adj 0.152
## d.f. 4551 Pr(> chi2) 0.0000 g 0.439
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.7593 -0.5781 -0.1358 0.4239 4.8891
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2666 0.1775 -1.50 0.1331
## skin_tone -0.0660 0.0151 -4.36 <0.0001
## g 0.2749 0.0150 18.33 <0.0001
## sex=Male 0.2705 0.2485 1.09 0.2764
## age 0.1311 0.1347 0.97 0.3301
## age' 0.1694 0.8285 0.20 0.8380
## age'' -0.7888 2.5247 -0.31 0.7547
## age''' 1.1965 3.3045 0.36 0.7173
## sex=Male * age 0.0162 0.1901 0.09 0.9322
## sex=Male * age' 0.3709 1.1650 0.32 0.7502
## sex=Male * age'' -0.6341 3.5554 -0.18 0.8585
## sex=Male * age''' 0.0303 4.6710 0.01 0.9948
##
#both
(m97_osg = ols(income ~ OPRE + SPRE + g + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE SPRE g sex age
## 2644 619 3 1588 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + SPRE + g + sex * rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 5274 LR chi2 849.96 R2 0.149
## sigma0.9272 d.f. 19 R2 adj 0.146
## d.f. 5254 Pr(> chi2) 0.0000 g 0.434
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.6811 -0.5883 -0.1310 0.4240 5.0668
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.1095 0.1697 -0.65 0.5187
## OPRE=Black 0.0622 0.0831 0.75 0.4541
## OPRE=Other 0.1221 0.0557 2.19 0.0284
## SPRE=American Indian -0.4778 0.1818 -2.63 0.0086
## SPRE=Asian 0.5700 0.1208 4.72 <0.0001
## SPRE=Black -0.2177 0.0873 -2.50 0.0126
## SPRE=Hispanic -0.0831 0.0395 -2.11 0.0352
## SPRE=Multi-racial -0.0947 0.0630 -1.50 0.1328
## SPRE=Other 0.2351 0.2506 0.94 0.3482
## SPRE=Pacific Islander 0.1708 0.2431 0.70 0.4823
## g 0.2571 0.0147 17.54 <0.0001
## sex=Male 0.1413 0.2343 0.60 0.5463
## age 0.2027 0.1288 1.57 0.1154
## age' -0.3171 0.7892 -0.40 0.6878
## age'' 0.7999 2.4034 0.33 0.7393
## age''' -0.9047 3.1425 -0.29 0.7735
## sex=Male * age -0.0881 0.1796 -0.49 0.6240
## sex=Male * age' 0.9791 1.0979 0.89 0.3725
## sex=Male * age'' -2.3994 3.3486 -0.72 0.4737
## sex=Male * age''' 1.7553 4.3953 0.40 0.6896
##
#full
(m97_osgt = ols(income ~ OPRE + SPRE + skin_tone + g + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE SPRE skin_tone g sex age
## 2644 619 3 2261 1588 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + SPRE + skin_tone + g + sex * rcs(age),
## data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 4456 LR chi2 790.12 R2 0.162
## sigma0.9144 d.f. 20 R2 adj 0.159
## d.f. 4435 Pr(> chi2) 0.0000 g 0.448
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.6993 -0.5734 -0.1322 0.4254 4.9041
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2772 0.1808 -1.53 0.1252
## OPRE=Black 0.1283 0.0951 1.35 0.1773
## OPRE=Other 0.1323 0.0596 2.22 0.0265
## SPRE=American Indian -0.4417 0.2135 -2.07 0.0386
## SPRE=Asian 0.6842 0.1277 5.36 <0.0001
## SPRE=Black -0.1291 0.0969 -1.33 0.1829
## SPRE=Hispanic -0.0228 0.0426 -0.53 0.5927
## SPRE=Multi-racial -0.1152 0.0715 -1.61 0.1073
## SPRE=Other 0.4894 0.2801 1.75 0.0807
## SPRE=Pacific Islander 0.2793 0.2579 1.08 0.2790
## skin_tone -0.0645 0.0245 -2.64 0.0084
## g 0.2678 0.0157 17.08 <0.0001
## sex=Male 0.2715 0.2504 1.08 0.2783
## age 0.1287 0.1356 0.95 0.3426
## age' 0.1706 0.8355 0.20 0.8382
## age'' -0.7857 2.5468 -0.31 0.7577
## age''' 1.1675 3.3365 0.35 0.7264
## sex=Male * age 0.0148 0.1914 0.08 0.9385
## sex=Male * age' 0.2973 1.1743 0.25 0.8002
## sex=Male * age'' -0.2688 3.5856 -0.07 0.9402
## sex=Male * age''' -0.7773 4.7153 -0.16 0.8691
##
(m97_osgt2 = ols(income ~ OPRE + SPRE + skin_tone_fixed + g + sex * rcs(age), data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE SPRE skin_tone_fixed
## 2644 619 3 3353
## g sex age
## 1588 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + SPRE + skin_tone_fixed + g + sex *
## rcs(age), data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 3739 LR chi2 729.59 R2 0.177
## sigma0.8980 d.f. 20 R2 adj 0.173
## d.f. 3718 Pr(> chi2) 0.0000 g 0.461
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.7556 -0.5658 -0.1234 0.4337 4.7877
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2721 0.1938 -1.40 0.1605
## OPRE=Black 0.1170 0.0975 1.20 0.2301
## OPRE=Other 0.1493 0.0638 2.34 0.0193
## SPRE=American Indian -0.4578 0.2157 -2.12 0.0338
## SPRE=Asian 0.8004 0.1351 5.92 <0.0001
## SPRE=Black -0.1316 0.1003 -1.31 0.1895
## SPRE=Hispanic -0.0373 0.0457 -0.82 0.4141
## SPRE=Multi-racial -0.1628 0.0760 -2.14 0.0322
## SPRE=Other 0.5548 0.2884 1.92 0.0545
## SPRE=Pacific Islander 0.2694 0.2639 1.02 0.3074
## skin_tone_fixed -0.0448 0.0256 -1.75 0.0799
## g 0.2698 0.0169 16.01 <0.0001
## sex=Male 0.2669 0.2680 1.00 0.3195
## age 0.1315 0.1459 0.90 0.3675
## age' 0.2109 0.8997 0.23 0.8147
## age'' -0.9227 2.7446 -0.34 0.7367
## age''' 1.2781 3.5925 0.36 0.7220
## sex=Male * age 0.0096 0.2048 0.05 0.9628
## sex=Male * age' 0.3942 1.2593 0.31 0.7543
## sex=Male * age'' -0.7546 3.8484 -0.20 0.8446
## sex=Male * age''' 0.4842 5.0617 0.10 0.9238
##
#relative importance
#use linear age bc eta² does not support nonlinear
(m97_osgt3 = ols(income ~ OPRE + SPRE + skin_tone + g + sex * age, data = d97b))
## Frequencies of Missing Values Due to Each Variable
## income OPRE SPRE skin_tone g sex age
## 2644 619 3 2261 1588 0 0
##
## Linear Regression Model
##
## ols(formula = income ~ OPRE + SPRE + skin_tone + g + sex * age,
## data = d97b)
##
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 4456 LR chi2 787.66 R2 0.162
## sigma0.9141 d.f. 14 R2 adj 0.159
## d.f. 4441 Pr(> chi2) 0.0000 g 0.447
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.7097 -0.5697 -0.1332 0.4243 4.9126
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.2868 0.0300 -9.56 <0.0001
## OPRE=Black 0.1269 0.0950 1.34 0.1817
## OPRE=Other 0.1324 0.0596 2.22 0.0263
## SPRE=American Indian -0.4483 0.2133 -2.10 0.0356
## SPRE=Asian 0.6809 0.1276 5.34 <0.0001
## SPRE=Black -0.1291 0.0968 -1.33 0.1825
## SPRE=Hispanic -0.0231 0.0426 -0.54 0.5876
## SPRE=Multi-racial -0.1155 0.0715 -1.62 0.1061
## SPRE=Other 0.4919 0.2800 1.76 0.0790
## SPRE=Pacific Islander 0.2788 0.2578 1.08 0.2795
## skin_tone -0.0639 0.0244 -2.61 0.0090
## g 0.2686 0.0157 17.16 <0.0001
## sex=Male 0.4010 0.0276 14.53 <0.0001
## age 0.1062 0.0195 5.46 <0.0001
## sex=Male * age 0.1282 0.0272 4.72 <0.0001
##
m97_osgt3 %>% aov() %>% lsr::etaSquared() %>% sqrt()
## eta.sq eta.sq.part
## OPRE 0.033 0.036
## SPRE 0.092 0.100
## skin_tone 0.036 0.039
## g 0.236 0.249
## sex 0.197 0.210
## age 0.173 0.186
## sex:age 0.065 0.071
bind_rows("NLSY97" = d97_age_iq_inc,
"NLSY79" = d79_age_iq_inc,
.id = "data") %>%
ggplot(aes(age, r, color = data)) +
geom_path() +
theme_bw() +
scale_y_continuous("r g x income") +
ggtitle("Income and IQ by age in NLSY's")
GG_save("figs/age_iq_inc.png")