library(pacman); p_load(ggplot2, openxlsx, tidyverse, fixest)
# You can get the data from https://www.pnas.org/doi/10.1073/pnas.2300926120
# vars1 <- openxlsx::read.xlsx("pnas.2300926120.sd01.xlsx", sheet = 1)
# sd1 <- openxlsx::read.xlsx("pnas.2300926120.sd01.xlsx", sheet = 2)
# sd1 <- as_tibble(sd1)
vars2 <- openxlsx::read.xlsx("pnas.2300926120.sd02.xlsx", sheet = 2)
sd2 <- openxlsx::read.xlsx("pnas.2300926120.sd02.xlsx", sheet = 3)
sd2 <- as_tibble(sd2)
# vars3 <- openxlsx::read.xlsx("pnas.2300926120.sd03.xlsx", sheet = 2)
# sd3 <- openxlsx::read.xlsx("pnas.2300926120.sd03.xlsx", sheet = 3)
# sd3 <- as_tibble(sd3)
# vars4 <- openxlsx::read.xlsx("pnas.2300926120.sd04.xlsx", sheet = 2)
# sd4 <- openxlsx::read.xlsx("pnas.2300926120.sd04.xlsx", sheet = 3)
# sd4 <- as_tibble(sd4)
sibs <- sd2 |> filter(Relationship=="Siblings")
sibs <- pivot_longer(sibs, matches("0|1
"), names_to = c(".value", "sib_id"),
names_pattern = "^(\\w+)(\\d)$")
sibs <- filter(sibs, ! duplicated(pid))
sibs <- mutate(sibs,
fab = byr - byrf, # father's age at birth
birth_order = as.numeric(as.factor(byr)),
n_sibs = n(),
.by = pidf
)
sibs <- filter(sibs, n_sibs <= 6)
# sanity check
table(sibs$birth_order, sibs$n_sibs)
##
## 1 2 3 4 5 6
## 1 4972 1950 639 203 77 31
## 2 0 1872 641 205 77 33
## 3 0 0 598 208 77 32
## 4 0 0 0 184 77 32
## 5 0 0 0 0 62 31
## 6 0 0 0 0 0 27
David Hugh-Jones posted some code to assess birth order effects in Gregory Clark’s British data from his PNAS publication. These birth order effects are on modern (1910-1997) variables. Since the coefficients weren’t laid out in that gist, I’ve decided to show them here.
mod_lhv <- fixest::feols(lhv ~ birth_order + fab + I(fab^2) | pidf + n_sibs, data = sibs,
vcov = cluster ~ pidf)
## NOTE: 3,803 observations removed because of NA values (LHS: 3,515, RHS: 478).
mod_imd <- fixest::feols(imd ~ birth_order + fab + I(fab^2) | pidf + n_sibs, data = sibs,
vcov = cluster ~ pidf)
## NOTE: 3,763 observations removed because of NA values (LHS: 3,475, RHS: 478).
mod_codir <- fixest::feglm(codir ~ birth_order + fab + I(fab^2) | pidf + n_sibs, data = sibs,
family = "binomial", vcov = cluster ~ pidf)
## NOTES: 819 observations removed because of NA values (LHS: 344, RHS: 478).
## 6,719/1 fixed-effects (9,687 observations) removed because of only 0 (or only 1) outcomes.
mod_statmod <- fixest::feols(statmod ~ birth_order + fab + I(fab^2)| pidf + n_sibs, data = sibs,
vcov = cluster ~ pidf)
## NOTE: 3,807 observations removed because of NA values (LHS: 3,519, RHS: 478).
mod_lhv_linear <- fixest::feols(lhv ~ birth_order + fab | pidf + n_sibs, data = sibs, vcov = cluster ~ pidf)
## NOTE: 3,803 observations removed because of NA values (LHS: 3,515, RHS: 478).
mod_imd_linear <- fixest::feols(imd ~ birth_order + fab | pidf + n_sibs, data = sibs, vcov = cluster ~ pidf)
## NOTE: 3,763 observations removed because of NA values (LHS: 3,475, RHS: 478).
mod_codir_linear <- fixest::feglm(codir ~ birth_order + fab | pidf + n_sibs, data = sibs,
family = "binomial", vcov = cluster ~ pidf)
## NOTES: 819 observations removed because of NA values (LHS: 344, RHS: 478).
## 6,719/1 fixed-effects (9,687 observations) removed because of only 0 (or only 1) outcomes.
mod_statmod_linear <- fixest::feols(statmod ~ birth_order + fab | pidf + n_sibs, data = sibs,
vcov = cluster ~ pidf)
## NOTE: 3,807 observations removed because of NA values (LHS: 3,519, RHS: 478).
mod_codir_lm <- fixest::feols(codir ~ birth_order + fab + I(fab^2) | pidf + n_sibs,
data = sibs, vcov = cluster ~ pidf)
## NOTE: 819 observations removed because of NA values (LHS: 344, RHS: 478).
mod_codir_gender <- fixest::feols(codir ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs,
data = sibs, vcov = cluster ~ pidf)
## NOTE: 820 observations removed because of NA values (LHS: 344, RHS: 479).
# New models
mod_lhv_gender <- fixest::feols(lhv ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs,
data = sibs, vcov = cluster ~ pidf)
## NOTE: 3,804 observations removed because of NA values (LHS: 3,515, RHS: 479).
mod_imd_gender <- fixest::feols(imd ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs,
data = sibs, vcov = cluster ~ pidf)
## NOTE: 3,764 observations removed because of NA values (LHS: 3,475, RHS: 479).
mod_statmod_gender <- fixest::feols(statmod ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs,
data = sibs, vcov = cluster ~ pidf)
## NOTE: 3,808 observations removed because of NA values (LHS: 3,519, RHS: 479).
mod_lhv #House value
## OLS estimation, Dep. Var.: lhv
## Observations: 8,225
## Fixed-effects: pidf: 5,991, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.024370 0.018397 -1.324653 0.18534
## fab 0.013070 0.013783 0.948231 0.34305
## I(fab^2) -0.000163 0.000182 -0.895716 0.37044
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.235333 Adj. R2: 0.356609
## Within R2: 0.001453
mod_imd #Index of multiple deprivation
## OLS estimation, Dep. Var.: imd
## Observations: 8,265
## Fixed-effects: pidf: 6,010, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order 0.036055 0.919242 0.039223 0.96871
## fab 0.355287 0.682867 0.520288 0.60288
## I(fab^2) -0.005456 0.009107 -0.599118 0.54912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 12.4 Adj. R2: 0.26711
## Within R2: 1.534e-4
mod_codir #Company directorship
## GLM estimation, family = binomial, Dep. Var.: codir
## Observations: 1,522
## Fixed-effects: pidf: 594, n_sibs: 5
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.126760 0.164319 -0.771425 0.440455
## fab -0.224896 0.128784 -1.746300 0.080759 .
## I(fab^2) 0.003015 0.001680 1.795173 0.072626 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -979.3 Adj. Pseudo R2: -0.515517
## BIC: 6,362.5 Squared Cor.: 0.079425
mod_statmod #PCA of house value, index of multiple deprivation, and company director indicator
## OLS estimation, Dep. Var.: statmod
## Observations: 8,221
## Fixed-effects: pidf: 5,990, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.043421 0.039723 -1.093105 0.27439
## fab -0.004330 0.030258 -0.143093 0.88622
## I(fab^2) 0.000057 0.000408 0.138657 0.88973
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.516437 Adj. R2: 0.399436
## Within R2: 0.002192
So the main models show no significant birth order effects.
mod_lhv_linear #House value
## OLS estimation, Dep. Var.: lhv
## Observations: 8,225
## Fixed-effects: pidf: 5,991, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.020355 0.017389 -1.170572 0.24182
## fab 0.001556 0.003851 0.403932 0.68628
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.235373 Adj. R2: 0.356678
## Within R2: 0.001112
mod_imd_linear #Index of multiple deprivation
## OLS estimation, Dep. Var.: imd
## Observations: 8,265
## Fixed-effects: pidf: 6,010, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order 0.168807 0.886532 0.190412 0.84899
## fab -0.030016 0.212098 -0.141522 0.88746
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 12.4 Adj. R2: 0.267335
## Within R2: 1.537e-5
mod_codir_linear #Company directorship
## GLM estimation, family = binomial, Dep. Var.: codir
## Observations: 1,522
## Fixed-effects: pidf: 594, n_sibs: 5
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.203031 0.160032 -1.268689 0.20455
## fab -0.007802 0.036073 -0.216287 0.82876
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood: -981.9 Adj. Pseudo R2: -0.517114
## BIC: 6,360.5 Squared Cor.: 0.076505
mod_codir_lm #Company directorship
## OLS estimation, Dep. Var.: codir
## Observations: 11,209
## Fixed-effects: pidf: 7,313, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.006788 0.008573 -0.791781 0.42851
## fab -0.010592 0.006780 -1.562183 0.11829
## I(fab^2) 0.000142 0.000089 1.596464 0.11043
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.176035 Adj. R2: 0.266152
## Within R2: 0.0024
mod_statmod_linear #PCA of house value, index of multiple deprivation, and company director indicator
## OLS estimation, Dep. Var.: statmod
## Observations: 8,221
## Fixed-effects: pidf: 5,990, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.044816 0.038195 -1.173325 0.24071
## fab -0.000329 0.008678 -0.037897 0.96977
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.516439 Adj. R2: 0.399701
## Within R2: 0.002183
Neither do the linear models.
mod_lhv_gender
## OLS estimation, Dep. Var.: lhv
## Observations: 8,224
## Fixed-effects: pidf: 5,991, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.015762 0.020163 -0.781736 0.43440
## factor(dfem)1 0.018058 0.047653 0.378951 0.70474
## fab 0.012935 0.013768 0.939483 0.34752
## I(fab^2) -0.000166 0.000182 -0.915648 0.35989
## birth_order:factor(dfem)1 -0.021677 0.019530 -1.109940 0.26707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.235043 Adj. R2: 0.357472
## Within R2: 0.00251
mod_imd_gender
## OLS estimation, Dep. Var.: imd
## Observations: 8,264
## Fixed-effects: pidf: 6,010, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order 0.007429 0.985303 0.007539 0.99398
## factor(dfem)1 -2.819251 2.478566 -1.137452 0.25539
## fab 0.375552 0.679269 0.552877 0.58037
## I(fab^2) -0.005713 0.009037 -0.632181 0.52729
## birth_order:factor(dfem)1 0.259624 1.088432 0.238530 0.81148
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 12.4 Adj. R2: 0.267459
## Within R2: 0.001812
mod_codir_gender
## OLS estimation, Dep. Var.: codir
## Observations: 11,208
## Fixed-effects: pidf: 7,313, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.016950 0.009503 -1.78373 7.4509e-02 .
## factor(dfem)1 -0.152409 0.020494 -7.43663 1.1504e-13 ***
## fab -0.009740 0.006653 -1.46389 1.4327e-01
## I(fab^2) 0.000134 0.000087 1.53617 1.2454e-01
## birth_order:factor(dfem)1 0.024616 0.008376 2.93881 3.3050e-03 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.173628 Adj. R2: 0.285649
## Within R2: 0.029582
mod_statmod_gender
## OLS estimation, Dep. Var.: statmod
## Observations: 8,220
## Fixed-effects: pidf: 5,990, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.02877895 0.042757 -0.673078 0.50092371
## factor(dfem)1 -0.36666791 0.103503 -3.542579 0.00039925 ***
## fab -0.00151205 0.029772 -0.050788 0.95949591
## I(fab^2) 0.00000993 0.000399 0.024850 0.98017516
## birth_order:factor(dfem)1 -0.00876393 0.044020 -0.199088 0.84220054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.509673 Adj. R2: 0.414414
## Within R2: 0.028195
But there was a highly significant interaction for company directorship with gender. This interaction did not hold for modstat, but the main effect did, presumably because of variance from codir. So let’s test that. PCA of two variables is equivalent to standardized averaging of two variables.
sibs$lhvimd <- rowMeans(cbind(scale(sibs$lhv, scale = FALSE), scale(sibs$imd, scale = FALSE)))
mod_statmodshort_gender <- fixest::feols(lhvimd ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs,
data = sibs, vcov = cluster ~ pidf)
## NOTE: 3,804 observations removed because of NA values (LHS: 3,515, RHS: 479).
mod_statmodshort_gender
## OLS estimation, Dep. Var.: lhvimd
## Observations: 8,224
## Fixed-effects: pidf: 5,991, n_sibs: 6
## Standard-errors: Clustered (pidf)
## Estimate Std. Error t value Pr(>|t|)
## birth_order -0.051789 0.501050 -0.103361 0.91768
## factor(dfem)1 -1.280541 1.253917 -1.021233 0.30719
## fab 0.221544 0.343344 0.645254 0.51879
## I(fab^2) -0.003249 0.004563 -0.711894 0.47656
## birth_order:factor(dfem)1 0.075166 0.548681 0.136993 0.89104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 6.20022 Adj. R2: 0.275952
## Within R2: 0.00175
It seems the interaction with codir didn’t transfer to modstat due to power, and the main effect in modstat may have been due to variance from codir.
Code: https://gist.github.com/hughjonesd/30b648df336fc75e1f215e12cbccd762 Archive: https://web.archive.org/web/20230702220058/https://gist.github.com/hughjonesd/30b648df336fc75e1f215e12cbccd762
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] fixest_0.11.1 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0
## [5] dplyr_1.1.0 purrr_1.0.1 readr_2.1.4 tidyr_1.3.0
## [9] tibble_3.1.8 tidyverse_2.0.0 openxlsx_4.2.5.2 ggplot2_3.4.1
## [13] pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] zoo_1.8-11 tidyselect_1.2.0 xfun_0.37
## [4] bslib_0.4.2 lattice_0.20-45 colorspace_2.1-0
## [7] vctrs_0.5.2 generics_0.1.3 htmltools_0.5.4
## [10] yaml_2.3.7 utf8_1.2.3 rlang_1.0.6
## [13] jquerylib_0.1.4 pillar_1.8.1 glue_1.6.2
## [16] withr_2.5.0 lifecycle_1.0.3 munsell_0.5.0
## [19] gtable_0.3.1 zip_2.2.2 evaluate_0.20
## [22] knitr_1.42 tzdb_0.3.0 fastmap_1.1.0
## [25] fansi_1.0.4 Rcpp_1.0.10 scales_1.2.1
## [28] cachem_1.0.6 jsonlite_1.8.4 hms_1.1.2
## [31] digest_0.6.31 stringi_1.7.12 numDeriv_2016.8-1.1
## [34] grid_4.2.2 cli_3.6.0 tools_4.2.2
## [37] sandwich_3.0-2 magrittr_2.0.3 sass_0.4.5
## [40] Formula_1.2-4 pkgconfig_2.0.3 ellipsis_0.3.2
## [43] dreamerr_1.2.3 timechange_0.2.0 rmarkdown_2.20
## [46] rstudioapi_0.14 R6_2.5.1 nlme_3.1-160
## [49] compiler_4.2.2