Setup

library(pacman); p_load(ggplot2, openxlsx, tidyverse, fixest)

# You can get the data from https://www.pnas.org/doi/10.1073/pnas.2300926120

# vars1 <- openxlsx::read.xlsx("pnas.2300926120.sd01.xlsx", sheet = 1)
# sd1 <-  openxlsx::read.xlsx("pnas.2300926120.sd01.xlsx", sheet = 2)
# sd1 <- as_tibble(sd1)

vars2 <- openxlsx::read.xlsx("pnas.2300926120.sd02.xlsx", sheet = 2)
sd2 <- openxlsx::read.xlsx("pnas.2300926120.sd02.xlsx", sheet = 3)
sd2 <- as_tibble(sd2)

# vars3 <- openxlsx::read.xlsx("pnas.2300926120.sd03.xlsx", sheet = 2)
# sd3 <- openxlsx::read.xlsx("pnas.2300926120.sd03.xlsx", sheet = 3)
# sd3 <- as_tibble(sd3)

# vars4 <- openxlsx::read.xlsx("pnas.2300926120.sd04.xlsx", sheet = 2)
# sd4 <- openxlsx::read.xlsx("pnas.2300926120.sd04.xlsx", sheet = 3)
# sd4 <- as_tibble(sd4)

sibs <- sd2 |> filter(Relationship=="Siblings")


sibs <- pivot_longer(sibs, matches("0|1
"), names_to = c(".value", "sib_id"),
                     names_pattern = "^(\\w+)(\\d)$")

sibs <- filter(sibs, ! duplicated(pid))
sibs <- mutate(sibs,
                  fab = byr - byrf, # father's age at birth
                  birth_order = as.numeric(as.factor(byr)),
                  n_sibs = n(),
                  .by = pidf
                )
sibs <- filter(sibs, n_sibs <= 6)

# sanity check               
table(sibs$birth_order, sibs$n_sibs)

##    
##        1    2    3    4    5    6
##   1 4972 1950  639  203   77   31
##   2    0 1872  641  205   77   33
##   3    0    0  598  208   77   32
##   4    0    0    0  184   77   32
##   5    0    0    0    0   62   31
##   6    0    0    0    0    0   27

Rationale

David Hugh-Jones posted some code to assess birth order effects in Gregory Clark’s British data from his PNAS publication. These birth order effects are on modern (1910-1997) variables. Since the coefficients weren’t laid out in that gist, I’ve decided to show them here.

Coefficients

mod_lhv <- fixest::feols(lhv ~ birth_order + fab + I(fab^2) | pidf + n_sibs, data = sibs, 
                         vcov = cluster ~ pidf)

## NOTE: 3,803 observations removed because of NA values (LHS: 3,515, RHS: 478).

mod_imd <- fixest::feols(imd ~ birth_order + fab + I(fab^2) | pidf + n_sibs, data = sibs, 
                         vcov = cluster ~ pidf)

## NOTE: 3,763 observations removed because of NA values (LHS: 3,475, RHS: 478).

mod_codir <- fixest::feglm(codir ~ birth_order + fab + I(fab^2) | pidf + n_sibs, data = sibs, 
                           family = "binomial", vcov = cluster ~ pidf)

## NOTES: 819 observations removed because of NA values (LHS: 344, RHS: 478).
##        6,719/1 fixed-effects (9,687 observations) removed because of only 0 (or only 1) outcomes.

mod_statmod <- fixest::feols(statmod ~ birth_order + fab + I(fab^2)| pidf + n_sibs, data = sibs, 
                             vcov = cluster ~ pidf)

## NOTE: 3,807 observations removed because of NA values (LHS: 3,519, RHS: 478).

mod_lhv_linear <- fixest::feols(lhv ~ birth_order + fab | pidf + n_sibs, data = sibs, vcov = cluster ~ pidf)

## NOTE: 3,803 observations removed because of NA values (LHS: 3,515, RHS: 478).

mod_imd_linear <- fixest::feols(imd ~ birth_order + fab | pidf + n_sibs, data = sibs, vcov = cluster ~ pidf)

## NOTE: 3,763 observations removed because of NA values (LHS: 3,475, RHS: 478).

mod_codir_linear <- fixest::feglm(codir ~ birth_order + fab | pidf + n_sibs, data = sibs, 
                                  family = "binomial", vcov = cluster ~ pidf)

## NOTES: 819 observations removed because of NA values (LHS: 344, RHS: 478).
##        6,719/1 fixed-effects (9,687 observations) removed because of only 0 (or only 1) outcomes.

mod_statmod_linear <- fixest::feols(statmod ~ birth_order + fab | pidf + n_sibs, data = sibs, 
                                    vcov = cluster ~ pidf)

## NOTE: 3,807 observations removed because of NA values (LHS: 3,519, RHS: 478).

mod_codir_lm <- fixest::feols(codir ~ birth_order + fab + I(fab^2) | pidf + n_sibs, 
                              data = sibs, vcov = cluster ~ pidf)

## NOTE: 819 observations removed because of NA values (LHS: 344, RHS: 478).

mod_codir_gender <- fixest::feols(codir ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs, 
                                  data = sibs, vcov = cluster ~ pidf)

## NOTE: 820 observations removed because of NA values (LHS: 344, RHS: 479).

# New models

mod_lhv_gender <- fixest::feols(lhv ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs, 
                                  data = sibs, vcov = cluster ~ pidf)

## NOTE: 3,804 observations removed because of NA values (LHS: 3,515, RHS: 479).

mod_imd_gender <- fixest::feols(imd ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs, 
                                  data = sibs, vcov = cluster ~ pidf)

## NOTE: 3,764 observations removed because of NA values (LHS: 3,475, RHS: 479).

mod_statmod_gender <- fixest::feols(statmod ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs, 
                                  data = sibs, vcov = cluster ~ pidf)

## NOTE: 3,808 observations removed because of NA values (LHS: 3,519, RHS: 479).

mod_lhv     #House value

## OLS estimation, Dep. Var.: lhv
## Observations: 8,225 
## Fixed-effects: pidf: 5,991,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order -0.024370   0.018397 -1.324653  0.18534 
## fab          0.013070   0.013783  0.948231  0.34305 
## I(fab^2)    -0.000163   0.000182 -0.895716  0.37044 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.235333     Adj. R2: 0.356609
##                  Within R2: 0.001453

mod_imd     #Index of multiple deprivation

## OLS estimation, Dep. Var.: imd
## Observations: 8,265 
## Fixed-effects: pidf: 6,010,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order  0.036055   0.919242  0.039223  0.96871 
## fab          0.355287   0.682867  0.520288  0.60288 
## I(fab^2)    -0.005456   0.009107 -0.599118  0.54912 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 12.4     Adj. R2: 0.26711 
##              Within R2: 1.534e-4

mod_codir   #Company directorship

## GLM estimation, family = binomial, Dep. Var.: codir
## Observations: 1,522 
## Fixed-effects: pidf: 594,  n_sibs: 5
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|)    
## birth_order -0.126760   0.164319 -0.771425 0.440455    
## fab         -0.224896   0.128784 -1.746300 0.080759 .  
## I(fab^2)     0.003015   0.001680  1.795173 0.072626 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood:  -979.3   Adj. Pseudo R2: -0.515517
##            BIC: 6,362.5     Squared Cor.:  0.079425

mod_statmod #PCA of house value, index of multiple deprivation, and company director indicator

## OLS estimation, Dep. Var.: statmod
## Observations: 8,221 
## Fixed-effects: pidf: 5,990,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order -0.043421   0.039723 -1.093105  0.27439 
## fab         -0.004330   0.030258 -0.143093  0.88622 
## I(fab^2)     0.000057   0.000408  0.138657  0.88973 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.516437     Adj. R2: 0.399436
##                  Within R2: 0.002192

So the main models show no significant birth order effects.

mod_lhv_linear     #House value

## OLS estimation, Dep. Var.: lhv
## Observations: 8,225 
## Fixed-effects: pidf: 5,991,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order -0.020355   0.017389 -1.170572  0.24182 
## fab          0.001556   0.003851  0.403932  0.68628 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.235373     Adj. R2: 0.356678
##                  Within R2: 0.001112

mod_imd_linear     #Index of multiple deprivation

## OLS estimation, Dep. Var.: imd
## Observations: 8,265 
## Fixed-effects: pidf: 6,010,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order  0.168807   0.886532  0.190412  0.84899 
## fab         -0.030016   0.212098 -0.141522  0.88746 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 12.4     Adj. R2: 0.267335
##              Within R2: 1.537e-5

mod_codir_linear   #Company directorship

## GLM estimation, family = binomial, Dep. Var.: codir
## Observations: 1,522 
## Fixed-effects: pidf: 594,  n_sibs: 5
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order -0.203031   0.160032 -1.268689  0.20455 
## fab         -0.007802   0.036073 -0.216287  0.82876 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Log-Likelihood:  -981.9   Adj. Pseudo R2: -0.517114
##            BIC: 6,360.5     Squared Cor.:  0.076505

mod_codir_lm       #Company directorship

## OLS estimation, Dep. Var.: codir
## Observations: 11,209 
## Fixed-effects: pidf: 7,313,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order -0.006788   0.008573 -0.791781  0.42851 
## fab         -0.010592   0.006780 -1.562183  0.11829 
## I(fab^2)     0.000142   0.000089  1.596464  0.11043 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.176035     Adj. R2: 0.266152
##                  Within R2: 0.0024

mod_statmod_linear #PCA of house value, index of multiple deprivation, and company director indicator

## OLS estimation, Dep. Var.: statmod
## Observations: 8,221 
## Fixed-effects: pidf: 5,990,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##              Estimate Std. Error   t value Pr(>|t|) 
## birth_order -0.044816   0.038195 -1.173325  0.24071 
## fab         -0.000329   0.008678 -0.037897  0.96977 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.516439     Adj. R2: 0.399701
##                  Within R2: 0.002183

Neither do the linear models.

mod_lhv_gender

## OLS estimation, Dep. Var.: lhv
## Observations: 8,224 
## Fixed-effects: pidf: 5,991,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##                            Estimate Std. Error   t value Pr(>|t|) 
## birth_order               -0.015762   0.020163 -0.781736  0.43440 
## factor(dfem)1              0.018058   0.047653  0.378951  0.70474 
## fab                        0.012935   0.013768  0.939483  0.34752 
## I(fab^2)                  -0.000166   0.000182 -0.915648  0.35989 
## birth_order:factor(dfem)1 -0.021677   0.019530 -1.109940  0.26707 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.235043     Adj. R2: 0.357472
##                  Within R2: 0.00251

mod_imd_gender

## OLS estimation, Dep. Var.: imd
## Observations: 8,264 
## Fixed-effects: pidf: 6,010,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##                            Estimate Std. Error   t value Pr(>|t|) 
## birth_order                0.007429   0.985303  0.007539  0.99398 
## factor(dfem)1             -2.819251   2.478566 -1.137452  0.25539 
## fab                        0.375552   0.679269  0.552877  0.58037 
## I(fab^2)                  -0.005713   0.009037 -0.632181  0.52729 
## birth_order:factor(dfem)1  0.259624   1.088432  0.238530  0.81148 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 12.4     Adj. R2: 0.267459
##              Within R2: 0.001812

mod_codir_gender

## OLS estimation, Dep. Var.: codir
## Observations: 11,208 
## Fixed-effects: pidf: 7,313,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##                            Estimate Std. Error  t value   Pr(>|t|)    
## birth_order               -0.016950   0.009503 -1.78373 7.4509e-02 .  
## factor(dfem)1             -0.152409   0.020494 -7.43663 1.1504e-13 ***
## fab                       -0.009740   0.006653 -1.46389 1.4327e-01    
## I(fab^2)                   0.000134   0.000087  1.53617 1.2454e-01    
## birth_order:factor(dfem)1  0.024616   0.008376  2.93881 3.3050e-03 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.173628     Adj. R2: 0.285649
##                  Within R2: 0.029582

mod_statmod_gender

## OLS estimation, Dep. Var.: statmod
## Observations: 8,220 
## Fixed-effects: pidf: 5,990,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##                              Estimate Std. Error   t value   Pr(>|t|)    
## birth_order               -0.02877895   0.042757 -0.673078 0.50092371    
## factor(dfem)1             -0.36666791   0.103503 -3.542579 0.00039925 ***
## fab                       -0.00151205   0.029772 -0.050788 0.95949591    
## I(fab^2)                   0.00000993   0.000399  0.024850 0.98017516    
## birth_order:factor(dfem)1 -0.00876393   0.044020 -0.199088 0.84220054    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.509673     Adj. R2: 0.414414
##                  Within R2: 0.028195

But there was a highly significant interaction for company directorship with gender. This interaction did not hold for modstat, but the main effect did, presumably because of variance from codir. So let’s test that. PCA of two variables is equivalent to standardized averaging of two variables.

sibs$lhvimd <- rowMeans(cbind(scale(sibs$lhv, scale = FALSE), scale(sibs$imd, scale = FALSE)))

mod_statmodshort_gender <- fixest::feols(lhvimd ~ birth_order*factor(dfem) + fab + I(fab^2) | pidf + n_sibs, 
                                  data = sibs, vcov = cluster ~ pidf)

## NOTE: 3,804 observations removed because of NA values (LHS: 3,515, RHS: 479).

mod_statmodshort_gender

## OLS estimation, Dep. Var.: lhvimd
## Observations: 8,224 
## Fixed-effects: pidf: 5,991,  n_sibs: 6
## Standard-errors: Clustered (pidf) 
##                            Estimate Std. Error   t value Pr(>|t|) 
## birth_order               -0.051789   0.501050 -0.103361  0.91768 
## factor(dfem)1             -1.280541   1.253917 -1.021233  0.30719 
## fab                        0.221544   0.343344  0.645254  0.51879 
## I(fab^2)                  -0.003249   0.004563 -0.711894  0.47656 
## birth_order:factor(dfem)1  0.075166   0.548681  0.136993  0.89104 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 6.20022     Adj. R2: 0.275952
##                 Within R2: 0.00175

It seems the interaction with codir didn’t transfer to modstat due to power, and the main effect in modstat may have been due to variance from codir.

Links

Code: https://gist.github.com/hughjonesd/30b648df336fc75e1f215e12cbccd762 Archive: https://web.archive.org/web/20230702220058/https://gist.github.com/hughjonesd/30b648df336fc75e1f215e12cbccd762

sessionInfo()

## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] fixest_0.11.1    lubridate_1.9.2  forcats_1.0.0    stringr_1.5.0   
##  [5] dplyr_1.1.0      purrr_1.0.1      readr_2.1.4      tidyr_1.3.0     
##  [9] tibble_3.1.8     tidyverse_2.0.0  openxlsx_4.2.5.2 ggplot2_3.4.1   
## [13] pacman_0.5.1    
## 
## loaded via a namespace (and not attached):
##  [1] zoo_1.8-11          tidyselect_1.2.0    xfun_0.37          
##  [4] bslib_0.4.2         lattice_0.20-45     colorspace_2.1-0   
##  [7] vctrs_0.5.2         generics_0.1.3      htmltools_0.5.4    
## [10] yaml_2.3.7          utf8_1.2.3          rlang_1.0.6        
## [13] jquerylib_0.1.4     pillar_1.8.1        glue_1.6.2         
## [16] withr_2.5.0         lifecycle_1.0.3     munsell_0.5.0      
## [19] gtable_0.3.1        zip_2.2.2           evaluate_0.20      
## [22] knitr_1.42          tzdb_0.3.0          fastmap_1.1.0      
## [25] fansi_1.0.4         Rcpp_1.0.10         scales_1.2.1       
## [28] cachem_1.0.6        jsonlite_1.8.4      hms_1.1.2          
## [31] digest_0.6.31       stringi_1.7.12      numDeriv_2016.8-1.1
## [34] grid_4.2.2          cli_3.6.0           tools_4.2.2        
## [37] sandwich_3.0-2      magrittr_2.0.3      sass_0.4.5         
## [40] Formula_1.2-4       pkgconfig_2.0.3     ellipsis_0.3.2     
## [43] dreamerr_1.2.3      timechange_0.2.0    rmarkdown_2.20     
## [46] rstudioapi_0.14     R6_2.5.1            nlme_3.1-160       
## [49] compiler_4.2.2

Modern Birth Order Effects in Gregory Clark’s PNAS Data

Setup

Rationale

Coefficients

Links