Packages and settings

Load packages and set settings.

library(pacman)
#may need this
#devtools::install_version("bisoreg",version="1.5")
p_load(kirkegaard, readr, rvest, dplyr, WikipediR, bisoreg, curry, DT)
options(digits = 2)
source("R/functions.R")

Load data

Load the scraped data.

#main data
d = read_rds("data/main.rds")

#US life expectancy
d_le = read_csv("data/us_le.csv")
## Parsed with column specification:
## cols(
##   Year = col_double(),
##   M = col_double(),
##   F = col_double()
## )

Sex ascertainment

#plurality score
d$male_cat = round(d$male_cat_frac)
d$male_pronoun = round(d$male_pron_frac)

#do methods agree?
d %$% table(male_cat, male_pronoun)
##         male_pronoun
## male_cat    0    1
##        0  268    0
##        1    0 1029
#total cases with data for both
d %$% table(male_cat, male_pronoun) %>% sum
## [1] 1297
#assign sex as factor
d$sex = sex_determiner(d$male_cat_frac, d$male_pron_frac)
d$sex_num = (as.numeric(d$sex)-2)*-1

#final distribution
table2(d$sex)
## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M      1113    79.0
## 2 F       295    21.0
## 3 <NA>      0     0

Nationality/Ethnicity

#load demographics data
demographics = d[c("jewish", "american", "british", "canadian", "australian", "new_zealandian", "scandinavian")]

#nationalities
colMeans(demographics) %>% round(2)
##         jewish       american        british       canadian     australian 
##           0.17           0.69           0.18           0.06           0.03 
## new_zealandian   scandinavian 
##           0.00           0.00
#double counting?
colMeans(demographics[-1]) %>% sum
## [1] 0.97
table2(rowSums(demographics[-1]))
## # A tibble: 5 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 1      1277  90.7  
## 2 0        87   6.18 
## 3 2        42   2.98 
## 4 3         2   0.142
## 5 <NA>      0   0
#proportions
colMeans(demographics[-1]) / colMeans(demographics[-1])[2] #UK
##       american        british       canadian     australian new_zealandian 
##          3.820          1.000          0.332          0.148          0.027 
##   scandinavian 
##          0.012
64.1  / 318.9  #UK
## [1] 0.2
35.16 / 318.9  #CAN
## [1] 0.11
23.13 / 318.9  #AUS
## [1] 0.073
4.471 / 318.9  #NZ
## [1] 0.014
#jews by sex
table(jewish = d$jewish, sex = d$sex) %>% prop.table(margin = 2)
##       sex
## jewish    M    F
##      0 0.83 0.85
##      1 0.17 0.15
#jews by nation
table(jewish = d$jewish, nation = d$american) %>% prop.table(margin = 1)
##       nation
## jewish    0    1
##      0 0.35 0.65
##      1 0.11 0.89

Distributions

#year of birth dist
GG_denhist(d$born, vline = NULL) + 
  scale_y_continuous(breaks = NULL) +
  scale_x_continuous("Year of birth")

GG_save("figures/birth_dist.png")

#stats
miss_count(d$born, reverse = T)
## [1] 1335
#year of birth for those who are dead
d %>% dplyr::filter(!is.na(age_at_death)) %$% averages(born)
## arithmetic  geometric   harmonic       mode     median    trimmed   midrange 
##       1915       1915       1915       1924       1918       1916       1795
#age at death
GG_denhist(d$age_at_death) + 
  scale_y_continuous(breaks = NULL) +
  scale_x_continuous("Age at death")

GG_save("figures/death_dist.png")

#cases
miss_count(d$age_at_death, reverse = T)
## [1] 305
#americans
mean(d$american)
## [1] 0.69

Output data for reuse

Or just for your amusement.

#print nicely but only the first few columns
d %>% DT::datatable() %>% formatRound(names(d)[map_lgl(d, ~!is.integer(.) & is.numeric(.))], digits=3)
#data to disk
write_rds(d, "data/main_reuse.rds")
write_csv(d, "data/main_reuse.csv")

#versions
write_sessioninfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.13            curry_0.1.1        bisoreg_1.5        R2WinBUGS_2.1-21  
##  [5] boot_1.3-25        coda_0.19-3        monreg_0.1.4       bootstrap_2019.6  
##  [9] WikipediR_1.5.0    rvest_0.3.5        xml2_1.3.2         kirkegaard_2018.05
## [13] metafor_2.4-0      Matrix_1.2-18      psych_1.9.12.31    magrittr_1.5      
## [17] assertthat_0.2.1   weights_1.0.1      mice_3.8.0         gdata_2.18.0      
## [21] Hmisc_4.4-0        Formula_1.2-3      survival_3.1-12    lattice_0.20-41   
## [25] forcats_0.5.0      stringr_1.4.0      dplyr_0.8.5        purrr_0.3.4       
## [29] readr_1.3.1        tidyr_1.0.3        tibble_3.0.1       ggplot2_3.3.0     
## [33] tidyverse_1.3.0    pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-147        fs_1.4.1            lubridate_1.7.8    
##  [4] RColorBrewer_1.1-2  httr_1.4.1          tools_3.6.3        
##  [7] backports_1.1.6     utf8_1.1.4          R6_2.4.1           
## [10] rpart_4.1-15        mgcv_1.8-31         DBI_1.1.0          
## [13] colorspace_1.4-1    nnet_7.3-14         withr_2.2.0        
## [16] tidyselect_1.0.0    gridExtra_2.3       mnormt_1.5-7       
## [19] compiler_3.6.3      cli_2.0.2           htmlTable_1.13.3   
## [22] labeling_0.3        scales_1.1.0        checkmate_2.0.0    
## [25] digest_0.6.25       foreign_0.8-76      rmarkdown_2.1      
## [28] base64enc_0.1-3     jpeg_0.1-8.1        pkgconfig_2.0.3    
## [31] htmltools_0.4.0     dbplyr_1.4.3        htmlwidgets_1.5.1  
## [34] rlang_0.4.6         readxl_1.3.1        rstudioapi_0.11    
## [37] farver_2.0.3        generics_0.0.2      jsonlite_1.6.1     
## [40] crosstalk_1.1.0.1   gtools_3.8.2        acepack_1.4.1      
## [43] Rcpp_1.0.4.6        munsell_0.5.0       fansi_0.4.1        
## [46] lifecycle_0.2.0     stringi_1.4.6       yaml_2.2.1         
## [49] grid_3.6.3          parallel_3.6.3      crayon_1.3.4       
## [52] haven_2.2.0         splines_3.6.3       hms_0.5.3          
## [55] knitr_1.28          pillar_1.4.4        reprex_0.3.0       
## [58] glue_1.4.0          evaluate_0.14       latticeExtra_0.6-29
## [61] data.table_1.12.8   modelr_0.1.7        png_0.1-7          
## [64] vctrs_0.2.4         cellranger_1.1.0    gtable_0.3.0       
## [67] xfun_0.13           broom_0.5.6         cluster_2.1.0      
## [70] ellipsis_0.3.0