Packages and settings

Load packages and set settings.

library(pacman)
#may need this
#devtools::install_version("bisoreg",version="1.5")
p_load(kirkegaard, readr, rvest, dplyr, WikipediR, bisoreg, curry, DT)
options(digits = 2)
source("R/functions.R")

Load data

Load the scraped data.

#main data
d = read_rds("data/main.rds")

#US life expectancy
d_le = read_csv("data/us_le.csv")

## Parsed with column specification:
## cols(
##   Year = col_double(),
##   M = col_double(),
##   F = col_double()
## )

Sex ascertainment

#plurality score
d$male_cat = round(d$male_cat_frac)
d$male_pronoun = round(d$male_pron_frac)

#do methods agree?
d %$% table(male_cat, male_pronoun)

##         male_pronoun
## male_cat    0    1
##        0  268    0
##        1    0 1029

#total cases with data for both
d %$% table(male_cat, male_pronoun) %>% sum

## [1] 1297

#assign sex as factor
d$sex = sex_determiner(d$male_cat_frac, d$male_pron_frac)
d$sex_num = (as.numeric(d$sex)-2)*-1

#final distribution
table2(d$sex)

## # A tibble: 3 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 M      1113    79.0
## 2 F       295    21.0
## 3 <NA>      0     0

Nationality/Ethnicity

#load demographics data
demographics = d[c("jewish", "american", "british", "canadian", "australian", "new_zealandian", "scandinavian")]

#nationalities
colMeans(demographics) %>% round(2)

##         jewish       american        british       canadian     australian 
##           0.17           0.69           0.18           0.06           0.03 
## new_zealandian   scandinavian 
##           0.00           0.00

#double counting?
colMeans(demographics[-1]) %>% sum

## [1] 0.97

table2(rowSums(demographics[-1]))

## # A tibble: 5 x 3
##   Group Count Percent
##   <chr> <dbl>   <dbl>
## 1 1      1277  90.7  
## 2 0        87   6.18 
## 3 2        42   2.98 
## 4 3         2   0.142
## 5 <NA>      0   0

#proportions
colMeans(demographics[-1]) / colMeans(demographics[-1])[2] #UK

##       american        british       canadian     australian new_zealandian 
##          3.820          1.000          0.332          0.148          0.027 
##   scandinavian 
##          0.012

64.1  / 318.9  #UK

## [1] 0.2

35.16 / 318.9  #CAN

## [1] 0.11

23.13 / 318.9  #AUS

## [1] 0.073

4.471 / 318.9  #NZ

## [1] 0.014

#jews by sex
table(jewish = d$jewish, sex = d$sex) %>% prop.table(margin = 2)

##       sex
## jewish    M    F
##      0 0.83 0.85
##      1 0.17 0.15

#jews by nation
table(jewish = d$jewish, nation = d$american) %>% prop.table(margin = 1)

##       nation
## jewish    0    1
##      0 0.35 0.65
##      1 0.11 0.89

Distributions

#year of birth dist
GG_denhist(d$born, vline = NULL) + 
  scale_y_continuous(breaks = NULL) +
  scale_x_continuous("Year of birth")

GG_save("figures/birth_dist.png")

#stats
miss_count(d$born, reverse = T)

## [1] 1335

#year of birth for those who are dead
d %>% dplyr::filter(!is.na(age_at_death)) %$% averages(born)

## arithmetic  geometric   harmonic       mode     median    trimmed   midrange 
##       1915       1915       1915       1924       1918       1916       1795

#age at death
GG_denhist(d$age_at_death) + 
  scale_y_continuous(breaks = NULL) +
  scale_x_continuous("Age at death")

GG_save("figures/death_dist.png")

#cases
miss_count(d$age_at_death, reverse = T)

## [1] 305

#americans
mean(d$american)

## [1] 0.69

Temporal trends

#find optimal span
set.seed(1)
loess_fits = list(
  male_pct = loess.wrapper(x = d$born, y = d$sex_num),
  age_death = d %>% dplyr::filter(!is.na(age_at_death)) %$% loess.wrapper(x = born, y = age_at_death),
  alumni = loess.wrapper(x = d$born, y = d$alumni)
)

#male% 
#span = optimal_loess$pars$span
ggplot(d, aes(born, sex_num)) +
  geom_smooth(method = "loess", span = loess_fits$male_pct$pars$span) +
  scale_y_continuous(labels = scales::percent, limits = c(0, 1), name = "Male comedians%") +
  scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
  theme_bw()

GG_save("figures/male_pct_loess.png")

#calculations
d %>% dplyr::filter(is_between(born, 1880, 1910)) %$% wtd_mean(sex_num)

## [1] 0.9

d %>% dplyr::filter(is_between(born, 1980, 1990)) %$% wtd_mean(sex_num)

## [1] 0.61

(comedian_male_pct_1915 = predict(loess_fits$male_pct, newdata = data.frame(x = 1915)))

##    1 
## 0.86

averages(d$sex_num)

## arithmetic  geometric   harmonic       mode     median    trimmed   midrange 
##       0.79       0.00       0.00       1.00       1.00       0.86       0.50

#age at death
ggplot(d, aes(born, age_at_death)) +
  geom_smooth(method = "loess", span = loess_fits$age_death$pars$span) +
  scale_y_continuous(name = "Age at death") +
  scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
  theme_bw()

GG_save("figures/age_death.png")

#averages
averages(d$age_at_death)

## arithmetic  geometric   harmonic       mode     median    trimmed   midrange 
##         69         67         64         78         71         70         62

#specific time prediction
(pred_age_death_1915 = predict(loess_fits$age_death, newdata = data.frame(x = 1915)))

##  1 
## 79

#calculations
(comedian_sex_bias = 4 * (1- comedian_male_pct_1915))

##    1 
## 0.58

(general_sex_bias = 4 * .5)

## [1] 2

(adjusted_comedian_le = pred_age_death_1915 - comedian_sex_bias)

##  1 
## 79

(adjusted_general_le = 66:71 - general_sex_bias)

## [1] 64 65 66 67 68 69

(comedian_le_advantage = adjusted_comedian_le - adjusted_general_le)

## [1] 14.6 13.6 12.6 11.6 10.6  9.6

(expected_le_due_to_g = .12 * 2 * 15)

## [1] 3.6

(1 - (comedian_le_advantage - expected_le_due_to_g)/comedian_le_advantage)

## [1] 0.25 0.27 0.29 0.31 0.34 0.38

#uni
ggplot(d, aes(born, university)) +
  geom_smooth() +
  scale_y_continuous(name = "Attended university %", labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
  theme_bw()

GG_save("figures/university.png")

#phd
ggplot(d, aes(born, phd)) +
  geom_smooth() +
  scale_y_continuous(name = "Has a Ph.D. %", labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
  theme_bw()

GG_save("figures/phd.png")

Alumni rates

#main trend
ggplot(d, aes(born, alumni)) +
  geom_smooth(method = "loess", span = loess_fits$alumni$pars$span) +
  scale_y_continuous(name = "Graduated from college or university", labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 84 rows containing non-finite values (stat_smooth).

## Warning: Removed 3 rows containing missing values (geom_smooth).

GG_save("figures/alumni.png")

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 84 rows containing non-finite values (stat_smooth).

## Warning: Removed 3 rows containing missing values (geom_smooth).

#correlate predictions
cor.test(
  predict(loess_fits$male_pct, newdata = data.frame(x = 1880:1980)) %>% detrend,
  predict(loess_fits$alumni, newdata = data.frame(x = 1880:1980)) %>% detrend
)

## 
##  Pearson's product-moment correlation
## 
## data:  predict(loess_fits$male_pct, newdata = data.frame(x = 1880:1980)) %>%  and predict(loess_fits$alumni, newdata = data.frame(x = 1880:1980)) %>%     detrend and     detrend
## t = 7, df = 99, p-value = 5e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.42 0.69
## sample estimates:
##  cor 
## 0.57

#make residuals
resids = data_frame(male_pct = predict(loess_fits$male_pct, newdata = data.frame(x = 1880:1980)) %>% detrend,
                    alumni = predict(loess_fits$alumni, newdata = data.frame(x = 1880:1980)) %>% detrend,
                    born = 1880:1980
                    )

## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

#plot
resids %>% 
  #long form
  tidyr::gather(key = variable, value = value, male_pct, alumni) %>% 
  ggplot(aes(born, value, color = variable)) +
  geom_smooth() +
  xlab("Year of birth") + ylab("Standardized residual from linear regression") +
  theme_bw()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#alumni trend by sex
ggplot(d, aes(born, alumni, color = sex)) +
  geom_smooth(method = "loess", span = loess_fits$alumni$pars$span) +
  scale_y_continuous(name = "Graduated from college or university", labels = scales::percent, limits = c(0, 1)) +
  scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
  theme_bw()

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 84 rows containing non-finite values (stat_smooth).

## Warning: Removed 7 rows containing missing values (geom_smooth).

GG_save("figures/alumni_sex.png")

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 84 rows containing non-finite values (stat_smooth).

## Warning: Removed 7 rows containing missing values (geom_smooth).

Output data for reuse

Or just for your amusement.

#print nicely but only the first few columns
d %>% DT::datatable() %>% formatRound(names(d)[map_lgl(d, ~!is.integer(.) & is.numeric(.))], digits=3)

#data to disk
write_rds(d, "data/main_reuse.rds")
write_csv(d, "data/main_reuse.csv")

#versions
write_sessioninfo()

## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.13            curry_0.1.1        bisoreg_1.5        R2WinBUGS_2.1-21  
##  [5] boot_1.3-25        coda_0.19-3        monreg_0.1.4       bootstrap_2019.6  
##  [9] WikipediR_1.5.0    rvest_0.3.5        xml2_1.3.2         kirkegaard_2018.05
## [13] metafor_2.4-0      Matrix_1.2-18      psych_1.9.12.31    magrittr_1.5      
## [17] assertthat_0.2.1   weights_1.0.1      mice_3.8.0         gdata_2.18.0      
## [21] Hmisc_4.4-0        Formula_1.2-3      survival_3.1-12    lattice_0.20-41   
## [25] forcats_0.5.0      stringr_1.4.0      dplyr_0.8.5        purrr_0.3.4       
## [29] readr_1.3.1        tidyr_1.0.3        tibble_3.0.1       ggplot2_3.3.0     
## [33] tidyverse_1.3.0    pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-147        fs_1.4.1            lubridate_1.7.8    
##  [4] RColorBrewer_1.1-2  httr_1.4.1          tools_3.6.3        
##  [7] backports_1.1.6     utf8_1.1.4          R6_2.4.1           
## [10] rpart_4.1-15        mgcv_1.8-31         DBI_1.1.0          
## [13] colorspace_1.4-1    nnet_7.3-14         withr_2.2.0        
## [16] tidyselect_1.0.0    gridExtra_2.3       mnormt_1.5-7       
## [19] compiler_3.6.3      cli_2.0.2           htmlTable_1.13.3   
## [22] labeling_0.3        scales_1.1.0        checkmate_2.0.0    
## [25] digest_0.6.25       foreign_0.8-76      rmarkdown_2.1      
## [28] base64enc_0.1-3     jpeg_0.1-8.1        pkgconfig_2.0.3    
## [31] htmltools_0.4.0     dbplyr_1.4.3        htmlwidgets_1.5.1  
## [34] rlang_0.4.6         readxl_1.3.1        rstudioapi_0.11    
## [37] farver_2.0.3        generics_0.0.2      jsonlite_1.6.1     
## [40] crosstalk_1.1.0.1   gtools_3.8.2        acepack_1.4.1      
## [43] Rcpp_1.0.4.6        munsell_0.5.0       fansi_0.4.1        
## [46] lifecycle_0.2.0     stringi_1.4.6       yaml_2.2.1         
## [49] grid_3.6.3          parallel_3.6.3      crayon_1.3.4       
## [52] haven_2.2.0         splines_3.6.3       hms_0.5.3          
## [55] knitr_1.28          pillar_1.4.4        reprex_0.3.0       
## [58] glue_1.4.0          evaluate_0.14       latticeExtra_0.6-29
## [61] data.table_1.12.8   modelr_0.1.7        png_0.1-7          
## [64] vctrs_0.2.4         cellranger_1.1.0    gtable_0.3.0       
## [67] xfun_0.13           broom_0.5.6         cluster_2.1.0      
## [70] ellipsis_0.3.0

The sex distribution of comedians over time

Emil O. W. Kirkegaard

Packages and settings

Load data

Sex ascertainment

Nationality/Ethnicity

Distributions

Temporal trends

Alumni rates

Output data for reuse