Load packages and set settings.
library(pacman)
#may need this
#devtools::install_version("bisoreg",version="1.5")
p_load(kirkegaard, readr, rvest, dplyr, WikipediR, bisoreg, curry, DT)
options(digits = 2)
source("R/functions.R")
Load the scraped data.
#main data
d = read_rds("data/main.rds")
#US life expectancy
d_le = read_csv("data/us_le.csv")
## Parsed with column specification:
## cols(
## Year = col_double(),
## M = col_double(),
## F = col_double()
## )
#plurality score
d$male_cat = round(d$male_cat_frac)
d$male_pronoun = round(d$male_pron_frac)
#do methods agree?
d %$% table(male_cat, male_pronoun)
## male_pronoun
## male_cat 0 1
## 0 268 0
## 1 0 1029
#total cases with data for both
d %$% table(male_cat, male_pronoun) %>% sum
## [1] 1297
#assign sex as factor
d$sex = sex_determiner(d$male_cat_frac, d$male_pron_frac)
d$sex_num = (as.numeric(d$sex)-2)*-1
#final distribution
table2(d$sex)
## # A tibble: 3 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 M 1113 79.0
## 2 F 295 21.0
## 3 <NA> 0 0
#load demographics data
demographics = d[c("jewish", "american", "british", "canadian", "australian", "new_zealandian", "scandinavian")]
#nationalities
colMeans(demographics) %>% round(2)
## jewish american british canadian australian
## 0.17 0.69 0.18 0.06 0.03
## new_zealandian scandinavian
## 0.00 0.00
#double counting?
colMeans(demographics[-1]) %>% sum
## [1] 0.97
table2(rowSums(demographics[-1]))
## # A tibble: 5 x 3
## Group Count Percent
## <chr> <dbl> <dbl>
## 1 1 1277 90.7
## 2 0 87 6.18
## 3 2 42 2.98
## 4 3 2 0.142
## 5 <NA> 0 0
#proportions
colMeans(demographics[-1]) / colMeans(demographics[-1])[2] #UK
## american british canadian australian new_zealandian
## 3.820 1.000 0.332 0.148 0.027
## scandinavian
## 0.012
64.1 / 318.9 #UK
## [1] 0.2
35.16 / 318.9 #CAN
## [1] 0.11
23.13 / 318.9 #AUS
## [1] 0.073
4.471 / 318.9 #NZ
## [1] 0.014
#jews by sex
table(jewish = d$jewish, sex = d$sex) %>% prop.table(margin = 2)
## sex
## jewish M F
## 0 0.83 0.85
## 1 0.17 0.15
#jews by nation
table(jewish = d$jewish, nation = d$american) %>% prop.table(margin = 1)
## nation
## jewish 0 1
## 0 0.35 0.65
## 1 0.11 0.89
#year of birth dist
GG_denhist(d$born, vline = NULL) +
scale_y_continuous(breaks = NULL) +
scale_x_continuous("Year of birth")
GG_save("figures/birth_dist.png")
#stats
miss_count(d$born, reverse = T)
## [1] 1335
#year of birth for those who are dead
d %>% dplyr::filter(!is.na(age_at_death)) %$% averages(born)
## arithmetic geometric harmonic mode median trimmed midrange
## 1915 1915 1915 1924 1918 1916 1795
#age at death
GG_denhist(d$age_at_death) +
scale_y_continuous(breaks = NULL) +
scale_x_continuous("Age at death")
GG_save("figures/death_dist.png")
#cases
miss_count(d$age_at_death, reverse = T)
## [1] 305
#americans
mean(d$american)
## [1] 0.69
#find optimal span
set.seed(1)
loess_fits = list(
male_pct = loess.wrapper(x = d$born, y = d$sex_num),
age_death = d %>% dplyr::filter(!is.na(age_at_death)) %$% loess.wrapper(x = born, y = age_at_death),
alumni = loess.wrapper(x = d$born, y = d$alumni)
)
#male%
#span = optimal_loess$pars$span
ggplot(d, aes(born, sex_num)) +
geom_smooth(method = "loess", span = loess_fits$male_pct$pars$span) +
scale_y_continuous(labels = scales::percent, limits = c(0, 1), name = "Male comedians%") +
scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
theme_bw()
GG_save("figures/male_pct_loess.png")
#calculations
d %>% dplyr::filter(is_between(born, 1880, 1910)) %$% wtd_mean(sex_num)
## [1] 0.9
d %>% dplyr::filter(is_between(born, 1980, 1990)) %$% wtd_mean(sex_num)
## [1] 0.61
(comedian_male_pct_1915 = predict(loess_fits$male_pct, newdata = data.frame(x = 1915)))
## 1
## 0.86
averages(d$sex_num)
## arithmetic geometric harmonic mode median trimmed midrange
## 0.79 0.00 0.00 1.00 1.00 0.86 0.50
#age at death
ggplot(d, aes(born, age_at_death)) +
geom_smooth(method = "loess", span = loess_fits$age_death$pars$span) +
scale_y_continuous(name = "Age at death") +
scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
theme_bw()
GG_save("figures/age_death.png")
#averages
averages(d$age_at_death)
## arithmetic geometric harmonic mode median trimmed midrange
## 69 67 64 78 71 70 62
#specific time prediction
(pred_age_death_1915 = predict(loess_fits$age_death, newdata = data.frame(x = 1915)))
## 1
## 79
#calculations
(comedian_sex_bias = 4 * (1- comedian_male_pct_1915))
## 1
## 0.58
(general_sex_bias = 4 * .5)
## [1] 2
(adjusted_comedian_le = pred_age_death_1915 - comedian_sex_bias)
## 1
## 79
(adjusted_general_le = 66:71 - general_sex_bias)
## [1] 64 65 66 67 68 69
(comedian_le_advantage = adjusted_comedian_le - adjusted_general_le)
## [1] 14.6 13.6 12.6 11.6 10.6 9.6
(expected_le_due_to_g = .12 * 2 * 15)
## [1] 3.6
(1 - (comedian_le_advantage - expected_le_due_to_g)/comedian_le_advantage)
## [1] 0.25 0.27 0.29 0.31 0.34 0.38
#uni
ggplot(d, aes(born, university)) +
geom_smooth() +
scale_y_continuous(name = "Attended university %", labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
theme_bw()
GG_save("figures/university.png")
#phd
ggplot(d, aes(born, phd)) +
geom_smooth() +
scale_y_continuous(name = "Has a Ph.D. %", labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
theme_bw()
GG_save("figures/phd.png")
#main trend
ggplot(d, aes(born, alumni)) +
geom_smooth(method = "loess", span = loess_fits$alumni$pars$span) +
scale_y_continuous(name = "Graduated from college or university", labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_smooth).
GG_save("figures/alumni.png")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_smooth).
#correlate predictions
cor.test(
predict(loess_fits$male_pct, newdata = data.frame(x = 1880:1980)) %>% detrend,
predict(loess_fits$alumni, newdata = data.frame(x = 1880:1980)) %>% detrend
)
##
## Pearson's product-moment correlation
##
## data: predict(loess_fits$male_pct, newdata = data.frame(x = 1880:1980)) %>% and predict(loess_fits$alumni, newdata = data.frame(x = 1880:1980)) %>% detrend and detrend
## t = 7, df = 99, p-value = 5e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.42 0.69
## sample estimates:
## cor
## 0.57
#make residuals
resids = data_frame(male_pct = predict(loess_fits$male_pct, newdata = data.frame(x = 1880:1980)) %>% detrend,
alumni = predict(loess_fits$alumni, newdata = data.frame(x = 1880:1980)) %>% detrend,
born = 1880:1980
)
## Warning: `data_frame()` is deprecated as of tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#plot
resids %>%
#long form
tidyr::gather(key = variable, value = value, male_pct, alumni) %>%
ggplot(aes(born, value, color = variable)) +
geom_smooth() +
xlab("Year of birth") + ylab("Standardized residual from linear regression") +
theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#alumni trend by sex
ggplot(d, aes(born, alumni, color = sex)) +
geom_smooth(method = "loess", span = loess_fits$alumni$pars$span) +
scale_y_continuous(name = "Graduated from college or university", labels = scales::percent, limits = c(0, 1)) +
scale_x_continuous(name = "Year of birth", limits = c(1880, NA), breaks = seq(0, 2000, 10)) +
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_smooth).
GG_save("figures/alumni_sex.png")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 84 rows containing non-finite values (stat_smooth).
## Warning: Removed 7 rows containing missing values (geom_smooth).
Or just for your amusement.
#print nicely but only the first few columns
d %>% DT::datatable() %>% formatRound(names(d)[map_lgl(d, ~!is.integer(.) & is.numeric(.))], digits=3)
#data to disk
write_rds(d, "data/main_reuse.rds")
write_csv(d, "data/main_reuse.csv")
#versions
write_sessioninfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] DT_0.13 curry_0.1.1 bisoreg_1.5 R2WinBUGS_2.1-21
## [5] boot_1.3-25 coda_0.19-3 monreg_0.1.4 bootstrap_2019.6
## [9] WikipediR_1.5.0 rvest_0.3.5 xml2_1.3.2 kirkegaard_2018.05
## [13] metafor_2.4-0 Matrix_1.2-18 psych_1.9.12.31 magrittr_1.5
## [17] assertthat_0.2.1 weights_1.0.1 mice_3.8.0 gdata_2.18.0
## [21] Hmisc_4.4-0 Formula_1.2-3 survival_3.1-12 lattice_0.20-41
## [25] forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.4
## [29] readr_1.3.1 tidyr_1.0.3 tibble_3.0.1 ggplot2_3.3.0
## [33] tidyverse_1.3.0 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] nlme_3.1-147 fs_1.4.1 lubridate_1.7.8
## [4] RColorBrewer_1.1-2 httr_1.4.1 tools_3.6.3
## [7] backports_1.1.6 utf8_1.1.4 R6_2.4.1
## [10] rpart_4.1-15 mgcv_1.8-31 DBI_1.1.0
## [13] colorspace_1.4-1 nnet_7.3-14 withr_2.2.0
## [16] tidyselect_1.0.0 gridExtra_2.3 mnormt_1.5-7
## [19] compiler_3.6.3 cli_2.0.2 htmlTable_1.13.3
## [22] labeling_0.3 scales_1.1.0 checkmate_2.0.0
## [25] digest_0.6.25 foreign_0.8-76 rmarkdown_2.1
## [28] base64enc_0.1-3 jpeg_0.1-8.1 pkgconfig_2.0.3
## [31] htmltools_0.4.0 dbplyr_1.4.3 htmlwidgets_1.5.1
## [34] rlang_0.4.6 readxl_1.3.1 rstudioapi_0.11
## [37] farver_2.0.3 generics_0.0.2 jsonlite_1.6.1
## [40] crosstalk_1.1.0.1 gtools_3.8.2 acepack_1.4.1
## [43] Rcpp_1.0.4.6 munsell_0.5.0 fansi_0.4.1
## [46] lifecycle_0.2.0 stringi_1.4.6 yaml_2.2.1
## [49] grid_3.6.3 parallel_3.6.3 crayon_1.3.4
## [52] haven_2.2.0 splines_3.6.3 hms_0.5.3
## [55] knitr_1.28 pillar_1.4.4 reprex_0.3.0
## [58] glue_1.4.0 evaluate_0.14 latticeExtra_0.6-29
## [61] data.table_1.12.8 modelr_0.1.7 png_0.1-7
## [64] vctrs_0.2.4 cellranger_1.1.0 gtable_0.3.0
## [67] xfun_0.13 broom_0.5.6 cluster_2.1.0
## [70] ellipsis_0.3.0