fertile <- pooled |>
mutate(age_band = cut(AGEP,
breaks = c(14, 19, 24, 29, 34, 39, 44, 50),
labels = c("15-19","20-24","25-29","30-34","35-39","40-44","45-50"),
right = TRUE))
big_occs <- fertile |>
group_by(harm) |>
summarise(n_births = sum(birth), .groups = "drop") |>
filter(n_births >= 50) |>
pull(harm)
model_data <- fertile |> filter(harm %in% big_occs)
cat(sprintf("Total fertile women 15-50: %d\n", nrow(fertile)))
## Total fertile women 15-50: 5476050
cat(sprintf("Occupations with >=50 pooled births: %d\n", length(big_occs)))
## Occupations with >=50 pooled births: 311
cat(sprintf("Sample size in those occupations: %d (births = %d)\n",
nrow(model_data), sum(model_data$birth)))
## Sample size in those occupations: 5476050 (births = 294492)
Direct age-standardization on (age_band × wave) cells. For each occupation \(O\),
\[\widehat{RR}_O = \frac{\sum_{c \in S_O} \tilde w_c \, \hat p_{cO}} {\sum_{c \in S_O} \tilde w_c \, \hat p_{c\cdot}}\]
with \(c\) indexing (age_band × wave) cells, \(\tilde w_c\) = the cell’s share of the overall fertile-sample weighted population, renormalized within each occupation’s observed cells.
joint_wts <- model_data |>
group_by(wave, age_band) |>
summarise(w_cell = sum(PWGTP), .groups = "drop") |>
mutate(w_cell = w_cell / sum(w_cell))
cell_overall <- model_data |>
group_by(wave, age_band) |>
summarise(births_w = sum(birth * PWGTP),
pop_w = sum(PWGTP),
.groups = "drop") |>
mutate(p_cell = births_w / pop_w)
cell_pooled <- model_data |>
group_by(harm, wave, age_band) |>
summarise(births_w = sum(birth * PWGTP),
pop_w = sum(PWGTP),
births_un = sum(birth),
.groups = "drop") |>
mutate(p_cellO = births_w / pop_w)
dsrr_pool <- cell_pooled |>
left_join(joint_wts, by = c("wave","age_band")) |>
left_join(cell_overall |> select(wave, age_band, p_cell), by = c("wave","age_band")) |>
group_by(harm) |>
mutate(w_norm = w_cell / sum(w_cell)) |>
summarise(dsr_occ = sum(w_norm * p_cellO),
dsr_ref = sum(w_norm * p_cell),
var_log_dsr = sum((w_norm * p_cellO)^2 / pmax(births_un, 1)) /
(sum(w_norm * p_cellO))^2,
births_un_total = sum(births_un),
n_cells = dplyr::n(),
.groups = "drop") |>
mutate(rr = dsr_occ / dsr_ref, log_rr = log(rr), se_log = sqrt(var_log_dsr),
rr_lo = exp(log_rr - 1.96 * se_log),
rr_hi = exp(log_rr + 1.96 * se_log)) |>
left_join(occp_labels, by = "harm") |>
arrange(desc(rr))
write_csv(
dsrr_pool |> transmute(OCCP = harm, occupation, rr = round(rr, 4),
rr_lo = round(rr_lo, 4), rr_hi = round(rr_hi, 4),
se_log = round(se_log, 4),
n_births = births_un_total, n_cells),
"data/fertility_rr_by_occupation_age_std.csv")
wave_age <- cell_pooled |>
group_by(wave, age_band) |>
summarise(pop_w = sum(pop_w), births_w = sum(births_w), .groups = "drop") |>
group_by(wave) |>
mutate(w_band = pop_w / sum(pop_w), p_b = births_w / pop_w) |>
ungroup()
dsrr_wave <- cell_pooled |>
left_join(wave_age |> select(wave, age_band, w_band, p_b),
by = c("wave","age_band")) |>
group_by(harm, wave) |>
mutate(w_norm = w_band / sum(w_band)) |>
summarise(dsr_occ = sum(w_norm * p_cellO),
dsr_ref = sum(w_norm * p_b),
var_log_dsr = sum((w_norm * p_cellO)^2 / pmax(births_un, 1)) /
(sum(w_norm * p_cellO))^2,
births_un_total = sum(births_un),
.groups = "drop") |>
mutate(rr = dsr_occ / dsr_ref, log_rr = log(rr), se_log = sqrt(var_log_dsr),
rr_lo = exp(log_rr - 1.96 * se_log),
rr_hi = exp(log_rr + 1.96 * se_log)) |>
left_join(occp_labels, by = "harm")
dsrr_wave_keep <- dsrr_wave |> filter(births_un_total >= 50)
plot_top_bot <- function(df, title_str, file_out, w = 12, h = 12, color = "steelblue") {
top25 <- df |> filter(births_un_total >= 50) |> arrange(desc(rr)) |> head(25)
bot25 <- df |> filter(births_un_total >= 50) |> arrange(rr) |> head(25)
tb <- bind_rows(top25 |> mutate(grp = "Top 25"),
bot25 |> mutate(grp = "Bottom 25")) |>
mutate(occupation = str_trunc(occupation, 55),
occupation = fct_reorder(occupation, rr),
grp = factor(grp, levels = c("Top 25","Bottom 25")))
p <- ggplot(tb, aes(rr, occupation)) +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
geom_errorbar(aes(xmin = rr_lo, xmax = rr_hi),
orientation = "y", width = 0.25, color = "grey40") +
geom_point(size = 2, color = color) +
scale_x_continuous(trans = "log2",
breaks = c(0.4,0.5,0.67,0.8,1,1.25,1.5,2,3)) +
facet_wrap(~ grp, scales = "free_y", ncol = 1) +
labs(title = title_str, x = "RR vs overall (log scale)", y = NULL,
caption = "Bars = 95% CIs. Occupations with >=50 births in window.") +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 8),
strip.text = element_text(face = "bold", hjust = 0))
ggsave(file_out, p, width = w, height = h, dpi = 150)
p
}
plot_top_bot(dsrr_pool,
"Age-standardized birth-rate ratio by occupation (pooled 2013-2022)",
"figs/fertility_rr_pooled.png")
plot_top_bot(dsrr_wave_keep |> filter(wave == "2013-2017"),
"Age-standardized RR — ACS 5yr PUMS 2013-2017",
"figs/fertility_rr_wave1_top_bot.png")
plot_top_bot(dsrr_wave_keep |> filter(wave == "2018-2022"),
"Age-standardized RR — ACS 5yr PUMS 2018-2022",
"figs/fertility_rr_wave2_top_bot.png")
wide <- dsrr_wave_keep |>
select(harm, occupation, wave, rr, log_rr, se_log, births_un_total) |>
pivot_wider(id_cols = c(harm, occupation),
names_from = wave,
values_from = c(rr, log_rr, se_log, births_un_total))
names(wide) <- gsub("-", "_", names(wide))
ov <- wide |> filter(!is.na(rr_2013_2017), !is.na(rr_2018_2022)) |>
mutate(d_log = log_rr_2018_2022 - log_rr_2013_2017,
min_births = pmin(births_un_total_2013_2017, births_un_total_2018_2022))
sp_r <- cor(ov$rr_2013_2017, ov$rr_2018_2022, method = "spearman")
log_r <- cor(ov$log_rr_2013_2017, ov$log_rr_2018_2022)
rel1 <- 1 - mean(ov$se_log_2013_2017^2) / var(ov$log_rr_2013_2017)
rel2 <- 1 - mean(ov$se_log_2018_2022^2) / var(ov$log_rr_2018_2022)
r_dis <- log_r / sqrt(rel1 * rel2)
movers <- ov |> filter(min_births >= 100) |> slice_max(abs(d_log), n = 12)
p_stab <- ggplot(ov, aes(rr_2013_2017, rr_2018_2022)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
geom_smooth(method = "lm", se = TRUE, color = "tomato", linewidth = 0.7) +
geom_point(aes(size = min_births), color = "steelblue", alpha = 0.5) +
geom_text_repel(data = movers, aes(label = str_trunc(occupation, 30)),
size = 2.7, min.segment.length = 0, max.overlaps = 30) +
scale_x_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
scale_size_continuous(range = c(1, 5), guide = "none") +
coord_fixed() +
labs(title = "Wave 1 vs Wave 2 — age-standardized fertility RR",
x = "RR (ACS 2013-2017)", y = "RR (ACS 2018-2022)",
caption = sprintf("Spearman %.2f | log-Pearson %.2f | disattenuated %.2f | n=%d overlap",
sp_r, log_r, r_dis, nrow(ov))) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_wave1_vs_wave2.png", p_stab,
width = 8.5, height = 8, dpi = 150)
sc_df <- dsrr_pool |>
inner_join(pct_female |> select(harm, pct_female), by = "harm")
cor_p <- cor(sc_df$log_rr, sc_df$pct_female)
extremes <- bind_rows(
sc_df |> arrange(desc(rr)) |> head(8),
sc_df |> arrange(rr) |> head(8),
sc_df |> arrange(pct_female) |> head(4),
sc_df |> arrange(desc(pct_female)) |> head(4)
) |> distinct(harm, .keep_all = TRUE)
p_pf <- ggplot(sc_df, aes(pct_female, rr)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey50") +
geom_point(aes(size = births_un_total), color = "steelblue", alpha = 0.55) +
geom_smooth(method = "lm", color = "tomato", linewidth = 0.7) +
geom_text_repel(data = extremes, aes(label = str_trunc(occupation, 30)),
size = 2.7, min.segment.length = 0, max.overlaps = 30) +
scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.5,2)) +
scale_size_continuous(range = c(1.2, 5.5), guide = "none") +
labs(title = "Age-standardized fertility RR vs % female in occupation",
x = "% female in occupation",
y = "RR (log scale)",
caption = sprintf("Pearson on log-RR = %.2f | n = %d", cor_p, nrow(sc_df))) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_vs_pct_female.png", p_pf,
width = 9, height = 6.5, dpi = 150)
plot_df <- dsrr_pool |> inner_join(job_chars, by = "harm")
trait_levels <- c("Realistic","Investigative","Artistic","Social","Enterprising",
"Conventional","things_people","data_ideas","job_zone")
trait_labels <- c("Realistic","Investigative","Artistic","Social","Enterprising",
"Conventional","Things ↔ People (Prediger)",
"Data ↔ Ideas (Prediger)","Complexity (Job Zone)")
long_df <- plot_df |>
pivot_longer(all_of(trait_levels), names_to = "var", values_to = "value") |>
mutate(trait = factor(var, levels = trait_levels, labels = trait_labels))
spline_test <- function(d) {
d <- d |> drop_na(value)
ml <- lm(log_rr ~ value, data = d)
ms <- lm(log_rr ~ ns(value, df = 3), data = d)
ge <- summary(ms)$r.squared - summary(ml)$r.squared
pv <- anova(ml, ms)$`Pr(>F)`[2]
tibble(p = pv, gain = ge, use_spline = (pv < 0.05) & (ge > 0.02))
}
tests <- long_df |> group_by(var) |> group_modify(~ spline_test(.x))
spline_vars <- tests |> filter(use_spline) |> pull(var)
cors <- long_df |> group_by(var, trait) |>
summarise(cor_log = cor(log_rr, value, use = "complete.obs"),
cor_sp = cor(rr, value, method = "spearman", use = "complete.obs"),
.groups = "drop") |>
mutate(label = sprintf("r = %.2f ρ = %.2f%s", cor_log, cor_sp,
ifelse(var %in% spline_vars, " (spline)", " (linear)")))
fit_one <- function(d, v) {
fit <- if (v %in% spline_vars) lm(log_rr ~ ns(value, df = 3), data = d)
else lm(log_rr ~ value, data = d)
newv <- seq(min(d$value, na.rm = TRUE), max(d$value, na.rm = TRUE), length.out = 80)
tibble(value = newv,
rr_pred = exp(predict(fit, newdata = tibble(value = newv))))
}
curves <- long_df |> group_by(var, trait) |> group_modify(~ fit_one(.x, .y$var))
p_jc <- ggplot(long_df, aes(value, rr)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey60") +
geom_point(aes(size = births_un_total), color = "steelblue", alpha = 0.45) +
geom_line(data = curves, aes(value, rr_pred), color = "tomato", linewidth = 0.8) +
geom_text(data = cors, aes(x = -Inf, y = Inf, label = label),
hjust = -0.05, vjust = 1.4, size = 3, inherit.aes = FALSE) +
facet_wrap(~ trait, scales = "free_x", ncol = 3) +
scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.5,2)) +
scale_size_continuous(range = c(0.8, 4), guide = "none") +
labs(title = "Fertility RR vs O*NET job characteristics",
x = "Trait score", y = "RR (log scale)",
caption = "Spline fit only where marginal p<0.05 AND R² gain >2%; linear elsewhere.") +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"),
strip.text = element_text(face = "bold"))
ggsave("figs/fertility_rr_vs_job_chars.png", p_jc,
width = 11, height = 10, dpi = 150)
n_w <- pooled |> filter(!is.na(harm)) |>
group_by(harm) |> summarise(n_women_w = sum(PWGTP), .groups = "drop")
big50 <- dsrr_pool |> inner_join(n_w, by = "harm") |>
arrange(desc(n_women_w)) |> head(50) |>
mutate(occ_label = str_trunc(occupation, 55),
occ_label = fct_reorder(occ_label, rr))
share <- sum(big50$n_women_w) / sum(n_w$n_women_w)
p_50 <- ggplot(big50, aes(rr, occ_label)) +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
geom_errorbar(aes(xmin = rr_lo, xmax = rr_hi),
orientation = "y", width = 0.25, color = "grey40") +
geom_point(aes(size = n_women_w / 1e6), color = "steelblue") +
scale_x_continuous(trans = "log2", breaks = c(0.7,0.8,0.9,1,1.1,1.25,1.5)) +
scale_size_continuous(name = "Women (M)", range = c(1.5, 6)) +
labs(title = "Fertility RR for the 50 largest occupations by female workforce",
subtitle = sprintf("These 50 occupations employ %.0f%% of US women 15-50 in the labor force",
100 * share),
x = "RR (log scale)", y = NULL) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_top50_largest.png", p_50,
width = 11, height = 11, dpi = 150)
hier_rr <- dsrr_pool |>
left_join(occp_prefix, by = "harm") |>
left_join(n_w, by = "harm") |>
mutate(major_group = ifelse(is.na(prefix), "Unmapped", prefix_names[prefix])) |>
filter(major_group != "Unmapped")
group_summary_w <- hier_rr |>
group_by(major_group) |>
summarise(n_occ = dplyr::n(),
mean_log_rr = weighted.mean(log_rr, n_women_w),
sd_log = sqrt(sum(n_women_w * (log_rr - mean_log_rr)^2) / sum(n_women_w)),
n_eff = sum(n_women_w)^2 / sum(n_women_w^2),
se_mean = sd_log / sqrt(pmax(n_eff, 1)),
mean_rr = exp(mean_log_rr),
mean_lo = exp(mean_log_rr - 1.96 * se_mean),
mean_hi = exp(mean_log_rr + 1.96 * se_mean),
.groups = "drop") |>
arrange(mean_log_rr) |>
mutate(major_group_o = factor(major_group, levels = major_group))
hier_rr <- hier_rr |>
mutate(major_group_o = factor(major_group, levels = levels(group_summary_w$major_group_o)))
p_mg <- ggplot() +
geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
geom_jitter(data = hier_rr,
aes(rr, major_group_o, size = n_women_w / 1e6),
height = 0.18, alpha = 0.4, color = "steelblue") +
geom_errorbar(data = group_summary_w,
aes(y = major_group_o, xmin = mean_lo, xmax = mean_hi),
orientation = "y", width = 0.4, color = "black", linewidth = 0.7) +
geom_point(data = group_summary_w,
aes(mean_rr, major_group_o), color = "black", size = 2.5, shape = 18) +
geom_text(data = group_summary_w,
aes(y = major_group_o, x = 2.0, label = sprintf("n=%d", n_occ)),
hjust = 0, size = 3, color = "grey30") +
scale_x_continuous(trans = "log2",
breaks = c(0.5,0.7,0.85,1,1.2,1.5,2),
limits = c(0.4, 2.3)) +
scale_size_continuous(name = "Women (millions)", range = c(0.5, 4)) +
labs(title = "Fertility RR by major occupational group (women-weighted)",
subtitle = "Blue dots = occupations; black diamond = group mean log-RR",
x = "RR (log scale)", y = NULL) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_major_groups.png", p_mg,
width = 13, height = 8.5, dpi = 150)
verdant_raw <- read_csv("data/verdant_occupations.csv", show_col_types = FALSE)
occp_politics <- read_csv("data/occp_politics.csv", show_col_types = FALSE,
col_types = cols(harm = "c"))
sc_pol <- dsrr_pool |> inner_join(occp_politics |> select(harm, pct_dem, total_donors),
by = "harm")
cor_pol <- cor(sc_pol$log_rr, sc_pol$pct_dem)
ext_pol <- bind_rows(
sc_pol |> arrange(desc(rr)) |> head(6),
sc_pol |> arrange(rr) |> head(6),
sc_pol |> arrange(pct_dem) |> head(4),
sc_pol |> arrange(desc(pct_dem)) |> head(4)
) |> distinct(harm, .keep_all = TRUE)
p_pol <- ggplot(sc_pol, aes(pct_dem, rr)) +
geom_hline(yintercept = 1, linetype = "dashed", color = "grey50") +
geom_vline(xintercept = 0.5, linetype = "dashed", color = "grey80") +
geom_point(aes(size = births_un_total), color = "steelblue", alpha = 0.55) +
geom_smooth(method = "lm", color = "tomato", linewidth = 0.7) +
geom_text_repel(data = ext_pol, aes(label = str_trunc(occupation, 32)),
size = 2.7, min.segment.length = 0, max.overlaps = 30) +
scale_x_continuous(breaks = seq(0, 1, 0.2),
labels = scales::percent_format(accuracy = 1)) +
scale_y_continuous(trans = "log2", breaks = c(0.5, 0.75, 1, 1.5, 2)) +
scale_size_continuous(range = c(1.2, 5.5), guide = "none") +
labs(title = "Age-standardized fertility RR vs occupation political lean",
subtitle = "% Democratic from Verdant Labs FEC donor data (~2008-2016)",
x = "% Democratic donors in occupation",
y = "RR (log scale)",
caption = sprintf("Pearson on log-RR = %.2f | n = %d occupations", cor_pol, nrow(sc_pol))) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_vs_pct_dem.png", p_pol, width = 9, height = 6.5, dpi = 150)
v_keep <- verdant_raw |>
mutate(total_donors = reps + dems) |>
filter(total_donors >= 100)
tb_v <- bind_rows(
v_keep |> arrange(desc(pct_dem)) |> head(25) |> mutate(grp = "Most Democratic"),
v_keep |> arrange(pct_dem) |> head(25) |> mutate(grp = "Most Republican")
) |> mutate(occupation = str_trunc(occupation, 45),
occupation = fct_reorder(occupation, pct_dem),
grp = factor(grp, levels = c("Most Democratic","Most Republican")))
p_v <- ggplot(tb_v, aes(pct_dem, occupation, fill = pct_dem)) +
geom_vline(xintercept = 0.5, linetype = "dashed", color = "grey50") +
geom_col() +
geom_text(aes(label = sprintf("%.0f%%", 100*pct_dem)),
color = "white", size = 2.7, hjust = 1.2) +
geom_text(aes(label = sprintf("n=%d", total_donors)),
color = "grey20", size = 2.5, hjust = -0.1, x = 1) +
facet_wrap(~ grp, scales = "free_y", ncol = 1) +
scale_fill_gradient2(low = "#b2182b", mid = "white", high = "#2166ac",
midpoint = 0.5, limits = c(0, 1), guide = "none") +
scale_x_continuous(labels = scales::percent_format(accuracy = 1),
breaks = seq(0, 1, 0.2),
limits = c(0, 1.18), expand = c(0, 0)) +
labs(title = "Most Democratic and most Republican occupations (Verdant Labs)",
x = "% Democratic donors", y = NULL) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"),
axis.text.y = element_text(size = 9),
strip.text = element_text(face = "bold", hjust = 0),
panel.grid.major.y = element_blank())
ggsave("figs/verdant_top_bottom_25.png", p_v, width = 11, height = 11, dpi = 150)
For each man in the PUMS, find his cohabiting opposite-sex partner (via RELSHIPP / RELP codes for spouse / unmarried partner of the reference person), and if she is a woman 15-50 with the FER question answered, attribute her FER response to him. This misses non-resident fathers (~30-40% of US births occur to non-cohabiting parents) — those men appear as zero-fertility by construction.
needed_male_22 <- c("SERIALNO","SPORDER","AGEP","SEX","FER","OCCP","PWGTP","RELSHIPP")
needed_male_17 <- c("SERIALNO","SPORDER","AGEP","SEX","FER","OCCP","PWGTP","RELP")
p22 <- read_parquet("data/pums_2022_5yr_person.parquet", col_select = all_of(needed_male_22))
p17 <- read_parquet("data/pums_2017_5yr_person.parquet", col_select = all_of(needed_male_17)) |>
rename(RELSHIPP = RELP)
ref_22 <- p22 |> filter(RELSHIPP == 20L)
prt_22 <- p22 |> filter(RELSHIPP %in% c(21L, 22L))
ref_17 <- p17 |> filter(RELSHIPP == 0L)
prt_17 <- p17 |> filter(RELSHIPP %in% c(1L, 13L))
extract_couple <- function(ref, prt, wave_label) {
ref |> inner_join(prt, by = "SERIALNO", suffix = c("_R","_P")) |>
filter(SEX_R != SEX_P) |>
mutate(male_age = if_else(SEX_R == 1L, AGEP_R, AGEP_P),
male_occp = if_else(SEX_R == 1L, OCCP_R, OCCP_P),
male_wgt = if_else(SEX_R == 1L, PWGTP_R, PWGTP_P),
female_age = if_else(SEX_R == 2L, AGEP_R, AGEP_P),
female_FER = if_else(SEX_R == 2L, FER_R, FER_P),
wave = wave_label)
}
couples <- bind_rows(
extract_couple(ref_17, prt_17, "2013-2017"),
extract_couple(ref_22, prt_22, "2018-2022")
) |>
filter(female_age >= 15, female_age <= 50,
female_FER %in% c("1", "2"),
!is.na(male_occp), male_occp != "bbbb",
male_age >= 15, male_age <= 65) |>
select(wave, male_age, male_occp, male_wgt, female_age, female_FER)
# Harmonize male OCCP to harm code via crosswalk
xw <- read_csv("data/occp_2010_2018_crosswalk.csv", show_col_types = FALSE,
col_types = cols(.default = "c"))
xw_2010 <- xw |> distinct(occ2010, group_id) |> filter(!is.na(occ2010))
xw_2018 <- xw |> distinct(occ2018, group_id) |> filter(!is.na(occ2018))
male_pool <- bind_rows(
couples |> filter(wave == "2013-2017") |>
left_join(xw_2010, by = c("male_occp" = "occ2010")) |>
mutate(harm = coalesce(group_id, male_occp)),
couples |> filter(wave == "2018-2022") |>
left_join(xw_2018, by = c("male_occp" = "occ2018")) |>
mutate(harm = coalesce(group_id, male_occp))
) |>
mutate(birth = as.integer(female_FER == "1"),
age_band = cut(male_age,
breaks = c(14, 19, 24, 29, 34, 39, 44, 49, 54, 65),
labels = c("15-19","20-24","25-29","30-34","35-39",
"40-44","45-49","50-54","55-65"),
right = TRUE))
cat(sprintf("Cohabiting couples linked: %d (births = %d)\n",
nrow(male_pool), sum(male_pool$birth)))
## Cohabiting couples linked: 3127014 (births = 246069)
big_male <- male_pool |>
group_by(harm) |> summarise(n_b = sum(birth), .groups = "drop") |>
filter(n_b >= 50) |> pull(harm)
male_data <- male_pool |> filter(harm %in% big_male)
m_joint <- male_data |>
group_by(wave, age_band) |>
summarise(w_cell = sum(male_wgt), .groups = "drop") |>
mutate(w_cell = w_cell / sum(w_cell))
m_overall <- male_data |>
group_by(wave, age_band) |>
summarise(births_w = sum(birth * male_wgt), pop_w = sum(male_wgt),
.groups = "drop") |>
mutate(p_cell = births_w / pop_w)
m_cell <- male_data |>
group_by(harm, wave, age_band) |>
summarise(births_w = sum(birth * male_wgt),
pop_w = sum(male_wgt),
births_un = sum(birth), .groups = "drop") |>
mutate(p_cellO = births_w / pop_w)
dsrr_male <- m_cell |>
left_join(m_joint, by = c("wave","age_band")) |>
left_join(m_overall |> select(wave, age_band, p_cell), by = c("wave","age_band")) |>
group_by(harm) |>
mutate(w_norm = w_cell / sum(w_cell)) |>
summarise(dsr_occ = sum(w_norm * p_cellO),
dsr_ref = sum(w_norm * p_cell),
var_log_dsr = sum((w_norm * p_cellO)^2 / pmax(births_un, 1)) /
(sum(w_norm * p_cellO))^2,
births_un_total = sum(births_un), .groups = "drop") |>
mutate(rr = dsr_occ / dsr_ref, log_rr = log(rr), se_log = sqrt(var_log_dsr),
rr_lo = exp(log_rr - 1.96 * se_log),
rr_hi = exp(log_rr + 1.96 * se_log)) |>
left_join(dsrr_pool |> select(harm, occupation), by = "harm") |>
arrange(desc(rr))
plot_top_bot(dsrr_male, "Age-standardized birth-rate ratio by MALE occupation (pooled 2013-2022)",
"figs/fertility_rr_male_pooled.png", color = "darkgreen")
both <- dsrr_male |>
select(harm, rr_m = rr, log_rr_m = log_rr, se_log_m = se_log,
births_m = births_un_total) |>
inner_join(dsrr_pool |> select(harm, occupation, rr_f = rr, log_rr_f = log_rr,
se_log_f = se_log, births_f = births_un_total),
by = "harm") |>
mutate(min_births = pmin(births_m, births_f),
d_log = log_rr_m - log_rr_f)
log_r_mf <- cor(both$log_rr_m, both$log_rr_f)
sp_r_mf <- cor(both$rr_m, both$rr_f, method = "spearman")
rel_m_mf <- 1 - mean(both$se_log_m^2) / var(both$log_rr_m)
rel_f_mf <- 1 - mean(both$se_log_f^2) / var(both$log_rr_f)
r_dis_mf <- log_r_mf / sqrt(rel_m_mf * rel_f_mf)
labelers <- bind_rows(
both |> filter(min_births >= 100) |> slice_max(abs(d_log), n = 10),
both |> arrange(desc(rr_f)) |> head(5),
both |> arrange(rr_f) |> head(5)
) |> distinct(harm, .keep_all = TRUE)
p_mf <- ggplot(both, aes(rr_f, rr_m)) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
geom_smooth(method = "lm", se = TRUE, color = "tomato", linewidth = 0.7) +
geom_point(aes(size = min_births), color = "steelblue", alpha = 0.5) +
geom_text_repel(data = labelers, aes(label = str_trunc(occupation, 32)),
size = 2.7, min.segment.length = 0, max.overlaps = 30) +
scale_x_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
scale_size_continuous(range = c(1, 5), guide = "none") +
coord_fixed() +
labs(title = "Male vs Female age-standardized fertility RR",
subtitle = "Male = birth in cohabiting partner's last 12 months, by his occupation",
x = "Female RR (by her occupation)",
y = "Male RR (by his occupation, cohabiting couples only)",
caption = sprintf("Spearman %.2f | log-Pearson %.2f | disattenuated %.2f | n=%d overlap",
sp_r_mf, log_r_mf, r_dis_mf, nrow(both))) +
theme_bw(base_size = 11) +
theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_male_vs_female.png", p_mf, width = 9, height = 8, dpi = 150)
For each of female and male:
(1 | major_group), weights = 1/se² — partial pooling within Census major groups# Female regression frame on 0-1 scale + politics
reg_f <- dsrr_pool |>
inner_join(job_chars, by = "harm") |>
inner_join(pct_female |> select(harm, pct_female), by = "harm") |>
inner_join(occp_politics |> select(harm, pct_dem, total_donors), by = "harm") |>
filter(!is.na(Realistic), !is.na(job_zone), !is.na(pct_female)) |>
mutate(across(c(Realistic, Investigative, Artistic, Social, Enterprising, Conventional),
~ (.x - 1) / 6),
job_zone = (job_zone - 1) / 4,
w = 1 / se_log^2,
pct_dem_se = sqrt(pct_dem * (1 - pct_dem) / total_donors)) |>
left_join(occp_prefix, by = "harm") |>
mutate(major_group = ifelse(is.na(prefix), "Unmapped", prefix_names[prefix]))
# Male regression frame
reg_m <- dsrr_male |>
filter(births_un_total >= 50) |>
inner_join(job_chars, by = "harm") |>
inner_join(pct_female |> select(harm, pct_female), by = "harm") |>
inner_join(occp_politics |> select(harm, pct_dem, total_donors), by = "harm") |>
filter(!is.na(Realistic), !is.na(job_zone), !is.na(pct_female)) |>
mutate(across(c(Realistic, Investigative, Artistic, Social, Enterprising, Conventional),
~ (.x - 1) / 6),
job_zone = (job_zone - 1) / 4,
w = 1 / se_log^2,
pct_dem_se = sqrt(pct_dem * (1 - pct_dem) / total_donors)) |>
left_join(occp_prefix, by = "harm") |>
mutate(major_group = ifelse(is.na(prefix), "Unmapped", prefix_names[prefix]))
f_pol <- log_rr ~ Realistic + Investigative + Artistic + Social +
Enterprising + Conventional + pct_female + job_zone + pct_dem
fgls_fit <- function(formula, data, se_vec, max_iter = 30, tol = 1e-6) {
tau2 <- 0
for (i in seq_len(max_iter)) {
wts <- 1 / (se_vec^2 + tau2)
data$wts_ <- wts
fit <- lm(formula, data = data, weights = wts_)
res <- residuals(fit)
df <- df.residual(fit)
Q <- sum(wts * res^2)
s_w <- sum(wts); s_w2 <- sum(wts^2)
new_tau2 <- max(0, (Q - df) / (s_w - s_w2/s_w))
if (abs(new_tau2 - tau2) < tol) break
tau2 <- new_tau2
}
list(fit = fit, tau2 = tau2)
}
simex_run <- function(reg, y_se, x_se, formula,
n_sim = 1000, lambdas = c(0, 0.5, 1.0, 1.5, 2.0),
seed = 1, n_boot = 100) {
set.seed(seed)
predictors <- attr(terms(formula), "term.labels")
n <- nrow(reg)
inner <- function(rb, yb, xb) {
avg <- matrix(NA_real_, length(lambdas), length(predictors) + 1)
r2s <- numeric(length(lambdas))
for (li in seq_along(lambdas)) {
lam <- lambdas[li]
coefs_acc <- 0; r2_acc <- 0
for (s in seq_len(n_sim)) {
d2 <- rb
if (lam > 0) {
d2$log_rr <- rb$log_rr + rnorm(n, 0, sqrt(lam) * yb)
d2$pct_dem <- pmin(pmax(rb$pct_dem + rnorm(n, 0, sqrt(lam) * xb), 0), 1)
}
fit <- lm(formula, data = d2)
coefs_acc <- coefs_acc + coef(fit)
r2_acc <- r2_acc + summary(fit)$r.squared
}
avg[li, ] <- coefs_acc / n_sim
r2s[li] <- r2_acc / n_sim
}
extr <- sapply(seq_len(ncol(avg)), \(j) {
predict(lm(avg[, j] ~ lambdas), newdata = data.frame(lambdas = -1))
})
r2_extr <- predict(lm(r2s ~ lambdas), newdata = data.frame(lambdas = -1))
list(beta = extr, r2 = r2_extr)
}
pt <- inner(reg, y_se, x_se)
boot <- matrix(NA_real_, n_boot, length(pt$beta))
for (b in seq_len(n_boot)) {
idx <- sample.int(n, n, replace = TRUE)
boot[b, ] <- inner(reg[idx, ], y_se[idx], x_se[idx])$beta
}
se <- apply(boot, 2, sd)
names(pt$beta) <- names(se) <- c("(Intercept)", predictors)
list(beta = pt$beta, se = se, r2 = pt$r2)
}
fmt_cell <- function(b, se, p) {
stars <- ifelse(p < 0.005, "***",
ifelse(p < 0.01, "**",
ifelse(p < 0.05, "*", "")))
sprintf("%.3f (%.3f)%s", b, se, stars)
}
row_one <- function(model, predictor) {
co <- if (inherits(model, "lmerMod")) {
s <- summary(model); m <- s$coefficients
cbind(m, "Pr(>|t|)" = 2 * pnorm(-abs(m[, "t value"])))
} else summary(model)$coef
if (!(predictor %in% rownames(co))) return("--")
fmt_cell(co[predictor, "Estimate"], co[predictor, "Std. Error"],
co[predictor, "Pr(>|t|)"])
}
row_simex <- function(sim, predictor) {
if (!(predictor %in% names(sim$beta))) return("--")
b <- sim$beta[predictor]; se <- sim$se[predictor]
p <- 2 * pnorm(-abs(b / se))
fmt_cell(b, se, p)
}
preds <- c("Realistic","Investigative","Artistic","Social","Enterprising",
"Conventional","pct_female","job_zone","pct_dem")
pretty <- c("Realistic","Investigative","Artistic","Social","Enterprising",
"Conventional","% female","Complexity (Job Zone)","% Democratic")
m_ols_f <- lm(f_pol, data = reg_f)
m_wls_f <- lm(f_pol, data = reg_f, weights = w)
m_re_f <- fgls_fit(f_pol, reg_f, reg_f$se_log)
m_lmer_f <- lmer(update(f_pol, ~ . + (1 | major_group)), data = reg_f, weights = w,
control = lmerControl(check.conv.singular = .makeCC("ignore", tol = 1e-4)))
sim_f <- simex_run(reg_f, reg_f$se_log, reg_f$pct_dem_se, f_pol)
m_ols_m <- lm(f_pol, data = reg_m)
m_wls_m <- lm(f_pol, data = reg_m, weights = w)
m_re_m <- fgls_fit(f_pol, reg_m, reg_m$se_log)
m_lmer_m <- lmer(update(f_pol, ~ . + (1 | major_group)), data = reg_m, weights = w,
control = lmerControl(check.conv.singular = .makeCC("ignore", tol = 1e-4)))
sim_m <- simex_run(reg_m, reg_m$se_log, reg_m$pct_dem_se, f_pol)
build_tab5 <- function(m_ols, m_re, m_wls, m_lmer, sim, n_obs, tau2, title, sub, outfile) {
body <- tibble(
Predictor = pretty,
OLS = sapply(preds, \(p) row_one(m_ols, p)),
FGLS = sapply(preds, \(p) row_one(m_re, p)),
WLS = sapply(preds, \(p) row_one(m_wls, p)),
HLM = sapply(preds, \(p) row_one(m_lmer, p)),
SIMEX = sapply(preds, \(p) row_simex(sim, p))
)
R2 <- function(m) sprintf("%.3f", summary(m)$r.squared)
Adj <- function(m) sprintf("%.3f", summary(m)$adj.r.squared)
r2_row <- tibble(Predictor = "R²",
OLS = R2(m_ols), FGLS = R2(m_re), WLS = R2(m_wls),
HLM = "—", SIMEX = sprintf("%.3f", sim$r2))
adj_row <- tibble(Predictor = "Adj-R²",
OLS = Adj(m_ols), FGLS = Adj(m_re), WLS = Adj(m_wls),
HLM = "—", SIMEX = "—")
tab_df <- bind_rows(body, r2_row, adj_row)
footer <- sprintf(
"n = %d. FGLS τ² = %.4f. HLM: lmer(... + (1|major_group), weights=1/se²); ICC = 0 (no residual group-level variance). SIMEX: n_sim=1000, linear extrap, n_boot=100; joint noise on log_rr AND pct_dem extrapolated to λ=-1.",
n_obs, tau2)
g <- tab_df |> gt() |>
cols_label(OLS = "OLS", FGLS = "FGLS", WLS = "WLS", HLM = "HLM", SIMEX = "SIMEX") |>
tab_header(title = title, subtitle = sub) |>
tab_style(style = list(cell_borders(sides = "top", weight = px(1.5))),
locations = cells_body(rows = which(tab_df$Predictor == "R²"))) |>
tab_style(style = list(cell_text(weight = "bold")),
locations = cells_body(rows = which(tab_df$Predictor %in% c("R²","Adj-R²")))) |>
tab_source_note(footer) |>
tab_source_note("Significance: * p<0.05 ** p<0.01 *** p<0.005") |>
tab_options(table.font.size = 10, heading.title.font.size = 13,
source_notes.font.size = 9)
gtsave(g, outfile)
}
build_tab5(m_ols_f, m_re_f$fit, m_wls_f, m_lmer_f, sim_f,
nobs(m_ols_f), m_re_f$tau2,
"Female fertility log-RR — five estimators",
"Predictors rescaled to 0-1 | + politics model",
"figs/fertility_regression_compact_5model.png")
build_tab5(m_ols_m, m_re_m$fit, m_wls_m, m_lmer_m, sim_m,
nobs(m_ols_m), m_re_m$tau2,
"Male fertility log-RR — five estimators",
"Birth attributed via cohabiting partner's FER answer | predictors 0-1 | + politics model",
"figs/fertility_regression_compact_5model_male.png")
sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_DK.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_DK.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Brussels
## tzcode source: system (glibc)
##
## attached base packages:
## [1] splines stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] lme4_1.1-37 Matrix_1.7-4 ggrepel_0.9.6 gt_1.3.0
## [5] arrow_23.0.1.1 lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0
## [9] dplyr_1.1.4 purrr_1.2.0 readr_2.1.5 tidyr_1.3.1
## [13] tibble_3.3.0 ggplot2_4.0.1.9000 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 farver_2.1.2 S7_0.2.1 fastmap_1.2.0
## [5] webshot2_0.1.2 promises_1.3.3 digest_0.6.39 timechange_0.3.0
## [9] lifecycle_1.0.4 sf_1.0-21 processx_3.8.6 magrittr_2.0.4
## [13] compiler_4.5.3 rlang_1.1.6.9000 sass_0.4.10 tools_4.5.3
## [17] yaml_2.3.10 knitr_1.50 labeling_0.4.3 bit_4.6.0
## [21] classInt_0.4-11 xml2_1.4.0 RColorBrewer_1.1-3 tidycensus_1.7.5
## [25] KernSmooth_2.23-26 websocket_1.4.4 withr_3.0.2 grid_4.5.3
## [29] e1071_1.7-16 scales_1.4.0 MASS_7.3-65 cli_3.6.5
## [33] rmarkdown_2.30 crayon_1.5.3 ragg_1.5.0 reformulas_0.4.1
## [37] generics_0.1.4 httr_1.4.7 tzdb_0.5.0 minqa_1.2.8
## [41] DBI_1.2.3 cachem_1.1.0 chromote_0.5.1 proxy_0.4-27
## [45] rvest_1.0.5 assertthat_0.2.1 parallel_4.5.3 tigris_2.2.1
## [49] vctrs_0.6.5 boot_1.3-32 jsonlite_2.0.0 hms_1.1.3
## [53] bit64_4.6.0-1 archive_1.1.12 systemfonts_1.3.1 jquerylib_0.1.4
## [57] units_1.0-0 glue_1.8.0 nloptr_2.2.1 ps_1.9.1
## [61] stringi_1.8.7 gtable_0.3.6 later_1.4.4 pillar_1.11.1
## [65] rappdirs_0.3.3 htmltools_0.5.8.1 R6_2.6.1 textshaping_1.0.4
## [69] Rdpack_2.6.4 vroom_1.6.6 evaluate_1.0.5 lattice_0.22-7
## [73] rbibutils_2.3 bslib_0.9.0 class_7.3-23 uuid_1.2-1
## [77] Rcpp_1.1.0 nlme_3.1-168 mgcv_1.9-3 xfun_0.57
## [81] fs_1.6.6 pkgconfig_2.0.3