Sample

fertile <- pooled |>
  mutate(age_band = cut(AGEP,
                        breaks = c(14, 19, 24, 29, 34, 39, 44, 50),
                        labels = c("15-19","20-24","25-29","30-34","35-39","40-44","45-50"),
                        right = TRUE))

big_occs <- fertile |>
  group_by(harm) |>
  summarise(n_births = sum(birth), .groups = "drop") |>
  filter(n_births >= 50) |>
  pull(harm)

model_data <- fertile |> filter(harm %in% big_occs)

cat(sprintf("Total fertile women 15-50: %d\n", nrow(fertile)))
## Total fertile women 15-50: 5476050
cat(sprintf("Occupations with >=50 pooled births: %d\n", length(big_occs)))
## Occupations with >=50 pooled births: 311
cat(sprintf("Sample size in those occupations: %d (births = %d)\n",
            nrow(model_data), sum(model_data$birth)))
## Sample size in those occupations: 5476050 (births = 294492)

Pooled DSRR

Direct age-standardization on (age_band × wave) cells. For each occupation \(O\),

\[\widehat{RR}_O = \frac{\sum_{c \in S_O} \tilde w_c \, \hat p_{cO}} {\sum_{c \in S_O} \tilde w_c \, \hat p_{c\cdot}}\]

with \(c\) indexing (age_band × wave) cells, \(\tilde w_c\) = the cell’s share of the overall fertile-sample weighted population, renormalized within each occupation’s observed cells.

joint_wts <- model_data |>
  group_by(wave, age_band) |>
  summarise(w_cell = sum(PWGTP), .groups = "drop") |>
  mutate(w_cell = w_cell / sum(w_cell))

cell_overall <- model_data |>
  group_by(wave, age_band) |>
  summarise(births_w = sum(birth * PWGTP),
            pop_w    = sum(PWGTP),
            .groups  = "drop") |>
  mutate(p_cell = births_w / pop_w)

cell_pooled <- model_data |>
  group_by(harm, wave, age_band) |>
  summarise(births_w  = sum(birth * PWGTP),
            pop_w     = sum(PWGTP),
            births_un = sum(birth),
            .groups   = "drop") |>
  mutate(p_cellO = births_w / pop_w)

dsrr_pool <- cell_pooled |>
  left_join(joint_wts,                                by = c("wave","age_band")) |>
  left_join(cell_overall |> select(wave, age_band, p_cell), by = c("wave","age_band")) |>
  group_by(harm) |>
  mutate(w_norm = w_cell / sum(w_cell)) |>
  summarise(dsr_occ = sum(w_norm * p_cellO),
            dsr_ref = sum(w_norm * p_cell),
            var_log_dsr = sum((w_norm * p_cellO)^2 / pmax(births_un, 1)) /
                          (sum(w_norm * p_cellO))^2,
            births_un_total = sum(births_un),
            n_cells = dplyr::n(),
            .groups = "drop") |>
  mutate(rr = dsr_occ / dsr_ref, log_rr = log(rr), se_log = sqrt(var_log_dsr),
         rr_lo = exp(log_rr - 1.96 * se_log),
         rr_hi = exp(log_rr + 1.96 * se_log)) |>
  left_join(occp_labels, by = "harm") |>
  arrange(desc(rr))

write_csv(
  dsrr_pool |> transmute(OCCP = harm, occupation, rr = round(rr, 4),
                         rr_lo = round(rr_lo, 4), rr_hi = round(rr_hi, 4),
                         se_log = round(se_log, 4),
                         n_births = births_un_total, n_cells),
  "data/fertility_rr_by_occupation_age_std.csv")

Per-wave DSRR

wave_age <- cell_pooled |>
  group_by(wave, age_band) |>
  summarise(pop_w = sum(pop_w), births_w = sum(births_w), .groups = "drop") |>
  group_by(wave) |>
  mutate(w_band = pop_w / sum(pop_w), p_b = births_w / pop_w) |>
  ungroup()

dsrr_wave <- cell_pooled |>
  left_join(wave_age |> select(wave, age_band, w_band, p_b),
            by = c("wave","age_band")) |>
  group_by(harm, wave) |>
  mutate(w_norm = w_band / sum(w_band)) |>
  summarise(dsr_occ = sum(w_norm * p_cellO),
            dsr_ref = sum(w_norm * p_b),
            var_log_dsr = sum((w_norm * p_cellO)^2 / pmax(births_un, 1)) /
                          (sum(w_norm * p_cellO))^2,
            births_un_total = sum(births_un),
            .groups = "drop") |>
  mutate(rr = dsr_occ / dsr_ref, log_rr = log(rr), se_log = sqrt(var_log_dsr),
         rr_lo = exp(log_rr - 1.96 * se_log),
         rr_hi = exp(log_rr + 1.96 * se_log)) |>
  left_join(occp_labels, by = "harm")

dsrr_wave_keep <- dsrr_wave |> filter(births_un_total >= 50)

Headline plot — pooled top/bottom 25

plot_top_bot <- function(df, title_str, file_out, w = 12, h = 12, color = "steelblue") {
  top25 <- df |> filter(births_un_total >= 50) |> arrange(desc(rr)) |> head(25)
  bot25 <- df |> filter(births_un_total >= 50) |> arrange(rr)       |> head(25)
  tb <- bind_rows(top25 |> mutate(grp = "Top 25"),
                  bot25 |> mutate(grp = "Bottom 25")) |>
    mutate(occupation = str_trunc(occupation, 55),
           occupation = fct_reorder(occupation, rr),
           grp = factor(grp, levels = c("Top 25","Bottom 25")))
  p <- ggplot(tb, aes(rr, occupation)) +
    geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
    geom_errorbar(aes(xmin = rr_lo, xmax = rr_hi),
                  orientation = "y", width = 0.25, color = "grey40") +
    geom_point(size = 2, color = color) +
    scale_x_continuous(trans = "log2",
                       breaks = c(0.4,0.5,0.67,0.8,1,1.25,1.5,2,3)) +
    facet_wrap(~ grp, scales = "free_y", ncol = 1) +
    labs(title = title_str, x = "RR vs overall (log scale)", y = NULL,
         caption = "Bars = 95% CIs.  Occupations with >=50 births in window.") +
    theme_bw(base_size = 11) +
    theme(plot.title = element_text(face = "bold"),
          axis.text.y = element_text(size = 8),
          strip.text = element_text(face = "bold", hjust = 0))
  ggsave(file_out, p, width = w, height = h, dpi = 150)
  p
}

plot_top_bot(dsrr_pool,
             "Age-standardized birth-rate ratio by occupation (pooled 2013-2022)",
             "figs/fertility_rr_pooled.png")

Per-wave headline plots

plot_top_bot(dsrr_wave_keep |> filter(wave == "2013-2017"),
             "Age-standardized RR — ACS 5yr PUMS 2013-2017",
             "figs/fertility_rr_wave1_top_bot.png")

plot_top_bot(dsrr_wave_keep |> filter(wave == "2018-2022"),
             "Age-standardized RR — ACS 5yr PUMS 2018-2022",
             "figs/fertility_rr_wave2_top_bot.png")

Wave-stability scatter

wide <- dsrr_wave_keep |>
  select(harm, occupation, wave, rr, log_rr, se_log, births_un_total) |>
  pivot_wider(id_cols = c(harm, occupation),
              names_from = wave,
              values_from = c(rr, log_rr, se_log, births_un_total))
names(wide) <- gsub("-", "_", names(wide))
ov <- wide |> filter(!is.na(rr_2013_2017), !is.na(rr_2018_2022)) |>
  mutate(d_log = log_rr_2018_2022 - log_rr_2013_2017,
         min_births = pmin(births_un_total_2013_2017, births_un_total_2018_2022))

sp_r  <- cor(ov$rr_2013_2017, ov$rr_2018_2022, method = "spearman")
log_r <- cor(ov$log_rr_2013_2017, ov$log_rr_2018_2022)
rel1 <- 1 - mean(ov$se_log_2013_2017^2) / var(ov$log_rr_2013_2017)
rel2 <- 1 - mean(ov$se_log_2018_2022^2) / var(ov$log_rr_2018_2022)
r_dis <- log_r / sqrt(rel1 * rel2)

movers <- ov |> filter(min_births >= 100) |> slice_max(abs(d_log), n = 12)

p_stab <- ggplot(ov, aes(rr_2013_2017, rr_2018_2022)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
  geom_smooth(method = "lm", se = TRUE, color = "tomato", linewidth = 0.7) +
  geom_point(aes(size = min_births), color = "steelblue", alpha = 0.5) +
  geom_text_repel(data = movers, aes(label = str_trunc(occupation, 30)),
                  size = 2.7, min.segment.length = 0, max.overlaps = 30) +
  scale_x_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
  scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
  scale_size_continuous(range = c(1, 5), guide = "none") +
  coord_fixed() +
  labs(title = "Wave 1 vs Wave 2 — age-standardized fertility RR",
       x = "RR (ACS 2013-2017)", y = "RR (ACS 2018-2022)",
       caption = sprintf("Spearman %.2f | log-Pearson %.2f | disattenuated %.2f | n=%d overlap",
                         sp_r, log_r, r_dis, nrow(ov))) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_wave1_vs_wave2.png", p_stab,
       width = 8.5, height = 8, dpi = 150)

RR vs % female

sc_df <- dsrr_pool |>
  inner_join(pct_female |> select(harm, pct_female), by = "harm")
cor_p <- cor(sc_df$log_rr, sc_df$pct_female)
extremes <- bind_rows(
  sc_df |> arrange(desc(rr)) |> head(8),
  sc_df |> arrange(rr)       |> head(8),
  sc_df |> arrange(pct_female) |> head(4),
  sc_df |> arrange(desc(pct_female)) |> head(4)
) |> distinct(harm, .keep_all = TRUE)

p_pf <- ggplot(sc_df, aes(pct_female, rr)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "grey50") +
  geom_point(aes(size = births_un_total), color = "steelblue", alpha = 0.55) +
  geom_smooth(method = "lm", color = "tomato", linewidth = 0.7) +
  geom_text_repel(data = extremes, aes(label = str_trunc(occupation, 30)),
                  size = 2.7, min.segment.length = 0, max.overlaps = 30) +
  scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.5,2)) +
  scale_size_continuous(range = c(1.2, 5.5), guide = "none") +
  labs(title = "Age-standardized fertility RR vs % female in occupation",
       x = "% female in occupation",
       y = "RR (log scale)",
       caption = sprintf("Pearson on log-RR = %.2f | n = %d", cor_p, nrow(sc_df))) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_vs_pct_female.png", p_pf,
       width = 9, height = 6.5, dpi = 150)

RR vs RIASEC + Prediger 2-D + Complexity (small multiples)

plot_df <- dsrr_pool |> inner_join(job_chars, by = "harm")

trait_levels <- c("Realistic","Investigative","Artistic","Social","Enterprising",
                  "Conventional","things_people","data_ideas","job_zone")
trait_labels <- c("Realistic","Investigative","Artistic","Social","Enterprising",
                  "Conventional","Things ↔ People (Prediger)",
                  "Data ↔ Ideas (Prediger)","Complexity (Job Zone)")

long_df <- plot_df |>
  pivot_longer(all_of(trait_levels), names_to = "var", values_to = "value") |>
  mutate(trait = factor(var, levels = trait_levels, labels = trait_labels))

spline_test <- function(d) {
  d <- d |> drop_na(value)
  ml <- lm(log_rr ~ value, data = d)
  ms <- lm(log_rr ~ ns(value, df = 3), data = d)
  ge <- summary(ms)$r.squared - summary(ml)$r.squared
  pv <- anova(ml, ms)$`Pr(>F)`[2]
  tibble(p = pv, gain = ge, use_spline = (pv < 0.05) & (ge > 0.02))
}
tests <- long_df |> group_by(var) |> group_modify(~ spline_test(.x))
spline_vars <- tests |> filter(use_spline) |> pull(var)

cors <- long_df |> group_by(var, trait) |>
  summarise(cor_log = cor(log_rr, value, use = "complete.obs"),
            cor_sp  = cor(rr, value, method = "spearman", use = "complete.obs"),
            .groups = "drop") |>
  mutate(label = sprintf("r = %.2f  ρ = %.2f%s", cor_log, cor_sp,
                         ifelse(var %in% spline_vars, "  (spline)", "  (linear)")))

fit_one <- function(d, v) {
  fit <- if (v %in% spline_vars) lm(log_rr ~ ns(value, df = 3), data = d)
         else                    lm(log_rr ~ value, data = d)
  newv <- seq(min(d$value, na.rm = TRUE), max(d$value, na.rm = TRUE), length.out = 80)
  tibble(value = newv,
         rr_pred = exp(predict(fit, newdata = tibble(value = newv))))
}
curves <- long_df |> group_by(var, trait) |> group_modify(~ fit_one(.x, .y$var))

p_jc <- ggplot(long_df, aes(value, rr)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "grey60") +
  geom_point(aes(size = births_un_total), color = "steelblue", alpha = 0.45) +
  geom_line(data = curves, aes(value, rr_pred), color = "tomato", linewidth = 0.8) +
  geom_text(data = cors, aes(x = -Inf, y = Inf, label = label),
            hjust = -0.05, vjust = 1.4, size = 3, inherit.aes = FALSE) +
  facet_wrap(~ trait, scales = "free_x", ncol = 3) +
  scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.5,2)) +
  scale_size_continuous(range = c(0.8, 4), guide = "none") +
  labs(title = "Fertility RR vs O*NET job characteristics",
       x = "Trait score", y = "RR (log scale)",
       caption = "Spline fit only where marginal p<0.05 AND R² gain >2%; linear elsewhere.") +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"),
        strip.text = element_text(face = "bold"))
ggsave("figs/fertility_rr_vs_job_chars.png", p_jc,
       width = 11, height = 10, dpi = 150)

Top 50 largest occupations

n_w <- pooled |> filter(!is.na(harm)) |>
  group_by(harm) |> summarise(n_women_w = sum(PWGTP), .groups = "drop")

big50 <- dsrr_pool |> inner_join(n_w, by = "harm") |>
  arrange(desc(n_women_w)) |> head(50) |>
  mutate(occ_label = str_trunc(occupation, 55),
         occ_label = fct_reorder(occ_label, rr))
share <- sum(big50$n_women_w) / sum(n_w$n_women_w)

p_50 <- ggplot(big50, aes(rr, occ_label)) +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
  geom_errorbar(aes(xmin = rr_lo, xmax = rr_hi),
                orientation = "y", width = 0.25, color = "grey40") +
  geom_point(aes(size = n_women_w / 1e6), color = "steelblue") +
  scale_x_continuous(trans = "log2", breaks = c(0.7,0.8,0.9,1,1.1,1.25,1.5)) +
  scale_size_continuous(name = "Women (M)", range = c(1.5, 6)) +
  labs(title = "Fertility RR for the 50 largest occupations by female workforce",
       subtitle = sprintf("These 50 occupations employ %.0f%% of US women 15-50 in the labor force",
                          100 * share),
       x = "RR (log scale)", y = NULL) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_top50_largest.png", p_50,
       width = 11, height = 11, dpi = 150)

Major occupational groups (women-weighted)

hier_rr <- dsrr_pool |>
  left_join(occp_prefix, by = "harm") |>
  left_join(n_w, by = "harm") |>
  mutate(major_group = ifelse(is.na(prefix), "Unmapped", prefix_names[prefix])) |>
  filter(major_group != "Unmapped")

group_summary_w <- hier_rr |>
  group_by(major_group) |>
  summarise(n_occ = dplyr::n(),
            mean_log_rr = weighted.mean(log_rr, n_women_w),
            sd_log = sqrt(sum(n_women_w * (log_rr - mean_log_rr)^2) / sum(n_women_w)),
            n_eff = sum(n_women_w)^2 / sum(n_women_w^2),
            se_mean = sd_log / sqrt(pmax(n_eff, 1)),
            mean_rr = exp(mean_log_rr),
            mean_lo = exp(mean_log_rr - 1.96 * se_mean),
            mean_hi = exp(mean_log_rr + 1.96 * se_mean),
            .groups = "drop") |>
  arrange(mean_log_rr) |>
  mutate(major_group_o = factor(major_group, levels = major_group))

hier_rr <- hier_rr |>
  mutate(major_group_o = factor(major_group, levels = levels(group_summary_w$major_group_o)))

p_mg <- ggplot() +
  geom_vline(xintercept = 1, linetype = "dashed", color = "grey50") +
  geom_jitter(data = hier_rr,
              aes(rr, major_group_o, size = n_women_w / 1e6),
              height = 0.18, alpha = 0.4, color = "steelblue") +
  geom_errorbar(data = group_summary_w,
                aes(y = major_group_o, xmin = mean_lo, xmax = mean_hi),
                orientation = "y", width = 0.4, color = "black", linewidth = 0.7) +
  geom_point(data = group_summary_w,
             aes(mean_rr, major_group_o), color = "black", size = 2.5, shape = 18) +
  geom_text(data = group_summary_w,
            aes(y = major_group_o, x = 2.0, label = sprintf("n=%d", n_occ)),
            hjust = 0, size = 3, color = "grey30") +
  scale_x_continuous(trans = "log2",
                     breaks = c(0.5,0.7,0.85,1,1.2,1.5,2),
                     limits = c(0.4, 2.3)) +
  scale_size_continuous(name = "Women (millions)", range = c(0.5, 4)) +
  labs(title = "Fertility RR by major occupational group (women-weighted)",
       subtitle = "Blue dots = occupations; black diamond = group mean log-RR",
       x = "RR (log scale)", y = NULL) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_major_groups.png", p_mg,
       width = 13, height = 8.5, dpi = 150)

Political lean (Verdant Labs)

verdant_raw <- read_csv("data/verdant_occupations.csv", show_col_types = FALSE)
occp_politics <- read_csv("data/occp_politics.csv", show_col_types = FALSE,
                          col_types = cols(harm = "c"))

sc_pol <- dsrr_pool |> inner_join(occp_politics |> select(harm, pct_dem, total_donors),
                                   by = "harm")
cor_pol <- cor(sc_pol$log_rr, sc_pol$pct_dem)

ext_pol <- bind_rows(
  sc_pol |> arrange(desc(rr)) |> head(6),
  sc_pol |> arrange(rr) |> head(6),
  sc_pol |> arrange(pct_dem) |> head(4),
  sc_pol |> arrange(desc(pct_dem)) |> head(4)
) |> distinct(harm, .keep_all = TRUE)

p_pol <- ggplot(sc_pol, aes(pct_dem, rr)) +
  geom_hline(yintercept = 1, linetype = "dashed", color = "grey50") +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "grey80") +
  geom_point(aes(size = births_un_total), color = "steelblue", alpha = 0.55) +
  geom_smooth(method = "lm", color = "tomato", linewidth = 0.7) +
  geom_text_repel(data = ext_pol, aes(label = str_trunc(occupation, 32)),
                  size = 2.7, min.segment.length = 0, max.overlaps = 30) +
  scale_x_continuous(breaks = seq(0, 1, 0.2),
                     labels = scales::percent_format(accuracy = 1)) +
  scale_y_continuous(trans = "log2", breaks = c(0.5, 0.75, 1, 1.5, 2)) +
  scale_size_continuous(range = c(1.2, 5.5), guide = "none") +
  labs(title = "Age-standardized fertility RR vs occupation political lean",
       subtitle = "% Democratic from Verdant Labs FEC donor data (~2008-2016)",
       x = "% Democratic donors in occupation",
       y = "RR (log scale)",
       caption = sprintf("Pearson on log-RR = %.2f | n = %d occupations", cor_pol, nrow(sc_pol))) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_vs_pct_dem.png", p_pol, width = 9, height = 6.5, dpi = 150)

Top/bottom 25 Verdant occupations (independent of fertility join)

v_keep <- verdant_raw |>
  mutate(total_donors = reps + dems) |>
  filter(total_donors >= 100)

tb_v <- bind_rows(
  v_keep |> arrange(desc(pct_dem)) |> head(25) |> mutate(grp = "Most Democratic"),
  v_keep |> arrange(pct_dem)       |> head(25) |> mutate(grp = "Most Republican")
) |> mutate(occupation = str_trunc(occupation, 45),
            occupation = fct_reorder(occupation, pct_dem),
            grp = factor(grp, levels = c("Most Democratic","Most Republican")))

p_v <- ggplot(tb_v, aes(pct_dem, occupation, fill = pct_dem)) +
  geom_vline(xintercept = 0.5, linetype = "dashed", color = "grey50") +
  geom_col() +
  geom_text(aes(label = sprintf("%.0f%%", 100*pct_dem)),
            color = "white", size = 2.7, hjust = 1.2) +
  geom_text(aes(label = sprintf("n=%d", total_donors)),
            color = "grey20", size = 2.5, hjust = -0.1, x = 1) +
  facet_wrap(~ grp, scales = "free_y", ncol = 1) +
  scale_fill_gradient2(low = "#b2182b", mid = "white", high = "#2166ac",
                       midpoint = 0.5, limits = c(0, 1), guide = "none") +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1),
                     breaks = seq(0, 1, 0.2),
                     limits = c(0, 1.18), expand = c(0, 0)) +
  labs(title = "Most Democratic and most Republican occupations (Verdant Labs)",
       x = "% Democratic donors", y = NULL) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"),
        axis.text.y = element_text(size = 9),
        strip.text = element_text(face = "bold", hjust = 0),
        panel.grid.major.y = element_blank())
ggsave("figs/verdant_top_bottom_25.png", p_v, width = 11, height = 11, dpi = 150)

Male sample — cohabiting-couple spousal linkage

For each man in the PUMS, find his cohabiting opposite-sex partner (via RELSHIPP / RELP codes for spouse / unmarried partner of the reference person), and if she is a woman 15-50 with the FER question answered, attribute her FER response to him. This misses non-resident fathers (~30-40% of US births occur to non-cohabiting parents) — those men appear as zero-fertility by construction.

needed_male_22 <- c("SERIALNO","SPORDER","AGEP","SEX","FER","OCCP","PWGTP","RELSHIPP")
needed_male_17 <- c("SERIALNO","SPORDER","AGEP","SEX","FER","OCCP","PWGTP","RELP")

p22 <- read_parquet("data/pums_2022_5yr_person.parquet", col_select = all_of(needed_male_22))
p17 <- read_parquet("data/pums_2017_5yr_person.parquet", col_select = all_of(needed_male_17)) |>
  rename(RELSHIPP = RELP)

ref_22 <- p22 |> filter(RELSHIPP == 20L)
prt_22 <- p22 |> filter(RELSHIPP %in% c(21L, 22L))
ref_17 <- p17 |> filter(RELSHIPP == 0L)
prt_17 <- p17 |> filter(RELSHIPP %in% c(1L, 13L))

extract_couple <- function(ref, prt, wave_label) {
  ref |> inner_join(prt, by = "SERIALNO", suffix = c("_R","_P")) |>
    filter(SEX_R != SEX_P) |>
    mutate(male_age   = if_else(SEX_R == 1L, AGEP_R, AGEP_P),
           male_occp  = if_else(SEX_R == 1L, OCCP_R, OCCP_P),
           male_wgt   = if_else(SEX_R == 1L, PWGTP_R, PWGTP_P),
           female_age = if_else(SEX_R == 2L, AGEP_R, AGEP_P),
           female_FER = if_else(SEX_R == 2L, FER_R, FER_P),
           wave = wave_label)
}

couples <- bind_rows(
  extract_couple(ref_17, prt_17, "2013-2017"),
  extract_couple(ref_22, prt_22, "2018-2022")
) |>
  filter(female_age >= 15, female_age <= 50,
         female_FER %in% c("1", "2"),
         !is.na(male_occp), male_occp != "bbbb",
         male_age >= 15, male_age <= 65) |>
  select(wave, male_age, male_occp, male_wgt, female_age, female_FER)

# Harmonize male OCCP to harm code via crosswalk
xw <- read_csv("data/occp_2010_2018_crosswalk.csv", show_col_types = FALSE,
               col_types = cols(.default = "c"))
xw_2010 <- xw |> distinct(occ2010, group_id) |> filter(!is.na(occ2010))
xw_2018 <- xw |> distinct(occ2018, group_id) |> filter(!is.na(occ2018))

male_pool <- bind_rows(
  couples |> filter(wave == "2013-2017") |>
    left_join(xw_2010, by = c("male_occp" = "occ2010")) |>
    mutate(harm = coalesce(group_id, male_occp)),
  couples |> filter(wave == "2018-2022") |>
    left_join(xw_2018, by = c("male_occp" = "occ2018")) |>
    mutate(harm = coalesce(group_id, male_occp))
) |>
  mutate(birth = as.integer(female_FER == "1"),
         age_band = cut(male_age,
                        breaks = c(14, 19, 24, 29, 34, 39, 44, 49, 54, 65),
                        labels = c("15-19","20-24","25-29","30-34","35-39",
                                   "40-44","45-49","50-54","55-65"),
                        right = TRUE))

cat(sprintf("Cohabiting couples linked: %d (births = %d)\n",
            nrow(male_pool), sum(male_pool$birth)))
## Cohabiting couples linked: 3127014 (births = 246069)

Male DSRR

big_male <- male_pool |>
  group_by(harm) |> summarise(n_b = sum(birth), .groups = "drop") |>
  filter(n_b >= 50) |> pull(harm)
male_data <- male_pool |> filter(harm %in% big_male)

m_joint <- male_data |>
  group_by(wave, age_band) |>
  summarise(w_cell = sum(male_wgt), .groups = "drop") |>
  mutate(w_cell = w_cell / sum(w_cell))
m_overall <- male_data |>
  group_by(wave, age_band) |>
  summarise(births_w = sum(birth * male_wgt), pop_w = sum(male_wgt),
            .groups = "drop") |>
  mutate(p_cell = births_w / pop_w)
m_cell <- male_data |>
  group_by(harm, wave, age_band) |>
  summarise(births_w  = sum(birth * male_wgt),
            pop_w     = sum(male_wgt),
            births_un = sum(birth), .groups = "drop") |>
  mutate(p_cellO = births_w / pop_w)

dsrr_male <- m_cell |>
  left_join(m_joint, by = c("wave","age_band")) |>
  left_join(m_overall |> select(wave, age_band, p_cell), by = c("wave","age_band")) |>
  group_by(harm) |>
  mutate(w_norm = w_cell / sum(w_cell)) |>
  summarise(dsr_occ = sum(w_norm * p_cellO),
            dsr_ref = sum(w_norm * p_cell),
            var_log_dsr = sum((w_norm * p_cellO)^2 / pmax(births_un, 1)) /
                          (sum(w_norm * p_cellO))^2,
            births_un_total = sum(births_un), .groups = "drop") |>
  mutate(rr = dsr_occ / dsr_ref, log_rr = log(rr), se_log = sqrt(var_log_dsr),
         rr_lo = exp(log_rr - 1.96 * se_log),
         rr_hi = exp(log_rr + 1.96 * se_log)) |>
  left_join(dsrr_pool |> select(harm, occupation), by = "harm") |>
  arrange(desc(rr))

Male top/bottom 25

plot_top_bot(dsrr_male, "Age-standardized birth-rate ratio by MALE occupation (pooled 2013-2022)",
             "figs/fertility_rr_male_pooled.png", color = "darkgreen")

Male vs Female RR scatter

both <- dsrr_male |>
  select(harm, rr_m = rr, log_rr_m = log_rr, se_log_m = se_log,
         births_m = births_un_total) |>
  inner_join(dsrr_pool |> select(harm, occupation, rr_f = rr, log_rr_f = log_rr,
                                  se_log_f = se_log, births_f = births_un_total),
             by = "harm") |>
  mutate(min_births = pmin(births_m, births_f),
         d_log = log_rr_m - log_rr_f)

log_r_mf <- cor(both$log_rr_m, both$log_rr_f)
sp_r_mf  <- cor(both$rr_m, both$rr_f, method = "spearman")
rel_m_mf <- 1 - mean(both$se_log_m^2) / var(both$log_rr_m)
rel_f_mf <- 1 - mean(both$se_log_f^2) / var(both$log_rr_f)
r_dis_mf <- log_r_mf / sqrt(rel_m_mf * rel_f_mf)

labelers <- bind_rows(
  both |> filter(min_births >= 100) |> slice_max(abs(d_log), n = 10),
  both |> arrange(desc(rr_f)) |> head(5),
  both |> arrange(rr_f) |> head(5)
) |> distinct(harm, .keep_all = TRUE)

p_mf <- ggplot(both, aes(rr_f, rr_m)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey50") +
  geom_smooth(method = "lm", se = TRUE, color = "tomato", linewidth = 0.7) +
  geom_point(aes(size = min_births), color = "steelblue", alpha = 0.5) +
  geom_text_repel(data = labelers, aes(label = str_trunc(occupation, 32)),
                  size = 2.7, min.segment.length = 0, max.overlaps = 30) +
  scale_x_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
  scale_y_continuous(trans = "log2", breaks = c(0.5,0.75,1,1.25,1.5,2)) +
  scale_size_continuous(range = c(1, 5), guide = "none") +
  coord_fixed() +
  labs(title = "Male vs Female age-standardized fertility RR",
       subtitle = "Male = birth in cohabiting partner's last 12 months, by his occupation",
       x = "Female RR (by her occupation)",
       y = "Male RR (by his occupation, cohabiting couples only)",
       caption = sprintf("Spearman %.2f | log-Pearson %.2f | disattenuated %.2f | n=%d overlap",
                         sp_r_mf, log_r_mf, r_dis_mf, nrow(both))) +
  theme_bw(base_size = 11) +
  theme(plot.title = element_text(face = "bold"))
ggsave("figs/fertility_rr_male_vs_female.png", p_mf, width = 9, height = 8, dpi = 150)

Compact 5-estimator regression tables

For each of female and male:

  • OLS: equal-weight regression
  • FGLS: weights = 1/(se² + τ²) (random-effects meta-analysis)
  • WLS: weights = 1/se² (precision / workforce-weighted)
  • HLM: lmer with (1 | major_group), weights = 1/se² — partial pooling within Census major groups
  • SIMEX: extrapolate noise on log_rr AND pct_dem to λ=-1
# Female regression frame on 0-1 scale + politics
reg_f <- dsrr_pool |>
  inner_join(job_chars, by = "harm") |>
  inner_join(pct_female |> select(harm, pct_female), by = "harm") |>
  inner_join(occp_politics |> select(harm, pct_dem, total_donors), by = "harm") |>
  filter(!is.na(Realistic), !is.na(job_zone), !is.na(pct_female)) |>
  mutate(across(c(Realistic, Investigative, Artistic, Social, Enterprising, Conventional),
                ~ (.x - 1) / 6),
         job_zone = (job_zone - 1) / 4,
         w = 1 / se_log^2,
         pct_dem_se = sqrt(pct_dem * (1 - pct_dem) / total_donors)) |>
  left_join(occp_prefix, by = "harm") |>
  mutate(major_group = ifelse(is.na(prefix), "Unmapped", prefix_names[prefix]))

# Male regression frame
reg_m <- dsrr_male |>
  filter(births_un_total >= 50) |>
  inner_join(job_chars, by = "harm") |>
  inner_join(pct_female |> select(harm, pct_female), by = "harm") |>
  inner_join(occp_politics |> select(harm, pct_dem, total_donors), by = "harm") |>
  filter(!is.na(Realistic), !is.na(job_zone), !is.na(pct_female)) |>
  mutate(across(c(Realistic, Investigative, Artistic, Social, Enterprising, Conventional),
                ~ (.x - 1) / 6),
         job_zone = (job_zone - 1) / 4,
         w = 1 / se_log^2,
         pct_dem_se = sqrt(pct_dem * (1 - pct_dem) / total_donors)) |>
  left_join(occp_prefix, by = "harm") |>
  mutate(major_group = ifelse(is.na(prefix), "Unmapped", prefix_names[prefix]))
f_pol <- log_rr ~ Realistic + Investigative + Artistic + Social +
                  Enterprising + Conventional + pct_female + job_zone + pct_dem

fgls_fit <- function(formula, data, se_vec, max_iter = 30, tol = 1e-6) {
  tau2 <- 0
  for (i in seq_len(max_iter)) {
    wts <- 1 / (se_vec^2 + tau2)
    data$wts_ <- wts
    fit <- lm(formula, data = data, weights = wts_)
    res <- residuals(fit)
    df  <- df.residual(fit)
    Q   <- sum(wts * res^2)
    s_w  <- sum(wts); s_w2 <- sum(wts^2)
    new_tau2 <- max(0, (Q - df) / (s_w - s_w2/s_w))
    if (abs(new_tau2 - tau2) < tol) break
    tau2 <- new_tau2
  }
  list(fit = fit, tau2 = tau2)
}

simex_run <- function(reg, y_se, x_se, formula,
                      n_sim = 1000, lambdas = c(0, 0.5, 1.0, 1.5, 2.0),
                      seed = 1, n_boot = 100) {
  set.seed(seed)
  predictors <- attr(terms(formula), "term.labels")
  n <- nrow(reg)
  inner <- function(rb, yb, xb) {
    avg <- matrix(NA_real_, length(lambdas), length(predictors) + 1)
    r2s <- numeric(length(lambdas))
    for (li in seq_along(lambdas)) {
      lam <- lambdas[li]
      coefs_acc <- 0; r2_acc <- 0
      for (s in seq_len(n_sim)) {
        d2 <- rb
        if (lam > 0) {
          d2$log_rr  <- rb$log_rr  + rnorm(n, 0, sqrt(lam) * yb)
          d2$pct_dem <- pmin(pmax(rb$pct_dem + rnorm(n, 0, sqrt(lam) * xb), 0), 1)
        }
        fit <- lm(formula, data = d2)
        coefs_acc <- coefs_acc + coef(fit)
        r2_acc <- r2_acc + summary(fit)$r.squared
      }
      avg[li, ] <- coefs_acc / n_sim
      r2s[li] <- r2_acc / n_sim
    }
    extr <- sapply(seq_len(ncol(avg)), \(j) {
      predict(lm(avg[, j] ~ lambdas), newdata = data.frame(lambdas = -1))
    })
    r2_extr <- predict(lm(r2s ~ lambdas), newdata = data.frame(lambdas = -1))
    list(beta = extr, r2 = r2_extr)
  }
  pt <- inner(reg, y_se, x_se)
  boot <- matrix(NA_real_, n_boot, length(pt$beta))
  for (b in seq_len(n_boot)) {
    idx <- sample.int(n, n, replace = TRUE)
    boot[b, ] <- inner(reg[idx, ], y_se[idx], x_se[idx])$beta
  }
  se <- apply(boot, 2, sd)
  names(pt$beta) <- names(se) <- c("(Intercept)", predictors)
  list(beta = pt$beta, se = se, r2 = pt$r2)
}

fmt_cell <- function(b, se, p) {
  stars <- ifelse(p < 0.005, "***",
           ifelse(p < 0.01,  "**",
           ifelse(p < 0.05,  "*",  "")))
  sprintf("%.3f (%.3f)%s", b, se, stars)
}
row_one <- function(model, predictor) {
  co <- if (inherits(model, "lmerMod")) {
    s <- summary(model); m <- s$coefficients
    cbind(m, "Pr(>|t|)" = 2 * pnorm(-abs(m[, "t value"])))
  } else summary(model)$coef
  if (!(predictor %in% rownames(co))) return("--")
  fmt_cell(co[predictor, "Estimate"], co[predictor, "Std. Error"],
           co[predictor, "Pr(>|t|)"])
}
row_simex <- function(sim, predictor) {
  if (!(predictor %in% names(sim$beta))) return("--")
  b <- sim$beta[predictor]; se <- sim$se[predictor]
  p <- 2 * pnorm(-abs(b / se))
  fmt_cell(b, se, p)
}
preds  <- c("Realistic","Investigative","Artistic","Social","Enterprising",
            "Conventional","pct_female","job_zone","pct_dem")
pretty <- c("Realistic","Investigative","Artistic","Social","Enterprising",
            "Conventional","% female","Complexity (Job Zone)","% Democratic")
m_ols_f  <- lm(f_pol, data = reg_f)
m_wls_f  <- lm(f_pol, data = reg_f, weights = w)
m_re_f   <- fgls_fit(f_pol, reg_f, reg_f$se_log)
m_lmer_f <- lmer(update(f_pol, ~ . + (1 | major_group)), data = reg_f, weights = w,
                 control = lmerControl(check.conv.singular = .makeCC("ignore", tol = 1e-4)))
sim_f    <- simex_run(reg_f, reg_f$se_log, reg_f$pct_dem_se, f_pol)
m_ols_m  <- lm(f_pol, data = reg_m)
m_wls_m  <- lm(f_pol, data = reg_m, weights = w)
m_re_m   <- fgls_fit(f_pol, reg_m, reg_m$se_log)
m_lmer_m <- lmer(update(f_pol, ~ . + (1 | major_group)), data = reg_m, weights = w,
                 control = lmerControl(check.conv.singular = .makeCC("ignore", tol = 1e-4)))
sim_m    <- simex_run(reg_m, reg_m$se_log, reg_m$pct_dem_se, f_pol)
build_tab5 <- function(m_ols, m_re, m_wls, m_lmer, sim, n_obs, tau2, title, sub, outfile) {
  body <- tibble(
    Predictor = pretty,
    OLS    = sapply(preds, \(p) row_one(m_ols,  p)),
    FGLS   = sapply(preds, \(p) row_one(m_re,   p)),
    WLS    = sapply(preds, \(p) row_one(m_wls,  p)),
    HLM    = sapply(preds, \(p) row_one(m_lmer, p)),
    SIMEX  = sapply(preds, \(p) row_simex(sim, p))
  )
  R2 <- function(m) sprintf("%.3f", summary(m)$r.squared)
  Adj <- function(m) sprintf("%.3f", summary(m)$adj.r.squared)
  r2_row  <- tibble(Predictor = "R²",
                    OLS = R2(m_ols), FGLS = R2(m_re), WLS = R2(m_wls),
                    HLM = "—", SIMEX = sprintf("%.3f", sim$r2))
  adj_row <- tibble(Predictor = "Adj-R²",
                    OLS = Adj(m_ols), FGLS = Adj(m_re), WLS = Adj(m_wls),
                    HLM = "—", SIMEX = "—")
  tab_df <- bind_rows(body, r2_row, adj_row)

  footer <- sprintf(
    "n = %d. FGLS τ² = %.4f. HLM: lmer(... + (1|major_group), weights=1/se²); ICC = 0 (no residual group-level variance). SIMEX: n_sim=1000, linear extrap, n_boot=100; joint noise on log_rr AND pct_dem extrapolated to λ=-1.",
    n_obs, tau2)
  g <- tab_df |> gt() |>
    cols_label(OLS = "OLS", FGLS = "FGLS", WLS = "WLS", HLM = "HLM", SIMEX = "SIMEX") |>
    tab_header(title = title, subtitle = sub) |>
    tab_style(style = list(cell_borders(sides = "top", weight = px(1.5))),
              locations = cells_body(rows = which(tab_df$Predictor == "R²"))) |>
    tab_style(style = list(cell_text(weight = "bold")),
              locations = cells_body(rows = which(tab_df$Predictor %in% c("R²","Adj-R²")))) |>
    tab_source_note(footer) |>
    tab_source_note("Significance: * p<0.05   ** p<0.01   *** p<0.005") |>
    tab_options(table.font.size = 10, heading.title.font.size = 13,
                source_notes.font.size = 9)
  gtsave(g, outfile)
}

build_tab5(m_ols_f, m_re_f$fit, m_wls_f, m_lmer_f, sim_f,
           nobs(m_ols_f), m_re_f$tau2,
           "Female fertility log-RR — five estimators",
           "Predictors rescaled to 0-1 | + politics model",
           "figs/fertility_regression_compact_5model.png")
build_tab5(m_ols_m, m_re_m$fit, m_wls_m, m_lmer_m, sim_m,
           nobs(m_ols_m), m_re_m$tau2,
           "Male fertility log-RR — five estimators",
           "Birth attributed via cohabiting partner's FER answer | predictors 0-1 | + politics model",
           "figs/fertility_regression_compact_5model_male.png")

Session info

sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] splines   stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] lme4_1.1-37        Matrix_1.7-4       ggrepel_0.9.6      gt_1.3.0          
##  [5] arrow_23.0.1.1     lubridate_1.9.4    forcats_1.0.1      stringr_1.6.0     
##  [9] dplyr_1.1.4        purrr_1.2.0        readr_2.1.5        tidyr_1.3.1       
## [13] tibble_3.3.0       ggplot2_4.0.1.9000 tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1   farver_2.1.2       S7_0.2.1           fastmap_1.2.0     
##  [5] webshot2_0.1.2     promises_1.3.3     digest_0.6.39      timechange_0.3.0  
##  [9] lifecycle_1.0.4    sf_1.0-21          processx_3.8.6     magrittr_2.0.4    
## [13] compiler_4.5.3     rlang_1.1.6.9000   sass_0.4.10        tools_4.5.3       
## [17] yaml_2.3.10        knitr_1.50         labeling_0.4.3     bit_4.6.0         
## [21] classInt_0.4-11    xml2_1.4.0         RColorBrewer_1.1-3 tidycensus_1.7.5  
## [25] KernSmooth_2.23-26 websocket_1.4.4    withr_3.0.2        grid_4.5.3        
## [29] e1071_1.7-16       scales_1.4.0       MASS_7.3-65        cli_3.6.5         
## [33] rmarkdown_2.30     crayon_1.5.3       ragg_1.5.0         reformulas_0.4.1  
## [37] generics_0.1.4     httr_1.4.7         tzdb_0.5.0         minqa_1.2.8       
## [41] DBI_1.2.3          cachem_1.1.0       chromote_0.5.1     proxy_0.4-27      
## [45] rvest_1.0.5        assertthat_0.2.1   parallel_4.5.3     tigris_2.2.1      
## [49] vctrs_0.6.5        boot_1.3-32        jsonlite_2.0.0     hms_1.1.3         
## [53] bit64_4.6.0-1      archive_1.1.12     systemfonts_1.3.1  jquerylib_0.1.4   
## [57] units_1.0-0        glue_1.8.0         nloptr_2.2.1       ps_1.9.1          
## [61] stringi_1.8.7      gtable_0.3.6       later_1.4.4        pillar_1.11.1     
## [65] rappdirs_0.3.3     htmltools_0.5.8.1  R6_2.6.1           textshaping_1.0.4 
## [69] Rdpack_2.6.4       vroom_1.6.6        evaluate_1.0.5     lattice_0.22-7    
## [73] rbibutils_2.3      bslib_0.9.0        class_7.3-23       uuid_1.2-1        
## [77] Rcpp_1.1.0         nlme_3.1-168       mgcv_1.9-3         xfun_0.57         
## [81] fs_1.6.6           pkgconfig_2.0.3