# Make sure your key is set USE YOUR OWN KEY FROM IPUMS ACCOUNT
Sys.setenv(IPUMS_API_KEY = "59cba10d8a5da536fc06b59dea55355814094bfc86733b1124884a3e")
start_year <- 2010
end_year <- 2023
years <- start_year:end_year
samples_use <- paste0("cps", years, "_03b") #builds the sample IDs
vars <- c("YEAR","STATEFIP","EMPSTAT","EARNWEEK","UNION","EARNWT") #tells IPUMS what columns to include (year, state, employment status, weekly earnings, union status, earnings weight).
ext <- define_extract_micro(
collection = "cps",
description = paste0("CPS March Basic Monthly ", start_year, "-", end_year, " union + earnweek"),
samples = samples_use,
variables = vars
) #creates an IPUMS “extract request” using those samples + variables.
dir.create("data_raw", showWarnings = FALSE)
cache_file <- sprintf("data_raw/cps_%d_%d.rds", start_year, end_year)
if (file.exists(cache_file)) {
cps <- readRDS(cache_file)
} else {
cps <- ext |>
submit_extract() |>
wait_for_extract() |>
download_extract() |>
read_ipums_ddi() |>
read_ipums_micro()
saveRDS(cps, cache_file)
}
#If that .rds file exists, it loads it with readRDS(). If it doesn’t exist, it submits the extract to IPUMS, waits, downloads, reads it into cps, then saves it as the .rds. This is what stops the extractions after the first successful run.
analysis_df <- cps %>% #keep only what we need and create a few variables
transmute(
year = as.integer(YEAR),
state = as.integer(STATEFIP),
empstat = as.integer(EMPSTAT), #identifiers
earnweek = as.numeric(EARNWEEK), #weekly earnings (numeric)
w = as.numeric(EARNWT), #CPS earnings weight (EARNWT)
union_any = case_when(
UNION %in% c(2, 3) ~ 1L,
UNION == 1 ~ 0L,
TRUE ~ NA_integer_
), #1 if union member or covered (UNION 2 or 3) 0 if not covered (UNION 1) NA otherwise (NIU/missing)
log_earnweek = if_else(!is.na(earnweek) & earnweek > 0, log(earnweek), NA_real_)
) %>% #only when earnings are > 0
filter(
empstat %in% c(10, 12), # employed only
!is.na(union_any), #non-missing union status
!is.na(log_earnweek), #non-missing log earnings
!is.na(w),
w > 0 #positive weight
)
#The sample is basically: employed people with valid weekly earnings + union info.
state_year <- analysis_df %>%
group_by(state, year) %>%
summarise(
union_share = weighted.mean(union_any, w, na.rm = TRUE), #(union/covered rate in that state-year)
p10 = wq(log_earnweek, w, 0.10),
p90 = wq(log_earnweek, w, 0.90), #weighted 10th and 90th percentiles of log weekly earnings
gap_90_10 = p90 - p10, #inequality measure
w_sum = sum(w, na.rm = TRUE),
.groups = "drop" #total weight in that state-year (used as regression weights)
)
m_ineq <- feols( #Relate inequality to unionization.
gap_90_10 ~ union_share | state + year, #adds state fixed effects (time-invariant differences across states) and year fixed effects (national shocks each year).
data = state_year,
weights = ~ w_sum, #makes bigger state-years count more.
cluster = ~ state #gives state-clustered standard errors.
)
summary(m_ineq)
## OLS estimation, Dep. Var.: gap_90_10
## Observations: 714
## Weights: w_sum
## Fixed-effects: state: 51, year: 14
## Standard-errors: Clustered (state)
## Estimate Std. Error t value Pr(>|t|)
## union_share -0.457403 0.397098 -1.15186 0.25485
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.143874 Adj. R2: 0.354692
## Within R2: 0.004886
\[ \text{gap}_{90,10} = p_{90}\!\big(\log(\text{wage})\big) - p_{10}\!\big(\log(\text{wage})\big) = \log\!\left(\frac{w_{90}}{w_{10}}\right) \]
This is the log 90/10 wage ratio (a standard inequality measure).
union_share is a proportion from 0 to 1 (0% to 100% union/covered)
Estimate = -0.457 means:
If a state’s unionization went up by 1.00 (i.e., +100 percentage points from no union to union), the 90–10 log wage gap would be 0.457 lower.
More realistically, a +0.10 change (10 percentage points) is: −0.457×0.10=−0.0457
Because the gap is a log ratio, a change of -0.0457 implies the 90/10 ratio changes by: exp(−0.0457)≈0.955
So a 10-pp increase in union share is associated with about a 4.5% lower 90/10 wage ratio (less inequality), holding state and year fixed effects constant.
p = 0.255, so: Not significant at 10%, 5%, or 1%. You cannot reject the null that the effect is 0.
Adj R^2 ~ 0.355 is mostly from the fixed effects explaining level differences across states + common year shocks.
Within R^2 ~ 0.0049 means union_share explains very little of the within-state over-time variation in inequality after FE.
Unforunately, in this dataset/specification, I don’t have strong evidence that changes in a state’s union share are associated with changes in the state’s 90/10 log wage gap.
However, The estimate has a roughly ~4–5% lower 90/10 wage ratio. So the point estimate is economically plausible, it’s just not precise enough to be confident.
This used state FE + year FE That means it’s only using within-state over-time changes in unionization to explain within-state over-time changes in inequality. If unionization doesn’t move a ton within states year-to-year (or it moves but inequality is driven by other big forces), the model won’t have much signal.
State-year is a coarse unit Union effects can be strong in: certain industries, certain occupations, certain parts of the wage distribution, without showing up cleanly in a single state-wide 90/10 measure.
90/10 gap may not be the channel unions affect most Unions often compress wages more around the middle or lower-middle. There could be stronger movement in: 50/10 (bottom compression), 90/50 (top pulling away), variance of log wages, Gini, Theil, etc.
Timing / lag effects Unionization changes might affect wage structure gradually. A simple fix is to try union_share lagged 1–3 years.