Wage Inequality Eco Impact for TWF and Check on SS Analysis

Author

crg

CRG Check

This is my check for the impact of equal pay analysis conducted by SS. Original methodology from IWPR here: https://iwpr.org/wp-content/uploads/2020/09/C455.pdfhttps://iwpr.org/wp-content/uploads/2020/09/C455.pdf

SS Code

# library (ipumsr)
# library(writexl)
# library(dplyr)
# library(car)
# 
# 
# 
# #read in CPS data -- this data extract only selected cases for TX (Fips = 48)
# ddi <- read_ipums_ddi("cps_00002.xml")
# data <- read_ipums_micro(ddi, data_file = "cps_00002.dat", verbose = FALSE)
# 
# # make variable name lowercase
# names(data) <- tolower(names(data))
# 
# #cleaning and filtering data
# data$sex_c <- Recode(as.numeric(data$sex), recodes = "1='Men'; 2='Women'; else = NA", as.factor=T)
# data$age_18up <- ifelse(data$age >17,1,0)
# data$uhrsworkly <- ifelse(data$uhrsworkly == 999, NA,data$uhrsworkly) # 999 = NIU
# data$incwage <- ifelse(data$incwage == 99999999,NA,data$incwage)
# data$educ_c <- Recode(as.numeric(data$educ), recodes = "2:71='1. Below HS'; 73='2. HS Diploma or Equivalent'; 81:92 = '3. Some College or Associates'; 111 = '4. Bachelors Degree'; 123:125 = '5. Higher Degree';  else = NA", as.factor=T)
# data$metro_c <- Recode(as.numeric(data$metro), recodes = "1='1. Not in a Metro Area'; 2:4='2. In a Metro Area'; else = NA", as.factor=T)
# data$region_c <- Recode(as.numeric(data$region), recodes = "11:12='Northeast'; 21:22='Midwest'; 31:33='South';41:42='West'; else = NA", as.factor=T)
# data$ln_ann_earn <- log(data$incwage) #natural log of annual earnings
# data$age2 <- data$age^2 #age squared
# 
# #only filter for columns we want
# data_filter <- data[,c("year","serial","cpsid","asecflag","asecwth","region","metro","pernum", "cpsidp","asecwt","age","wkswork1","uhrsworkly","incwage","sex_c","age_18up","educ_c","metro_c","region_c","ln_ann_earn","age2")]
# 
# 
# #remove those below 18 years of age and without "positive earnings and positive hours of work"
# data_filter <- data_filter %>%
#   filter(age_18up ==1, # only those 18+
#          !is.na(incwage), incwage > 0, # remove missing/invalid income data
#          !is.na(uhrsworkly), uhrsworkly > 0,
#          !is.na(wkswork1), wkswork1 > 0,
#          !is.na(educ_c), !is.na(metro_c), !is.na(region_c)) %>%
#   mutate(hourswrk_LY = uhrsworkly * wkswork1, # calculate hours worked last year
#          hourly_wage = incwage /hourswrk_LY)  # calculate hourly wage)
# 
# 
# #Men only: 
# data_men <- data_filter %>% 
#   filter (sex_c == "Men")
#   
# #90th percentile for wage for men is $150,000 in 2024, $136,000 in 2021
# names(data_men)

Hardcoding for 90th percentile – I think SS used 2017 report threshold and adjusted forward using CPIU

# 
# temp_90thp <- data_men %>% group_by(year) %>% 
# summarise(value_90th_percentile = quantile(incwage, 0.9, na.rm = TRUE))
# 
# #2024 data for men, only those earning below 90th percentile
# data_men24 <- data_men %>%
#   filter (year==2024,incwage<150001)
# 
# #2021 data for men, only those earning below 90th percentile
# data_men21 <- data_men %>%
#   filter (year==2021,incwage<136001)
# 
# 
# #women only: 
# data_women24 <- data_filter %>% 
#   filter (sex_c == "Women", year == 2024)
# data_women21 <- data_filter %>% 
#   filter (sex_c == "Women", year == 2021)

# library(survey)
# des24 <- svydesign(id=~1, weights=~asecwt, data=data_men24)
# 
# ols24 <- svyglm(ln_ann_earn ~ age + age2 + educ_c +metro_c+hourswrk_LY, design = des24)
# 
# summary(ols24)

# des21 <- svydesign(id=~1, weights=~asecwt, data=data_men21)
# 
# ols21 <- svyglm(ln_ann_earn ~ age + age2 + educ_c +metro_c+hourswrk_LY, design = des21)
# 
# summary(ols21)

# 
# #predict values based on men's coefficients
# 
# #add as column to 2024 and 2021 data <-- note that these will give the natural log of annual income. 
# data_women24$Predict_log_annincome <- predict(ols24, newdata=data_women24)
# data_women21$Predict_log_annincome <- predict(ols21, newdata=data_women21)
# 
# #reverse the natural log with exp function
# data_women24$Predict_annincome <- exp(data_women24$Predict_log_annincome)
# data_women21$Predict_annincome <- exp(data_women21$Predict_log_annincome)
# 
# #calculate difference
# data_women24$Predict_difference <- data_women24$Predict_annincome - data_women24$incwage
# data_women21$Predict_difference <- data_women21$Predict_annincome - data_women21$incwage
# 
# 
# #"Women’s earnings are predicted using the coefficients from the men’s earnings equation (this method assumes that women retain their own human capital but are rewarded at the same rates as men would be) and calculated only for the actual hours that women worked during the year. Those with reduced predicted earnings are assigned their actual earnings during the year. The average earnings increase calculated for working women includes those with no predicted earnings increases under equal pay."
# data_women24$Predict_difference <- data_women24$Predict_annincome - data_women24$incwage
# data_women21$Predict_difference <- data_women21$Predict_annincome - data_women21$incwage
# 
# 
# #only include positive differences and sum values
# data_women24$Predict_difference_Positive <- ifelse(data_women24$Predict_difference>0,data_women24$Predict_difference,0)
# data_women21$Predict_difference_Positive <- ifelse(data_women21$Predict_difference>0,data_women21$Predict_difference,0)
# 
# sum(data_women21$Predict_difference_Positive)

CRG start

# #by state
# total21<- data_women21 %>%
#   group_by(year) %>%
#   summarise(sum = sum(Predict_difference_Positive*asecwt))
# 
# total24<- data_women24 %>%
#   group_by(year) %>%
#   summarise(sum = sum(Predict_difference_Positive*asecwt))
# 
# 
# print(total21)
# print(total24)
# 
# 
# total24 <- total24 / 1e9
# total21 <- total21 / 1e9
# 
# 
# summary(data_women24$asecwt)
# 
# 
# sum(data_women24$Predict_difference_Positive * asecwt) / 1e9

CRG Re-Do: 2019-2024

I pulled ASEC data for years 2019-2024 to get more data and see year to year change.

Friendly reminder to myself bc everytime I go back to CPS after not using for a while, I forget about how annoying CPS is by default adding the monthly data. Always remember to “uncheck” the basic monthly samples. See picture

library(ipumsr)
library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(survey)

Loading required package: grid
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loading required package: survival

Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart

library(writexl)

Warning: package 'writexl' was built under R version 4.4.2

library(purrr)

#CPS ASEC data (2019-2024)
ddi <- read_ipums_ddi("cps_00072.xml")
data <- read_ipums_micro(ddi, data_file = "cps_00072.dat", verbose = FALSE)

data <- rename_with(data, tolower)

data <- data %>%
  mutate(
    sex_c = case_when(
      sex == 1 ~ "Men",
      sex == 2 ~ "Women",
      TRUE ~ NA_character_
    ) %>% factor(),

    age_18up = if_else(age > 17, 1, 0),
    uhrsworkly = na_if(uhrsworkly, 999),
    incwage = na_if(incwage, 99999999) %>% as.numeric(),

    #ed categories
    educ_c = case_when(
      educ >= 2 & educ <= 71 ~ "1. Below HS",
      educ == 73 ~ "2. HS Diploma or Equivalent",
      educ >= 81 & educ <= 92 ~ "3. Some College or Associates",
      educ == 111 ~ "4. Bachelors Degree",
      educ >= 123 & educ <= 125 ~ "5. Higher Degree",
      TRUE ~ NA_character_
    ) |> factor(levels = c("1. Below HS", "2. HS Diploma or Equivalent", 
                            "3. Some College or Associates", "4. Bachelors Degree", "5. Higher Degree")),

    #metro
    metro_c = case_when(
      metro == 1 ~ "1. Not in Metro",
      metro %in% c(2, 3, 4) ~ "2. In Metro",
      TRUE ~ NA_character_
    ) %>% factor(levels = c("1. Not in Metro", "2. In Metro")),

    ln_ann_earn = log(incwage),
    age2 = age^2
  ) |>
  filter(
    age_18up == 1, 
    !is.na(incwage) & incwage > 0, 
    !is.na(uhrsworkly) & uhrsworkly > 0,
    !is.na(wkswork1) & wkswork1 > 0,
    !is.na(educ_c), !is.na(metro_c)
  ) |>
  mutate(
    hourswrk_LY = uhrsworkly * wkswork1, 
    hourly_wage = incwage / hourswrk_LY
  )

#compute 90th percentile earnings cutoff for men per year (not hardcoded..reasoning is that dynamic allows us to capture time changes..in other words...The 2017 report dynamically determined the 90th percentile using the CPS dataset at the time of analysis.Hardcoding $136,000 for 2021 and $150,000 for 2024 ignores actual wage distribution changes over time)

men_90th_percentile <- data |>
  filter(sex_c == "Men") |>
  group_by(year) |>
  summarise(value_90th_percentile = quantile(incwage, 0.9, na.rm = TRUE), .groups = "drop")

#merge cutoff into dataset
data <- left_join(data, men_90th_percentile, by = "year")

#subset men below 90th percentile
data_men <- data |> filter(sex_c == "Men", incwage < value_90th_percentile)

#alternative to SS FOR LOOP: Run OLS models per year using `map()`
models <- data_men |>
  group_split(year) |>
  set_names(unique(data$year)) |>
  map(~{
    if (nrow(.x) > 0) {
      des <- svydesign(id = ~1, weights = ~as.numeric(.x$asecwt), data = .x)
      return(svyglm(ln_ann_earn ~ age + age2 + educ_c + metro_c + hourswrk_LY, design = des))
    } else {
      return(NULL)
    }
  })

#apply men's wage model to predict women's earnings
data_women <- data %>% filter(sex_c == "Women")

#ensure predictions return numeric values to avoid svystat issue
data_women <- data_women |>
  group_split(year) |>
  set_names(unique(data_women$year)) |>
  map_dfr(~{
    y <- as.character(.x$year[1])
    model <- models[[y]]
    
    if (!is.null(model)) {
      .x <- .x |>
        filter(complete.cases(age, age2, educ_c, metro_c, hourswrk_LY)) %>%
        mutate(
          Predict_log_annincome = as.numeric(predict(model, newdata = .)),
          Predict_annincome = exp(Predict_log_annincome),
          Predict_difference = as.numeric(Predict_annincome) - as.numeric(incwage),
          Predict_difference_Positive = if_else(as.numeric(Predict_difference) > 0, as.numeric(Predict_difference), 0)
        )
    }
    return(.x)
  })

total_wage_gap <- data_women |>
  group_by(year) |>
  summarise(total_wage_increase = sum(as.numeric(Predict_difference_Positive) * as.numeric(asecwt), na.rm = TRUE) / 1e9)

print(total_wage_gap)

# A tibble: 6 × 2
   year total_wage_increase
  <dbl>               <dbl>
1  2019                47.0
2  2020                53.7
3  2021                45.7
4  2022                56.6
5  2023                61.6
6  2024                60.1

# write_xlsx(total_wage_gap, "Texas_Equal_Pay_Analysis_2019_2024.xlsx")

Verify 2021 Results Using Hardcoded Threshold to Cross Check

If I hardcoded, would I get what SS/IWPR got?

# percentile_2021 <- 136000
# 
# # Subset men below the 90th percentile threshold for 2021
# data_men_2021_hc <- data_men |>
#   filter(year == 2021, incwage < percentile_2021)
# 
# #OLS model for 2021 using men’s earnings (with hardcoded threshold)
# des_2021_hc <- svydesign(id = ~1, weights = ~asecwt, data = data_men_2021_hc)
# 
# ols_2021_hc <- svyglm(ln_ann_earn ~ age + age2 + educ_c + metro_c + hourswrk_LY, design = des_2021_hc)
# 
# 
# data_women_2021_hc <- data_women |>
#   filter(year == 2021)
# 
# data_women_2021_hc$educ_c <- factor(data_women_2021_hc$educ_c, levels = levels(data_men_2021_hc$educ_c))
# data_women_2021_hc$metro_c <- factor(data_women_2021_hc$metro_c, levels = levels(data_men_2021_hc$metro_c))
# 
# #women's earnings using men’s OLS model with hardcoded threshold
# data_women_2021_hc$Predict_log_annincome <- as.numeric(predict(ols_2021_hc, newdata = data_women_2021_hc))
# 
# data_women_2021_hc <- data_women_2021_hc |>
#   mutate(
#     Predict_annincome = exp(Predict_log_annincome),
#     Predict_difference = Predict_annincome - incwage,
#     Predict_difference_Positive = pmax(Predict_difference, 0))
# 
# total_wage_gap_2021_hc <- data_women_2021_hc |>
#   summarise(total_wage_increase = sum(Predict_difference_Positive * asecwt, na.rm = TRUE) / 1e9)  # Convert to billions
# 
# print(total_wage_gap_2021_hc)

Verifying 2024 Using Hardcoded

# # Define hardcoded 90th percentile wage cutoff for 2024
# percentile_2024 <- 150000
# 
# # Subset men below the 90th percentile threshold for 2024
# data_men_2024_hc <- data_men |>
#   filter(year == 2024, incwage < percentile_2024)
# 
# #run OLS model for 2024 using men’s earnings (with hardcoded threshold)
# des_2024_hc <- svydesign(id = ~1, weights = ~asecwt, data = data_men_2024_hc)
# 
# ols_2024_hc <- svyglm(ln_ann_earn ~ age + age2 + educ_c + metro_c + hourswrk_LY, design = des_2024_hc)
# 
# data_women_2024_hc <- data_women |>
#   filter(year == 2024)
# 
# data_women_2024_hc$educ_c <- factor(data_women_2024_hc$educ_c, levels = levels(data_men_2024_hc$educ_c))
# data_women_2024_hc$metro_c <- factor(data_women_2024_hc$metro_c, levels = levels(data_men_2024_hc$metro_c))
# 
# data_women_2024_hc$Predict_log_annincome <- as.numeric(predict(ols_2024_hc, newdata = data_women_2024_hc))
# 
# #everse log transformation and compute wage differences
# data_women_2024_hc <- data_women_2024_hc %>%
#   mutate(
#     Predict_annincome = exp(Predict_log_annincome),
#     Predict_difference = Predict_annincome - incwage,
#     Predict_difference_Positive = pmax(Predict_difference, 0)  )
# 
# total_wage_gap_2024_hc <- data_women_2024_hc %>%
#   summarise(total_wage_increase = sum(Predict_difference_Positive * asecwt, na.rm = TRUE) / 1e9)  
# 
# print(total_wage_gap_2024_hc)

function (object, contr, how.many, ...) 
{
    object <- as.factor(object)
    if (!nlevels(object)) 
        stop("object not interpretable as a factor")
    if (!missing(contr) && is.name(Xcontr <- substitute(contr))) 
        contr <- switch(as.character(Xcontr), poly = "contr.poly", 
            helmert = "contr.helmert", sum = "contr.sum", treatment = "contr.treatment", 
            SAS = "contr.SAS", contr)
    if (missing(contr)) {
        oc <- getOption("contrasts")
        contr <- if (length(oc) < 2L) 
            if (is.ordered(object)) 
                contr.poly
            else contr.treatment
        else oc[1 + is.ordered(object)]
    }
    if (missing(how.many) && missing(...)) 
        contrasts(object) <- contr
    else {
        if (is.character(contr)) 
            contr <- get(contr, mode = "function")
        if (is.function(contr)) 
            contr <- contr(levels(object), ...)
        contrasts(object, how.many) <- contr
    }
    object
}
<bytecode: 0x000002bfa950dc80>
<environment: namespace:stats>

Poverty Impact

Note to self: Modified original dataset to include family size

Note* I went from using incwage to hhincome*

library(dplyr)
library(tidyverse)
library(survey)
library(writexl)

# Assign 2024 Census poverty thresholds
poverty_thresholds <- c(
  "1" = 16320, "2" = 21006, "3" = 24537, "4" = 32355,
  "5" = 39019, "6" = 44879, "7" = 51638, "8" = 57753, "9" = 69473
)

# Update poverty threshold calculation using HHINCOME
data_women1 <- data_women |>
  mutate(
    pov_threshold = case_when(
      famsize == 1 ~ poverty_thresholds["1"],
      famsize == 2 ~ poverty_thresholds["2"],
      famsize == 3 ~ poverty_thresholds["3"],
      famsize == 4 ~ poverty_thresholds["4"],
      famsize == 5 ~ poverty_thresholds["5"],
      famsize == 6 ~ poverty_thresholds["6"],
      famsize == 7 ~ poverty_thresholds["7"],
      famsize == 8 ~ poverty_thresholds["8"],
      famsize >= 9 ~ poverty_thresholds["9"],
      TRUE ~ NA_real_
    ),
    below_poverty_before = if_else(hhincome < pov_threshold, 1, 0)  # Use HHINCOME instead of INCWAGE
  )

# Keep all working women (removing full-time/full-year filter)
data_women1 <- data_women1 |> filter(year == 2024)


# Adjust household income instead of individual wages
data_women1 <- data_women1 |>
  mutate(
    hhincome_adjusted = hhincome + Predict_difference_Positive,  
    below_poverty_after = if_else(hhincome_adjusted < pov_threshold, 1, 0)  
  )

# Summarize poverty rates before and after wage adjustment
poverty_summary <- data_women1 |>
  summarise(
    total_women = sum(asecwt, na.rm = TRUE),  # Corrected weight variable
    poverty_before = weighted.mean(below_poverty_before, asecwt, na.rm = TRUE) * 100,  
    poverty_after = weighted.mean(below_poverty_after, asecwt, na.rm = TRUE) * 100,  
    reduction = poverty_before - poverty_after  
  )

print(poverty_summary)

# A tibble: 1 × 4
  total_women poverty_before poverty_after reduction
        <dbl>          <dbl>         <dbl>     <dbl>
1    6762147.           5.05          2.60      2.46

# write_xlsx(poverty_summary, "Texas_Poverty_Impact_Equal_Pay.xlsx")


# summary(data_women1$hhincome)
# hist(data_women1$hhincome, breaks = 50, main = "Household Income Distribution", xlab = "Household Income")


# #poverty rate for working women in 2023
# data_women_2023 <- data_women |> filter(year == 2023)
# 
# 
# data_women_2023 <- data_women_2023 |>
#   mutate(
#     pov_threshold = case_when(
#       famsize == 1 ~ poverty_thresholds["1"],
#       famsize == 2 ~ poverty_thresholds["2"],
#       famsize == 3 ~ poverty_thresholds["3"],
#       famsize == 4 ~ poverty_thresholds["4"],
#       famsize == 5 ~ poverty_thresholds["5"],
#       famsize == 6 ~ poverty_thresholds["6"],
#       famsize == 7 ~ poverty_thresholds["7"],
#       famsize == 8 ~ poverty_thresholds["8"],
#       famsize >= 9 ~ poverty_thresholds["9"],
#       TRUE ~ NA_real_
#     ),
#     below_poverty_before = if_else(hhincome < pov_threshold, 1, 0))
# 
# poverty_summary_2023 <- data_women_2023 |>
#   summarise(
#     total_women = sum(asecwt, na.rm = TRUE),
#     poverty_before = weighted.mean(hhincome < pov_threshold, asecwt, na.rm = TRUE) * 100
#   )
# 
# print(poverty_summary_2023)