# library (ipumsr)
# library(writexl)
# library(dplyr)
# library(car)
#
#
#
# #read in CPS data -- this data extract only selected cases for TX (Fips = 48)
# ddi <- read_ipums_ddi("cps_00002.xml")
# data <- read_ipums_micro(ddi, data_file = "cps_00002.dat", verbose = FALSE)
#
# # make variable name lowercase
# names(data) <- tolower(names(data))
#
# #cleaning and filtering data
# data$sex_c <- Recode(as.numeric(data$sex), recodes = "1='Men'; 2='Women'; else = NA", as.factor=T)
# data$age_18up <- ifelse(data$age >17,1,0)
# data$uhrsworkly <- ifelse(data$uhrsworkly == 999, NA,data$uhrsworkly) # 999 = NIU
# data$incwage <- ifelse(data$incwage == 99999999,NA,data$incwage)
# data$educ_c <- Recode(as.numeric(data$educ), recodes = "2:71='1. Below HS'; 73='2. HS Diploma or Equivalent'; 81:92 = '3. Some College or Associates'; 111 = '4. Bachelors Degree'; 123:125 = '5. Higher Degree'; else = NA", as.factor=T)
# data$metro_c <- Recode(as.numeric(data$metro), recodes = "1='1. Not in a Metro Area'; 2:4='2. In a Metro Area'; else = NA", as.factor=T)
# data$region_c <- Recode(as.numeric(data$region), recodes = "11:12='Northeast'; 21:22='Midwest'; 31:33='South';41:42='West'; else = NA", as.factor=T)
# data$ln_ann_earn <- log(data$incwage) #natural log of annual earnings
# data$age2 <- data$age^2 #age squared
#
# #only filter for columns we want
# data_filter <- data[,c("year","serial","cpsid","asecflag","asecwth","region","metro","pernum", "cpsidp","asecwt","age","wkswork1","uhrsworkly","incwage","sex_c","age_18up","educ_c","metro_c","region_c","ln_ann_earn","age2")]
#
#
# #remove those below 18 years of age and without "positive earnings and positive hours of work"
# data_filter <- data_filter %>%
# filter(age_18up ==1, # only those 18+
# !is.na(incwage), incwage > 0, # remove missing/invalid income data
# !is.na(uhrsworkly), uhrsworkly > 0,
# !is.na(wkswork1), wkswork1 > 0,
# !is.na(educ_c), !is.na(metro_c), !is.na(region_c)) %>%
# mutate(hourswrk_LY = uhrsworkly * wkswork1, # calculate hours worked last year
# hourly_wage = incwage /hourswrk_LY) # calculate hourly wage)
#
#
# #Men only:
# data_men <- data_filter %>%
# filter (sex_c == "Men")
#
# #90th percentile for wage for men is $150,000 in 2024, $136,000 in 2021
# names(data_men)
Wage Inequality Eco Impact for TWF and Check on SS Analysis
CRG Check
This is my check for the impact of equal pay analysis conducted by SS. Original methodology from IWPR here: https://iwpr.org/wp-content/uploads/2020/09/C455.pdfhttps://iwpr.org/wp-content/uploads/2020/09/C455.pdf
SS Code
Hardcoding for 90th percentile – I think SS used 2017 report threshold and adjusted forward using CPIU
#
# temp_90thp <- data_men %>% group_by(year) %>%
# summarise(value_90th_percentile = quantile(incwage, 0.9, na.rm = TRUE))
#
# #2024 data for men, only those earning below 90th percentile
# data_men24 <- data_men %>%
# filter (year==2024,incwage<150001)
#
# #2021 data for men, only those earning below 90th percentile
# data_men21 <- data_men %>%
# filter (year==2021,incwage<136001)
#
#
# #women only:
# data_women24 <- data_filter %>%
# filter (sex_c == "Women", year == 2024)
# data_women21 <- data_filter %>%
# filter (sex_c == "Women", year == 2021)
# library(survey)
# des24 <- svydesign(id=~1, weights=~asecwt, data=data_men24)
#
# ols24 <- svyglm(ln_ann_earn ~ age + age2 + educ_c +metro_c+hourswrk_LY, design = des24)
#
# summary(ols24)
# des21 <- svydesign(id=~1, weights=~asecwt, data=data_men21)
#
# ols21 <- svyglm(ln_ann_earn ~ age + age2 + educ_c +metro_c+hourswrk_LY, design = des21)
#
# summary(ols21)
#
# #predict values based on men's coefficients
#
# #add as column to 2024 and 2021 data <-- note that these will give the natural log of annual income.
# data_women24$Predict_log_annincome <- predict(ols24, newdata=data_women24)
# data_women21$Predict_log_annincome <- predict(ols21, newdata=data_women21)
#
# #reverse the natural log with exp function
# data_women24$Predict_annincome <- exp(data_women24$Predict_log_annincome)
# data_women21$Predict_annincome <- exp(data_women21$Predict_log_annincome)
#
# #calculate difference
# data_women24$Predict_difference <- data_women24$Predict_annincome - data_women24$incwage
# data_women21$Predict_difference <- data_women21$Predict_annincome - data_women21$incwage
#
#
# #"Women’s earnings are predicted using the coefficients from the men’s earnings equation (this method assumes that women retain their own human capital but are rewarded at the same rates as men would be) and calculated only for the actual hours that women worked during the year. Those with reduced predicted earnings are assigned their actual earnings during the year. The average earnings increase calculated for working women includes those with no predicted earnings increases under equal pay."
# data_women24$Predict_difference <- data_women24$Predict_annincome - data_women24$incwage
# data_women21$Predict_difference <- data_women21$Predict_annincome - data_women21$incwage
#
#
# #only include positive differences and sum values
# data_women24$Predict_difference_Positive <- ifelse(data_women24$Predict_difference>0,data_women24$Predict_difference,0)
# data_women21$Predict_difference_Positive <- ifelse(data_women21$Predict_difference>0,data_women21$Predict_difference,0)
#
# sum(data_women21$Predict_difference_Positive)
CRG start
# #by state
# total21<- data_women21 %>%
# group_by(year) %>%
# summarise(sum = sum(Predict_difference_Positive*asecwt))
#
# total24<- data_women24 %>%
# group_by(year) %>%
# summarise(sum = sum(Predict_difference_Positive*asecwt))
#
#
# print(total21)
# print(total24)
#
#
# total24 <- total24 / 1e9
# total21 <- total21 / 1e9
#
#
# summary(data_women24$asecwt)
#
#
# sum(data_women24$Predict_difference_Positive * asecwt) / 1e9
CRG Re-Do: 2019-2024
I pulled ASEC data for years 2019-2024 to get more data and see year to year change.
Friendly reminder to myself bc everytime I go back to CPS after not using for a while, I forget about how annoying CPS is by default adding the monthly data. Always remember to “uncheck” the basic monthly samples. See picture
library(ipumsr)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.1 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(survey)
Loading required package: grid
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Loading required package: survival
Attaching package: 'survey'
The following object is masked from 'package:graphics':
dotchart
library(writexl)
Warning: package 'writexl' was built under R version 4.4.2
library(purrr)
#CPS ASEC data (2019-2024)
<- read_ipums_ddi("cps_00072.xml")
ddi <- read_ipums_micro(ddi, data_file = "cps_00072.dat", verbose = FALSE)
data
<- rename_with(data, tolower)
data
<- data %>%
data mutate(
sex_c = case_when(
== 1 ~ "Men",
sex == 2 ~ "Women",
sex TRUE ~ NA_character_
%>% factor(),
)
age_18up = if_else(age > 17, 1, 0),
uhrsworkly = na_if(uhrsworkly, 999),
incwage = na_if(incwage, 99999999) %>% as.numeric(),
#ed categories
educ_c = case_when(
>= 2 & educ <= 71 ~ "1. Below HS",
educ == 73 ~ "2. HS Diploma or Equivalent",
educ >= 81 & educ <= 92 ~ "3. Some College or Associates",
educ == 111 ~ "4. Bachelors Degree",
educ >= 123 & educ <= 125 ~ "5. Higher Degree",
educ TRUE ~ NA_character_
|> factor(levels = c("1. Below HS", "2. HS Diploma or Equivalent",
) "3. Some College or Associates", "4. Bachelors Degree", "5. Higher Degree")),
#metro
metro_c = case_when(
== 1 ~ "1. Not in Metro",
metro %in% c(2, 3, 4) ~ "2. In Metro",
metro TRUE ~ NA_character_
%>% factor(levels = c("1. Not in Metro", "2. In Metro")),
)
ln_ann_earn = log(incwage),
age2 = age^2
|>
) filter(
== 1,
age_18up !is.na(incwage) & incwage > 0,
!is.na(uhrsworkly) & uhrsworkly > 0,
!is.na(wkswork1) & wkswork1 > 0,
!is.na(educ_c), !is.na(metro_c)
|>
) mutate(
hourswrk_LY = uhrsworkly * wkswork1,
hourly_wage = incwage / hourswrk_LY
)
#compute 90th percentile earnings cutoff for men per year (not hardcoded..reasoning is that dynamic allows us to capture time changes..in other words...The 2017 report dynamically determined the 90th percentile using the CPS dataset at the time of analysis.Hardcoding $136,000 for 2021 and $150,000 for 2024 ignores actual wage distribution changes over time)
<- data |>
men_90th_percentile filter(sex_c == "Men") |>
group_by(year) |>
summarise(value_90th_percentile = quantile(incwage, 0.9, na.rm = TRUE), .groups = "drop")
#merge cutoff into dataset
<- left_join(data, men_90th_percentile, by = "year")
data
#subset men below 90th percentile
<- data |> filter(sex_c == "Men", incwage < value_90th_percentile)
data_men
#alternative to SS FOR LOOP: Run OLS models per year using `map()`
<- data_men |>
models group_split(year) |>
set_names(unique(data$year)) |>
map(~{
if (nrow(.x) > 0) {
<- svydesign(id = ~1, weights = ~as.numeric(.x$asecwt), data = .x)
des return(svyglm(ln_ann_earn ~ age + age2 + educ_c + metro_c + hourswrk_LY, design = des))
else {
} return(NULL)
}
})
#apply men's wage model to predict women's earnings
<- data %>% filter(sex_c == "Women")
data_women
#ensure predictions return numeric values to avoid svystat issue
<- data_women |>
data_women group_split(year) |>
set_names(unique(data_women$year)) |>
map_dfr(~{
<- as.character(.x$year[1])
y <- models[[y]]
model
if (!is.null(model)) {
<- .x |>
.x filter(complete.cases(age, age2, educ_c, metro_c, hourswrk_LY)) %>%
mutate(
Predict_log_annincome = as.numeric(predict(model, newdata = .)),
Predict_annincome = exp(Predict_log_annincome),
Predict_difference = as.numeric(Predict_annincome) - as.numeric(incwage),
Predict_difference_Positive = if_else(as.numeric(Predict_difference) > 0, as.numeric(Predict_difference), 0)
)
}return(.x)
})
<- data_women |>
total_wage_gap group_by(year) |>
summarise(total_wage_increase = sum(as.numeric(Predict_difference_Positive) * as.numeric(asecwt), na.rm = TRUE) / 1e9)
print(total_wage_gap)
# A tibble: 6 × 2
year total_wage_increase
<dbl> <dbl>
1 2019 47.0
2 2020 53.7
3 2021 45.7
4 2022 56.6
5 2023 61.6
6 2024 60.1
# write_xlsx(total_wage_gap, "Texas_Equal_Pay_Analysis_2019_2024.xlsx")
Verify 2021 Results Using Hardcoded Threshold to Cross Check
If I hardcoded, would I get what SS/IWPR got?
# percentile_2021 <- 136000
#
# # Subset men below the 90th percentile threshold for 2021
# data_men_2021_hc <- data_men |>
# filter(year == 2021, incwage < percentile_2021)
#
# #OLS model for 2021 using men’s earnings (with hardcoded threshold)
# des_2021_hc <- svydesign(id = ~1, weights = ~asecwt, data = data_men_2021_hc)
#
# ols_2021_hc <- svyglm(ln_ann_earn ~ age + age2 + educ_c + metro_c + hourswrk_LY, design = des_2021_hc)
#
#
# data_women_2021_hc <- data_women |>
# filter(year == 2021)
#
# data_women_2021_hc$educ_c <- factor(data_women_2021_hc$educ_c, levels = levels(data_men_2021_hc$educ_c))
# data_women_2021_hc$metro_c <- factor(data_women_2021_hc$metro_c, levels = levels(data_men_2021_hc$metro_c))
#
# #women's earnings using men’s OLS model with hardcoded threshold
# data_women_2021_hc$Predict_log_annincome <- as.numeric(predict(ols_2021_hc, newdata = data_women_2021_hc))
#
# data_women_2021_hc <- data_women_2021_hc |>
# mutate(
# Predict_annincome = exp(Predict_log_annincome),
# Predict_difference = Predict_annincome - incwage,
# Predict_difference_Positive = pmax(Predict_difference, 0))
#
# total_wage_gap_2021_hc <- data_women_2021_hc |>
# summarise(total_wage_increase = sum(Predict_difference_Positive * asecwt, na.rm = TRUE) / 1e9) # Convert to billions
#
# print(total_wage_gap_2021_hc)
Verifying 2024 Using Hardcoded
# # Define hardcoded 90th percentile wage cutoff for 2024
# percentile_2024 <- 150000
#
# # Subset men below the 90th percentile threshold for 2024
# data_men_2024_hc <- data_men |>
# filter(year == 2024, incwage < percentile_2024)
#
# #run OLS model for 2024 using men’s earnings (with hardcoded threshold)
# des_2024_hc <- svydesign(id = ~1, weights = ~asecwt, data = data_men_2024_hc)
#
# ols_2024_hc <- svyglm(ln_ann_earn ~ age + age2 + educ_c + metro_c + hourswrk_LY, design = des_2024_hc)
#
# data_women_2024_hc <- data_women |>
# filter(year == 2024)
#
# data_women_2024_hc$educ_c <- factor(data_women_2024_hc$educ_c, levels = levels(data_men_2024_hc$educ_c))
# data_women_2024_hc$metro_c <- factor(data_women_2024_hc$metro_c, levels = levels(data_men_2024_hc$metro_c))
#
# data_women_2024_hc$Predict_log_annincome <- as.numeric(predict(ols_2024_hc, newdata = data_women_2024_hc))
#
# #everse log transformation and compute wage differences
# data_women_2024_hc <- data_women_2024_hc %>%
# mutate(
# Predict_annincome = exp(Predict_log_annincome),
# Predict_difference = Predict_annincome - incwage,
# Predict_difference_Positive = pmax(Predict_difference, 0) )
#
# total_wage_gap_2024_hc <- data_women_2024_hc %>%
# summarise(total_wage_increase = sum(Predict_difference_Positive * asecwt, na.rm = TRUE) / 1e9)
#
# print(total_wage_gap_2024_hc)
C
function (object, contr, how.many, ...)
{
object <- as.factor(object)
if (!nlevels(object))
stop("object not interpretable as a factor")
if (!missing(contr) && is.name(Xcontr <- substitute(contr)))
contr <- switch(as.character(Xcontr), poly = "contr.poly",
helmert = "contr.helmert", sum = "contr.sum", treatment = "contr.treatment",
SAS = "contr.SAS", contr)
if (missing(contr)) {
oc <- getOption("contrasts")
contr <- if (length(oc) < 2L)
if (is.ordered(object))
contr.poly
else contr.treatment
else oc[1 + is.ordered(object)]
}
if (missing(how.many) && missing(...))
contrasts(object) <- contr
else {
if (is.character(contr))
contr <- get(contr, mode = "function")
if (is.function(contr))
contr <- contr(levels(object), ...)
contrasts(object, how.many) <- contr
}
object
}
<bytecode: 0x000002bfa950dc80>
<environment: namespace:stats>
Poverty Impact
Note to self: Modified original dataset to include family size
Note* I went from using incwage to hhincome*
library(dplyr)
library(tidyverse)
library(survey)
library(writexl)
# Assign 2024 Census poverty thresholds
<- c(
poverty_thresholds "1" = 16320, "2" = 21006, "3" = 24537, "4" = 32355,
"5" = 39019, "6" = 44879, "7" = 51638, "8" = 57753, "9" = 69473
)
# Update poverty threshold calculation using HHINCOME
<- data_women |>
data_women1 mutate(
pov_threshold = case_when(
== 1 ~ poverty_thresholds["1"],
famsize == 2 ~ poverty_thresholds["2"],
famsize == 3 ~ poverty_thresholds["3"],
famsize == 4 ~ poverty_thresholds["4"],
famsize == 5 ~ poverty_thresholds["5"],
famsize == 6 ~ poverty_thresholds["6"],
famsize == 7 ~ poverty_thresholds["7"],
famsize == 8 ~ poverty_thresholds["8"],
famsize >= 9 ~ poverty_thresholds["9"],
famsize TRUE ~ NA_real_
),below_poverty_before = if_else(hhincome < pov_threshold, 1, 0) # Use HHINCOME instead of INCWAGE
)
# Keep all working women (removing full-time/full-year filter)
<- data_women1 |> filter(year == 2024)
data_women1
# Adjust household income instead of individual wages
<- data_women1 |>
data_women1 mutate(
hhincome_adjusted = hhincome + Predict_difference_Positive,
below_poverty_after = if_else(hhincome_adjusted < pov_threshold, 1, 0)
)
# Summarize poverty rates before and after wage adjustment
<- data_women1 |>
poverty_summary summarise(
total_women = sum(asecwt, na.rm = TRUE), # Corrected weight variable
poverty_before = weighted.mean(below_poverty_before, asecwt, na.rm = TRUE) * 100,
poverty_after = weighted.mean(below_poverty_after, asecwt, na.rm = TRUE) * 100,
reduction = poverty_before - poverty_after
)
print(poverty_summary)
# A tibble: 1 × 4
total_women poverty_before poverty_after reduction
<dbl> <dbl> <dbl> <dbl>
1 6762147. 5.05 2.60 2.46
# write_xlsx(poverty_summary, "Texas_Poverty_Impact_Equal_Pay.xlsx")
# summary(data_women1$hhincome)
# hist(data_women1$hhincome, breaks = 50, main = "Household Income Distribution", xlab = "Household Income")
# #poverty rate for working women in 2023
# data_women_2023 <- data_women |> filter(year == 2023)
#
#
# data_women_2023 <- data_women_2023 |>
# mutate(
# pov_threshold = case_when(
# famsize == 1 ~ poverty_thresholds["1"],
# famsize == 2 ~ poverty_thresholds["2"],
# famsize == 3 ~ poverty_thresholds["3"],
# famsize == 4 ~ poverty_thresholds["4"],
# famsize == 5 ~ poverty_thresholds["5"],
# famsize == 6 ~ poverty_thresholds["6"],
# famsize == 7 ~ poverty_thresholds["7"],
# famsize == 8 ~ poverty_thresholds["8"],
# famsize >= 9 ~ poverty_thresholds["9"],
# TRUE ~ NA_real_
# ),
# below_poverty_before = if_else(hhincome < pov_threshold, 1, 0))
#
# poverty_summary_2023 <- data_women_2023 |>
# summarise(
# total_women = sum(asecwt, na.rm = TRUE),
# poverty_before = weighted.mean(hhincome < pov_threshold, asecwt, na.rm = TRUE) * 100
# )
#
# print(poverty_summary_2023)