library(ipumsr)
library(tidyverse)
library(survey)
library(writexl)
library(purrr)Poverty Reduction Estimate for Women in Texas, 2024
Estimating Poverty Reduction for Women in Texas Given Gender Pay Equality
Source: 2024 Current Population Survey Annual Social and Economic supplement (CPS-ASEC) via IPUMS CPS
Sources, Methods, and Data Notes
This analysis calculates the estimated reduction of poverty for women in Texas if there was no gender pay inequality for the year 2024. The code is based on Coda Rayo-Garza’s original code.
For the analysis, I use 2024 CPS-ASEC Cross-sectional data from IPUMS CPS. I did not use any of the basic monthly samples. I filtered the data to be for Texas (FIPS=48) before downloading the data extract.
Data Prep
# Load CPS-ASEC data extract (2024)
ddi <- read_ipums_ddi("cps_00021.xml")
data <- read_ipums_micro(ddi, data_file = "cps_00021.dat", verbose = FALSE)
data <- rename_with(data, tolower)Cleaning Data
# CRG's code (from gender wage gap analysis)
data <- data %>%
mutate(
sex_c = case_when(
sex == 1 ~ "Men",
sex == 2 ~ "Women",
TRUE ~ NA_character_
) %>% factor(),
age_18up = if_else(age > 17, 1, 0),
uhrsworkly = na_if(uhrsworkly, 999),
incwage = na_if(incwage, 99999999) %>% as.numeric(),
#ed categories
educ_c = case_when(
educ >= 2 & educ <= 71 ~ "1. Below HS",
educ == 73 ~ "2. HS Diploma or Equivalent",
educ >= 81 & educ <= 92 ~ "3. Some College or Associates",
educ == 111 ~ "4. Bachelors Degree",
educ >= 123 & educ <= 125 ~ "5. Higher Degree",
TRUE ~ NA_character_
) |> factor(levels = c("1. Below HS", "2. HS Diploma or Equivalent",
"3. Some College or Associates", "4. Bachelors Degree", "5. Higher Degree")),
#metro
metro_c = case_when(
metro == 1 ~ "1. Not in Metro",
metro %in% c(2, 3, 4) ~ "2. In Metro",
TRUE ~ NA_character_
) %>% factor(levels = c("1. Not in Metro", "2. In Metro")),
# KW add: make sure poverty is numeric
poverty = as.numeric(poverty),
ln_ann_earn = log(incwage),
age2 = age^2
) |>
filter(
age_18up == 1,
!is.na(incwage) & incwage > 0,
!is.na(uhrsworkly) & uhrsworkly > 0,
!is.na(wkswork1) & wkswork1 > 0,
!is.na(educ_c),
!is.na(metro_c),
!is.na(hhincome) & hhincome > 0, # added household income to use for poverty analysis
!is.na(cutoff), # added cutoff for poverty analysis
) |>
mutate(
hourswrk_LY = uhrsworkly * wkswork1,
hourly_wage = incwage / hourswrk_LY,
in_poverty = if_else(poverty == 10, 1, 0) # added poverty vairable for poverty analysis
)Calculating the actual poverty rate for working women
Here I am just doing some debugging because I can’t figure out why my poverty rates are so low!! So I have double checked the actual poverty rate for working women three different ways.
# Method 1: using svymean()
working_women <- data %>%
filter(
year == 2024,
sex_c == "Women",
)
#define survey design
design <- svydesign(ids = ~1, weights = ~asecwt, data = working_women)
#calculate weighted poverty rate
poverty_rate_women <- svymean(~in_poverty, design, na.rm = TRUE)
print(poverty_rate_women) mean SE
in_poverty 0.057148 0.0056
# I end up getting the same rate as I did below, which is an actual poverty rate of 5.7% for working women age 18+. This seems low.# Method 2: using svytotal()
weighted_poverty_count <- svytotal(~in_poverty, design, na.rm = TRUE)
# Total weighted count of all working women
weighted_total_women <- svytotal(~as.numeric(sex_c == "Women"), design, na.rm = TRUE)
# Calculate weighted poverty rate manually
poverty_rate_women_manual <- as.numeric(weighted_poverty_count) / as.numeric(weighted_total_women)
print(poverty_rate_women_manual)[1] 0.05714815
# still get 5.7%# Method 3: using survey::svyratio()
working_women_3 <- working_women %>%
mutate(one = 1)
# redefine sd with 'one' column
design <- svydesign(ids = ~1, weights = ~asecwt, data = working_women_3)
#calculate weighted poverty rate
poverty_rate_women_ratio <- svyratio(~in_poverty, ~one, design, na.rm = TRUE)
print(poverty_rate_women_ratio)Ratio estimator: svyratio.survey.design2(~in_poverty, ~one, design, na.rm = TRUE)
Ratios=
one
in_poverty 0.05714815
SEs=
one
in_poverty 0.005632614
# still get 5.7%# Method 4: Post-Stratification with srvyr Package
library(srvyr)Warning: package 'srvyr' was built under R version 4.2.3
Attaching package: 'srvyr'
The following object is masked from 'package:stats':
filter
# convert design into srvyr object
design_srvyr <- as_survey_design(design)
# calculate poverty rate
poverty_rate_women_srvyr <- design_srvyr %>%
summarize(poverty_rate = survey_mean(in_poverty, na.rm = TRUE))
print(poverty_rate_women_srvyr)# A tibble: 1 × 2
poverty_rate poverty_rate_se
<dbl> <dbl>
1 0.0571 0.00563
# still get 5.7%Analysis - Wage Gap
# CRG's code (from gender wage gap analysis)
#compute 90th percentile earnings cutoff for men per year (not hardcoded..reasoning is that dynamic allows us to capture time changes..in other words...The 2017 report dynamically determined the 90th percentile using the CPS dataset at the time of analysis.Hardcoding ignores actual wage distribution changes over time)
men_90th_percentile <- data |>
filter(sex_c == "Men") |>
group_by(year) |>
summarise(value_90th_percentile = quantile(incwage, 0.9, na.rm = TRUE), .groups = "drop")
#merge cutoff into dataset
data <- left_join(data, men_90th_percentile, by = "year")
#subset men below 90th percentile
data_men <- data |> filter(sex_c == "Men", incwage < value_90th_percentile)
#alternative to SS FOR LOOP: Run OLS models per year using `map()`
models <- data_men |>
group_split(year) |>
set_names(unique(data$year)) |>
map(~{
if (nrow(.x) > 0) {
des <- svydesign(id = ~1, weights = ~as.numeric(.x$asecwt), data = .x)
return(svyglm(ln_ann_earn ~ age + age2 + educ_c + metro_c + hourswrk_LY, design = des))
} else {
return(NULL)
}
})
#apply men's wage model to predict women's earnings
data_women <- data %>% filter(sex_c == "Women")
#ensure predictions return numeric values to avoid svystat issue
data_women <- data_women |>
group_split(year) |>
set_names(unique(data_women$year)) |>
map_dfr(~{
y <- as.character(.x$year[1])
model <- models[[y]]
if (!is.null(model)) {
.x <- .x |>
filter(complete.cases(age, age2, educ_c, metro_c, hourswrk_LY)) %>%
mutate(
Predict_log_annincome = as.numeric(predict(model, newdata = .)),
Predict_annincome = exp(Predict_log_annincome),
Predict_difference = as.numeric(Predict_annincome) - as.numeric(incwage),
Predict_difference_Positive = if_else(as.numeric(Predict_difference) > 0, as.numeric(Predict_difference), 0)
)
}
return(.x)
})
total_wage_gap <- data_women |>
group_by(year) |>
summarise(total_wage_increase = sum(as.numeric(Predict_difference_Positive) * as.numeric(asecwt), na.rm = TRUE) / 1e9)
print(total_wage_gap)# A tibble: 10 × 2
year total_wage_increase
<dbl> <dbl>
1 2015 42.5
2 2016 43.2
3 2017 45.4
4 2018 46.2
5 2019 47.0
6 2020 53.7
7 2021 45.7
8 2022 56.6
9 2023 61.6
10 2024 60.1
Analysis - Poverty Reduction
The below is my original code.
I use the variable CUTOFF. The CUTOFF variable is located on the person record, although it treats respondents who live in families collectively. CUTOFF is the official poverty threshold used by the Census Bureau to evaluate the poverty status (POVERTY) of each family in the sample. For instance, for a family consisting of 2 individuals, both of whom are under 65, with no children, the poverty threshold in 1989 was 7,495 dollars. If a sampled family of this composition reported income below 7,495 dollars in that year, then they would be coded as “below poverty” in the POVERTY variable. CUTOFF uses Census-defined family units, which do not necessarily correspond to the IPUMS-derived family units as specified in FAMUNIT. See FTYPE, FAMKIND, and FAMREL for more on Census family units.
# calculate new predicted household income
data_women <- data_women %>%
mutate(
Predict_household_income = if_else(
Predict_difference_Positive > 0,
hhincome + Predict_difference_Positive,
hhincome
),
predict_pov_status = if_else(Predict_household_income < cutoff, 10, 20) # 10 = Below Poverty, 20 = Above Poverty
)
# calculate actual poverty rate
actual_pov_rate <- data_women %>%
group_by(year) %>%
summarise(
tot_women_w = sum(asecwt, na.rm = TRUE),
women_pov_w = sum(asecwt * (poverty == 10), na.rm = TRUE),
actual_pov_rate = women_pov_w / tot_women_w,
.groups = "drop"
)
# calculate new predicted poverty rate
predicted_pov_rate <- data_women %>%
group_by(year) %>%
summarise(
tot_women_w = sum(asecwt, na.rm = TRUE),
predicted_pov_w = sum(asecwt * (predict_pov_status == 10), na.rm = TRUE),
predicted_pov_rate = predicted_pov_w / tot_women_w,
.groups = "drop"
)
# merge
difference_pov_rate <- actual_pov_rate %>%
left_join(predicted_pov_rate, by = "year") %>%
mutate(
difference = actual_pov_rate - predicted_pov_rate,
percent_change_poverty_number = ((predicted_pov_w - women_pov_w) / women_pov_w) * 100
)
print(difference_pov_rate)# A tibble: 10 × 9
year tot_women_w.x women_pov_w actual_pov_rate tot_women_w.y predicted_pov_w
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2015 5708381. 497475. 0.0871 5708381. 166139.
2 2016 5742270. 414481. 0.0722 5742270. 150830.
3 2017 5814314. 405280. 0.0697 5814314. 153749.
4 2018 5986619. 402983. 0.0673 5986619. 129519.
5 2019 6251022. 423918 0.0678 6251022. 160713.
6 2020 6466837. 314778. 0.0487 6466837. 101584.
7 2021 6184068. 400106. 0.0647 6184068. 172653.
8 2022 6286258. 406543. 0.0647 6286258. 150564.
9 2023 6731254. 406290. 0.0604 6731254. 123780.
10 2024 6762147. 386444. 0.0571 6762147. 160064.
# ℹ 3 more variables: predicted_pov_rate <dbl>, difference <dbl>,
# percent_change_poverty_number <dbl>
write_xlsx(difference_pov_rate, "difference_poverty_rate.xlsx")Notes:
The total weighted number for working women looks right at 6,762,147. However, the poverty rate I am getting is low at 5.7%. For reference, the 2016 IWPR study reported 10.5% of working women were in poverty, which decreased to 4.8% assuming gender pay equality. The 2021 IWPR study reported 10.3% of working women were in poverty, which decreased to 5.8% assuming gender pay equality.