Equal Pay OLS Regression

Author

Sarah Serpas

Replicating IWPR Methods for Texas Only

Link to RPubs: https://rpubs.com/sarahserpas/equalpay_ols_TXonly

This 2017 Document details the OLS Approach
This 2021 document has more updated numbers
- Texas impact would be ~ $47.87 B

Data Gathering and Cleaning

Data: Current Population Survey Annual Social and Economic supplement (CPS-ASEC)
- Men: aged 18 or older with positive earnings and positive hours of work during the previous year
  - Limited to those earning at or below 90th percentile of men’s annual earnings
Method: OLS Model
Controls:
- Age (AGE)
- Age ^2 (proxy for work experience)
- Education
- Annual Hours of Work
- Metropolitan Residence
- Region of the Country
Dependent variable: Natural log of annual earnings
CPS Variables Used
- AGE -> 85 = top code
- SEX –> 1 = Male, 2 = Female, 9 = Not in Universe
- EDUC (Grouped to below HS, HS grad/equiv, Associates or Some College, Bachelors, Higher Degre - not specified how report did this)
  - 001 = NIU,
  - 071 and below = 12th grade, no diploma and below
  - 073 = High School Diploma or equivalent
  - 081 = Some college but no degree
  - 091 = Associates Degree, occupational/vocational program
  - 092 = Associates Degree, Academic program
  - 111 = Bachelor’s Degree
  - 123 = Master’s Degree
  - 124 = Professional School Degree
  - 125 = Doctorate Degree
- METRO = Metropolitan Status (Grouped to in-metro vs out of metro – not specified how they coded this in technical appendix)
  - 0 = not identified
  - 1 = not in metro area
  - 2 - in metro area, in principal city
  - 3 - in metro area, not in central/principal city
  - 4 - in metro area, central/principal city status not identified
- UHRSWORKLY –> 999 = NIU
- INCWAGE –> 99999999 = N.I.U. (Not in Universe)

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library (ipumsr)
library(writexl)
library(dplyr)
library(car)

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

#read in CPS data -- this data extract only selected cases for TX (Fips = 48)
ddi <- read_ipums_ddi("Equal Earnings OLS/cps_00002.xml")
data <- read_ipums_micro(ddi, data_file = ("Equal Earnings OLS/cps_00002.dat"), verbose = FALSE)

# make variable name lowercase
names(data) <- tolower(names(data))

#cleaning and filtering data
data$sex_c <- Recode(as.numeric(data$sex), recodes = "1='Men'; 2='Women'; else = NA", as.factor=T)
data$age_18up <- ifelse(data$age >17,1,0)
data$uhrsworkly <- ifelse(data$uhrsworkly == 999, NA,data$uhrsworkly) # 999 = NIU
data$incwage <- ifelse(data$incwage == 99999999,NA,data$incwage)
data$educ_c <- Recode(as.numeric(data$educ), recodes = "2:71='1. Below HS'; 73='2. HS Diploma or Equivalent'; 81:92 = '3. Some College or Associates'; 111 = '4. Bachelors Degree'; 123:125 = '5. Higher Degree';  else = NA", as.factor=T)
data$metro_c <- Recode(as.numeric(data$metro), recodes = "1='1. Not in a Metro Area'; 2:4='2. In a Metro Area'; else = NA", as.factor=T)
data$region_c <- Recode(as.numeric(data$region), recodes = "11:12='Northeast'; 21:22='Midwest'; 31:33='South';41:42='West'; else = NA", as.factor=T)
data$ln_ann_earn <- log(data$incwage) #natural log of annual earnings
data$age2 <- data$age^2 #age squared

#only filter for columns we want
data_filter <- data[,c("year","serial","cpsid","asecflag","asecwth","region","metro","pernum", "cpsidp","asecwt","age","wkswork1","uhrsworkly","incwage","sex_c","age_18up","educ_c","metro_c","region_c","ln_ann_earn","age2")]


#remove those below 18 years of age and without "positive earnings and positive hours of work"
data_filter <- data_filter %>%
  filter(age_18up ==1, # only those 18+
         !is.na(incwage), incwage > 0, # remove missing/invalid income data
         !is.na(uhrsworkly), uhrsworkly > 0,
         !is.na(wkswork1), wkswork1 > 0,
         !is.na(educ_c), !is.na(metro_c), !is.na(region_c)) %>%
  mutate(hourswrk_LY = uhrsworkly * wkswork1, # calculate hours worked last year
         hourly_wage = incwage /hourswrk_LY)  # calculate hourly wage)


#Men only: 
data_men <- data_filter %>% 
  filter (sex_c == "Men")
  
#90th percentile for wage for men is $150,000 in 2024, $136,000 in 2021
names(data_men)

 [1] "year"        "serial"      "cpsid"       "asecflag"    "asecwth"    
 [6] "region"      "metro"       "pernum"      "cpsidp"      "asecwt"     
[11] "age"         "wkswork1"    "uhrsworkly"  "incwage"     "sex_c"      
[16] "age_18up"    "educ_c"      "metro_c"     "region_c"    "ln_ann_earn"
[21] "age2"        "hourswrk_LY" "hourly_wage"

temp_90thp <- data_men %>% group_by(year) %>% 
summarise(value_90th_percentile = quantile(incwage, 0.9, na.rm = TRUE))

#2024 data for men, only those earning below 90th percentile
data_men24 <- data_men %>%
  filter (year==2024,incwage<150001)

#2021 data for men, only those earning below 90th percentile
data_men21 <- data_men %>%
  filter (year==2021,incwage<136001)


#women only: 
data_women24 <- data_filter %>% 
  filter (sex_c == "Women", year == 2024)
data_women21 <- data_filter %>% 
  filter (sex_c == "Women", year == 2021)

OLS Regression

library(survey)

Loading required package: grid

Loading required package: Matrix


Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loading required package: survival


Attaching package: 'survey'

The following object is masked from 'package:graphics':

    dotchart

des24 <- svydesign(id=~1, weights=~asecwt, data=data_men24)

ols24 <- svyglm(ln_ann_earn ~ age + age2 + educ_c +metro_c+hourswrk_LY, design = des24)

summary(ols24)


Call:
svyglm(formula = ln_ann_earn ~ age + age2 + educ_c + metro_c + 
    hourswrk_LY, design = des24)

Survey design:
svydesign(id = ~1, weights = ~asecwt, data = data_men24)

Coefficients:
                                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)                          7.057e+00  1.935e-01  36.473  < 2e-16 ***
age                                  6.871e-02  8.283e-03   8.295  < 2e-16 ***
age2                                -6.668e-04  9.454e-05  -7.053 2.33e-12 ***
educ_c2. HS Diploma or Equivalent    2.811e-01  5.850e-02   4.805 1.65e-06 ***
educ_c3. Some College or Associates  3.781e-01  6.091e-02   6.208 6.41e-10 ***
educ_c4. Bachelors Degree            6.403e-01  6.863e-02   9.330  < 2e-16 ***
educ_c5. Higher Degree               7.718e-01  6.795e-02  11.358  < 2e-16 ***
metro_c2. In a Metro Area            1.631e-01  7.482e-02   2.179   0.0294 *  
hourswrk_LY                          7.416e-04  3.849e-05  19.267  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.4565149)

Number of Fisher Scoring iterations: 2

des21 <- svydesign(id=~1, weights=~asecwt, data=data_men21)

ols21 <- svyglm(ln_ann_earn ~ age + age2 + educ_c +metro_c+hourswrk_LY, design = des21)

summary(ols21)


Call:
svyglm(formula = ln_ann_earn ~ age + age2 + educ_c + metro_c + 
    hourswrk_LY, design = des21)

Survey design:
svydesign(id = ~1, weights = ~asecwt, data = data_men21)

Coefficients:
                                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)                          7.353e+00  1.855e-01  39.643  < 2e-16 ***
age                                  5.273e-02  7.561e-03   6.974 4.09e-12 ***
age2                                -4.898e-04  8.500e-05  -5.762 9.51e-09 ***
educ_c2. HS Diploma or Equivalent    3.000e-01  5.503e-02   5.451 5.59e-08 ***
educ_c3. Some College or Associates  2.997e-01  5.576e-02   5.374 8.56e-08 ***
educ_c4. Bachelors Degree            6.799e-01  5.427e-02  12.528  < 2e-16 ***
educ_c5. Higher Degree               7.012e-01  7.125e-02   9.842  < 2e-16 ***
metro_c2. In a Metro Area            1.789e-02  8.249e-02   0.217    0.828    
hourswrk_LY                          7.700e-04  3.745e-05  20.563  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 0.4626209)

Number of Fisher Scoring iterations: 2

Estimate Women’s Earnings with Men’s Coefficients

#predict values based on men's coefficients

#add as column to 2024 and 2021 data <-- note that these will give the natural log of annual income. 
data_women24$Predict_log_annincome <- predict(ols24, newdata=data_women24)
data_women21$Predict_log_annincome <- predict(ols21, newdata=data_women21)

#reverse the natural log with exp function
data_women24$Predict_annincome <- exp(data_women24$Predict_log_annincome)
data_women21$Predict_annincome <- exp(data_women21$Predict_log_annincome)

#calculate difference
data_women24$Predict_difference <- data_women24$Predict_annincome - data_women24$incwage
data_women21$Predict_difference <- data_women21$Predict_annincome - data_women21$incwage


#"Women’s earnings are predicted using the coefficients from the men’s earnings equation (this method assumes that women retain their own human capital but are rewarded at the same rates as men would be) and calculated only for the actual hours that women worked during the year. Those with reduced predicted earnings are assigned their actual earnings during the year. The average earnings increase calculated for working women includes those with no predicted earnings increases under equal pay."
data_women24$Predict_difference <- data_women24$Predict_annincome - data_women24$incwage
data_women21$Predict_difference <- data_women21$Predict_annincome - data_women21$incwage


#only include positive differences and sum values
data_women24$Predict_difference_Positive <- ifelse(data_women24$Predict_difference>0,data_women24$Predict_difference,0)
data_women21$Predict_difference_Positive <- ifelse(data_women21$Predict_difference>0,data_women21$Predict_difference,0)

sum(data_women21$Predict_difference_Positive)

[1] 16880023

sum(data_women24$Predict_difference_Positive)

[1] 20995938

names(data_women21)

 [1] "year"                        "serial"                     
 [3] "cpsid"                       "asecflag"                   
 [5] "asecwth"                     "region"                     
 [7] "metro"                       "pernum"                     
 [9] "cpsidp"                      "asecwt"                     
[11] "age"                         "wkswork1"                   
[13] "uhrsworkly"                  "incwage"                    
[15] "sex_c"                       "age_18up"                   
[17] "educ_c"                      "metro_c"                    
[19] "region_c"                    "ln_ann_earn"                
[21] "age2"                        "hourswrk_LY"                
[23] "hourly_wage"                 "Predict_log_annincome"      
[25] "Predict_annincome"           "Predict_difference"         
[27] "Predict_difference_Positive"

#by state
total21<- data_women21 %>%
  group_by(year) %>%
  summarise(sum = sum(Predict_difference_Positive*asecwt))

total24<- data_women24 %>%
  group_by(year) %>%
  summarise(sum = sum(Predict_difference_Positive*asecwt))

The echo: false option disables the printing of code (only output is displayed).