Summary

In this report, I calculate and plot unionization trends for doctors and nurses using data from the Current Population Survey (CPS) from January 2003 to August 2024, the latest data available as of October 2024.

The variable of interest in this report is UNION, which indicates whether, for the current job, the respondent was: 1) a member of a labor union or employee association similar to a union; 2) not a union member but covered by a union or employee association contract; or 3) neither a union member nor covered by a union contract.

For this analysis, I use the EARNWT weight, as recommended by CPS documentation.

These are the following steps I took for this task:

  1. I downloaded basic monthly cross-sectional data from January 2003 to August 2024. The specific variables of interest for this analysis are the following: YEAR, MONTH, EMPSTAT, OCC2010, UNION, and EARNWT.
  2. With all samples downloaded, I created a subset of the data for only respondents that indicated they were actively working, or had a job but had not been at work for the last week relative to the survey date.
  3. Next, I created flags for nurses and doctors.
  1. Many observations did not have values for UNION. Thus, I subset the data to exclude these observations, then grouped by date and unionization status. Using the EARNWT weight, I calculated the monthly count and monthly log count for each month-year-unionization observation
  2. I then calculated the linear time trend during COVID by running a simple linear regression on log unionization for nurses and doctors separately between January 2013 to January 2020.
  3. Finally, I plot the missing workers by occupation and extrapolate the pre-pandemic trends from 2020 onwards.
# adding union labels
data <- data %>% 
    mutate(UNION = case_when(
    UNION == 1 ~ "No union coverage",
    UNION == 2 ~ "Member of labor union",
    UNION == 3 ~ "Covered by union but not a member",
    TRUE ~ NA_character_
  )) %>% 
  mutate(date = paste(YEAR, MONTH, "1",sep="-") %>% as.Date("%Y-%m-%d")) # formats date


# now creating nurse and doctor flags and filtering only for nurses and doctors
hc.df <- data %>% 
  filter(EMPSTAT %in% c(10,12)) %>% # "At work" or "Has job, not at work last week"
  mutate(nurse = ifelse(OCC2010 %in% c(3500,3600,3130),1,0), # create nurse dummy
         doctor = ifelse(OCC2010==3060,1,0)) %>% # create doctor dummy
  filter(nurse==1 | doctor==1) # keep only doctors and nurses


# unionized workers per occupation-industry (# of unionized doctors/nurses in each industry)
# i use EARNWT, which is "a person-level weight that should be used in any analysis including one of the following variables: EARNWEEK, HOURWAGE, PAIDHOUR, UNION, UHRSWORKORG, WKSWORKORG, ELIGORG, and OTPAY. For any other analysis using ASEC data, researchers should use ASECWT or for analyses of non-ASEC data, WTFINL."

# union <- hc.df %>% 
#   filter(is.na(UNION) == FALSE) %>% 
#   group_by(date, nurse, UNION) %>% 
#   summarise(count = sum(EARNWT, na.rm = T)) %>%
#   mutate(log_count = log(count)) %>%
#   arrange(date,UNION)

# creating df for nurse unionization only
nurses <- hc.df %>% 
  filter(is.na(UNION) == FALSE & nurse == 1) %>% 
  group_by(date, UNION) %>% 
  summarise(count = sum(EARNWT, na.rm = T)) %>%
  mutate(log_count = log(count)) %>%
  arrange(date,UNION)
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
# creating df for doctor unionization only
doctors <- hc.df %>% 
  filter(is.na(UNION) == FALSE & nurse == 0) %>% 
  group_by(date, UNION) %>% 
  summarise(count = sum(EARNWT, na.rm = T)) %>%
  mutate(log_count = log(count)) %>%
  arrange(date,UNION)
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
# creating regressions for doctors by unionization status

# doctors covered but not members

d.notmem <- doctors %>% 
  filter(UNION == "Covered by union but not a member")

dreg.notmem <- lm(log_count ~ date, 
       data = d.notmem %>% 
         filter(between(date, as.Date('2013-01-01'), as.Date('2020-01-01'))))

d.notmem$exp_values <- dreg.notmem$coefficients["(Intercept)"] + 
    dreg.notmem$coefficients["date"]*as.numeric(d.notmem$date)

# doctors with no union coverage

d.nocov <- doctors %>% 
  filter(UNION == "No union coverage")

dreg.nocov <- lm(log_count ~ date, 
       data = d.nocov %>% 
         filter(between(date, as.Date('2013-01-01'), as.Date('2020-01-01'))))

d.nocov$exp_values <- dreg.nocov$coefficients["(Intercept)"] + 
    dreg.nocov$coefficients["date"]*as.numeric(d.nocov$date)
  
# doctors union members

d.member <- doctors %>% 
  filter(UNION == "Member of labor union")

dreg.member <- lm(log_count ~ date, 
       data = d.member %>% 
         filter(between(date, as.Date('2013-01-01'), as.Date('2020-01-01'))))

d.member$exp_values <- dreg.member$coefficients["(Intercept)"] + 
    dreg.member$coefficients["date"]*as.numeric(d.member$date)
  

# plotting 

ggplot(doctors, aes(x = date, y = log_count, color = UNION)) +
   
   geom_line(aes(y=exp_values, x=date), color="darkblue", linetype="dotted", linewidth=.8,
              data=d.notmem %>% filter(date>=ymd("2013-01-01"))) + 
   

   geom_line(aes(y=exp_values, x=date), color="red", linetype="dotted", linewidth=.8,
              data=d.member %>% filter(date>=ymd("2013-01-01"))) + 
   
   
   geom_line(aes(y=exp_values, x=date), color="darkgreen", linetype="dotted", linewidth=.8,
              data=d.nocov %>% filter(date>=ymd("2013-01-01"))) + 
  
  geom_line(size = 0.5) +
  labs(title = "Doctor Unionization (Jan. 2003 to Aug. 2024)",
       x = "",
       y = "Log Count") +
  
  scale_color_manual(values = c("#31597d","#9b494f","#4b8b3b"), 
                       c(name="")) +
  
  geom_vline(xintercept = 2020-01-01, linetype = "dashed", color = "grey", size = 0.5) +
  theme(legend.position="bottom") +
  theme_stata()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# creating regressions for nurses by unionization status

# nurses covered but not members

n.notmem <- nurses %>% 
  filter(UNION == "Covered by union but not a member")

nreg.notmem <- lm(log_count ~ date, 
       data = n.notmem %>% 
         filter(between(date, as.Date('2013-01-01'), as.Date('2020-01-01'))))

n.notmem$exp_values <- nreg.notmem$coefficients["(Intercept)"] + 
    nreg.notmem$coefficients["date"]*as.numeric(n.notmem$date)

# nurses with no union coverage

n.nocov <- nurses %>% 
  filter(UNION == "No union coverage")

nreg.nocov <- lm(log_count ~ date, 
       data = n.nocov %>% 
         filter(between(date, as.Date('2013-01-01'), as.Date('2020-01-01'))))

n.nocov$exp_values <- nreg.nocov$coefficients["(Intercept)"] + 
    nreg.nocov$coefficients["date"]*as.numeric(n.nocov$date)
  
# nurses union members

n.member <- nurses %>% 
  filter(UNION == "Member of labor union")

nreg.member <- lm(log_count ~ date, 
       data = n.member %>% 
         filter(between(date, as.Date('2013-01-01'), as.Date('2020-01-01'))))

n.member$exp_values <- nreg.member$coefficients["(Intercept)"] + 
    nreg.member$coefficients["date"]*as.numeric(n.member$date)
  

# plotting 

 ggplot(nurses, aes(x = date, y = log_count, color = UNION)) +
   
   geom_line(aes(y=exp_values, x=date), color="darkblue", linetype="dotted", linewidth=.8,
              data=n.notmem %>% filter(date>=ymd("2013-01-01"))) + 
   

   geom_line(aes(y=exp_values, x=date), color="red", linetype="dotted", linewidth=.8,
              data=n.member %>% filter(date>=ymd("2013-01-01"))) + 
   
   
   geom_line(aes(y=exp_values, x=date), color="darkgreen", linetype="dotted", linewidth=.8,
              data=n.nocov %>% filter(date>=ymd("2013-01-01"))) + 
  
  geom_line(size = 0.5) +
   
  labs(title = "Nurse Unionization (Jan. 2003 to Aug. 2024)",
       x = "",
       y = "Log Count") +
  scale_color_manual(values = c("#31597d","#9b494f","#4b8b3b"), 
                       c(name="")) +
  geom_vline(xintercept = as.Date("2020-01-01"), linetype = "dashed", color = "grey", size = 0.5) +
  theme(legend.position="bottom") +
  theme_stata()