library(tidyverse)
library(lubridate)
library(knitr)
library(kableExtra)
library(scales)

1. Load Cleaned Data

df <- read.csv("graduate_full_clean.csv", stringsAsFactors = FALSE)

# Re-apply factor encodings after CSV round-trip
df <- df %>%
  mutate(
    SAMPLE_SEX_1997  = factor(SAMPLE_SEX_1997,
      levels = c("Male", "Female")),
    SAMPLE_RACE_1997 = factor(SAMPLE_RACE_1997,
      levels = c("Black", "Hispanic", "Mixed Race (Non-Hispanic)", "Non-Black / Non-Hispanic")),
    Region = factor(Region,
      levels = c("Northeast", "North Central", "South", "West")),
    marital_status = factor(marital_status,
      levels = c("Never Married", "Married", "Separated", "Divorced", "Widowed", "Unknown")),
    Occupation_Group2 = factor(Occupation_Group2),
    Industry_Group    = factor(Industry_Group),
    DOB            = as.Date(DOB),
    Interview_Date = as.Date(Interview_Date),
    StartDate      = as.Date(StartDate),
    StopDate       = as.Date(StopDate),
    # Treatment indicators
    Post2018   = Year >= 2018,
    CA_Proxy   = Region == "West",
    Treated    = CA_Proxy & Post2018
  )

dim(df)
## [1] 171601     33

2. Sample Overview

# Individuals by sex and race
df %>%
  distinct(PUBID_1997, SAMPLE_SEX_1997, SAMPLE_RACE_1997) %>%
  count(SAMPLE_SEX_1997, SAMPLE_RACE_1997) %>%
  pivot_wider(names_from = SAMPLE_SEX_1997, values_from = n, values_fill = 0) %>%
  kable(caption = "Unique Individuals by Sex and Race") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Unique Individuals by Sex and Race
SAMPLE_RACE_1997 Male Female
Black 1144 1138
Hispanic 964 904
Mixed Race (Non-Hispanic) 40 43
Non-Black / Non-Hispanic 2372 2205
df %>%
  count(Year) %>%
  ggplot(aes(x = Year, y = n)) +
  geom_col(fill = "#2c7bb6") +
  geom_vline(xintercept = 2017.5, linetype = "dashed", color = "firebrick", linewidth = 0.8) +
  annotate("text", x = 2018.2, y = max(table(df$Year)) * 0.95,
           label = "CA Ban\nJan 2018", color = "firebrick", hjust = 0, size = 3.2) +
  scale_x_continuous(breaks = seq(1997, 2021, 2)) +
  labs(title = "Job Spells by Year", x = "Year", y = "Number of Spells") +
  theme_minimal()


3. Wage Distribution

df %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  ggplot(aes(x = HRLY_WAGE_clean)) +
  geom_histogram(bins = 60, fill = "#2c7bb6", color = "white") +
  scale_x_continuous(labels = dollar_format()) +
  labs(title = "Overall Hourly Wage Distribution",
       x = "Hourly Wage", y = "Count") +
  theme_minimal()

df %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  ggplot(aes(x = HRLY_WAGE_clean, fill = SAMPLE_SEX_1997)) +
  geom_density(alpha = 0.5) +
  scale_x_continuous(labels = dollar_format()) +
  scale_fill_manual(values = c("#2c7bb6", "#d7191c")) +
  labs(title = "Wage Distribution by Sex",
       x = "Hourly Wage", y = "Density", fill = NULL) +
  theme_minimal()

df %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  ggplot(aes(x = HRLY_WAGE_clean, fill = SAMPLE_RACE_1997)) +
  geom_density(alpha = 0.4) +
  scale_x_continuous(labels = dollar_format()) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Wage Distribution by Race",
       x = "Hourly Wage", y = "Density", fill = NULL) +
  theme_minimal()


4. Wage Gap Analysis

gender_gap <- df %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  group_by(SAMPLE_SEX_1997) %>%
  summarise(
    Mean_Wage   = round(mean(HRLY_WAGE_clean), 2),
    Median_Wage = round(median(HRLY_WAGE_clean), 2),
    N           = n()
  )

gender_gap %>%
  kable(caption = "Wage Summary by Sex") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wage Summary by Sex
SAMPLE_SEX_1997 Mean_Wage Median_Wage N
Male 14.27 10.50 85831
Female 12.85 9.75 84055
male_med   <- gender_gap$Median_Wage[gender_gap$SAMPLE_SEX_1997 == "Male"]
female_med <- gender_gap$Median_Wage[gender_gap$SAMPLE_SEX_1997 == "Female"]
cat("Median gender wage gap:", round((male_med - female_med) / male_med * 100, 1), "cents on the dollar\n")
## Median gender wage gap: 7.1 cents on the dollar
df %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  group_by(SAMPLE_RACE_1997) %>%
  summarise(
    Mean_Wage   = round(mean(HRLY_WAGE_clean), 2),
    Median_Wage = round(median(HRLY_WAGE_clean), 2),
    N           = n()
  ) %>%
  arrange(desc(Median_Wage)) %>%
  kable(caption = "Wage Summary by Race") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wage Summary by Race
SAMPLE_RACE_1997 Mean_Wage Median_Wage N
Hispanic 13.61 10.4 34184
Mixed Race (Non-Hispanic) 14.69 10.0 1518
Non-Black / Non-Hispanic 14.22 10.0 91615
Black 12.08 9.5 42569
df %>%
  filter(!is.na(HRLY_WAGE_clean)) %>%
  group_by(SAMPLE_SEX_1997, SAMPLE_RACE_1997) %>%
  summarise(Median_Wage = round(median(HRLY_WAGE_clean), 2), .groups = "drop") %>%
  ggplot(aes(x = reorder(SAMPLE_RACE_1997, Median_Wage),
             y = Median_Wage,
             fill = SAMPLE_SEX_1997)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("#2c7bb6", "#d7191c")) +
  scale_y_continuous(labels = dollar_format()) +
  coord_flip() +
  labs(title = "Median Hourly Wage by Race and Sex",
       x = NULL, y = "Median Wage", fill = NULL) +
  theme_minimal()


5. California (West) vs. Rest of Country

df %>%
  filter(!is.na(HRLY_WAGE_clean), !is.na(Region)) %>%
  group_by(Region) %>%
  summarise(
    Mean_Wage   = round(mean(HRLY_WAGE_clean), 2),
    Median_Wage = round(median(HRLY_WAGE_clean), 2),
    N           = n()
  ) %>%
  kable(caption = "Wage by Region") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wage by Region
Region Mean_Wage Median_Wage N
Northeast 14.16 10.25 27364
North Central 13.00 10.00 39054
South 12.88 10.00 64843
West 14.81 11.00 37829
df %>%
  filter(!is.na(HRLY_WAGE_clean), !is.na(CA_Proxy)) %>%
  mutate(Group = if_else(CA_Proxy, "West (CA Proxy)", "Rest of U.S.")) %>%
  group_by(Year, Group) %>%
  summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
  ggplot(aes(x = Year, y = Median_Wage, color = Group, group = Group)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  geom_vline(xintercept = 2017.5, linetype = "dashed", color = "firebrick") +
  annotate("text", x = 2018.2, y = 9,
           label = "CA Ban\nJan 2018", color = "firebrick", hjust = 0, size = 3.2) +
  scale_x_continuous(breaks = seq(1997, 2021, 2)) +
  scale_y_continuous(labels = dollar_format()) +
  scale_color_manual(values = c("#2c7bb6", "#fdae61")) +
  labs(title = "Median Wage Trend: West vs. Rest of U.S.",
       x = "Year", y = "Median Hourly Wage", color = NULL) +
  theme_minimal()

df %>%
  filter(!is.na(HRLY_WAGE_clean), !is.na(Region)) %>%
  mutate(Group = if_else(CA_Proxy, "West (CA Proxy)", "Rest of U.S.")) %>%
  group_by(Year, Group, SAMPLE_SEX_1997) %>%
  summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
  pivot_wider(names_from = SAMPLE_SEX_1997, values_from = Median_Wage) %>%
  mutate(Gender_Gap = Male - Female) %>%
  ggplot(aes(x = Year, y = Gender_Gap, color = Group, group = Group)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  geom_vline(xintercept = 2017.5, linetype = "dashed", color = "firebrick") +
  annotate("text", x = 2018.2, y = max(0),
           label = "CA Ban\nJan 2018", color = "firebrick", hjust = 0, size = 3.2) +
  scale_x_continuous(breaks = seq(1997, 2021, 2)) +
  scale_y_continuous(labels = dollar_format()) +
  scale_color_manual(values = c("#2c7bb6", "#fdae61")) +
  labs(title = "Gender Wage Gap Over Time: West vs. Rest of U.S.",
       subtitle = "Gap = Median Male Wage − Median Female Wage",
       x = "Year", y = "Wage Gap ($/hr)", color = NULL) +
  theme_minimal()


6. Occupation & Industry Breakdown

df %>%
  filter(!is.na(HRLY_WAGE_clean), Occupation_Group2 != "") %>%
  group_by(Occupation_Group2) %>%
  summarise(Median_Wage = median(HRLY_WAGE_clean), N = n(), .groups = "drop") %>%
  filter(N > 200) %>%
  ggplot(aes(x = reorder(Occupation_Group2, Median_Wage), y = Median_Wage)) +
  geom_col(fill = "#2c7bb6") +
  coord_flip() +
  scale_y_continuous(labels = dollar_format()) +
  labs(title = "Median Wage by Occupation Group",
       x = NULL, y = "Median Hourly Wage") +
  theme_minimal()


7. Education & Tenure Effects

df %>%
  filter(!is.na(HRLY_WAGE_clean), !is.na(HGC)) %>%
  group_by(HGC, SAMPLE_SEX_1997) %>%
  summarise(Median_Wage = median(HRLY_WAGE_clean), N = n(), .groups = "drop") %>%
  filter(N > 50) %>%
  ggplot(aes(x = HGC, y = Median_Wage, color = SAMPLE_SEX_1997, group = SAMPLE_SEX_1997)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.5) +
  scale_y_continuous(labels = scales::dollar_format()) +
  scale_x_continuous(breaks = seq(6, 20, 1), limits = c(6, 20)) +
  scale_color_manual(values = c("#2c7bb6", "#d7191c")) +
  labs(title = "Median Wage by Highest Grade Completed and Sex",
       subtitle = "Do returns to education differ by gender?",
       x = "Highest Grade Completed", y = "Median Hourly Wage", color = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 9))

df %>%
  filter(!is.na(HRLY_WAGE_clean), Occupation_Group2 != "") %>%
  group_by(Occupation_Group2, SAMPLE_SEX_1997) %>%
  summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
  pivot_wider(names_from = SAMPLE_SEX_1997, values_from = Median_Wage) %>%
  mutate(Gap = Male - Female) %>%
  filter(!is.na(Gap)) %>%
  ggplot(aes(x = reorder(Occupation_Group2, Gap), y = Gap)) +
  geom_col(fill = "#d7191c") +
  coord_flip() +
  scale_y_continuous(labels = scales::dollar_format()) +
  labs(title = "Gender Wage Gap by Occupation Group",
       subtitle = "Gap = Median Male − Median Female Wage",
       x = NULL, y = "Wage Gap ($/hr)") +
  theme_minimal()

df %>%
  filter(!is.na(HRLY_WAGE_clean), !is.na(TENURE)) %>%
  mutate(Tenure_Band = cut(TENURE,
    breaks = c(0, 12, 36, 60, 120, Inf),
    labels = c("< 1 yr", "1–3 yrs", "3–5 yrs", "5–10 yrs", "10+ yrs"),
    right = FALSE)) %>%
  group_by(Tenure_Band, SAMPLE_SEX_1997) %>%
  summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
  ggplot(aes(x = Tenure_Band, y = Median_Wage, fill = SAMPLE_SEX_1997)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("#2c7bb6", "#d7191c")) +
  scale_y_continuous(labels = dollar_format()) +
  labs(title = "Median Wage by Tenure Band and Sex",
       x = "Tenure", y = "Median Hourly Wage", fill = NULL) +
  theme_minimal()


8. Pre/Post 2018 Summary

df %>%
  filter(!is.na(HRLY_WAGE_clean), !is.na(Region)) %>%
  mutate(
    Period = if_else(Post2018, "Post-2018", "Pre-2018"),
    Group  = if_else(CA_Proxy, "West", "Rest of U.S.")
  ) %>%
  group_by(Group, Period, SAMPLE_SEX_1997) %>%
  summarise(Median_Wage = round(median(HRLY_WAGE_clean), 2), N = n(), .groups = "drop") %>%
  pivot_wider(names_from = Period, values_from = c(Median_Wage, N)) %>%
  mutate(Change = `Median_Wage_Post-2018` - `Median_Wage_Pre-2018`) %>%
  kable(caption = "Median Wage Pre vs. Post 2018 by Group and Sex") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Median Wage Pre vs. Post 2018 by Group and Sex
Group SAMPLE_SEX_1997 Median_Wage_Post-2018 Median_Wage_Pre-2018 N_Post-2018 N_Pre-2018 Change
Rest of U.S. Male 20.00 10 6234 59888 10.00
Rest of U.S. Female 18.00 9 6344 58795 9.00
West Male 25.00 11 1808 17495 14.00
West Female 20.05 10 1715 16811 10.05

9. Key Takeaways

cat("
KEY FINDINGS FOR MODELING STAGE
================================

1. GENDER GAP: A persistent median wage gap exists across all years.
   Examine the gender-gap trend chart to assess whether West narrowed
   faster post-2018 relative to the Rest of U.S.

2. RACIAL GAP: Non-Black/Non-Hispanic workers earn more at the median;
   Black and Hispanic workers show lower median wages across most occupations.

3. WEST vs. REST: Check the pre/post table above for DiD signal —
   if the West gender gap shrank more post-2018 than the Rest of U.S.,
   that supports the ban's effectiveness.

4. OCCUPATION SEGREGATION: Large variation in gender wage gaps across
   occupation groups — controls will be critical in the model.

5. EDUCATION & TENURE: Both show positive returns; must be included
   as controls to isolate discrimination from productivity differences.

SCOPE LIMITATION: Region == 'West' includes CA, WA, OR, NV, AZ, etc.
   The treatment effect is diluted by non-CA Western states.
   Flag this clearly in the final presentation.
")
## 
## KEY FINDINGS FOR MODELING STAGE
## ================================
## 
## 1. GENDER GAP: A persistent median wage gap exists across all years.
##    Examine the gender-gap trend chart to assess whether West narrowed
##    faster post-2018 relative to the Rest of U.S.
## 
## 2. RACIAL GAP: Non-Black/Non-Hispanic workers earn more at the median;
##    Black and Hispanic workers show lower median wages across most occupations.
## 
## 3. WEST vs. REST: Check the pre/post table above for DiD signal —
##    if the West gender gap shrank more post-2018 than the Rest of U.S.,
##    that supports the ban's effectiveness.
## 
## 4. OCCUPATION SEGREGATION: Large variation in gender wage gaps across
##    occupation groups — controls will be critical in the model.
## 
## 5. EDUCATION & TENURE: Both show positive returns; must be included
##    as controls to isolate discrimination from productivity differences.
## 
## SCOPE LIMITATION: Region == 'West' includes CA, WA, OR, NV, AZ, etc.
##    The treatment effect is diluted by non-CA Western states.
##    Flag this clearly in the final presentation.