library(tidyverse)
library(lubridate)
library(knitr)
library(kableExtra)
library(scales)
1. Load Cleaned Data
df <- read.csv("graduate_full_clean.csv", stringsAsFactors = FALSE)
# Re-apply factor encodings after CSV round-trip
df <- df %>%
mutate(
SAMPLE_SEX_1997 = factor(SAMPLE_SEX_1997,
levels = c("Male", "Female")),
SAMPLE_RACE_1997 = factor(SAMPLE_RACE_1997,
levels = c("Black", "Hispanic", "Mixed Race (Non-Hispanic)", "Non-Black / Non-Hispanic")),
Region = factor(Region,
levels = c("Northeast", "North Central", "South", "West")),
marital_status = factor(marital_status,
levels = c("Never Married", "Married", "Separated", "Divorced", "Widowed", "Unknown")),
Occupation_Group2 = factor(Occupation_Group2),
Industry_Group = factor(Industry_Group),
DOB = as.Date(DOB),
Interview_Date = as.Date(Interview_Date),
StartDate = as.Date(StartDate),
StopDate = as.Date(StopDate),
# Treatment indicators
Post2018 = Year >= 2018,
CA_Proxy = Region == "West",
Treated = CA_Proxy & Post2018
)
dim(df)
## [1] 171601 33
2. Sample Overview
# Individuals by sex and race
df %>%
distinct(PUBID_1997, SAMPLE_SEX_1997, SAMPLE_RACE_1997) %>%
count(SAMPLE_SEX_1997, SAMPLE_RACE_1997) %>%
pivot_wider(names_from = SAMPLE_SEX_1997, values_from = n, values_fill = 0) %>%
kable(caption = "Unique Individuals by Sex and Race") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Unique Individuals by Sex and Race
|
SAMPLE_RACE_1997
|
Male
|
Female
|
|
Black
|
1144
|
1138
|
|
Hispanic
|
964
|
904
|
|
Mixed Race (Non-Hispanic)
|
40
|
43
|
|
Non-Black / Non-Hispanic
|
2372
|
2205
|
df %>%
count(Year) %>%
ggplot(aes(x = Year, y = n)) +
geom_col(fill = "#2c7bb6") +
geom_vline(xintercept = 2017.5, linetype = "dashed", color = "firebrick", linewidth = 0.8) +
annotate("text", x = 2018.2, y = max(table(df$Year)) * 0.95,
label = "CA Ban\nJan 2018", color = "firebrick", hjust = 0, size = 3.2) +
scale_x_continuous(breaks = seq(1997, 2021, 2)) +
labs(title = "Job Spells by Year", x = "Year", y = "Number of Spells") +
theme_minimal()

3. Wage Distribution
df %>%
filter(!is.na(HRLY_WAGE_clean)) %>%
ggplot(aes(x = HRLY_WAGE_clean)) +
geom_histogram(bins = 60, fill = "#2c7bb6", color = "white") +
scale_x_continuous(labels = dollar_format()) +
labs(title = "Overall Hourly Wage Distribution",
x = "Hourly Wage", y = "Count") +
theme_minimal()

df %>%
filter(!is.na(HRLY_WAGE_clean)) %>%
ggplot(aes(x = HRLY_WAGE_clean, fill = SAMPLE_SEX_1997)) +
geom_density(alpha = 0.5) +
scale_x_continuous(labels = dollar_format()) +
scale_fill_manual(values = c("#2c7bb6", "#d7191c")) +
labs(title = "Wage Distribution by Sex",
x = "Hourly Wage", y = "Density", fill = NULL) +
theme_minimal()

df %>%
filter(!is.na(HRLY_WAGE_clean)) %>%
ggplot(aes(x = HRLY_WAGE_clean, fill = SAMPLE_RACE_1997)) +
geom_density(alpha = 0.4) +
scale_x_continuous(labels = dollar_format()) +
scale_fill_brewer(palette = "Set1") +
labs(title = "Wage Distribution by Race",
x = "Hourly Wage", y = "Density", fill = NULL) +
theme_minimal()

4. Wage Gap Analysis
gender_gap <- df %>%
filter(!is.na(HRLY_WAGE_clean)) %>%
group_by(SAMPLE_SEX_1997) %>%
summarise(
Mean_Wage = round(mean(HRLY_WAGE_clean), 2),
Median_Wage = round(median(HRLY_WAGE_clean), 2),
N = n()
)
gender_gap %>%
kable(caption = "Wage Summary by Sex") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wage Summary by Sex
|
SAMPLE_SEX_1997
|
Mean_Wage
|
Median_Wage
|
N
|
|
Male
|
14.27
|
10.50
|
85831
|
|
Female
|
12.85
|
9.75
|
84055
|
male_med <- gender_gap$Median_Wage[gender_gap$SAMPLE_SEX_1997 == "Male"]
female_med <- gender_gap$Median_Wage[gender_gap$SAMPLE_SEX_1997 == "Female"]
cat("Median gender wage gap:", round((male_med - female_med) / male_med * 100, 1), "cents on the dollar\n")
## Median gender wage gap: 7.1 cents on the dollar
df %>%
filter(!is.na(HRLY_WAGE_clean)) %>%
group_by(SAMPLE_RACE_1997) %>%
summarise(
Mean_Wage = round(mean(HRLY_WAGE_clean), 2),
Median_Wage = round(median(HRLY_WAGE_clean), 2),
N = n()
) %>%
arrange(desc(Median_Wage)) %>%
kable(caption = "Wage Summary by Race") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wage Summary by Race
|
SAMPLE_RACE_1997
|
Mean_Wage
|
Median_Wage
|
N
|
|
Hispanic
|
13.61
|
10.4
|
34184
|
|
Mixed Race (Non-Hispanic)
|
14.69
|
10.0
|
1518
|
|
Non-Black / Non-Hispanic
|
14.22
|
10.0
|
91615
|
|
Black
|
12.08
|
9.5
|
42569
|
df %>%
filter(!is.na(HRLY_WAGE_clean)) %>%
group_by(SAMPLE_SEX_1997, SAMPLE_RACE_1997) %>%
summarise(Median_Wage = round(median(HRLY_WAGE_clean), 2), .groups = "drop") %>%
ggplot(aes(x = reorder(SAMPLE_RACE_1997, Median_Wage),
y = Median_Wage,
fill = SAMPLE_SEX_1997)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("#2c7bb6", "#d7191c")) +
scale_y_continuous(labels = dollar_format()) +
coord_flip() +
labs(title = "Median Hourly Wage by Race and Sex",
x = NULL, y = "Median Wage", fill = NULL) +
theme_minimal()

5. California (West) vs. Rest of Country
df %>%
filter(!is.na(HRLY_WAGE_clean), !is.na(Region)) %>%
group_by(Region) %>%
summarise(
Mean_Wage = round(mean(HRLY_WAGE_clean), 2),
Median_Wage = round(median(HRLY_WAGE_clean), 2),
N = n()
) %>%
kable(caption = "Wage by Region") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Wage by Region
|
Region
|
Mean_Wage
|
Median_Wage
|
N
|
|
Northeast
|
14.16
|
10.25
|
27364
|
|
North Central
|
13.00
|
10.00
|
39054
|
|
South
|
12.88
|
10.00
|
64843
|
|
West
|
14.81
|
11.00
|
37829
|
df %>%
filter(!is.na(HRLY_WAGE_clean), !is.na(CA_Proxy)) %>%
mutate(Group = if_else(CA_Proxy, "West (CA Proxy)", "Rest of U.S.")) %>%
group_by(Year, Group) %>%
summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
ggplot(aes(x = Year, y = Median_Wage, color = Group, group = Group)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
geom_vline(xintercept = 2017.5, linetype = "dashed", color = "firebrick") +
annotate("text", x = 2018.2, y = 9,
label = "CA Ban\nJan 2018", color = "firebrick", hjust = 0, size = 3.2) +
scale_x_continuous(breaks = seq(1997, 2021, 2)) +
scale_y_continuous(labels = dollar_format()) +
scale_color_manual(values = c("#2c7bb6", "#fdae61")) +
labs(title = "Median Wage Trend: West vs. Rest of U.S.",
x = "Year", y = "Median Hourly Wage", color = NULL) +
theme_minimal()

df %>%
filter(!is.na(HRLY_WAGE_clean), !is.na(Region)) %>%
mutate(Group = if_else(CA_Proxy, "West (CA Proxy)", "Rest of U.S.")) %>%
group_by(Year, Group, SAMPLE_SEX_1997) %>%
summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
pivot_wider(names_from = SAMPLE_SEX_1997, values_from = Median_Wage) %>%
mutate(Gender_Gap = Male - Female) %>%
ggplot(aes(x = Year, y = Gender_Gap, color = Group, group = Group)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
geom_vline(xintercept = 2017.5, linetype = "dashed", color = "firebrick") +
annotate("text", x = 2018.2, y = max(0),
label = "CA Ban\nJan 2018", color = "firebrick", hjust = 0, size = 3.2) +
scale_x_continuous(breaks = seq(1997, 2021, 2)) +
scale_y_continuous(labels = dollar_format()) +
scale_color_manual(values = c("#2c7bb6", "#fdae61")) +
labs(title = "Gender Wage Gap Over Time: West vs. Rest of U.S.",
subtitle = "Gap = Median Male Wage − Median Female Wage",
x = "Year", y = "Wage Gap ($/hr)", color = NULL) +
theme_minimal()

6. Occupation & Industry Breakdown
df %>%
filter(!is.na(HRLY_WAGE_clean), Occupation_Group2 != "") %>%
group_by(Occupation_Group2) %>%
summarise(Median_Wage = median(HRLY_WAGE_clean), N = n(), .groups = "drop") %>%
filter(N > 200) %>%
ggplot(aes(x = reorder(Occupation_Group2, Median_Wage), y = Median_Wage)) +
geom_col(fill = "#2c7bb6") +
coord_flip() +
scale_y_continuous(labels = dollar_format()) +
labs(title = "Median Wage by Occupation Group",
x = NULL, y = "Median Hourly Wage") +
theme_minimal()

7. Education & Tenure Effects
df %>%
filter(!is.na(HRLY_WAGE_clean), !is.na(HGC)) %>%
group_by(HGC, SAMPLE_SEX_1997) %>%
summarise(Median_Wage = median(HRLY_WAGE_clean), N = n(), .groups = "drop") %>%
filter(N > 50) %>%
ggplot(aes(x = HGC, y = Median_Wage, color = SAMPLE_SEX_1997, group = SAMPLE_SEX_1997)) +
geom_line(linewidth = 1) +
geom_point(size = 2.5) +
scale_y_continuous(labels = scales::dollar_format()) +
scale_x_continuous(breaks = seq(6, 20, 1), limits = c(6, 20)) +
scale_color_manual(values = c("#2c7bb6", "#d7191c")) +
labs(title = "Median Wage by Highest Grade Completed and Sex",
subtitle = "Do returns to education differ by gender?",
x = "Highest Grade Completed", y = "Median Hourly Wage", color = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(size = 9))

df %>%
filter(!is.na(HRLY_WAGE_clean), Occupation_Group2 != "") %>%
group_by(Occupation_Group2, SAMPLE_SEX_1997) %>%
summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
pivot_wider(names_from = SAMPLE_SEX_1997, values_from = Median_Wage) %>%
mutate(Gap = Male - Female) %>%
filter(!is.na(Gap)) %>%
ggplot(aes(x = reorder(Occupation_Group2, Gap), y = Gap)) +
geom_col(fill = "#d7191c") +
coord_flip() +
scale_y_continuous(labels = scales::dollar_format()) +
labs(title = "Gender Wage Gap by Occupation Group",
subtitle = "Gap = Median Male − Median Female Wage",
x = NULL, y = "Wage Gap ($/hr)") +
theme_minimal()

df %>%
filter(!is.na(HRLY_WAGE_clean), !is.na(TENURE)) %>%
mutate(Tenure_Band = cut(TENURE,
breaks = c(0, 12, 36, 60, 120, Inf),
labels = c("< 1 yr", "1–3 yrs", "3–5 yrs", "5–10 yrs", "10+ yrs"),
right = FALSE)) %>%
group_by(Tenure_Band, SAMPLE_SEX_1997) %>%
summarise(Median_Wage = median(HRLY_WAGE_clean), .groups = "drop") %>%
ggplot(aes(x = Tenure_Band, y = Median_Wage, fill = SAMPLE_SEX_1997)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("#2c7bb6", "#d7191c")) +
scale_y_continuous(labels = dollar_format()) +
labs(title = "Median Wage by Tenure Band and Sex",
x = "Tenure", y = "Median Hourly Wage", fill = NULL) +
theme_minimal()

8. Pre/Post 2018 Summary
df %>%
filter(!is.na(HRLY_WAGE_clean), !is.na(Region)) %>%
mutate(
Period = if_else(Post2018, "Post-2018", "Pre-2018"),
Group = if_else(CA_Proxy, "West", "Rest of U.S.")
) %>%
group_by(Group, Period, SAMPLE_SEX_1997) %>%
summarise(Median_Wage = round(median(HRLY_WAGE_clean), 2), N = n(), .groups = "drop") %>%
pivot_wider(names_from = Period, values_from = c(Median_Wage, N)) %>%
mutate(Change = `Median_Wage_Post-2018` - `Median_Wage_Pre-2018`) %>%
kable(caption = "Median Wage Pre vs. Post 2018 by Group and Sex") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Median Wage Pre vs. Post 2018 by Group and Sex
|
Group
|
SAMPLE_SEX_1997
|
Median_Wage_Post-2018
|
Median_Wage_Pre-2018
|
N_Post-2018
|
N_Pre-2018
|
Change
|
|
Rest of U.S.
|
Male
|
20.00
|
10
|
6234
|
59888
|
10.00
|
|
Rest of U.S.
|
Female
|
18.00
|
9
|
6344
|
58795
|
9.00
|
|
West
|
Male
|
25.00
|
11
|
1808
|
17495
|
14.00
|
|
West
|
Female
|
20.05
|
10
|
1715
|
16811
|
10.05
|
9. Key Takeaways
cat("
KEY FINDINGS FOR MODELING STAGE
================================
1. GENDER GAP: A persistent median wage gap exists across all years.
Examine the gender-gap trend chart to assess whether West narrowed
faster post-2018 relative to the Rest of U.S.
2. RACIAL GAP: Non-Black/Non-Hispanic workers earn more at the median;
Black and Hispanic workers show lower median wages across most occupations.
3. WEST vs. REST: Check the pre/post table above for DiD signal —
if the West gender gap shrank more post-2018 than the Rest of U.S.,
that supports the ban's effectiveness.
4. OCCUPATION SEGREGATION: Large variation in gender wage gaps across
occupation groups — controls will be critical in the model.
5. EDUCATION & TENURE: Both show positive returns; must be included
as controls to isolate discrimination from productivity differences.
SCOPE LIMITATION: Region == 'West' includes CA, WA, OR, NV, AZ, etc.
The treatment effect is diluted by non-CA Western states.
Flag this clearly in the final presentation.
")
##
## KEY FINDINGS FOR MODELING STAGE
## ================================
##
## 1. GENDER GAP: A persistent median wage gap exists across all years.
## Examine the gender-gap trend chart to assess whether West narrowed
## faster post-2018 relative to the Rest of U.S.
##
## 2. RACIAL GAP: Non-Black/Non-Hispanic workers earn more at the median;
## Black and Hispanic workers show lower median wages across most occupations.
##
## 3. WEST vs. REST: Check the pre/post table above for DiD signal —
## if the West gender gap shrank more post-2018 than the Rest of U.S.,
## that supports the ban's effectiveness.
##
## 4. OCCUPATION SEGREGATION: Large variation in gender wage gaps across
## occupation groups — controls will be critical in the model.
##
## 5. EDUCATION & TENURE: Both show positive returns; must be included
## as controls to isolate discrimination from productivity differences.
##
## SCOPE LIMITATION: Region == 'West' includes CA, WA, OR, NV, AZ, etc.
## The treatment effect is diluted by non-CA Western states.
## Flag this clearly in the final presentation.