data <- read_dta("clean_data/pruned_transcal_panel_data.dta") %>%
filter(Year == 2019, dept == "Police", position == "Officer-Firefighters") %>%
select(Employee_Name, department = Agency, salary = base_plus_overtime)
department_stats <- data %>%
group_by(department) %>%
summarise(
N_d = n(),
avg_salary = mean(salary)
) %>%
filter(N_d > 4) # Must have at least 4 employees by pruning method (employee was there in 2018 and 2020)
department_stats
D = nrow(department_stats)
sum_avg_wages = sum(department_stats$avg_salary)
data_shrunken <-
data %>%
group_by(department) %>%
mutate(
avg_salary = mean(salary),
N_d = n()
) %>%
filter(N_d > 4) %>%
ungroup() %>%
mutate(
A = avg_salary - sum_avg_wages,
A_sq = A^2,
B = salary - avg_salary,
B_sq = B^2
)
data_shrunken
sum_a_sq = data_shrunken %>% select(department, A_sq) %>% distinct() %>% pull(A_sq) %>% sum() # Sum A_sq across departments
avg_B_sq = data_shrunken %>% group_by(department) %>% summarise(avg_B_sq = mean(B_sq)) # Within department, avg_B_sq
avg_B = data_shrunken %>% group_by(department) %>% summarise(avg_B = mean(B)) # Within department, avg_B
data_shrunken_final <- data_shrunken %>%
left_join(avg_B_sq, by = "department") %>%
left_join(avg_B, by = "department") %>%
select(department, avg_salary, N_d, avg_B_sq, avg_B) %>%
distinct() %>%
mutate(
shrunken_mean =
((((1 /(D-3)) * sum_a_sq) - avg_B_sq) / ((1/(D-3)) * sum_a_sq) * avg_salary) +
(((avg_B) / ((1/(D-3)) * sum_a_sq)) * (1/D) * sum_avg_wages),
avg_minus_shrunken = avg_salary - shrunken_mean
) %>%
select(department, avg_salary, shrunken_mean, avg_minus_shrunken, everything())
data_shrunken_final
# Recalculate Kevin's Way
w_d_bar_0 = (1/D) * sum_avg_wages
w_d_min_w_d_0_sq =
department_stats %>%
mutate(
w_d_minus_w_d_0_sq = (avg_salary - w_d_bar_0)^2,
)
sigma_0 = (1/D) * sum(w_d_min_w_d_0_sq$w_d_minus_w_d_0_sq)
data_shrunken <-
data %>%
group_by(department) %>%
mutate(
w_d_bar = mean(salary),
N_d = n()
) %>%
filter(N_d > 4) %>%
mutate(
sigma_d = (1/(N_d ^2)) * sum((salary - w_d_bar)^2),
shrunken_mean = (sigma_d / sigma_0) * w_d_bar_0 + (1 - (sigma_d / sigma_0)) * w_d_bar,
avg_minus_shrunken = w_d_bar - shrunken_mean
) %>%
select(department, w_d_bar, shrunken_mean, avg_minus_shrunken, everything())
data_shrunken
# Overlay two histograms from data_shrunken of w_d_bar and shrunken_mean
data_shrunken %>%
distinct(department, w_d_bar, shrunken_mean, N_d) %>%
ungroup() %>%
pivot_longer(cols = c(w_d_bar, shrunken_mean), names_to = "variable", values_to = "value") %>%
ggplot() +
geom_histogram(aes(value, fill = variable), alpha = 0.5, position = "identity", bins = 30) +
labs(
x = "Average Salary",
y = "Count",
title = "Average Salary vs. Shrunken Mean",
subtitle = "California Agencies with >4 Employees by Pruning Method, 2022 $"
)

data_shrunken %>%
distinct(department, w_d_bar, shrunken_mean, N_d) %>%
ungroup() %>%
summarise(
avg_w_d_bar = mean(w_d_bar),
avg_shrunken_mean = mean(shrunken_mean)
)