data <- read_dta("clean_data/pruned_transcal_panel_data.dta") %>% 
  filter(Year == 2019, dept == "Police", position == "Officer-Firefighters") %>% 
  select(Employee_Name, department = Agency, salary =  base_plus_overtime)
department_stats <- data %>%
  group_by(department) %>%
  summarise(
    N_d = n(),
    avg_salary = mean(salary)
  ) %>% 
  filter(N_d > 4) # Must have at least 4 employees by pruning method (employee was there in 2018 and 2020)

department_stats
D = nrow(department_stats)
sum_avg_wages = sum(department_stats$avg_salary)

data_shrunken <-
  data %>% 
  group_by(department) %>%
  mutate(
    avg_salary = mean(salary),
    N_d = n()
  ) %>% 
  filter(N_d > 4) %>%
  ungroup() %>% 
  mutate(
    A = avg_salary - sum_avg_wages,
    A_sq = A^2,
    B = salary - avg_salary,
    B_sq = B^2
  ) 

data_shrunken
sum_a_sq = data_shrunken %>% select(department, A_sq) %>% distinct() %>% pull(A_sq) %>% sum() # Sum A_sq across departments
avg_B_sq = data_shrunken %>% group_by(department) %>% summarise(avg_B_sq = mean(B_sq)) # Within department, avg_B_sq
avg_B = data_shrunken %>% group_by(department) %>% summarise(avg_B = mean(B)) # Within department, avg_B

data_shrunken_final <- data_shrunken %>% 
  left_join(avg_B_sq, by = "department") %>% 
  left_join(avg_B, by = "department") %>% 
  select(department, avg_salary, N_d, avg_B_sq, avg_B) %>% 
  distinct() %>% 
  mutate(
    shrunken_mean =
      ((((1 /(D-3)) * sum_a_sq) - avg_B_sq) / ((1/(D-3)) * sum_a_sq) * avg_salary) +
      (((avg_B) / ((1/(D-3)) * sum_a_sq)) * (1/D) * sum_avg_wages),
    avg_minus_shrunken = avg_salary - shrunken_mean
  )  %>% 
  select(department, avg_salary, shrunken_mean, avg_minus_shrunken, everything())
data_shrunken_final
# Recalculate Kevin's Way
w_d_bar_0 = (1/D) * sum_avg_wages

w_d_min_w_d_0_sq = 
  department_stats %>% 
  mutate(
    w_d_minus_w_d_0_sq = (avg_salary - w_d_bar_0)^2,
  )

sigma_0 = (1/D) * sum(w_d_min_w_d_0_sq$w_d_minus_w_d_0_sq)

data_shrunken <-
  data %>% 
  group_by(department) %>%
  mutate(
    w_d_bar = mean(salary),
    N_d = n()
  ) %>% 
  filter(N_d > 4) %>% 
  mutate(
    sigma_d = (1/(N_d ^2)) * sum((salary - w_d_bar)^2),
    shrunken_mean = (sigma_d / sigma_0) * w_d_bar_0 + (1 - (sigma_d / sigma_0)) * w_d_bar,
    avg_minus_shrunken = w_d_bar - shrunken_mean
  )  %>% 
  select(department, w_d_bar, shrunken_mean, avg_minus_shrunken, everything())

data_shrunken
# Overlay two histograms from data_shrunken of w_d_bar and shrunken_mean  
data_shrunken %>% 
  distinct(department, w_d_bar, shrunken_mean, N_d) %>%  
  ungroup() %>% 
  pivot_longer(cols = c(w_d_bar, shrunken_mean), names_to = "variable", values_to = "value") %>% 
  ggplot() +
  geom_histogram(aes(value, fill = variable), alpha = 0.5, position = "identity", bins = 30) +
  labs(
    x = "Average Salary",
    y = "Count",
    title = "Average Salary vs. Shrunken Mean",
    subtitle = "California Agencies with >4 Employees by Pruning Method, 2022 $"
  )

data_shrunken %>% 
  distinct(department, w_d_bar, shrunken_mean, N_d) %>% 
  ungroup() %>% 
  summarise(
    avg_w_d_bar = mean(w_d_bar),
    avg_shrunken_mean = mean(shrunken_mean)
  )