The following report will look at all Cook County employees excluding Forest Preserves, indicating amount of base salary paid to an employee during the County fiscal quarter.The data ranges fiscal years 2016 through 2018 – the fiscal year is from December to November. The data set is sourced from data.gov (https://catalog.data.gov/dataset/employee-payroll)
This data set did not need any cleaning and was ready to use directly from the download from data.gov.
One item to note from the data is that the pay details are reported on a quarterly basis, so there are 4 entries for each employee for each financial year, given they stay employed with the Cook County for the full financial year. Since salary is generally examined on an annual basis, I had to do some aggregating of the data to have the salary data for each financial year for each employee (I summed the quarterly salary for each employee identifier for each year), resulting in the full financial year salary for each employee for each financial year. This eliminates the confusion of comparison when it was presented as a quarterly base, since salary is commonly viewed on a annual base.
library(tidyverse)
library(ggplot2)
library(plotly)
library(sampling)
employee_payroll_df = read.csv("Employee_Payroll.csv")
employee_payroll = as_tibble(employee_payroll_df)
Employee_population_Bureau = table(select(employee_payroll, Bureau))
sum_salary = employee_payroll %>%
group_by(Bureau, Fiscal.Year) %>%
summarize(total_salary = sum(Base.Pay, na.rm = TRUE),
.groups = "keep")
sum_salary = sum_salary %>%
mutate(total_salary = as.numeric(total_salary))
employee_counts = employee_payroll %>%
group_by(Bureau, Fiscal.Year) %>%
summarize(Unique_Employees = n_distinct(Employee.Identifier), .groups = "keep") %>%
arrange(Bureau, Fiscal.Year)
employee_counts = employee_counts %>%
mutate(Unique_Employees = as.numeric(Unique_Employees))
average_salary = sum_salary %>%
inner_join(employee_counts, by = c("Bureau", "Fiscal.Year")) %>%
mutate(Average_Salary = total_salary / Unique_Employees)
top_5_salary_bureau = average_salary %>%
group_by(Fiscal.Year) %>%
arrange(desc(Average_Salary)) %>%
slice_head(n = 5) %>%
ungroup()
top_5_population_bureau = average_salary %>%
group_by(Fiscal.Year) %>%
arrange(desc(Unique_Employees)) %>%
slice_head(n = 5) %>%
ungroup()
#Bar Chart of Top 5
p = ggplot(top_5_salary_bureau,
aes(
x = reorder(Bureau, Average_Salary),
y = Average_Salary,
text = paste("Bureau:", Bureau, "<br>Average Salary: $", format(Average_Salary, big.mark = ","))
)
) +
geom_bar(stat = "identity", fill = "#0072B2") +
facet_wrap(~ Fiscal.Year, scales = "free_x") +
labs(
title = "Top 5 Bureaus by Average Salary, Per Fiscal Year",
x = "Bureau",
y = "Average Salary (USD)"
) +
theme_minimal() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
plotly_graph = ggplotly(p, tooltip = "text")
print(plotly_graph)
p = ggplot(top_5_population_bureau,
aes(
x = reorder(Bureau, Average_Salary),
y = Average_Salary,
text = paste("Bureau:", Bureau, "<br>Average Salary: $", format(Average_Salary, big.mark = ","))
)
) +
geom_bar(stat = "identity", fill = "#0072B2") +
facet_wrap(~ Fiscal.Year, scales = "free_x") +
labs(
title = "Top 5 Bureaus by Population Average Salary, Per Fiscal Year",
x = "Bureau",
y = "Average Salary (USD)"
) +
theme_minimal() +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())
plotly_graph = ggplotly(p, tooltip = "text")
print(plotly_graph)
#Bar graph Analysis In the above bar graphs, the ranking of the bureaus was done in two ways, the first was based on the average salary and the second was based on bureau population.
For the first bar graph, I calculated the average annual salary for each bureau and then ranked them in descending order and extracted the top five bureaus (based on average annual salary) for each fiscal year. I see that the annual salary for the financial year 2018 was significantly lower than the 2 other years that were included. This could be due to incomplete data and unreported earnings. Furthermore, the second bar graph looks at bureaus based on the population count, so the count of unique employees for each bureau for each year was calculated and ordered in descending order and I extracted the top five bureaus based on the population of employees. I see that when comparing the two graphs, I see that the population graph has lower average earnings than the graph that is based solely on the average salary.
# Box plot processing
individual_salary = employee_payroll %>%
group_by(Employee.Identifier, Fiscal.Year, Bureau) %>%
summarize(total_salary = sum(Base.Pay, na.rm = TRUE),
.groups = "keep")
subset_2016_salary_data = filter(individual_salary, Fiscal.Year == 2016)
five_num_2016 = fivenum(subset_2016_salary_data$total_salary)
IQR_2016 = five_num_2016[4]-five_num_2016[2]
lower_bound_2016 = five_num_2016[2]-(1.5*IQR_2016)
lower_2016_outliers = subset_2016_salary_data$total_salary[which(subset_2016_salary_data$total_salary < lower_bound_2016)]
#length(lower_2016_outliers)
upper_bound_2016 = five_num_2016[4]+(1.5*IQR_2016)
upper_2016_outliers = subset_2016_salary_data$total_salary[which(subset_2016_salary_data$total_salary > upper_bound_2016)]
#length(upper_2016_outliers)
subset_2017_salary_data = filter(individual_salary, Fiscal.Year == 2017)
five_num_2017 = fivenum(subset_2017_salary_data$total_salary)
IQR_2017 = five_num_2017[4]-five_num_2017[2]
lower_bound_2017 = five_num_2017[2]-(1.5*IQR_2017)
lower_2017_outliers = subset_2017_salary_data$total_salary[which(subset_2017_salary_data$total_salary < lower_bound_2017)]
#length(lower_2017_outliers)
upper_bound_2017 = five_num_2017[4]+(1.5*IQR_2017)
upper_2017_outliers = subset_2017_salary_data$total_salary[which(subset_2017_salary_data$total_salary > upper_bound_2017)]
#length(upper_2017_outliers)
subset_2018_salary_data = filter(individual_salary, Fiscal.Year == 2018)
five_num_2018 = fivenum(subset_2018_salary_data$total_salary)
IQR_2018 = five_num_2018[4]-five_num_2018[2]
lower_bound_2018 = five_num_2018[2]-(1.5*IQR_2018)
lower_2018_outliers = subset_2018_salary_data$total_salary[which(subset_2018_salary_data$total_salary < lower_bound_2018)]
#length(lower_2018_outliers)
upper_bound_2018 = five_num_2018[4]+(1.5*IQR_2018)
upper_2018_outliers = subset_2018_salary_data$total_salary[which(subset_2018_salary_data$total_salary > upper_bound_2018)]
#length(upper_2018_outliers)
#Boxplots
grouped_box_plot = plot_ly(
data = individual_salary,
y = ~as.factor(Fiscal.Year),
x = ~total_salary,
type = 'box',
boxpoints = "outliers",
color = ~as.factor(Fiscal.Year),
hovertemplate = "Year: %{x}<br>Salary: $%{y:.2f}<extra></extra>"
) %>%
layout(
title = "Distribution of Employee Salaries by Fiscal Year (With Outliers)",
xaxis = list(title = "Fiscal Year"),
yaxis = list(title = "Total Salary (USD)"),
boxmode = 'group',
showlegend = FALSE
)
print(grouped_box_plot)
salary_2016 = filter(subset_2016_salary_data, total_salary< upper_bound_2016)
salary_2017 = filter(subset_2017_salary_data, total_salary< upper_bound_2017)
salary_2018 = filter(subset_2018_salary_data, total_salary< upper_bound_2018)
combined_salary <- bind_rows(salary_2016, salary_2017, salary_2018)
grouped_box_plot = plot_ly(
data = combined_salary,
y = ~as.factor(Fiscal.Year),
x = ~total_salary,
type = 'box',
boxpoints = "outliers",
color = ~as.factor(Fiscal.Year),
hovertemplate = "Year: %{x}<br>Salary: $%{y:.2f}<extra></extra>"
) %>%
layout(
title = "Distribution of Employee Salaries by Fiscal Year (Without Outliers)",
xaxis = list(title = "Fiscal Year"),
yaxis = list(title = "Total Salary (USD)"),
boxmode = 'group',
showlegend = FALSE
)
print(grouped_box_plot)
years = sort(unique(individual_salary$Fiscal.Year))
max_count = individual_salary %>%
group_by(Fiscal.Year) %>%
summarise(counts = hist(total_salary, plot = FALSE)$counts) %>%
summarise(max_count = max(unlist(counts))) %>%
pull(max_count)
plots_list = lapply(years, function(y) {
df = individual_salary %>% filter(Fiscal.Year == y)
plot_ly(
data = df,
x = ~total_salary,
type = 'histogram',
name = paste("Fiscal Year", y),
hovertemplate = "Salary: %{x}<br>Count: %{y}<extra></extra>"
) %>%
layout(
title = "Annual Salary for Population (With Outliers)",
xaxis = list(title = "Total Salary (USD)"),
yaxis = list(title = "Count of Employees", range = c(0, 1200))
)
})
plotly_histograms = subplot(
plots_list,
nrows = 3,
shareY = TRUE,
titleX = TRUE,
titleY = TRUE,
margin = 0.05
)
plotly_histograms
max_count = individual_salary %>%
group_by(Fiscal.Year) %>%
summarise(counts = hist(combined_salary$total_salary, plot = FALSE)$counts) %>%
summarise(max_count = max(unlist(counts))) %>%
pull(max_count)
plots_list = lapply(years, function(y) {
df = individual_salary %>% filter(Fiscal.Year == y)
plot_ly(
data = df,
x = ~combined_salary$total_salary,
type = 'histogram',
name = paste("Fiscal Year", y),
hovertemplate = "Salary: %{x}<br>Count: %{y}<extra></extra>"
) %>%
layout(
title = "Annual Salary for Population (Without Outliers)",
xaxis = list(title = "Total Salary (USD)"),
yaxis = list(title = "Count of Employees", range = c(0, 1200))
)
})
plotly_histograms = subplot(
plots_list,
nrows = 3,
shareY = TRUE,
titleX = TRUE,
titleY = TRUE,
margin = 0.05
)
plotly_histograms