library(tidyverse)
library(openxlsx)
library(janitor)
library(usmap)
library(scales)
Source: U.S. Bureau of Labor Statistics
data <- read.xlsx("salary_by_state_May2024.xlsx") |>
clean_names() |>
select(state = area_title, job_title = occ_title, job_group = o_group, tot_emp, annual_mean_salary = a_mean) |>
mutate(tot_emp = as.numeric(tot_emp),
annual_mean_salary = as.numeric(annual_mean_salary),
annnual_agg_salary = tot_emp * annual_mean_salary)
major_category_salary <- data |>
filter(job_group == "major") |>
group_by(state, job_title) |>
summarise(total_employees = sum(tot_emp, na.rm = TRUE),
annual_agg_salary = sum(annnual_agg_salary, na.rm = TRUE),
annual_mean_salary = annual_agg_salary / total_employees) |>
select(-annual_agg_salary, -total_employees)
data_jobs_salary <- data |>
filter(str_detect(job_title, "(?i)data")) |>
filter(!str_detect(job_title, "Key")) |>
group_by(state, job_title) |>
summarise(total_employees = sum(tot_emp, na.rm = TRUE),
annual_agg_salary = sum(annnual_agg_salary, na.rm = TRUE),
annual_mean_salary = annual_agg_salary / total_employees) |>
select(-annual_agg_salary, -total_employees)
df <- rbind(major_category_salary, data_jobs_salary) |> arrange(state, job_title)
df |>
head(5)
## # A tibble: 5 × 3
## # Groups: state [1]
## state job_title annual_mean_salary
## <chr> <chr> <dbl>
## 1 Alabama Architecture and Engineering Occupations 105480
## 2 Alabama Arts, Design, Entertainment, Sports, and Media Occ… 57030
## 3 Alabama Building and Grounds Cleaning and Maintenance Occu… 33000
## 4 Alabama Business and Financial Operations Occupations 82220
## 5 Alabama Community and Social Service Occupations 52680
data_roles <- c(
"Data Scientists",
"Database Administrators",
"Database Architects"
)
# Create comparison groups
compare_df <- df |>
mutate(
role_group = if_else(job_title %in% data_roles,
"Data Practitioners",
"All Jobs"),
annual_mean_salary = as.numeric(annual_mean_salary)
) |>
group_by(state, role_group) |>
summarise(
avg_salary = mean(annual_mean_salary, na.rm = TRUE),
.groups = "drop"
) |>
pivot_wider(
names_from = role_group,
values_from = avg_salary
) |>
drop_na() |>
mutate(
salary_gap = `Data Practitioners` - `All Jobs`,
state = reorder(state, salary_gap)
)
# Reshape for legend-friendly plotting
plot_df <- compare_df |>
pivot_longer(
cols = c(`All Jobs`, `Data Practitioners`),
names_to = "job_group",
values_to = "avg_salary"
)
# Plot
ggplot(plot_df, aes(x = avg_salary, y = state)) +
geom_segment(
data = compare_df,
aes(
x = `All Jobs`,
xend = `Data Practitioners`,
y = state,
yend = state
),
inherit.aes = FALSE,
linewidth = 1,
alpha = 0.6
) +
# points with shapes & colors mapped for legend
geom_point(
aes(shape = job_group, color = job_group),
size = 3
) +
scale_shape_manual(
values = c(
"All Jobs" = 16,
"Data Practitioners" = 8
)
) +
scale_color_manual(
values = c(
"All Jobs" = "black",
"Data Practitioners" = "red"
)
) +
scale_x_continuous(labels = dollar) +
labs(
title = "Average Annual Salaries by State",
subtitle = "Data Practitioner Salaries Consistently Outpace Other Jobs",
x = NULL,
y = NULL,
shape = "Job Category",
color = "Job Category",
) +
theme_minimal(base_size = 11) +
theme(
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "top"
)
roles_keep <- c("Data Scientists", "Database Administrators", "Database Architects")
dp <- df |>
select(state, job_title, annual_mean_salary) |>
filter(job_title %in% roles_keep) |>
mutate(
annual_mean_salary = as.numeric(annual_mean_salary),
role = recode(job_title,
"Data Scientists" = "Data Scientist",
"Database Administrators" = "Database Administrator",
"Database Architects" = "Database Architect")
) |>
select(state, role, annual_mean_salary)
p3 <- dp |>
mutate(role = factor(role, levels = c("Data Scientist", "Database Architect", "Database Administrator"))) |>
ggplot(aes(x = role, y = annual_mean_salary)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(aes(text = state), width = 0.15, alpha = 0.7, size = 2) +
scale_y_continuous(labels = dollar) +
labs(
title = "State-to-State Salary Variation by Data Practitioner Role",
subtitle = "Database Architects typically receive higher pay and Database Administrators the lowest,
while Data Scientist salaries have the most variation",
x = NULL,
y = NULL
) +
theme_minimal(base_size = 11)
print(p3)
dp_map <- dp |> filter(state != "Puerto Rico", role != "Database Administrator")
p1 <- plot_usmap(
data = dp_map,
values = "annual_mean_salary",
regions = "states"
) +
facet_wrap(~ role, ncol = 2) +
scale_fill_continuous(
label = dollar,
name = "Avg annual salary"
) +
labs(
title = "Salary Heatmap for Highest-Paid Data Practitionor Jobs",
subtitle = "Database Architect jobs have higher pay in most states, but Data Scientist salaries are equally
competitive in several states, such as California, New York, and Washington",
) +
theme(
legend.position = "right",
strip.text = element_text(face = "bold")
)
print(p1)