Packages Used

library(tidyverse)
library(openxlsx)
library(janitor)
library(usmap)
library(scales)

Load/Tidy Data

Show code for loading data

Source: U.S. Bureau of Labor Statistics

data <- read.xlsx("salary_by_state_May2024.xlsx") |> 
  clean_names() |> 
  select(state = area_title, job_title = occ_title, job_group = o_group, tot_emp, annual_mean_salary = a_mean) |> 
  mutate(tot_emp = as.numeric(tot_emp),
         annual_mean_salary = as.numeric(annual_mean_salary),
         annnual_agg_salary = tot_emp * annual_mean_salary)
major_category_salary <- data |> 
  filter(job_group == "major") |> 
  group_by(state, job_title) |> 
  summarise(total_employees = sum(tot_emp, na.rm = TRUE),
            annual_agg_salary = sum(annnual_agg_salary, na.rm = TRUE),
            annual_mean_salary = annual_agg_salary / total_employees) |> 
  select(-annual_agg_salary, -total_employees)


data_jobs_salary <- data |> 
  filter(str_detect(job_title, "(?i)data")) |>
  filter(!str_detect(job_title, "Key")) |>
  group_by(state, job_title) |> 
  summarise(total_employees = sum(tot_emp, na.rm = TRUE),
            annual_agg_salary = sum(annnual_agg_salary, na.rm = TRUE),
            annual_mean_salary = annual_agg_salary / total_employees) |> 
  select(-annual_agg_salary, -total_employees)


df <- rbind(major_category_salary, data_jobs_salary) |> arrange(state, job_title)

df |> 
  head(5)
## # A tibble: 5 × 3
## # Groups:   state [1]
##   state   job_title                                           annual_mean_salary
##   <chr>   <chr>                                                            <dbl>
## 1 Alabama Architecture and Engineering Occupations                        105480
## 2 Alabama Arts, Design, Entertainment, Sports, and Media Occ…              57030
## 3 Alabama Building and Grounds Cleaning and Maintenance Occu…              33000
## 4 Alabama Business and Financial Operations Occupations                    82220
## 5 Alabama Community and Social Service Occupations                         52680

Visualization 1

data_roles <- c(
  "Data Scientists",
  "Database Administrators",
  "Database Architects"
)

# Create comparison groups
compare_df <- df |> 
  mutate(
    role_group = if_else(job_title %in% data_roles,
                         "Data Practitioners",
                         "All Jobs"),
    annual_mean_salary = as.numeric(annual_mean_salary)
  ) |> 
  group_by(state, role_group) |> 
  summarise(
    avg_salary = mean(annual_mean_salary, na.rm = TRUE),
    .groups = "drop"
  )  |> 
  pivot_wider(
    names_from = role_group,
    values_from = avg_salary
  )  |> 
  drop_na() |> 
  mutate(
    salary_gap = `Data Practitioners` - `All Jobs`,
    state = reorder(state, salary_gap)
  )

# Reshape for legend-friendly plotting
plot_df <- compare_df  |> 
  pivot_longer(
    cols = c(`All Jobs`, `Data Practitioners`),
    names_to = "job_group",
    values_to = "avg_salary"
  )

# Plot
ggplot(plot_df, aes(x = avg_salary, y = state)) +

  geom_segment(
    data = compare_df,
    aes(
      x = `All Jobs`,
      xend = `Data Practitioners`,
      y = state,
      yend = state
    ),
    inherit.aes = FALSE,
    linewidth = 1,
    alpha = 0.6
  ) +

  # points with shapes & colors mapped for legend
  geom_point(
    aes(shape = job_group, color = job_group),
    size = 3
  ) +

  scale_shape_manual(
    values = c(
      "All Jobs" = 16, 
      "Data Practitioners" = 8 
    )
  ) +
  scale_color_manual(
    values = c(
      "All Jobs" = "black",
      "Data Practitioners" = "red"
    )
  ) +

  scale_x_continuous(labels = dollar) +
  labs(
    title = "Average Annual Salaries by State",
    subtitle = "Data Practitioner Salaries Consistently Outpace Other Jobs",
    x = NULL,
    y = NULL,
    shape = "Job Category",
    color = "Job Category",
) +

  theme_minimal(base_size = 11) +
  theme(
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "top"
  )

Visualization 2

roles_keep <- c("Data Scientists", "Database Administrators", "Database Architects")

dp <- df |> 
  select(state, job_title, annual_mean_salary) |> 
  filter(job_title %in% roles_keep) |> 
  mutate(
    annual_mean_salary = as.numeric(annual_mean_salary),
    role = recode(job_title,
                  "Data Scientists" = "Data Scientist",
                  "Database Administrators" = "Database Administrator",
                  "Database Architects" = "Database Architect")
  ) |> 
  select(state, role, annual_mean_salary)

p3 <- dp |> 
  mutate(role = factor(role, levels = c("Data Scientist", "Database Architect", "Database Administrator"))) |> 
  ggplot(aes(x = role, y = annual_mean_salary)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(text = state), width = 0.15, alpha = 0.7, size = 2) +
  scale_y_continuous(labels = dollar) +
  labs(
    title = "State-to-State Salary Variation by Data Practitioner Role",
    subtitle = "Database Architects typically receive higher pay and Database Administrators the lowest,
while Data Scientist salaries have the most variation",
    x = NULL,
    y = NULL
  ) +
  theme_minimal(base_size = 11)

print(p3)

Visualization 3

dp_map <- dp |> filter(state != "Puerto Rico", role != "Database Administrator")

p1 <- plot_usmap(
  data = dp_map,
  values = "annual_mean_salary",
  regions = "states"
) +
  facet_wrap(~ role, ncol = 2) +
  scale_fill_continuous(
    label = dollar,
    name  = "Avg annual salary"
  ) +
  labs(
    title = "Salary Heatmap for Highest-Paid Data Practitionor Jobs",
    subtitle = "Database Architect jobs have higher pay in most states, but Data Scientist salaries are equally
competitive in several states, such as California, New York, and Washington",
) +
  theme(
    legend.position = "right",
    strip.text = element_text(face = "bold")
  )

print(p1)