DATA608 Assignment 4

Prompt

I have introduced the term “Data Practitioner” as a generic job descriptor because we have so many different job role titles for individuals whose work activities overlap including Data Scientist, Data Engineer, Data Analyst, Business Analyst, Data Architect, etc.

For this story we will answer the question, “How much do we get paid?” Your analysis and data visualizations must address the variation in average salary based on role descriptor and state.

R Packages

library(tidyverse)
library(httr)
library(jsonlite)
library(dplyr)
library(ggplot2)
library(reshape2)
library(usmap)
library(tinytex)

Data Import

url <- "https://raw.githubusercontent.com/Stevee-G/Data608/refs/heads/main/Assignment4/2025_IPST_Wages.csv"

df <- read_csv(url)

glimpse(df)

## Rows: 4,692
## Columns: 9
## $ AREA_TITLE  <chr> "Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Ariz…
## $ NAICS_TITLE <chr> "Information", "Information", "Information", "Information"…
## $ OCC_TITLE   <chr> "All Occupations", "Computer and Mathematical Occupations"…
## $ A_MEAN      <dbl> 77490, 109620, 111890, 87980, 99360, 91730, 104240, 102440…
## $ A_PCT10     <dbl> 31000, 46570, 64620, 38800, 50490, 35680, 48740, 84450, 11…
## $ A_PCT25     <dbl> 43890, 71430, 83860, 57650, 84460, 48510, 62760, 92790, 12…
## $ A_MEDIAN    <dbl> 63020, 104670, 121640, 88100, 102880, 76640, 97380, 104280…
## $ A_PCT75     <dbl> 95630, 145210, 129260, 103730, 105710, 123280, 133370, 104…
## $ A_PCT90     <dbl> 152460, 173930, 154660, 132520, 134120, 168010, 170760, 12…

Data Wrangling

state <- df %>% 
  filter(OCC_TITLE != "All Occupations") %>% 
  mutate(state = AREA_TITLE) %>% 
  group_by(state) %>% 
  summarize(
    mean = mean(A_MEAN),
    low = mean(A_PCT10),
    mid = mean(A_MEDIAN),
    high = mean(A_PCT90)
  )
glimpse(state)

## Rows: 54
## Columns: 5
## $ state <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Color…
## $ mean  <dbl> 113884.38, 96585.00, 112939.59, 95482.59, 156705.04, 135602.50, …
## $ low   <dbl> 65352.50, 60505.00, 68908.16, 50492.96, 93832.63, 83745.91, 7327…
## $ mid   <dbl> 108798.96, 95339.17, 111492.04, 89648.52, 149069.27, 130490.11, …
## $ high  <dbl> 166117.1, 131878.3, 160947.8, 147460.4, 227532.7, 193890.3, 1770…

state_long <- melt(state %>% 
                     select(-mean),
                   id.vars = "state") %>% 
  arrange(state)
glimpse(state_long)

## Rows: 162
## Columns: 3
## $ state    <chr> "Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska"…
## $ variable <fct> low, mid, high, low, mid, high, low, mid, high, low, mid, hig…
## $ value    <dbl> 65352.50, 108798.96, 166117.08, 60505.00, 95339.17, 131878.33…

profession <- df %>% 
  group_by(OCC_TITLE) %>% 
  summarize(
    lowest = mean(A_PCT10),
    mid = mean(A_MEDIAN),
    highest = mean(A_PCT90)
  )
glimpse(profession)

## Rows: 8
## Columns: 4
## $ OCC_TITLE <chr> "All Occupations", "Computer and Information Research Scient…
## $ lowest    <dbl> 39014.19, 109656.53, 59614.00, 78074.94, 77159.84, 102739.17…
## $ mid       <dbl> 71381.14, 159471.53, 104755.26, 119377.93, 118928.93, 144561…
## $ highest   <dbl> 148521.7, 232543.5, 168649.3, 177123.1, 161767.9, 192624.0, …

prof_long <- melt(profession,
                   id.vars = "OCC_TITLE") %>% 
  arrange(OCC_TITLE)
glimpse(prof_long)

## Rows: 24
## Columns: 3
## $ OCC_TITLE <chr> "All Occupations", "All Occupations", "All Occupations", "Co…
## $ variable  <fct> lowest, mid, highest, lowest, mid, highest, lowest, mid, hig…
## $ value     <dbl> 39014.19, 71381.14, 148521.69, 109656.53, 159471.53, 232543.…

field <- df %>% 
  filter(OCC_TITLE != "All Occupations") %>% 
  group_by(NAICS_TITLE) %>% 
  summarize(
    low = mean(A_PCT10),
    mid = mean(A_MEDIAN),
    high = mean(A_PCT90)
  )
glimpse(field)

## Rows: 32
## Columns: 4
## $ NAICS_TITLE <chr> "Accounting, Tax Preparation, Bookkeeping, and Payroll Ser…
## $ low         <dbl> 71430.41, 59201.64, 70773.23, 69768.00, 79632.37, 76727.97…
## $ mid         <dbl> 109287.81, 96217.61, 112788.18, 119665.20, 123663.81, 1214…
## $ high        <dbl> 168965.3, 147463.3, 173936.6, 193857.0, 176904.2, 178317.2…

field_long <- melt(field,
                   id.vars = "NAICS_TITLE") %>% 
  arrange(NAICS_TITLE)
glimpse(field_long)

## Rows: 96
## Columns: 3
## $ NAICS_TITLE <chr> "Accounting, Tax Preparation, Bookkeeping, and Payroll Ser…
## $ variable    <fct> low, mid, high, low, mid, high, low, mid, high, low, mid, …
## $ value       <dbl> 71430.41, 109287.81, 168965.34, 59201.64, 96217.61, 147463…

Data Visualizations

plot_usmap(data = state, values = "mean", color = "black") +
  scale_fill_continuous(low = "whitesmoke",
                        high = "darkblue",
                        name = "Average Salary",
                        label = scales::comma) +
  labs(title = "Heatmap: Data Practitioner Salary Across the States") +
  theme(legend.position = "right") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(state_long, aes(x = value, y = reorder(state, value, FUN = max))) +
  geom_line() +
  geom_point(aes(color = variable), size = 2) +
  scale_color_brewer(palette = "Set1", direction = -1) +
  labs(title = "Dumbbell Plot: Salary Range by State",
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(prof_long, aes(x = value, y = reorder(OCC_TITLE, value, FUN = max))) +
  geom_line() +
  geom_point(aes(color = variable), size = 2) +
  scale_color_brewer(palette = "Set1", direction = -1) +
  labs(title = "Dumbbell Plot: Salary Range by Profession",
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(field_long, aes(x = value, y = reorder(NAICS_TITLE, value, FUN = max))) +
  geom_line() +
  geom_point(aes(color = variable), size = 2) +
  scale_color_brewer(palette = "Set1", direction = -1) +
  labs(title = "Dumbbell Plot: Salary Range by Industry",
       x = "",
       y = "") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  theme(plot.title = element_text(hjust = 0.5))

Sources

Salary Data:

https://www.bls.gov/oes/oes_research_2025.htm