I have introduced the term “Data Practitioner” as a generic job descriptor because we have so many different job role titles for individuals whose work activities overlap including Data Scientist, Data Engineer, Data Analyst, Business Analyst, Data Architect, etc.
For this story we will answer the question, “How much do we get paid?” Your analysis and data visualizations must address the variation in average salary based on role descriptor and state.
library(tidyverse)
library(httr)
library(jsonlite)
library(dplyr)
library(ggplot2)
library(reshape2)
library(usmap)
library(tinytex)
url <- "https://raw.githubusercontent.com/Stevee-G/Data608/refs/heads/main/Assignment4/2025_IPST_Wages.csv"
df <- read_csv(url)
glimpse(df)
## Rows: 4,692
## Columns: 9
## $ AREA_TITLE <chr> "Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Ariz…
## $ NAICS_TITLE <chr> "Information", "Information", "Information", "Information"…
## $ OCC_TITLE <chr> "All Occupations", "Computer and Mathematical Occupations"…
## $ A_MEAN <dbl> 77490, 109620, 111890, 87980, 99360, 91730, 104240, 102440…
## $ A_PCT10 <dbl> 31000, 46570, 64620, 38800, 50490, 35680, 48740, 84450, 11…
## $ A_PCT25 <dbl> 43890, 71430, 83860, 57650, 84460, 48510, 62760, 92790, 12…
## $ A_MEDIAN <dbl> 63020, 104670, 121640, 88100, 102880, 76640, 97380, 104280…
## $ A_PCT75 <dbl> 95630, 145210, 129260, 103730, 105710, 123280, 133370, 104…
## $ A_PCT90 <dbl> 152460, 173930, 154660, 132520, 134120, 168010, 170760, 12…
state <- df %>%
filter(OCC_TITLE != "All Occupations") %>%
mutate(state = AREA_TITLE) %>%
group_by(state) %>%
summarize(
mean = mean(A_MEAN),
low = mean(A_PCT10),
mid = mean(A_MEDIAN),
high = mean(A_PCT90)
)
glimpse(state)
## Rows: 54
## Columns: 5
## $ state <chr> "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Color…
## $ mean <dbl> 113884.38, 96585.00, 112939.59, 95482.59, 156705.04, 135602.50, …
## $ low <dbl> 65352.50, 60505.00, 68908.16, 50492.96, 93832.63, 83745.91, 7327…
## $ mid <dbl> 108798.96, 95339.17, 111492.04, 89648.52, 149069.27, 130490.11, …
## $ high <dbl> 166117.1, 131878.3, 160947.8, 147460.4, 227532.7, 193890.3, 1770…
state_long <- melt(state %>%
select(-mean),
id.vars = "state") %>%
arrange(state)
glimpse(state_long)
## Rows: 162
## Columns: 3
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alaska", "Alaska", "Alaska"…
## $ variable <fct> low, mid, high, low, mid, high, low, mid, high, low, mid, hig…
## $ value <dbl> 65352.50, 108798.96, 166117.08, 60505.00, 95339.17, 131878.33…
profession <- df %>%
group_by(OCC_TITLE) %>%
summarize(
lowest = mean(A_PCT10),
mid = mean(A_MEDIAN),
highest = mean(A_PCT90)
)
glimpse(profession)
## Rows: 8
## Columns: 4
## $ OCC_TITLE <chr> "All Occupations", "Computer and Information Research Scient…
## $ lowest <dbl> 39014.19, 109656.53, 59614.00, 78074.94, 77159.84, 102739.17…
## $ mid <dbl> 71381.14, 159471.53, 104755.26, 119377.93, 118928.93, 144561…
## $ highest <dbl> 148521.7, 232543.5, 168649.3, 177123.1, 161767.9, 192624.0, …
prof_long <- melt(profession,
id.vars = "OCC_TITLE") %>%
arrange(OCC_TITLE)
glimpse(prof_long)
## Rows: 24
## Columns: 3
## $ OCC_TITLE <chr> "All Occupations", "All Occupations", "All Occupations", "Co…
## $ variable <fct> lowest, mid, highest, lowest, mid, highest, lowest, mid, hig…
## $ value <dbl> 39014.19, 71381.14, 148521.69, 109656.53, 159471.53, 232543.…
field <- df %>%
filter(OCC_TITLE != "All Occupations") %>%
group_by(NAICS_TITLE) %>%
summarize(
low = mean(A_PCT10),
mid = mean(A_MEDIAN),
high = mean(A_PCT90)
)
glimpse(field)
## Rows: 32
## Columns: 4
## $ NAICS_TITLE <chr> "Accounting, Tax Preparation, Bookkeeping, and Payroll Ser…
## $ low <dbl> 71430.41, 59201.64, 70773.23, 69768.00, 79632.37, 76727.97…
## $ mid <dbl> 109287.81, 96217.61, 112788.18, 119665.20, 123663.81, 1214…
## $ high <dbl> 168965.3, 147463.3, 173936.6, 193857.0, 176904.2, 178317.2…
field_long <- melt(field,
id.vars = "NAICS_TITLE") %>%
arrange(NAICS_TITLE)
glimpse(field_long)
## Rows: 96
## Columns: 3
## $ NAICS_TITLE <chr> "Accounting, Tax Preparation, Bookkeeping, and Payroll Ser…
## $ variable <fct> low, mid, high, low, mid, high, low, mid, high, low, mid, …
## $ value <dbl> 71430.41, 109287.81, 168965.34, 59201.64, 96217.61, 147463…
plot_usmap(data = state, values = "mean", color = "black") +
scale_fill_continuous(low = "whitesmoke",
high = "darkblue",
name = "Average Salary",
label = scales::comma) +
labs(title = "Heatmap: Data Practitioner Salary Across the States") +
theme(legend.position = "right") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(state_long, aes(x = value, y = reorder(state, value, FUN = max))) +
geom_line() +
geom_point(aes(color = variable), size = 2) +
scale_color_brewer(palette = "Set1", direction = -1) +
labs(title = "Dumbbell Plot: Salary Range by State",
x = "",
y = "") +
theme_minimal() +
theme(legend.position = "bottom") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(prof_long, aes(x = value, y = reorder(OCC_TITLE, value, FUN = max))) +
geom_line() +
geom_point(aes(color = variable), size = 2) +
scale_color_brewer(palette = "Set1", direction = -1) +
labs(title = "Dumbbell Plot: Salary Range by Profession",
x = "",
y = "") +
theme_minimal() +
theme(legend.position = "bottom") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(field_long, aes(x = value, y = reorder(NAICS_TITLE, value, FUN = max))) +
geom_line() +
geom_point(aes(color = variable), size = 2) +
scale_color_brewer(palette = "Set1", direction = -1) +
labs(title = "Dumbbell Plot: Salary Range by Industry",
x = "",
y = "") +
theme_minimal() +
theme(legend.position = "bottom") +
theme(plot.title = element_text(hjust = 0.5))