# Story 4: DATA PRACTITIONER SALARY STORY "How Much Do We Get Paid?"
# Packages
pkgs <- c("ggplot2", "dplyr", "tidyr", "scales", "forcats", "ggtext", "ggridges")
for (p in pkgs) {
if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}
library(ggplot2)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
##
## col_factor
library(forcats)
library(ggtext)
library(ggridges)
## Warning: package 'ggridges' was built under R version 4.5.3
# DESIGN
BG <- "#0D1117"
PANEL <- "#161B22"
GRID_COL <- "#21262D"
BORDER <- "#30363D"
TXT_HI <- "#FFFFFF"
TXT_MED <- "#8B949E"
TXT_LOW <- "#484F58"
C_DS <- "#58A6FF" # Data Scientist — electric blue
C_DE <- "#3FB950" # Data Engineer — emerald
C_DA <- "#F78166" # Data Analyst — coral
C_BA <- "#D29922" # Business Analyst— amber
C_DAR <- "#BC8CFF" # Data Architect — violet
GOLD <- "#FFD700"
ROLE_COLORS <- c(
"Data Architect" = C_DAR,
"Data Scientist" = C_DS,
"Data Engineer" = C_DE,
"Business Analyst" = C_BA,
"Data Analyst" = C_DA
)
ROLE_LEVELS <- c("Data Analyst", "Business Analyst",
"Data Engineer", "Data Scientist", "Data Architect")
base_theme <- theme_minimal() +
theme(
plot.background = element_rect(fill = BG, color = NA),
panel.background = element_rect(fill = PANEL, color = NA),
panel.grid.major = element_line(color = GRID_COL, linewidth = 0.4),
panel.grid.minor = element_blank(),
axis.ticks = element_blank(),
axis.text = element_text(color = TXT_MED, family = "mono", size = 9),
axis.title = element_text(color = TXT_MED, family = "mono", size = 9),
plot.title = element_text(color = TXT_HI, family = "sans",
face = "bold", size = 17,
margin = margin(b = 4)),
plot.subtitle = element_text(color = TXT_MED, family = "mono",
size = 13, margin = margin(b = 14)),
plot.caption = element_text(color = TXT_LOW, family = "mono",
size = 7, margin = margin(t = 10)),
legend.background = element_rect(fill = PANEL, color = NA),
legend.key = element_rect(fill = PANEL, color = NA),
legend.text = element_text(color = TXT_MED, family = "mono", size = 8),
legend.title = element_text(color = TXT_HI, family = "mono",
size = 8, face = "bold"),
plot.margin = margin(20, 24, 14, 24)
)
# DATA loading
# Slide 1 & 4: National role-level salary data
# BLS OEWS 2024 · Glassdoor 2024 · ZipRecruiter 2024-25
roles_nat <- read_csv("roles_nat.csv") %>%
mutate(role = factor(role, levels = ROLE_LEVELS))
## Rows: 5 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): role
## dbl (4): avg, median, p25, p75
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Slide 2 & 5: Data Scientist salary by state (BLS OEWS May 2024)
ds_state <- read_csv("ds_state.csv")
## Rows: 48 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (1): salary
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Slide 3: Multi-role × state matrix (30 representative states)
# Sources: BLS + ZipRecruiter + Glassdoor blended estimates
state_role <- read_csv("state_role.csv")
## Rows: 30 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): abbr, name
## dbl (5): Data Scientist, Data Engineer, Data Analyst, Business Analyst, Data...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
state_role_long <- state_role %>%
pivot_longer(cols = all_of(ROLE_LEVELS),
names_to = "role",
values_to = "salary") %>%
mutate(role = factor(role, levels = ROLE_LEVELS))
# SLIDE 1 : Radial bar
radial_data <- roles_nat %>%
arrange(desc(avg)) %>%
mutate(role = factor(role, levels = role))
INNER <- 55000 # inner radius — bars grow outward from here
OUTER_MAX <- 115000 # tallest bar height above INNER
LABEL_R <- OUTER_MAX + 22000
p1 <- ggplot(radial_data, aes(x = role, y = avg - INNER, fill = role)) +
geom_col(width = 0.75, alpha = 0.90) +
# Salary label
geom_text(
aes(y = (avg - INNER) * 0.38,
label = paste0("$", round(avg / 1000), "K")),
color = TXT_HI, family = "mono", fontface = "bold",
size = 5.0, hjust = 0.5, vjust = 0.5,
show.legend = FALSE
) +
# Role title
geom_text(
aes(y = LABEL_R, label = role, color = role),
family = "mono", fontface = "bold",
size = 4.6, hjust = 0.5, vjust = 0.5,
show.legend = FALSE
) +
coord_polar(theta = "x", start = 0, clip = "off") +
scale_color_manual(values = ROLE_COLORS, guide = "none") +
scale_fill_manual(values = ROLE_COLORS, guide = "none") +
scale_y_continuous(limits = c(-30000, LABEL_R + 24000), expand = c(0, 0)) +
labs(
title = "Five Job Titles. One Family. Completely Different Paychecks.",
subtitle = "Two people working at the same company, the same data, taking home different paychecks",
x = NULL,
y = NULL,
caption = NULL
) +
base_theme +
theme(
axis.text = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0.5),
plot.margin = margin(60, 60, 60, 60)
)
ggsave("slide1_radial.png", p1,
width = 12, height = 12, dpi = 200, bg = BG)
p1

# SLIDE 2 : Ridgeline + Range bar
set.seed(42)
X_LO <- 45000
X_HI <- 285000
X_BREAKS <- seq(60000, 260000, by = 40000)
sim_salaries <- roles_nat %>%
rowwise() %>%
reframe({
mu <- log(median)
sigma <- (log(p75) - log(p25)) / (2 * 0.6745)
sims <- rlnorm(2500, meanlog = mu, sdlog = sigma)
sims <- pmax(pmin(sims, 310000), 40000)
data.frame(role = role, salary = sims)
}) %>%
mutate(role = factor(role, levels = ROLE_LEVELS))
role_annotations <- roles_nat %>%
select(role, median, avg, p25, p75) %>%
mutate(role = factor(role, levels = ROLE_LEVELS))
p2 <- ggplot(sim_salaries,
aes(x = salary, y = role, fill = role, group = role)) +
# LAYER 1: ridgeline density
geom_density_ridges(
aes(height = after_stat(density)),
stat = "density",
color = NA,
alpha = 0.18,
scale = 0.85,
rel_min_height = 0.01,
bandwidth = 6500
) +
# LAYER 2: IQR band
geom_segment(
data = role_annotations,
aes(x = p25, xend = p75, y = role, yend = role, color = role),
inherit.aes = FALSE,
linewidth = 9, alpha = 0.28, lineend = "round"
) +
# LAYER 3: median diamond
geom_point(
data = role_annotations,
aes(x = median, y = role),
inherit.aes = FALSE,
shape = 18, size = 7, color = GOLD, alpha = 0.95
) +
# LAYER 4: average dot
geom_point(
data = role_annotations,
aes(x = avg, y = role, color = role),
inherit.aes = FALSE,
size = 5.5, alpha = 0.95
) +
# LAYER 5: avg label above
geom_text(
data = role_annotations,
aes(x = avg, y = role, color = role,
label = paste0("avg $", round(avg / 1000), "K")),
inherit.aes = FALSE,
family = "mono", fontface = "bold", size = 2.9,
nudge_y = 0.22, show.legend = FALSE
) +
# LAYER 6: median label below
geom_text(
data = role_annotations,
aes(x = median, y = role,
label = paste0("med $", round(median / 1000), "K")),
inherit.aes = FALSE,
color = GOLD, family = "mono", fontface = "bold", size = 2.7,
nudge_y = -0.22
) +
# LAYER 7: p25 / p75 labels
geom_text(
data = role_annotations,
aes(x = p25, y = role, label = paste0("$", round(p25 / 1000), "K")),
inherit.aes = FALSE,
color = TXT_LOW, family = "mono", size = 2.1, nudge_y = -0.22
) +
geom_text(
data = role_annotations,
aes(x = p75, y = role, label = paste0("$", round(p75 / 1000), "K")),
inherit.aes = FALSE,
color = TXT_LOW, family = "mono", size = 2.1, nudge_y = -0.22
) +
scale_fill_manual(values = ROLE_COLORS, guide = "none") +
scale_color_manual(values = ROLE_COLORS, guide = "none") +
scale_x_continuous(
labels = label_dollar(scale = 1e-3, suffix = "K"),
limits = c(X_LO, X_HI),
breaks = X_BREAKS
) +
labs(
title = "The Average Is Just the Beginning",
subtitle = "For every role, median sits below average. Some roles have a very wide range",
y = NULL,
caption = "◆ = median ● = average band = 25th–75th percentile "
) +
base_theme +
theme(
panel.grid.major.y = element_blank(),
axis.text.y = element_text(size = 11, face = "bold", color = TXT_HI)
)
## Warning in geom_density_ridges(aes(height = after_stat(density)), stat =
## "density", : Ignoring unknown parameters: `bandwidth`
ggsave("slide2_combined.png", p2,
width = 13, height = 7.5, dpi = 200, bg = BG)
## Warning: Removed 247 rows containing non-finite outside the scale range
## (`stat_density()`).
p2
## Warning: Removed 247 rows containing non-finite outside the scale range
## (`stat_density()`).

# SLIDE 3: HEATMAP
state_order <- state_role %>%
arrange(desc(`Data Scientist`)) %>%
pull(abbr)
heat_data <- state_role_long %>%
mutate(
abbr = factor(abbr, levels = state_order),
# Reverse role order so Data Architect is at top of y-axis
role = factor(role, levels = rev(ROLE_LEVELS))
)
heat_pal <- c(
"#0d1b3e", "#1a3a6b", "#1d5b99",
"#2b80c5", "#58A6FF", "#3FB950", "#FFD700"
)
p3 <- ggplot(heat_data, aes(x = abbr, y = role, fill = salary)) +
geom_tile(color = BG, linewidth = 0.6) +
geom_text(aes(
label = paste0("$", round(salary / 1000), "K"),
color = ifelse(salary > 128000, BG, TXT_HI)
),
family = "mono", fontface = "bold", size = 2.2) +
scale_color_identity() +
scale_fill_gradientn(
colors = heat_pal,
values = scales::rescale(c(54000, 70000, 85000,
100000, 115000, 135000, 172000)),
labels = label_dollar(scale = 1e-3, suffix = "K"),
name = "Avg Annual Salary",
guide = guide_colorbar(
barwidth = 12,
barheight = 0.6,
title.position = "top",
ticks = FALSE,
direction = "horizontal",
title.theme = element_text(color = TXT_HI, family = "mono",
face = "bold", size = 8)
)
) +
scale_x_discrete(position = "bottom") +
scale_y_discrete(
labels = c(
"Data Architect" = "Data Architect",
"Data Scientist" = "Data Scientist",
"Data Engineer" = "Data Engineer",
"Business Analyst" = "Business Analyst",
"Data Analyst" = "Data Analyst"
)
) +
labs(
title = "Two Levers Move the Needle: Role and Location",
subtitle = paste0(
"The highest paying cell in this matrix is $116K above the lowest.\n",
"Your role and your state together are the two biggest decisions you'll ever make about your salary."
),
x = NULL,
y = NULL,
caption = NULL
) +
base_theme +
theme(
panel.grid = element_blank(),
axis.text.x = element_text(color = TXT_MED, size = 7.5, face = "bold",
angle = 45, hjust = 1),
axis.text.y = element_text(color = TXT_HI, size = 9, face = "bold"),
legend.position = "top",
legend.justification = "left",
legend.margin = margin(b = 6)
)
ggsave("slide3_heatmap.png", p3,
width = 16, height = 6, dpi = 200, bg = BG)
p3

# SLIDE 4 : Connected slope chart
top_states <- c("WA", "CA", "DC", "MA", "NJ")
bottom_states <- c("LA", "MS", "OK", "IN", "MO")
slope_data <- state_role_long %>%
filter(abbr %in% c(top_states, bottom_states)) %>%
mutate(
group = ifelse(abbr %in% top_states, "Top 5 States", "Bottom 5 States"),
group = factor(group, levels = c("Top 5 States", "Bottom 5 States")),
state_group = paste0(abbr, "_", group) # unique grouping key for geom_line
)
slope_summary <- slope_data %>%
group_by(role, group) %>%
summarise(mean_salary = mean(salary), .groups = "drop")
group_colors <- c("Top 5 States" = C_DE, "Bottom 5 States" = C_DA)
label_data <- slope_summary %>%
filter(role == "Data Architect") %>%
mutate(
label = paste0(group, "\n$", round(mean_salary / 1000), "K avg")
)
p4 <- ggplot(slope_data,
aes(x = role, y = salary,
group = state_group, color = group)) +
geom_line(alpha = 0.2, linewidth = 0.8) +
geom_point(alpha = 0.45, size = 2.8) +
geom_line(data = slope_summary,
aes(x = role, y = mean_salary,
group = group, color = group),
linewidth = 2.5, alpha = 0.95,
inherit.aes = FALSE) +
geom_point(data = slope_summary,
aes(x = role, y = mean_salary, color = group),
size = 6, alpha = 0.98,
inherit.aes = FALSE) +
geom_text(data = label_data,
aes(x = role, y = mean_salary, label = label, color = group),
hjust = -0.12,
family = "mono",
fontface = "bold",
size = 2.9,
show.legend = FALSE,
inherit.aes = FALSE) +
scale_color_manual(values = group_colors, name = NULL) +
scale_y_continuous(
labels = label_dollar(scale = 1e-3, suffix = "K"),
limits = c(48000, 215000),
breaks = seq(60000, 200000, by = 40000)
) +
scale_x_discrete(expand = expansion(add = c(0.3, 1.7))) +
labs(
title = "The State Gap Rivals the Role Gap : Know Both Levers",
subtitle = paste0("A Data Analyst in Washington earns more than a Data Scientist in Mississippi.\n",
"Location can outweigh a full role upgrade"),
x = NULL,
y = "Annual Salary (USD)",
caption = "Thick line = group average · Thin lines = individual states"
) +
base_theme +
theme(
axis.text.x = element_text(color = TXT_HI, size = 10.5, face = "bold"),
panel.grid.major.x = element_blank(),
legend.position = "none"
)
ggsave("slide4_slope.png", p4,
width = 13, height = 7, dpi = 200, bg = BG)
p4

# SLIDE 5 : Lollipop chart
NATIONAL_MEDIAN <- 112590 # BLS OEWS May 2024
lollipop_data <- ds_state %>%
mutate(
pct = (salary / NATIONAL_MEDIAN - 1) * 100,
label = tools::toTitleCase(state),
label = case_when(
state == "north carolina" ~ "N. Carolina",
state == "south carolina" ~ "S. Carolina",
state == "north dakota" ~ "N. Dakota",
state == "south dakota" ~ "S. Dakota",
state == "west virginia" ~ "W. Virginia",
state == "new hampshire" ~ "N. Hampshire",
state == "new mexico" ~ "N. Mexico",
state == "rhode island" ~ "R. Island",
TRUE ~ label
),
tier = case_when(
pct >= 15 ~ "Well Above",
pct >= 0 ~ "Above",
pct >= -15 ~ "Below",
TRUE ~ "Well Below"
),
tier = factor(tier, levels = c("Well Above","Above","Below","Well Below"))
) %>%
arrange(desc(pct)) %>%
mutate(label = factor(label, levels = label)) # left → right: highest to lowest
tier_colors <- c(
"Well Above" = C_DE,
"Above" = C_DS,
"Below" = C_DA,
"Well Below" = "#c0392b"
)
# Label every state with its % — rotated so they don't collide
p5 <- ggplot(lollipop_data,
aes(x = label, y = pct, color = tier)) +
# Zero reference line
geom_hline(yintercept = 0, color = GOLD,
linewidth = 0.9, linetype = "dashed") +
# Stems
geom_segment(aes(xend = label, y = 0, yend = pct),
linewidth = 0.7, alpha = 0.55) +
# Dots
geom_point(size = 3.5, alpha = 0.93) +
# % labels — above positive bars, below negative bars
geom_text(
aes(
y = pct + ifelse(pct >= 0, 2.2, -2.2),
label = paste0(ifelse(pct >= 0, "+", ""), round(pct, 0), "%"),
vjust = ifelse(pct >= 0, 0, 1)
),
family = "mono",
fontface = "bold",
size = 2.3,
show.legend = FALSE
) +
scale_color_manual(
values = tier_colors,
name = "vs. National Median",
guide = guide_legend(override.aes = list(size = 4),
direction = "horizontal",
title.position = "top")
) +
scale_y_continuous(
labels = function(x) paste0(ifelse(x >= 0, "+", ""), x, "%"),
limits = c(-50, 58),
breaks = seq(-40, 50, by = 10)
) +
# Key insight annotation — top left, large and legible
annotate("text",
x = 1.5, y = 54,
label = paste0(
"Washington leads at +41% · Mississippi lowest at -38% · ",
"A $90K gap separates the best and worst-paying states for Data Scientists"
),
hjust = 0, vjust = 1,
color = TXT_HI, family = "mono", fontface = "bold", size = 3.4,
lineheight = 1.3) +
labs(
title = "Some States Pay a 40% Bonus Just for Showing Up There",
subtitle = NULL,
x = NULL,
y = "% Difference from National Median",
caption = NULL
) +
base_theme +
theme(
axis.text.x = element_text(size = 7.0, angle = 45, hjust = 1,
color = TXT_MED),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_line(color = GRID_COL, linewidth = 0.4),
legend.position = "top",
legend.justification = "left"
)
ggsave("slide5_lollipop.png", p5,
width = 16, height = 7, dpi = 200, bg = BG)
p5

# SLIDE 6: Connected dot plot
# Classify states into three tiers for faceting
high_states <- c("WA", "CA", "DC", "MA", "NY")
mid_states <- c("TX", "CO", "FL", "GA", "OH")
low_states <- c("MS", "LA", "OK", "IN", "MO")
s6_states <- c(high_states, mid_states, low_states)
s6_data <- state_role_long %>%
filter(abbr %in% s6_states) %>%
mutate(
tier = case_when(
abbr %in% high_states ~ "High-Pay States",
abbr %in% mid_states ~ "Mid-Pay States",
TRUE ~ "Low-Pay States"
),
tier = factor(tier, levels = c("High-Pay States", "Mid-Pay States", "Low-Pay States")),
# Use full state name for labels
state_label = case_when(
abbr == "DC" ~ "DC",
TRUE ~ name
)
)
# National average per role — drawn as a reference line across all facets
nat_avg <- roles_nat %>%
select(role, avg) %>%
mutate(role = factor(role, levels = ROLE_LEVELS))
p6 <- ggplot(s6_data,
aes(x = role, y = salary, group = abbr)) +
# Faint connecting line per state
geom_line(aes(color = tier), alpha = 0.30, linewidth = 0.7) +
# Dots — role-coloured so both dimensions are encoded
geom_point(aes(color = role), size = 4.5, alpha = 0.90) +
# State abbreviation label on the rightmost role (Data Architect)
geom_text(
data = s6_data %>% filter(role == "Data Architect"),
aes(label = abbr, color = role),
hjust = -0.3, family = "mono", fontface = "bold",
size = 2.6, show.legend = FALSE
) +
# National average reference line — coloured by role so each line is identifiable
geom_hline(
data = nat_avg,
aes(yintercept = avg, color = role),
linewidth = 0.65,
linetype = "dashed",
alpha = 0.70,
inherit.aes = FALSE
) +
scale_color_manual(
values = c(
ROLE_COLORS,
"High-Pay States" = C_DE,
"Mid-Pay States" = C_DS,
"Low-Pay States" = C_DA
),
guide = "none"
) +
scale_y_continuous(
labels = label_dollar(scale = 1e-3, suffix = "K"),
breaks = seq(60000, 180000, by = 30000)
) +
scale_x_discrete(
labels = c("Analyst", "Biz
Analyst", "Engineer", "Scientist", "Architect"),
expand = expansion(add = c(0.3, 1.0))
) +
facet_wrap(~ tier, ncol = 3) +
labs(
title = "When plotted together",
subtitle = paste0("The gap between a Data Analyst in a low pay state and a Data Architect in a high pay state is over $116,000 a year.\n",
"Role and location aren't independent choices, they compound."),
x = NULL,
y = "Annual Salary (USD)",
caption = "Dot colour = role · Gold dashed line = national role average · Each line = one state"
) +
base_theme +
theme(
strip.background = element_rect(fill = GRID_COL, color = NA),
strip.text = element_text(color = TXT_HI, family = "mono",
face = "bold", size = 10),
axis.text.x = element_text(color = TXT_HI, size = 8.5, face = "bold"),
panel.grid.major.x = element_blank(),
panel.spacing = unit(1.5, "lines")
)
## Warning in geom_hline(data = nat_avg, aes(yintercept = avg, color = role), :
## Ignoring unknown parameters: `inherit.aes`
ggsave("slide6_role_x_state.png", p6,
width = 14, height = 7, dpi = 200, bg = BG)
p6
