Data608_Story4.knit

# Story 4: DATA PRACTITIONER SALARY STORY "How Much Do We Get Paid?"
# Packages

pkgs <- c("ggplot2", "dplyr", "tidyr", "scales", "forcats", "ggtext", "ggridges")
for (p in pkgs) {
  if (!requireNamespace(p, quietly = TRUE)) install.packages(p)
}

library(ggplot2)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

library(forcats)
library(ggtext)
library(ggridges)

## Warning: package 'ggridges' was built under R version 4.5.3

# DESIGN

BG       <- "#0D1117"
PANEL    <- "#161B22"
GRID_COL <- "#21262D"
BORDER   <- "#30363D"
TXT_HI   <- "#FFFFFF"
TXT_MED  <- "#8B949E"
TXT_LOW  <- "#484F58"

C_DS   <- "#58A6FF"   # Data Scientist  — electric blue
C_DE   <- "#3FB950"   # Data Engineer   — emerald
C_DA   <- "#F78166"   # Data Analyst    — coral
C_BA   <- "#D29922"   # Business Analyst— amber
C_DAR  <- "#BC8CFF"   # Data Architect  — violet
GOLD   <- "#FFD700"

ROLE_COLORS <- c(
  "Data Architect"    = C_DAR,
  "Data Scientist"    = C_DS,
  "Data Engineer"     = C_DE,
  "Business Analyst"  = C_BA,
  "Data Analyst"      = C_DA
)

ROLE_LEVELS <- c("Data Analyst", "Business Analyst",
                 "Data Engineer", "Data Scientist", "Data Architect")

base_theme <- theme_minimal() +
  theme(
    plot.background    = element_rect(fill = BG,    color = NA),
    panel.background   = element_rect(fill = PANEL, color = NA),
    panel.grid.major   = element_line(color = GRID_COL, linewidth = 0.4),
    panel.grid.minor   = element_blank(),
    axis.ticks         = element_blank(),
    axis.text          = element_text(color = TXT_MED, family = "mono", size = 9),
    axis.title         = element_text(color = TXT_MED, family = "mono", size = 9),
    plot.title         = element_text(color = TXT_HI,  family = "sans",
                                      face = "bold", size = 17,
                                      margin = margin(b = 4)),
    plot.subtitle      = element_text(color = TXT_MED, family = "mono",
                                      size = 13, margin = margin(b = 14)),
    plot.caption       = element_text(color = TXT_LOW, family = "mono",
                                      size = 7,   margin = margin(t = 10)),
    legend.background  = element_rect(fill = PANEL, color = NA),
    legend.key         = element_rect(fill = PANEL, color = NA),
    legend.text        = element_text(color = TXT_MED, family = "mono", size = 8),
    legend.title       = element_text(color = TXT_HI,  family = "mono",
                                      size = 8, face = "bold"),
    plot.margin        = margin(20, 24, 14, 24)
  )

# DATA loading

# Slide 1 & 4: National role-level salary data
# BLS OEWS 2024 · Glassdoor 2024 · ZipRecruiter 2024-25
roles_nat <- read_csv("roles_nat.csv") %>%
  mutate(role = factor(role, levels = ROLE_LEVELS))

## Rows: 5 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): role
## dbl (4): avg, median, p25, p75
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Slide 2 & 5: Data Scientist salary by state (BLS OEWS May 2024)
ds_state <- read_csv("ds_state.csv")

## Rows: 48 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (1): salary
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Slide 3: Multi-role × state matrix (30 representative states)
# Sources: BLS + ZipRecruiter + Glassdoor blended estimates
state_role <- read_csv("state_role.csv")

## Rows: 30 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): abbr, name
## dbl (5): Data Scientist, Data Engineer, Data Analyst, Business Analyst, Data...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

state_role_long <- state_role %>%
  pivot_longer(cols = all_of(ROLE_LEVELS),
               names_to = "role",
               values_to = "salary") %>%
  mutate(role = factor(role, levels = ROLE_LEVELS))

# SLIDE 1 : Radial bar


radial_data <- roles_nat %>%
  arrange(desc(avg)) %>%
  mutate(role = factor(role, levels = role))

INNER <- 55000   # inner radius — bars grow outward from here
OUTER_MAX <- 115000  # tallest bar height above INNER

LABEL_R <- OUTER_MAX + 22000

p1 <- ggplot(radial_data, aes(x = role, y = avg - INNER, fill = role)) +
  geom_col(width = 0.75, alpha = 0.90) +
  # Salary label
  geom_text(
    aes(y = (avg - INNER) * 0.38,
        label = paste0("$", round(avg / 1000), "K")),
    color = TXT_HI, family = "mono", fontface = "bold",
    size = 5.0, hjust = 0.5, vjust = 0.5,
    show.legend = FALSE
  ) +
  # Role title
  geom_text(
    aes(y = LABEL_R, label = role, color = role),
    family = "mono", fontface = "bold",
    size = 4.6, hjust = 0.5, vjust = 0.5,
    show.legend = FALSE
  ) +
  coord_polar(theta = "x", start = 0, clip = "off") +
  scale_color_manual(values = ROLE_COLORS, guide = "none") +
  scale_fill_manual(values = ROLE_COLORS, guide = "none") +
  scale_y_continuous(limits = c(-30000, LABEL_R + 24000), expand = c(0, 0)) +
  labs(
    title    = "Five Job Titles. One Family. Completely Different Paychecks.",
    subtitle = "Two people working at the same company, the same data, taking home different paychecks",
    x        = NULL,
    y        = NULL,
    caption  = NULL
  ) +
  base_theme +
  theme(
    axis.text     = element_blank(),
    panel.grid    = element_blank(),
    plot.title    = element_text(hjust = 0.5),
    plot.subtitle = element_text(hjust = 0.5),
    plot.caption  = element_text(hjust = 0.5),
    plot.margin   = margin(60, 60, 60, 60)
  )

ggsave("slide1_radial.png", p1,
       width = 12, height = 12, dpi = 200, bg = BG)
p1

# SLIDE 2 : Ridgeline + Range bar


set.seed(42)

X_LO    <- 45000
X_HI    <- 285000
X_BREAKS <- seq(60000, 260000, by = 40000)

sim_salaries <- roles_nat %>%
  rowwise() %>%
  reframe({
    mu    <- log(median)
    sigma <- (log(p75) - log(p25)) / (2 * 0.6745)
    sims  <- rlnorm(2500, meanlog = mu, sdlog = sigma)
    sims  <- pmax(pmin(sims, 310000), 40000)
    data.frame(role = role, salary = sims)
  }) %>%
  mutate(role = factor(role, levels = ROLE_LEVELS))

role_annotations <- roles_nat %>%
  select(role, median, avg, p25, p75) %>%
  mutate(role = factor(role, levels = ROLE_LEVELS))

p2 <- ggplot(sim_salaries,
             aes(x = salary, y = role, fill = role, group = role)) +
  # LAYER 1: ridgeline density
  geom_density_ridges(
    aes(height = after_stat(density)),
    stat           = "density",
    color          = NA,
    alpha          = 0.18,
    scale          = 0.85,
    rel_min_height = 0.01,
    bandwidth      = 6500
  ) +
  # LAYER 2: IQR band
  geom_segment(
    data        = role_annotations,
    aes(x = p25, xend = p75, y = role, yend = role, color = role),
    inherit.aes = FALSE,
    linewidth   = 9, alpha = 0.28, lineend = "round"
  ) +
  # LAYER 3: median diamond
  geom_point(
    data        = role_annotations,
    aes(x = median, y = role),
    inherit.aes = FALSE,
    shape = 18, size = 7, color = GOLD, alpha = 0.95
  ) +
  # LAYER 4: average dot
  geom_point(
    data        = role_annotations,
    aes(x = avg, y = role, color = role),
    inherit.aes = FALSE,
    size = 5.5, alpha = 0.95
  ) +
  # LAYER 5: avg label above
  geom_text(
    data        = role_annotations,
    aes(x = avg, y = role, color = role,
        label = paste0("avg $", round(avg / 1000), "K")),
    inherit.aes = FALSE,
    family = "mono", fontface = "bold", size = 2.9,
    nudge_y = 0.22, show.legend = FALSE
  ) +
  # LAYER 6: median label below
  geom_text(
    data        = role_annotations,
    aes(x = median, y = role,
        label = paste0("med $", round(median / 1000), "K")),
    inherit.aes = FALSE,
    color = GOLD, family = "mono", fontface = "bold", size = 2.7,
    nudge_y = -0.22
  ) +
  # LAYER 7: p25 / p75 labels 
  geom_text(
    data        = role_annotations,
    aes(x = p25, y = role, label = paste0("$", round(p25 / 1000), "K")),
    inherit.aes = FALSE,
    color = TXT_LOW, family = "mono", size = 2.1, nudge_y = -0.22
  ) +
  geom_text(
    data        = role_annotations,
    aes(x = p75, y = role, label = paste0("$", round(p75 / 1000), "K")),
    inherit.aes = FALSE,
    color = TXT_LOW, family = "mono", size = 2.1, nudge_y = -0.22
  ) +
  scale_fill_manual(values = ROLE_COLORS,  guide = "none") +
  scale_color_manual(values = ROLE_COLORS, guide = "none") +
  scale_x_continuous(
    labels = label_dollar(scale = 1e-3, suffix = "K"),
    limits = c(X_LO, X_HI),
    breaks = X_BREAKS
  ) +
  labs(
    title    = "The Average Is Just the Beginning",
    subtitle = "For every role, median sits below average. Some roles have a very wide range",
    y       = NULL,
    caption = "◆ = median  ● = average  band = 25th–75th percentile "
  ) +
  base_theme +
  theme(
    panel.grid.major.y = element_blank(),
    axis.text.y        = element_text(size = 11, face = "bold", color = TXT_HI)
  )

## Warning in geom_density_ridges(aes(height = after_stat(density)), stat =
## "density", : Ignoring unknown parameters: `bandwidth`

ggsave("slide2_combined.png", p2,
       width = 13, height = 7.5, dpi = 200, bg = BG)

## Warning: Removed 247 rows containing non-finite outside the scale range
## (`stat_density()`).

p2

## Warning: Removed 247 rows containing non-finite outside the scale range
## (`stat_density()`).

# SLIDE 3: HEATMAP 


state_order <- state_role %>%
  arrange(desc(`Data Scientist`)) %>%
  pull(abbr)

heat_data <- state_role_long %>%
  mutate(
    abbr = factor(abbr, levels = state_order),
    # Reverse role order so Data Architect is at top of y-axis
    role = factor(role, levels = rev(ROLE_LEVELS))
  )

heat_pal <- c(
  "#0d1b3e", "#1a3a6b", "#1d5b99",
  "#2b80c5", "#58A6FF", "#3FB950", "#FFD700"
)

p3 <- ggplot(heat_data, aes(x = abbr, y = role, fill = salary)) +
  geom_tile(color = BG, linewidth = 0.6) +
  geom_text(aes(
    label = paste0("$", round(salary / 1000), "K"),
    color = ifelse(salary > 128000, BG, TXT_HI)
  ),
  family = "mono", fontface = "bold", size = 2.2) +
  scale_color_identity() +
  scale_fill_gradientn(
    colors   = heat_pal,
    values   = scales::rescale(c(54000, 70000, 85000,
                                 100000, 115000, 135000, 172000)),
    labels   = label_dollar(scale = 1e-3, suffix = "K"),
    name     = "Avg Annual Salary",
    guide    = guide_colorbar(
      barwidth       = 12,
      barheight      = 0.6,
      title.position = "top",
      ticks          = FALSE,
      direction      = "horizontal",
      title.theme    = element_text(color = TXT_HI, family = "mono",
                                    face = "bold", size = 8)
    )
  ) +
  scale_x_discrete(position = "bottom") +
  scale_y_discrete(
    labels = c(
      "Data Architect"    = "Data Architect",
      "Data Scientist"    = "Data Scientist",
      "Data Engineer"     = "Data Engineer",
      "Business Analyst"  = "Business Analyst",
      "Data Analyst"      = "Data Analyst"
    )
  ) +
  labs(
    title    = "Two Levers Move the Needle: Role and Location",
    subtitle = paste0(
  "The highest paying cell in this matrix is $116K above the lowest.\n",
  "Your role and your state together are the two biggest decisions you'll ever make about your salary."
),
    x       = NULL,
    y       = NULL,
    caption = NULL
  ) +
  base_theme +
  theme(
    panel.grid      = element_blank(),
    axis.text.x     = element_text(color = TXT_MED, size = 7.5, face = "bold",
                                   angle = 45, hjust = 1),
    axis.text.y     = element_text(color = TXT_HI,  size = 9,   face = "bold"),
    legend.position = "top",
    legend.justification = "left",
    legend.margin   = margin(b = 6)
  )

ggsave("slide3_heatmap.png", p3,
       width = 16, height = 6, dpi = 200, bg = BG)

p3

# SLIDE 4 : Connected slope chart


top_states    <- c("WA", "CA", "DC", "MA", "NJ")
bottom_states <- c("LA", "MS", "OK", "IN", "MO")

slope_data <- state_role_long %>%
  filter(abbr %in% c(top_states, bottom_states)) %>%
  mutate(
    group = ifelse(abbr %in% top_states, "Top 5 States", "Bottom 5 States"),
    group = factor(group, levels = c("Top 5 States", "Bottom 5 States")),
    state_group = paste0(abbr, "_", group)   # unique grouping key for geom_line
  )

slope_summary <- slope_data %>%
  group_by(role, group) %>%
  summarise(mean_salary = mean(salary), .groups = "drop")

group_colors <- c("Top 5 States" = C_DE, "Bottom 5 States" = C_DA)

label_data <- slope_summary %>%
  filter(role == "Data Architect") %>%
  mutate(
    label = paste0(group, "\n$", round(mean_salary / 1000), "K avg")
  )

p4 <- ggplot(slope_data,
             aes(x = role, y = salary,
                 group = state_group, color = group)) +
  geom_line(alpha = 0.2, linewidth = 0.8) +
  geom_point(alpha = 0.45, size = 2.8) +
  geom_line(data = slope_summary,
            aes(x = role, y = mean_salary,
                group = group, color = group),
            linewidth = 2.5, alpha = 0.95,
            inherit.aes = FALSE) +
  geom_point(data = slope_summary,
             aes(x = role, y = mean_salary, color = group),
             size = 6, alpha = 0.98,
             inherit.aes = FALSE) +
  geom_text(data = label_data,
            aes(x = role, y = mean_salary, label = label, color = group),
            hjust       = -0.12,
            family      = "mono",
            fontface    = "bold",
            size        = 2.9,
            show.legend = FALSE,
            inherit.aes = FALSE) +
  scale_color_manual(values = group_colors, name = NULL) +
  scale_y_continuous(
    labels = label_dollar(scale = 1e-3, suffix = "K"),
    limits = c(48000, 215000),
    breaks = seq(60000, 200000, by = 40000)
  ) +
  scale_x_discrete(expand = expansion(add = c(0.3, 1.7))) +
  labs(
    title    = "The State Gap Rivals the Role Gap : Know Both Levers",
    subtitle = paste0("A Data Analyst in Washington earns more than a Data Scientist in Mississippi.\n",
                      "Location can outweigh a full role upgrade"),
    x       = NULL,
    y       = "Annual Salary (USD)",
    caption = "Thick line = group average  ·  Thin lines = individual states"
  ) +
  base_theme +
  theme(
    axis.text.x        = element_text(color = TXT_HI, size = 10.5, face = "bold"),
    panel.grid.major.x = element_blank(),
    legend.position    = "none"
  )

ggsave("slide4_slope.png", p4,
       width = 13, height = 7, dpi = 200, bg = BG)

p4

# SLIDE 5 : Lollipop chart


NATIONAL_MEDIAN <- 112590   # BLS OEWS May 2024

lollipop_data <- ds_state %>%
  mutate(
    pct   = (salary / NATIONAL_MEDIAN - 1) * 100,
    label = tools::toTitleCase(state),
    label = case_when(
      state == "north carolina" ~ "N. Carolina",
      state == "south carolina" ~ "S. Carolina",
      state == "north dakota"   ~ "N. Dakota",
      state == "south dakota"   ~ "S. Dakota",
      state == "west virginia"  ~ "W. Virginia",
      state == "new hampshire"  ~ "N. Hampshire",
      state == "new mexico"     ~ "N. Mexico",
      state == "rhode island"   ~ "R. Island",
      TRUE                      ~ label
    ),
    tier = case_when(
      pct >= 15  ~ "Well Above",
      pct >= 0   ~ "Above",
      pct >= -15 ~ "Below",
      TRUE       ~ "Well Below"
    ),
    tier = factor(tier, levels = c("Well Above","Above","Below","Well Below"))
  ) %>%
  arrange(desc(pct)) %>%
  mutate(label = factor(label, levels = label))   # left → right: highest to lowest

tier_colors <- c(
  "Well Above" = C_DE,
  "Above"      = C_DS,
  "Below"      = C_DA,
  "Well Below" = "#c0392b"
)

# Label every state with its % — rotated so they don't collide
p5 <- ggplot(lollipop_data,
             aes(x = label, y = pct, color = tier)) +
  # Zero reference line
  geom_hline(yintercept = 0, color = GOLD,
             linewidth = 0.9, linetype = "dashed") +
  # Stems
  geom_segment(aes(xend = label, y = 0, yend = pct),
               linewidth = 0.7, alpha = 0.55) +
  # Dots
  geom_point(size = 3.5, alpha = 0.93) +
  # % labels — above positive bars, below negative bars
  geom_text(
    aes(
      y     = pct + ifelse(pct >= 0, 2.2, -2.2),
      label = paste0(ifelse(pct >= 0, "+", ""), round(pct, 0), "%"),
      vjust = ifelse(pct >= 0, 0, 1)
    ),
    family      = "mono",
    fontface    = "bold",
    size        = 2.3,
    show.legend = FALSE
  ) +
  scale_color_manual(
    values = tier_colors,
    name   = "vs. National Median",
    guide  = guide_legend(override.aes = list(size = 4),
                          direction = "horizontal",
                          title.position = "top")
  ) +
  scale_y_continuous(
    labels = function(x) paste0(ifelse(x >= 0, "+", ""), x, "%"),
    limits = c(-50, 58),
    breaks = seq(-40, 50, by = 10)
  ) +
  # Key insight annotation — top left, large and legible
  annotate("text",
           x = 1.5, y = 54,
           label = paste0(
             "Washington leads at +41%  ·  Mississippi lowest at -38%  ·  ",
             "A $90K gap separates the best and worst-paying states for Data Scientists"
           ),
           hjust = 0, vjust = 1,
           color = TXT_HI, family = "mono", fontface = "bold", size = 3.4,
           lineheight = 1.3) +
  labs(
    title    = "Some States Pay a 40% Bonus Just for Showing Up There",
    subtitle = NULL,
    x        = NULL,
    y        = "% Difference from National Median",
    caption  = NULL
  ) +
  base_theme +
  theme(
    axis.text.x      = element_text(size = 7.0, angle = 45, hjust = 1,
                                    color = TXT_MED),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = GRID_COL, linewidth = 0.4),
    legend.position    = "top",
    legend.justification = "left"
  )

ggsave("slide5_lollipop.png", p5,
       width = 16, height = 7, dpi = 200, bg = BG)
p5

# SLIDE 6: Connected dot plot 


# Classify states into three tiers for faceting
high_states <- c("WA", "CA", "DC", "MA", "NY")
mid_states  <- c("TX", "CO", "FL", "GA", "OH")
low_states  <- c("MS", "LA", "OK", "IN", "MO")

s6_states <- c(high_states, mid_states, low_states)

s6_data <- state_role_long %>%
  filter(abbr %in% s6_states) %>%
  mutate(
    tier = case_when(
      abbr %in% high_states ~ "High-Pay States",
      abbr %in% mid_states  ~ "Mid-Pay States",
      TRUE                  ~ "Low-Pay States"
    ),
    tier = factor(tier, levels = c("High-Pay States", "Mid-Pay States", "Low-Pay States")),
    # Use full state name for labels
    state_label = case_when(
      abbr == "DC" ~ "DC",
      TRUE ~ name
    )
  )

# National average per role — drawn as a reference line across all facets
nat_avg <- roles_nat %>%
  select(role, avg) %>%
  mutate(role = factor(role, levels = ROLE_LEVELS))

p6 <- ggplot(s6_data,
             aes(x = role, y = salary, group = abbr)) +
  # Faint connecting line per state
  geom_line(aes(color = tier), alpha = 0.30, linewidth = 0.7) +
  # Dots — role-coloured so both dimensions are encoded
  geom_point(aes(color = role), size = 4.5, alpha = 0.90) +
  # State abbreviation label on the rightmost role (Data Architect)
  geom_text(
    data = s6_data %>% filter(role == "Data Architect"),
    aes(label = abbr, color = role),
    hjust = -0.3, family = "mono", fontface = "bold",
    size = 2.6, show.legend = FALSE
  ) +
  # National average reference line — coloured by role so each line is identifiable
  geom_hline(
    data        = nat_avg,
    aes(yintercept = avg, color = role),
    linewidth   = 0.65,
    linetype    = "dashed",
    alpha       = 0.70,
    inherit.aes = FALSE
  ) +
  scale_color_manual(
    values = c(
      ROLE_COLORS,
      "High-Pay States" = C_DE,
      "Mid-Pay States"  = C_DS,
      "Low-Pay States"  = C_DA
    ),
    guide = "none"
  ) +
  scale_y_continuous(
    labels = label_dollar(scale = 1e-3, suffix = "K"),
    breaks = seq(60000, 180000, by = 30000)
  ) +
  scale_x_discrete(
    labels = c("Analyst", "Biz
Analyst", "Engineer", "Scientist", "Architect"),
    expand = expansion(add = c(0.3, 1.0))
  ) +
  facet_wrap(~ tier, ncol = 3) +
  labs(
    title    = "When plotted together",
    subtitle = paste0("The gap between a Data Analyst in a low pay state and a Data Architect in a high pay state is over $116,000 a year.\n",
                      "Role and location aren't independent choices, they compound."),
    x       = NULL,
    y       = "Annual Salary (USD)",
    caption = "Dot colour = role  ·  Gold dashed line = national role average  ·  Each line = one state"
  ) +
  base_theme +
  theme(
    strip.background = element_rect(fill = GRID_COL, color = NA),
    strip.text       = element_text(color = TXT_HI, family = "mono",
                                    face = "bold", size = 10),
    axis.text.x      = element_text(color = TXT_HI, size = 8.5, face = "bold"),
    panel.grid.major.x = element_blank(),
    panel.spacing    = unit(1.5, "lines")
  )

## Warning in geom_hline(data = nat_avg, aes(yintercept = avg, color = role), :
## Ignoring unknown parameters: `inherit.aes`

ggsave("slide6_role_x_state.png", p6,
       width = 14, height = 7, dpi = 200, bg = BG)
p6