Author

Henock Montcho

Published

May 22, 2026

Code
# Load libraries
library(tidytext)
library(readxl)
library(tidyverse)
library(sf)
library(tigris)
library(viridis)
library(usmap) 
library(ggplot2)
library(forcats)
library(scales)
library(dplyr)

setwd("C:/Users/month/Downloads")

Data_Scientist <- read_excel("C:/Users/month/Downloads/Data_Scientist.xlsx")
Data_Scientist$Salary <- as.numeric(gsub("[$,\\s]", "", Data_Scientist$Salary))
head(Data_Scientist)
# A tibble: 6 × 2
  State      Salary
  <chr>       <dbl>
1 Alabama    111249
2 Alaska     132182
3 Arizona    114378
4 Arkansas   101493
5 California 121131
6 Colorado   129062
Code
Data_Engineer <- read_excel("C:/Users/month/Downloads/Data_Engineer.xlsx")
Data_Engineer$Salary <- as.numeric(gsub("[$,\\s]", "", Data_Engineer$Salary))
head(Data_Engineer)
# A tibble: 6 × 2
  State      Salary
  <chr>       <dbl>
1 Alabama    117573
2 Alaska     139697
3 Arizona    120881
4 Arkansas   107263
5 California 128018
6 Colorado   136399
Code
Data_Analyst <- read_excel("C:/Users/month/Downloads/Data_Analyst.xlsx")
Data_Analyst$Salary <- as.numeric(gsub("[$,\\s]", "", Data_Analyst$Salary))
head(Data_Analyst)
# A tibble: 6 × 2
  State      Salary
  <chr>       <dbl>
1 Alabama     74904
2 Alaska      88999
3 Arizona     77011
4 Arkansas    68335
5 California  81558
6 Colorado    86898
Code
Business_Analyst <- read_excel("C:/Users/month/Downloads/Business_Analyst.xlsx")
Business_Analyst$Salary <- as.numeric(gsub("[$,\\s]", "", Business_Analyst$Salary))
head(Business_Analyst)
# A tibble: 6 × 2
  State      Salary
  <chr>       <dbl>
1 Alabama     89426
2 Alaska     106253
3 Arizona     91942
4 Arkansas    81584
5 California  97370
6 Colorado   103744
Code
Data_Practitioner <- Data_Scientist |>
  left_join(Data_Engineer, by = "State") |>
  left_join(Data_Analyst, by = "State") |>
  left_join(Business_Analyst, by = "State") |>
  rename(Data_Scientist   = Salary.x,
    Data_Engineer    = Salary.y,
    Data_Analyst     = Salary.x.x,
    Business_Analyst = Salary.y.y)

# Convert to long

Data_Practitioner_Salaries <- Data_Practitioner |>
  pivot_longer(
    cols = c(Data_Scientist, Data_Engineer, Data_Analyst, Business_Analyst),
    names_to = "role",
    values_to = "salary") |>
  mutate(
    state = str_trim(as.character(State)),
    salary = as.numeric(salary)
  )

head(Data_Practitioner_Salaries)
# A tibble: 6 × 4
  State   role             salary state  
  <chr>   <chr>             <dbl> <chr>  
1 Alabama Data_Scientist   111249 Alabama
2 Alabama Data_Engineer    117573 Alabama
3 Alabama Data_Analyst      74904 Alabama
4 Alabama Business_Analyst  89426 Alabama
5 Alaska  Data_Scientist   132182 Alaska 
6 Alaska  Data_Engineer    139697 Alaska 
Code
# Roles Boxplot 

palette <- c("#9BB8AD", "#A39FA1", "#DEB3A0", "#FEC6AF")

bp_role <- ggplot(Data_Practitioner_Salaries, aes(x=" ", y = salary, group = role)) + 
  geom_boxplot(aes(fill = role)) + 
  theme_minimal() +
  scale_y_continuous(labels = label_comma()) +
  facet_grid(. ~ role) +
  scale_fill_manual(values=palette) +
  theme(legend.position = "none") +
  theme(text = element_text(size=12), axis.title=element_text(size=12))
bp_role

Comment:

Code
Data_Scientist1 <- select(Data_Practitioner, c("Data_Scientist", "State"))
Data_Engineer1  <- select(Data_Practitioner, c("Data_Engineer", "State"))
Data_Analyst1   <- select(Data_Practitioner, c("Data_Analyst", "State"))
Business_Analyst1 <- select(Data_Practitioner, c("Business_Analyst", "State"))


# Bar chart Data Scientist

bc_DS <- ggplot(Data_Scientist1, aes(x = fct_reorder(Data_Scientist1$State, Data_Scientist1$Data_Scientist), y = Data_Scientist1$Data_Scientist, fill = State)) +
  geom_col() +                         
  coord_flip() +
  scale_y_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    text = element_text(size = 8),
    axis.title = element_text(size = 12),
    plot.title = element_text(size = 10)
  ) +
  labs(
    title = "US Average Data Scientist Salaries by State / Territory",
    x = "State or Territory",
    y = "Annual Average Salary"
  )

bc_DS

Code
# Bar Chart Data Engineer

bc_DE <- ggplot(Data_Engineer1, aes(x = fct_reorder(Data_Engineer1$State, Data_Engineer1$Data_Engineer), y = Data_Engineer1$Data_Engineer, fill = State)) +
  geom_col() +                         
  coord_flip() +
  scale_y_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    text = element_text(size = 8),
    axis.title = element_text(size = 12),
    plot.title = element_text(size = 10)
  ) +
  labs(
    title = "US Average Data Engineer Salaries by State / Territory",
    x = "State or Territory",
    y = "Annual Average Salary"
  )

bc_DE

Code
# Bar Chart Data analyst
bc_DA <- ggplot(Data_Analyst1, aes(x = fct_reorder(Data_Analyst1$State, Data_Analyst1$Data_Analyst), y = Data_Analyst, fill = State)) +
  geom_col() +                         
  coord_flip() +
  scale_y_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    text = element_text(size = 8),
    axis.title = element_text(size = 12),
    plot.title = element_text(size = 10)
  ) +
  labs(
    title = "US Average Data Analyst Salaries by State / Territory",
    x = "State or Territory",
    y = "Annual Average Salary"
  )

bc_DA

Code
# Bar Chart Business Analyst
bc_BA <- ggplot(Business_Analyst1, aes(x = fct_reorder(Business_Analyst1$State, Business_Analyst1$Business_Analyst), y = Business_Analyst, fill = State)) +
  geom_col() +                         
  coord_flip() +
  scale_y_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    text = element_text(size = 8),
    axis.title = element_text(size = 12),
    plot.title = element_text(size = 10)
  ) +
  labs(
    title = "US Average Business Analyst Salaries by State / Territory",
    x = "State or Territory",
    y = "Annual Average Salary"
  )

bc_BA

Code
### Bar Chart for the average of the overall Data Practitioner Salaries by States.

Data_Practitioner$Avg_Salary <- rowMeans(
  Data_Practitioner[, c("Data_Scientist", "Data_Engineer", "Data_Analyst", "Business_Analyst")],
  na.rm = TRUE
)

bc_state <- ggplot(Data_Practitioner, 
                   aes(x = fct_reorder(State, Avg_Salary), 
                       y = Avg_Salary, 
                       fill = State)) +
  geom_col() +                         
  coord_flip() +
  scale_y_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    text = element_text(size = 8),
    axis.title = element_text(size = 12),
    plot.title = element_text(size = 10)
  ) +
  labs(
    title = "US Average Data Practitioner Salaries by State / Territory",
    x = "State or Territory",
    y = "Average Salary Across Roles"
  )

bc_state

Code
# Faceted bar chart of Top 20 States of all four roles in one plot (ranked by Data Scientist Role)

top20_states <- Data_Practitioner  |>
  arrange(desc(Data_Scientist))  |>
  slice(1:20)  |>
  pull(State)

dp_top20 <- Data_Practitioner  |>
  filter(State %in% top20_states)  |>
  pivot_longer(
    cols = c(Data_Scientist, Data_Engineer, Data_Analyst, Business_Analyst),
    names_to = "Role",
    values_to = "Salary"
  )

ggplot(dp_top20, aes(x = fct_reorder(State, Salary), y = Salary, fill = State)) +
  geom_col() +
  coord_flip() +
  facet_wrap(~ Role, scales = "free_y") +
  scale_y_continuous(labels = scales::label_comma()) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(
    title = "Top 20 Highest-Paying States for Data Practitioner Roles",
    x = "State",
    y = "Average Salary"
  )

Code
dp_long <- Data_Practitioner  |>
  pivot_longer(
    cols = c(Data_Scientist, Data_Engineer, Data_Analyst, Business_Analyst),
    names_to = "Role",
    values_to = "Salary"
  )

  # Faceted bar chart

ggplot(dp_long, aes(x = Role, y = State, fill = Salary)) +
  geom_tile(color = "white") +
  scale_fill_viridis_c(option = "plasma") +
  theme_minimal() +
  labs(
    title = "Heatmap of Data Practitioner Salaries by State and Role",
    x = "Role",
    y = "State",
    fill = "Salary"
  )

Interpretation:
The heatmap confirms a stable national hierarchy of compensation across data roles, suggesting that role complexity and technical depth drive salary more than geography. Some states show uniformly higher salaries across all roles, indicated by lighter colors:

Other states show uniformly lower salaries, indicated by darker purples:

Comment: The heatmap shows consistent salary stratification by role across all states—Data Scientists and Data Engineers earn the most, while Business Analysts and Data Analysts earn less, with noticeable geographic clusters where all roles command higher pay.

The heatmap quickly communicates relative differences across both roles and states.

Across nearly every state:

Code
top20_states <- Data_Scientist1 %>%
  arrange(desc(Data_Scientist)) %>%
  slice(1:20) %>%
  pull(State)

Data_Scientist1 <- Data_Scientist1 %>%
  mutate(Group = ifelse(State %in% top20_states, "Top 20", "Other States"))

bc_DS <- ggplot(Data_Scientist1, 
                aes(x = fct_reorder(State, Data_Scientist), 
                    y = Data_Scientist, 
                    fill = Group)) +
  geom_col() +                         
  coord_flip() +
  scale_fill_manual(values = c("Top 20" = "#1f78b4",   # blue
                               "Other States" = "#b2df8a")) +  # green
  scale_y_continuous(labels = label_comma()) +
  theme_minimal() +
  theme(
    legend.position = "none",
    text = element_text(size = 8),
    axis.title = element_text(size = 12),
    plot.title = element_text(size = 10)
  ) +
  labs(
    title = "US Average Data Scientist Salaries by State / Territory",
    x = "State or Territory",
    y = "Annual Average Salary"
  )

bc_DS