Data 608 Story 4

How much do Data Science roles get paid in the US?

Data Sources

The data used to answer the question is taken from Glassdoor via Kaggle. The data was scrapped from Glassdoor and posted to the site. The data collected represent 2021 Job Listings with salary and location information for Data Science related careers. Please note the dataset did not contain data for all 50 US states.

Source: https://www.kaggle.com/datasets/nikhilbhathi/data-scientist-salary-us-glassdoor?resource=download

Source CSV https://raw.githubusercontent.com/johnnydrodriguez/data608_story4/main/glassdoor_2021.csv

Prepping the Environment

library(tidyverse)
library(ggplot2)
library(dplyr)
library(stringr)
library(RColorBrewer)
library(cluster)
library(maps)

The Glass Door Jobs Listing Dataset

# Define the URL of the CSV file
file_url <- "https://raw.githubusercontent.com/johnnydrodriguez/data608_story4/main/glassdoor_2021.csv"

# Read the CSV file into a data frame named jobs_df
jobs_df <- read.csv(url(file_url))
glimpse(jobs_df, n = 1)

## Rows: 742
## Columns: 42
## $ index              <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1…
## $ Job.Title          <chr> "Data Scientist", "Healthcare Data Scientist", "Dat…
## $ Salary.Estimate    <chr> "$53K-$91K (Glassdoor est.)", "$63K-$112K (Glassdoo…
## $ Job.Description    <chr> "Data Scientist\nLocation: Albuquerque, NM\nEducati…
## $ Rating             <dbl> 3.8, 3.4, 4.8, 3.8, 2.9, 3.4, 4.1, 3.8, 3.3, 4.6, 3…
## $ Company.Name       <chr> "Tecolote Research\n3.8", "University of Maryland M…
## $ Location           <chr> "Albuquerque, NM", "Linthicum, MD", "Clearwater, FL…
## $ Headquarters       <chr> "Goleta, CA", "Baltimore, MD", "Clearwater, FL", "R…
## $ Size               <chr> "501 - 1000 ", "10000+ ", "501 - 1000 ", "1001 - 50…
## $ Founded            <int> 1973, 1984, 2010, 1965, 1998, 2000, 2008, 2005, 201…
## $ Type.of.ownership  <chr> "Company - Private", "Other Organization", "Company…
## $ Industry           <chr> "Aerospace & Defense", "Health Care Services & Hosp…
## $ Sector             <chr> "Aerospace & Defense", "Health Care", "Business Ser…
## $ Revenue            <chr> "$50 to $100 million (USD)", "$2 to $5 billion (USD…
## $ Competitors        <chr> "-1", "-1", "-1", "Oak Ridge National Laboratory, N…
## $ Hourly             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Employer.provided  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Lower.Salary       <int> 53, 63, 80, 56, 86, 71, 54, 86, 38, 120, 126, 64, 1…
## $ Upper.Salary       <int> 91, 112, 90, 97, 143, 119, 93, 142, 84, 160, 201, 1…
## $ Avg.Salary.K.      <dbl> 72.0, 87.5, 85.0, 76.5, 114.5, 95.0, 73.5, 114.0, 6…
## $ company_txt        <chr> "Tecolote Research", "University of Maryland Medica…
## $ Job.Location       <chr> "NM", "MD", "FL", "WA", "NY", "TX", "MD", "CA", "NY…
## $ Age                <int> 48, 37, 11, 56, 23, 21, 13, 16, 7, 12, 10, 53, 59, …
## $ Python             <int> 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, …
## $ spark              <int> 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, …
## $ aws                <int> 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ excel              <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, …
## $ sql                <int> 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, …
## $ sas                <int> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ keras              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ pytorch            <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ scikit             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ tensor             <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, …
## $ hadoop             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, …
## $ tableau            <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ bi                 <int> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
## $ flink              <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ mongo              <int> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ google_an          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ job_title_sim      <chr> "data scientist", "data scientist", "data scientist…
## $ seniority_by_title <chr> "na", "na", "na", "na", "na", "na", "na", "na", "na…
## $ Degree             <chr> "M", "M", "M", "na", "na", "na", "na", "M", "P", "n…

Data Clean Up

# Eliminate duplicate job lisiting in datase
jobs_unique <- jobs_df %>%
  distinct(Job.Description, Company.Name, .keep_all = TRUE) %>%
  mutate(Job.Description = gsub("’", "'", Job.Description))

# Cleanup Eucation column
education <- c("Masters", "Bachelors", "Master's", "Bachelor's", "Phd", "PhD", "Ph.D")
jobs_unique <- jobs_unique %>%
  mutate(education = str_extract(Job.Description, paste(education, collapse = "|")))

# Rename columns
jobs_unique <- jobs_unique %>%
  rename(
    Company_Rating = Rating,
    Company_Name = Company.Name,
    HQ_Address = Headquarters,
    type_of_ownership = Type.of.ownership,
    Company_Age = Age,
    Job_Title = Job.Title,
    Salary_Estimate = Salary.Estimate,
    Job_Description = Job.Description,
    Lower_Salary_K = Lower.Salary,
    Upper_Salary_K = Upper.Salary,
    Avg_Salary_K = Avg.Salary.K.,
    Job_Location = Job.Location
  )

Distribution of Salary Across Data Science Jobs

Generally speaking, Data Science jobs pay well and at or above the median US household income of $70,784 in 2021.

# Filter out rows where 'Avg_Salary_K' is NA
jobs_unique_filtered <- subset(jobs_unique, !is.na(Avg_Salary_K))

# Calculate the density
density_estimate <- density(jobs_unique_filtered$Avg_Salary_K, na.rm = TRUE)

# Calculate the cumulative density
cumulative_density <- cumsum(density_estimate$y) / sum(density_estimate$y)

# Find the indices for the central 50%
lower_index <- which.min(abs(cumulative_density - 0.25))
upper_index <- which.min(abs(cumulative_density - 0.75))

# Get the x values corresponding to the central 50%
lower_bound_50 <- density_estimate$x[lower_index]
upper_bound_50 <- density_estimate$x[upper_index]

# Create the density plot
ggplot(jobs_unique_filtered, aes(x = Avg_Salary_K)) +
  geom_density(aes(y = ..density..), alpha = 0.5, fill = "gray") +  # KDE in gray
  geom_area(data = data.frame(x = density_estimate$x, y = density_estimate$y),
            aes(x = ifelse(x >= lower_bound_50 & x <= upper_bound_50, x, NA), y = ifelse(x >= lower_bound_50 & x <= upper_bound_50, y, 0)),
            fill = "darkblue", alpha = 0.4) +  # Highlight central 50% in blue
  ggtitle("General Data Science Salary Distribution") +  # Title
  xlab("Average Salary ($K)") +  # Updated X-axis label
  ylab("") +  # Empty Y-axis label
  theme_minimal() +
  theme(
    axis.text.y = element_blank()  # Remove Y-axis text
  ) +
  scale_x_continuous(breaks = c(pretty(jobs_unique_filtered$Avg_Salary_K, n = 5)),
                     labels = pretty(jobs_unique_filtered$Avg_Salary_K, n = 5)) +  # Reduced number of x-axis ticks
  annotate("text", x = lower_bound_50, y = max(density_estimate$y) / 2, label = paste("Lower 50% =", round(lower_bound_50, 2)), hjust = 0.5, size = 2.5) +  # Lower 50% label
  annotate("text", x = upper_bound_50, y = max(density_estimate$y) / 2, label = paste("Upper 50% =", round(upper_bound_50, 2)), hjust = 0.5, size = 2.5) +  # Upper 50% label
  annotate("text", x = Inf, y = Inf, label = "50% of data science jobs pay between $74K and $122K", hjust = 1, vjust = 2, size = 4)  # Kicker moved to the top

Salary Range for Data Science Jobs by Title

The salary range varies greatly however. Entry level analyst or data analytics roles have salaries on the lower end while directors and machine learning engineers earn the most. Noteable is that some roles have offer top salaries outlier ranges closer to $250K.

# Filter out rows where 'Avg_Salary_K' or 'job_title_sim' is NA and not "na"
jobs_unique_filtered <- jobs_unique %>% 
  filter(!is.na(Avg_Salary_K) & !is.na(job_title_sim) & job_title_sim != "na")

# Rename 'data analitics' to 'data analytics'
jobs_unique_filtered$job_title_sim <- gsub("data analitics", "data analytics", jobs_unique_filtered$job_title_sim)

# Rename 'Data scientist project manager' to 'data project manager'
jobs_unique_filtered$job_title_sim <- gsub("Data scientist project manager", "data project manager", jobs_unique_filtered$job_title_sim)

# Calculate mean Avg_Salary_K for each job_title_sim
mean_salary_by_title <- jobs_unique_filtered %>% 
  group_by(job_title_sim) %>% 
  summarise(mean_salary = mean(Avg_Salary_K, na.rm = TRUE)) %>% 
  arrange(desc(mean_salary))  

# Reorder the levels of job_title_sim based on mean_salary
jobs_unique_filtered$job_title_sim <- factor(jobs_unique_filtered$job_title_sim, levels = mean_salary_by_title$job_title_sim)

# Calculate the overall mean salary across all job titles
overall_mean_salary <- mean(jobs_unique_filtered$Avg_Salary_K, na.rm = TRUE)

# Create a box plot sorted by mean Avg_Salary_K
ggplot(jobs_unique_filtered, aes(x = job_title_sim, y = Avg_Salary_K)) +
  geom_boxplot() +
  geom_hline(yintercept = overall_mean_salary, color = "darkorange", linetype = "dashed", size = 1) +
  xlab("Job Titles") +  
  ylab("Avg Salary($K)") + 
  ggtitle("Glassdoor 2021 Salary Distribution for Data Science Jobs") + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  annotate("text", x = Inf, y = overall_mean_salary, label = paste("Mean =", round(overall_mean_salary, .5)), vjust = 2, hjust = 6.75, color = "darkorange") +
  annotate("text", x = 1, y = 260, label = "While the average sits above $100K, some roles have a wide range of outliers exceeding $250K", hjust = 0, vjust = 0, size = 3.5)

Data Science Job Salaries Clustered By Similar State Salary Ranges

Not all salaries are equal. California pays the highest salaries while Louisiana pays the least. While regional differences account for the state salary clusters, some states exceed their regional counterparts.

# Set a seed for reproducibility
set.seed(12345)


# Handle missing values
jobs_unique <- jobs_unique %>% drop_na(Avg_Salary_K, Lower_Salary_K, Upper_Salary_K)

# Aggregate the average, lower, and upper salaries by 'Job_Location'
agg_data <- jobs_unique %>% 
  group_by(Job_Location) %>% 
  summarise(
    Avg_Salary_K = mean(Avg_Salary_K, na.rm = FALSE),
    Lower_Salary_K = mean(Lower_Salary_K, na.rm = FALSE),
    Upper_Salary_K = mean(Upper_Salary_K, na.rm = FALSE)
  )

# Perform K-means clustering
kmeans_result <- kmeans(agg_data[, "Avg_Salary_K", drop = FALSE], centers = 3)

# Add cluster labels to the data
agg_data$cluster <- kmeans_result$cluster

# Function to find nearest neighbors within a cluster
find_nearest_neighbors <- function(cluster_data) {
  remaining <- cluster_data$Job_Location
  start <- sample(remaining, 1)
  ordered <- c(start)
  remaining <- setdiff(remaining, start)
  
  while (length(remaining) > 0) {
    last_salary <- cluster_data$Avg_Salary_K[cluster_data$Job_Location == start]
    nearest_neighbor <- remaining[which.min(abs(cluster_data$Avg_Salary_K[cluster_data$Job_Location %in% remaining] - last_salary))]
    
    ordered <- c(ordered, nearest_neighbor)
    start <- nearest_neighbor
    remaining <- setdiff(remaining, start)
  }
  
  return(cluster_data %>% arrange(match(Job_Location, ordered)))
}

# Reorder the states within each cluster
cluster_list <- lapply(split(agg_data, agg_data$cluster), find_nearest_neighbors)

# Sort clusters by their average salary
cluster_means <- sapply(cluster_list, function(x) mean(x$Avg_Salary_K))
sorted_clusters <- cluster_list[order(cluster_means)]

# Concatenate sorted clusters
agg_data_ordered <- do.call(rbind, sorted_clusters)

# Update Job_Location to be a factor with the ordering found
agg_data_ordered$Job_Location <- factor(agg_data_ordered$Job_Location, levels = agg_data_ordered$Job_Location)

# Calculate the mean average, lower, and upper salaries
mean_avg_salary <- mean(agg_data$Avg_Salary_K, na.rm = FALSE)
mean_lower_salary <- mean(agg_data$Lower_Salary_K, na.rm = FALSE)
mean_upper_salary <- mean(agg_data$Upper_Salary_K, na.rm = FALSE)


# Generate the scatter plot
ggplot(agg_data_ordered, aes(x = Job_Location, y = Avg_Salary_K, color = as.factor(cluster))) +
  geom_point(size = 2) +
  geom_text(aes(label = Job_Location), vjust = -1, hjust = 0.5, size = 2.5) +
  geom_hline(yintercept = mean_avg_salary, linetype = "dashed", color = "black", size = 0.5) +
  geom_hline(yintercept = mean_lower_salary, linetype = "dashed", color = "black", size = 0.5) +
  geom_hline(yintercept = mean_upper_salary, linetype = "dashed", color = "black", size = 0.5) +
  annotate("text", x = Inf, y = mean_avg_salary, label = "Mean", hjust = 16.25, vjust = -0.2) +
  annotate("text", x = Inf, y = mean_lower_salary, label = "Lower", hjust = 15, vjust = -0.2) +
  annotate("text", x = Inf, y = mean_upper_salary, label = "Upper", hjust = 15, vjust = -0.2) +
  scale_color_manual(values = c("orange", "darkgreen", "blue")) +
  theme(
    legend.position = "none",
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  ggtitle("Data Science Salary Clustered by State") +  # Updated title
  ylab("Average Salary ($K)") +  # Updated Y-axis label
  annotate("text", x = Inf, y = Inf, label = "Regional and state differences greatly affect salary ranges.", hjust = 1.5, vjust = 2)  # Added kicker

Data Science Salaries Accross the US

In general, California, Illinois, and the Northeast pay above the mean. Southern States, largely with the exception of Texas, pay below significantly below the mean.

state_lookup <- data.frame(
  abbreviation = c("al", "az", "ca", "co", "ct", "dc", "de", "fl", "ga", 
                   "ia", "id", "il", "in", "ks", "ky", "la", "ma", "md",  
                   "mi", "mn", "mo", "nc", "ne", "nj", "nm", "ny", "oh", 
                   "or", "pa", "ri", "sc", "tn", "tx", "ut", "va", "wa", "wi"),
  full_name = c("alabama", "arizona", "california", "colorado", "connecticut", 
                "district of columbia", "delaware", "florida", "georgia", 
                "iowa", "idaho", "illinois", "indiana", "kansas", "kentucky", 
                "louisiana", "massachusetts", "maryland", "michigan", "minnesota", 
                "missouri", "north carolina", "nebraska", "new jersey", "new mexico", 
                "new york", "ohio", "oregon", "pennsylvania", "rhode island", 
                "south carolina", "tennessee", "texas", "utah", "virginia", 
                "washington", "wisconsin")
)

# Aggregate the data by state and average salary  
agg_data <- jobs_unique %>%
  group_by(Job_Location) %>%
  summarise(Avg_Salary_K = mean(Avg_Salary_K, na.rm = TRUE))

# Convert state abbreviations to lowercase to match ggplot2's built-in data
agg_data$Job_Location <- tolower(agg_data$Job_Location)  

# Join data
states_map <- map_data("state")
agg_data <- agg_data %>%
  left_join(state_lookup, by = c("Job_Location" = "abbreviation")) %>%
  left_join(states_map, by = c("full_name" = "region"))

#Plot
ggplot(agg_data, aes(x = long, y = lat, group = group, fill = Avg_Salary_K)) +
  geom_polygon(color = "black", size = 0.1) + 
  scale_fill_gradientn(colors = rev(colorRampPalette(c("#5C4033", "beige"))(5))) +
  coord_fixed(1.3) +
  labs(
    title = "Data Science Salary by State",  # Updated title
    subtitle = "California, Illinois and the Northeast US pay data scientists above the $101K average.",  # Added kicker
    fill = "Average Salary (thousands USD)"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")