DATA 608 Story 4

library(ggplot2)

# Define the roles
roles <- data.frame(
  Role = c(
    "Deep Learning (DL)",
    "Machine Learning (ML)",
    "Artificial Intelligence (AI)",
    "Data Architecture",
    "Data Science",
    "Data Engineering",
    "Data Management",
    "Business Intelligence (BI)",
    "Data Analytics",
    "Big Data"
  )
)

# Create the infographic
ggplot(roles, aes(y = Role, x = 1)) +
  geom_text(aes(label = Role), size = 5, hjust = 0.5, vjust = 0.5) +
  theme_void() +
  labs(title = "Roles Under Data Practitioner") +
  theme(
    plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
    axis.text.y = element_blank(), # Remove y-axis text
    axis.ticks.y = element_blank() # Remove y-axis ticks
  )

Salary Distribution of Data Practitioners

The histogram illustrates the salary distribution of data practitioners. The data shows a clear peak in salaries around the $100,000–$120,000 range, indicating this is the most common earnings bracket in the field. Salaries taper off toward both the lower end (below $50,000) and the higher end (above $200,000), reflecting a more limited number of practitioners in these extreme ranges. This distribution highlights a concentration of data practitioners within mid-level salary bands, consistent with a growing and maturing data profession.

# Load required libraries
library(ggplot2)
library(dplyr)
library(scales)

# Load the dataset
data <- read.csv("us_dataScience_salaries_2024.csv")

# Remove outliers using the IQR method
filtered_data <- data %>%
  mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
  filter(
    salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
    salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
  )

# Create a new column for fill color based on salary range
filtered_data <- filtered_data %>%
  mutate(
    fill_color = ifelse(salary_in_usd >= 100000 & salary_in_usd <= 200000, "#a1d99b", "#e5f5e0")
  )

# Create the histogram
ggplot(filtered_data, aes(x = salary_in_usd, fill = fill_color)) +
  geom_histogram(binwidth = 10000, color = "black", alpha = 0.7) +
  scale_fill_manual(values = c("#e5f5e0" = "#e5f5e0", "#a1d99b" = "#a1d99b"), guide = "none") +
  labs(
    title = "Salary Distribution with Highlighted Range",
    x = "Salary (USD)",
    y = "Frequency"
  ) +
  scale_x_continuous(
    labels = scales::label_number(scale = 0.001, suffix = "K"),
    breaks = c(0, 100000, 200000, 300000)
  ) +
  geom_vline(xintercept = c(100000, 200000), color = "#a1d99b", linetype = "dashed") +
  annotate(
    "segment",
    x = 100000, xend = 200000, y = max(table(cut(filtered_data$salary_in_usd, breaks = seq(0, max(filtered_data$salary_in_usd), by = 10000)))), 
    yend = max(table(cut(filtered_data$salary_in_usd, breaks = seq(0, max(filtered_data$salary_in_usd), by = 10000)))),
    arrow = arrow(type = "closed", ends = "both", length = unit(0.2, "cm")), color = "#a1d99b"
  ) +
  annotate(
    "text",
    x = 150000, 
    y = max(table(cut(filtered_data$salary_in_usd, 
        breaks = seq(0, max(filtered_data$salary_in_usd), by = 10000)))) + 20, 
    label = expression(bold("Most salaries range:\n    $100K and $200K")), 
    color = "red", 
    size = 4,
    hjust = 0.5  # Center justify the text
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  )

## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

## Salary Distribution of Data Practitioners by Job Category

The chart highlights the distribution of salaries across various data practitioner job categories. Roles like Machine Learning, Artificial Intelligence (AI), and Deep Learning dominate the higher salary ranges (above $200,000), reflecting their demand and specialized nature. Meanwhile, categories such as Business Intelligence (BI), Data Analytics, and Data Management are concentrated in lower salary ranges (below $100,000), suggesting their focus on entry to mid-level roles. The majority of data practitioners earn within the $100,000–$120,000 range, indicating this is the central salary band for the industry.

# Prepare data for clustering with outlier removal
clustering_data <- data %>%
  mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
  group_by(job_category) %>%
  summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE)) %>%
  ungroup() %>%
  filter(
    avg_salary > quantile(avg_salary, 0.25) - 1.5 * IQR(avg_salary) &
    avg_salary < quantile(avg_salary, 0.75) + 1.5 * IQR(avg_salary)
  )

# Create a custom salary range column for legend mapping
clustering_data <- clustering_data %>%
  mutate(salary_range = case_when(
    avg_salary >= 195000 ~ ">= 195K",
    avg_salary < 131000 ~ "< 131K",
    TRUE ~ "131K–195K"
  ))

# Reorder salary_range to match the plot order
clustering_data <- clustering_data %>%
  mutate(
    salary_range = factor(salary_range, levels = c(">= 195K", "131K–195K", "< 131K"))
  )

# Update color mapping for salary ranges
salary_colors <- c(">= 195K" = "#a1d99b", "131K–195K" = "#e5f5e0", "< 131K" = "#fc9272")

# Plot with reversed bar order and legend at the bottom
ggplot(clustering_data, aes(x = reorder(job_category, -avg_salary), y = avg_salary, fill = salary_range)) +
  geom_bar(stat = "identity", show.legend = TRUE) +
  scale_fill_manual(values = salary_colors, name = "Salary Range") +  # Custom colors and legend
  coord_flip() +
  labs(
    title = "Job Titles Clustered by Salary Range",
    x = "Job Title",
    y = "Average Salary (USD)"
  ) +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    legend.position = "bottom",    # Move legend to the bottom
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    legend.background = element_rect(fill = "white", color = NA),
    plot.title = element_text(hjust = 0, size = 16)
  )

Salaries by Experience Level and Company Size

The boxplot illustrates salary variations across experience levels (EN, MI, SE, EX) and company sizes (L, M, S). Key observations include:

Impact of Experience:
- Salaries increase consistently with experience level across all company sizes, with EX (Executive) roles commanding the highest median salaries.
Company Size Influence:
- Large companies (L) consistently offer higher salaries across all experience levels, with a significant gap compared to medium (M) and small (S) companies.
- Medium companies (M) show competitive salaries at senior levels (SE, EX) but offer lower compensation at entry-level (EN).
Small Companies (S):
- Small companies have the narrowest salary range and generally lower pay, with only EX-level roles showing a significant bump in compensation.
Notable Gaps:
- The widest salary range is observed in L-sized companies for SE and EX levels, reflecting more opportunities for higher pay in these organizations.
- EN-level roles show less variance across company sizes, suggesting entry-level roles are more standardized in pay.

This analysis highlights that experience level is the dominant factor in salary progression, while company size further amplifies opportunities for higher earnings. Large companies provide the best overall compensation, especially for experienced professionals.

library(ggplot2)
library(dplyr)

# Ensure salary_in_usd is numeric and prepare the data
boxplot_data <- data %>%
  mutate(
    salary_in_usd = as.numeric(gsub(",", "", salary_in_usd)),  # Remove commas and convert to numeric
    company_size = case_when(
      company_size == "L" ~ "Large (L)",
      company_size == "M" ~ "Medium (M)",
      company_size == "S" ~ "Small (S)"
    ),
    company_size = factor(company_size, levels = c("Large (L)", "Medium (M)", "Small (S)")),
    experience_level = factor(experience_level, levels = c("EN", "MI", "SE", "EX"))
  )

# Define the color scheme
company_size_colors <- c("Large (L)" = "#fc9272",  # Red
                         "Medium (M)" = "#a1d99b",  # Green
                         "Small (S)" = "#9ecae1")   # Blue

# Create the boxplot grouped by experience level
ggplot(boxplot_data, aes(x = experience_level, y = salary_in_usd, fill = company_size)) +
  geom_boxplot(
    position = position_dodge(width = 0.8),  # Ensure grouped boxplots
    outlier.shape = NA                      # Remove outliers for clarity
  ) +
  facet_wrap(~ company_size, nrow = 1) +  # Create facets for company sizes
  scale_fill_manual(values = company_size_colors, name = "Company Size") +  # Apply colors
  labs(
    title = "Salary Distribution by Company Size and Experience Level",
    x = "Experience Level",
    y = "Salary (USD)"
  ) +
  scale_y_continuous(
    limits = c(0, 400000),  # Set y-axis range to max 400K
    breaks = seq(0, 400000, by = 100000),  # Define y-axis ticks
    labels = scales::label_number(scale = 0.001, suffix = "K")  # Format Y-axis labels
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),  # Remove vertical gridlines
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(color = "lightgray", linetype = "dashed", size = 0.2), # Faded gridlines
    panel.grid.minor.y = element_line(color = "lightgray", linetype = "dotted", size = 0.1), # Thinner minor gridlines
    legend.position = "right",             # Position legend on the right
    legend.title = element_text(size = 10, face = "bold"),
    legend.text = element_text(size = 9),
    strip.text = element_text(size = 10, face = "bold"),  # Style facet labels
    axis.title.x = element_text(size = 12),
    axis.title.y = element_text(size = 12),
    axis.text.x = element_text(size = 10),
    axis.text.y = element_text(size = 10),
    plot.title = element_text(hjust = 0, size = 16)
  )

## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Regional Insights on Data Practitioner Salaries Across the US

The map visualization provides a regional analysis of median salaries across the United States for data practitioners. Here are key insights:

Regional Trends:
- The Northeast and West regions show the highest median salaries, particularly in states like New York and California, which likely reflects the concentration of high-paying industries such as tech and finance.
- The Midwest and South regions generally have lower median salaries, though some states like Illinois (Midwest) and Texas (South) stand out with relatively higher figures.
Top Earning States:
- States such as California, New York, and Massachusetts are consistently at the top of the salary range, aligning with their role as hubs for data science and technology-related industries.
Salary Disparities:
- There is a noticeable salary disparity between regions. This could be attributed to factors like the cost of living, the presence of tech hubs, and the availability of high-skill job opportunities in specific areas.
Opportunities for Growth:
- States with moderate salary ranges, such as Texas and Colorado, represent emerging tech hubs, possibly indicating growing opportunities for data practitioners in these regions.

This analysis demonstrates how geography significantly impacts earning potential for data practitioners, with high-paying opportunities concentrated in specific areas. This insight can guide job seekers to target regions that align with their career and financial goals.

# Load required libraries
library(dplyr)
library(ggplot2)
library(maps)
library(cowplot)
library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

# Load the data
state_data <- read_csv('state_datapractitioner_wages.csv')

## Rows: 444 Columns: 10

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): state, state_abrev, ownership_type, occ_title, role, employment_to...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Replace non-numeric values (e.g., "*", "**") with NA in mean_salary, employment_total, and employment_per_1000
state_data <- state_data %>%
  mutate(
    average_salary = ifelse(grepl("[^0-9,]", average_salary) | average_salary == "*", NA, average_salary),
    employment_total = ifelse(grepl("[^0-9,]", employment_total) | employment_total == "*", NA, employment_total),
    employment_per_1000 = ifelse(grepl("[^0-9.]", employment_per_1000) | employment_per_1000 == "*", NA, employment_per_1000)
  )

# Remove commas and convert columns to numeric
state_data <- state_data %>%
  mutate(
    average_salary = as.numeric(gsub(",", "", average_salary)),
    employment_total = as.numeric(gsub(",", "", employment_total)),
    employment_per_1000 = as.numeric(employment_per_1000)
  )

# Define neighboring states for imputation
neighbors <- list(
  "MT" = c("ID", "WY", "SD", "ND"),
  "MS" = c("TN", "AR", "LA", "AL"),
  "OK" = c("TX", "KS", "MO", "AR"),
  "KS" = c("NE", "MO", "OK", "CO"),
  "WY" = c("MT", "SD", "NE", "CO", "UT", "ID")
  # Add remaining states as needed
)

# Function to impute missing values based on neighboring states
impute_missing_value <- function(state_abrev, variable, state_data, neighbors_list) {
  neighbor_states <- neighbors_list[[state_abrev]]
  if (is.null(neighbor_states) || length(neighbor_states) == 0) {
    return(NA)
  }
  neighbor_values <- state_data %>%
    filter(state_abrev %in% neighbor_states) %>%
    pull(!!sym(variable))
  if (all(is.na(neighbor_values))) {
    return(NA)
  } else {
    return(mean(neighbor_values, na.rm = TRUE))
  }
}

# Apply imputation for missing values
state_data <- state_data %>%
  mutate(
    average_salary = ifelse(
      is.na(average_salary),
      sapply(state_abrev, impute_missing_value, "average_salary", state_data, neighbors),
      average_salary
    ),
    employment_total = ifelse(
      is.na(employment_total),
      sapply(state_abrev, impute_missing_value, "employment_total", state_data, neighbors),
      employment_total
    ),
    employment_per_1000 = ifelse(
      is.na(employment_per_1000),
      sapply(state_abrev, impute_missing_value, "employment_per_1000", state_data, neighbors),
      employment_per_1000
    )
  )

# Define region mapping for each state abbreviation
region_mapping <- data.frame(
  state_abrev = c("CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA",   # Northeast
                  "IL", "IN", "MI", "OH", "WI",                           # Midwest
                  "IA", "KS", "MN", "MO", "NE", "ND", "SD",               # Midwest
                  "DE", "FL", "GA", "MD", "NC", "SC", "VA", "WV",         # South Atlantic
                  "AL", "KY", "MS", "TN",                                 # East South Central
                  "AR", "LA", "OK", "TX",                                 # West South Central
                  "AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY",         # Mountain
                  "AK", "CA", "HI", "OR", "WA"),                          # Pacific
  region = c("Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast",
             "Midwest", "Midwest", "Midwest", "Midwest", "Midwest",
             "Midwest", "Midwest", "Midwest", "Midwest", "Midwest", "Midwest", "Midwest",
             "South", "South", "South", "South", "South", "South", "South", "South",
             "South", "South", "South", "South",
             "South", "South", "South", "South",
             "West", "West", "West", "West", "West", "West", "West", "West",
             "West", "West", "West", "West", "West")
)

# Get US state map data
states_map <- map_data("state") %>%
  rename(state_name = region) %>%
  mutate(state_name = tolower(state_name))

# Map state abbreviations to state names
state_abbreviation_mapping <- data.frame(
  state_name = tolower(state.name),
  state_abrev = state.abb
)

# Prepare data for plotting
filtered_data <- state_data %>%
  group_by(state_abrev) %>%
  summarise(state_mean_salary = median(average_salary, na.rm = TRUE), .groups = "drop") %>%
  left_join(region_mapping, by = "state_abrev") %>%
  left_join(state_abbreviation_mapping, by = "state_abrev")

map_data_with_salary <- states_map %>%
  left_join(filtered_data, by = c("state_name"))

# Define custom color gradient
salary_colors <- c("#f7fcf5", "#e5f5e0", "#c7e9c0", "#a1d99b", "#74c476", "#41ab5d", "#238b45", "#006d2c", "#00441b")

# Function to plot each region
plot_region <- function(region_name) {
  region_data <- map_data_with_salary %>% filter(region == region_name)
  
  ggplot() +
    geom_polygon(data = region_data, aes(x = long, y = lat, group = group, fill = state_mean_salary), color = "white") +
    scale_fill_gradientn(colors = salary_colors, name = "Average Salary ($)", na.value = "gray90") +
    labs(title = region_name) +
    theme_minimal() +
    theme(
      legend.position = "none",
      axis.title = element_blank(),
      axis.text = element_blank(),
      axis.ticks = element_blank(),
      panel.grid = element_blank(),
      plot.title = element_text(hjust = 0.5, size = 14)
    ) +
    coord_fixed(1.3)
}

# Generate plots
plot_northeast <- plot_region("Northeast")
plot_midwest <- plot_region("Midwest")
plot_south <- plot_region("South")
plot_west <- plot_region("West")

# Extract legend
legend_plot <- ggplot() +
  geom_polygon(data = map_data_with_salary %>% filter(region == "Northeast"), aes(x = long, y = lat, group = group, fill = state_mean_salary), color = "white") +
  scale_fill_gradientn(colors = salary_colors, name = "Median Salary ($)", na.value = "gray90") +
  theme_minimal() +
  theme(
    legend.position = "right",
    axis.title = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank()
  )
legend <- get_legend(legend_plot)

## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.

# Arrange final plot
final_plot <- plot_grid(
  plot_grid(plot_northeast, plot_west, ncol = 2, align = "h"),
  plot_grid(plot_midwest, plot_south, ncol = 2, align = "h"),
  ncol = 1
)

final_output <- plot_grid(final_plot, legend, ncol = 2, rel_widths = c(3, 0.6))

# Display the plot
print(final_output)

States Grouped by Data Practitioner Salary Ranges

This chart categorizes U.S. states into three clusters based on average salaries for data practitioners. States with the highest salaries (Cluster 1, marked in red) are dominated by hubs like California, which reflects the demand for tech talent in major urban centers. Cluster 2 (green) represents states with competitive but mid-tier salaries, offering opportunities for professionals seeking balanced costs of living and wages. Cluster 3 (blue) highlights states with lower average salaries, often in regions where the cost of living aligns with compensation. This distribution emphasizes geographic disparities in tech salaries, reflecting local market dynamics and industry presence.

# Load necessary libraries
library(dplyr)
library(ggplot2)

# Ensure the correct salary column is used
clustering_data <- state_data %>%
  group_by(state_abrev) %>%
  summarize(avg_salary = mean(average_salary, na.rm = TRUE)) %>%
  ungroup() 

# Perform clustering (k-means) on filtered data
set.seed(123)
kmeans_result <- kmeans(clustering_data$avg_salary, centers = 3)
clustering_data$cluster <- as.factor(kmeans_result$cluster)

# Ensure the correct order for clusters (red, lavender, green)
clustering_data <- clustering_data %>%
  mutate(cluster = factor(cluster, levels = c("1", "2", "3")))

# Define custom colors for clusters
cluster_colors <- c("1" = "#de2d26", "2" = "#756bb1", "3" = "#31a354")

# Visualize clusters with dots and adjusted y-axis ticks
ggplot(clustering_data, aes(x = reorder(state_abrev, avg_salary), y = avg_salary, color = cluster)) +
  geom_point(size = 2) + # Use dots for the visualization
  scale_color_manual(values = cluster_colors, name = "Cluster") + # Assign custom colors
  scale_y_continuous(
    breaks = seq(
      floor(min(clustering_data$avg_salary) / 10000) * 10000, 
      max(clustering_data$avg_salary), 
      by = 10000
    ),
    labels = scales::label_number(scale = 0.001, suffix = "K") # Format labels
  ) + 
  labs(
    title = "States Clustered by Salary Range",
    x = "State Abbreviation",
    y = "Average Salary (USD)",
    color = "Cluster"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major = element_blank(), # Remove major gridlines
    panel.grid.minor = element_blank(), # Remove minor gridlines
    axis.text.y = element_text(size = 6), # Make y-axis tick labels bold
    axis.text.x = element_text(size = 12, face = "bold"),
    axis.title = element_text(size = 14),
    plot.title = element_text(hjust = 0, size = 16)
  ) +
  coord_flip() # Flip the coordinates for horizontal layout

Heading: “Unveiling Salary Trends Across Data Roles”

Artificial Intelligence (AI), Machine Learning (ML), and Deep Learning (DL) consistently lead in salary trends, reflecting high demand for these advanced technical skills. While roles such as Data Science (DS) and Data Engineering (DE) demonstrate steady growth, Data Management (DM) experiences significant fluctuations. The overarching trend indicates a general rise in salaries from 2020 to 2023, with a slight dip in 2024, underscoring the competitive and evolving nature of the data industry.

library(dplyr)
library(ggplot2)

# Remove outliers using IQR method
filtered_data <- data %>%
  mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
  group_by(work_year) %>%
  filter(
    salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
    salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
  ) %>%
  ungroup()

# Calculate average salary for each job category per year
category_trend <- filtered_data %>%
  group_by(work_year, job_category) %>%
  summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")

# Calculate overall average salary trend
overall_trend <- filtered_data %>%
  group_by(work_year) %>%
  summarize(overall_avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")

# Faceted line plot with conditional theming
ggplot(category_trend, aes(x = work_year, y = avg_salary / 10000, color = job_category)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  geom_line(
    data = overall_trend,
    aes(x = work_year, y = overall_avg_salary / 10000),
    color = "blue",
    linetype = "dashed",
    size = 1.2,
    inherit.aes = FALSE
  ) +
  facet_wrap(~job_category, scales = "free_y", ncol = 3) +
  scale_y_continuous(
    breaks = scales::pretty_breaks(),  # Dynamically generate pretty breaks
    labels = scales::label_number(scale = 1)  # Format y-axis without extra "K"
  ) +
  labs(
    title = "Average Salary Trends by Job Category Over Years",
    x = "Year",
    y = "Average Salary x 10,000",
    color = "Job Category"
  ) +
  theme_minimal() +
  theme(
    strip.text = element_text(size = 10),
    plot.title = element_text(hjust = 0, size = 16),
    axis.text.x = element_text(angle = 45, hjust = 1),
    axis.title.y = element_blank(),  # Remove y-axis title
    panel.grid = element_blank(),   # Remove all gridlines
    legend.position = "none",       # Remove legend
    axis.text.y = element_blank()   # Remove y-axis labels on specific columns
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(filtered_data, aes(x = job_category, y = salary_in_usd, fill = job_category)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  scale_y_continuous(labels = scales::label_number(scale = 0.001, suffix = "K")) +
  labs(
    title = "Salary Distribution of Data Practitioners by Job Category",
    x = "Job Category",
    y = "Salary (USD)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none", # Removes the legend
    panel.grid = element_blank() # Removes gridlines
  )

# Remove outliers using the IQR method
filtered_data <- data %>%
  mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
  filter(
    salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
    salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
  )

# Calculate median salary for each job category and reorder job_category in descending order
sorted_data <- filtered_data %>%
  group_by(job_category) %>%
  summarize(median_salary = median(salary_in_usd, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(median_salary)) %>%
  mutate(job_category = factor(job_category, levels = job_category))

# Join sorted data back to the original dataset
filtered_data <- filtered_data %>%
  mutate(job_category = factor(job_category, levels = levels(sorted_data$job_category)))

ggplot(filtered_data, aes(x = job_category, y = salary_in_usd, fill = job_category)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  scale_y_continuous(labels = scales::label_number(scale = 0.001, suffix = "K")) +
  labs(
    title = "Salary Distribution of Data Practitioners by Job Category",
    x = NULL,  # Removes x-axis label
    y = "Salary (USD)"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),      # Removes x-axis tick labels
    axis.ticks.x = element_blank(),     # Removes x-axis ticks
    legend.position = "right",          # Keeps the legend
    panel.grid = element_blank(),       # Removes gridlines
    axis.title.x = element_blank()      # Removes x-axis title
  )

library(dplyr)
library(ggplot2)

# Remove outliers using IQR method
filtered_data <- data %>%
  mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
  group_by(work_year) %>%
  filter(
    salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
    salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
  ) %>%
  ungroup()

# Calculate average salary for each job category
average_salary <- filtered_data %>%
  group_by(job_category) %>%
  summarize(overall_avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")

# Reorder job_category based on the overall average salary
filtered_data <- filtered_data %>%
  mutate(job_category = factor(job_category, levels = average_salary %>% arrange(desc(overall_avg_salary)) %>% pull(job_category)))

# Calculate average salary for each job category per year
category_trend <- filtered_data %>%
  group_by(work_year, job_category) %>%
  summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")

# Calculate overall average salary trend
overall_trend <- filtered_data %>%
  group_by(work_year) %>%
  summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop") %>%
  mutate(job_category = "Overall Average Salary")

# Combine category-specific and overall trends
combined_data <- bind_rows(category_trend, overall_trend)

# Faceted line plot with overall trend
ggplot(combined_data, aes(x = work_year, y = avg_salary / 10000, group = job_category)) +
  geom_line(data = category_trend, aes(color = job_category), size = 1) +
  geom_point(data = category_trend, aes(color = job_category), size = 2) +
  geom_line(data = overall_trend, aes(y = avg_salary / 10000), color = "blue", linetype = "dashed", size = 1) +
  scale_color_manual(
    values = setNames(scales::hue_pal()(length(unique(category_trend$job_category))), unique(category_trend$job_category))
  ) +
  scale_y_continuous(
    breaks = scales::pretty_breaks(),  # Dynamically generate breaks
    labels = scales::label_number(accuracy = 1)  # Remove decimal places
  ) +
  facet_wrap(~job_category, scales = "free_y", ncol = 3) +
  labs(
    title = "Average Salary Trends by Job Category Over Years",
    x = "Year",
    y = "Average Salary x 10,000"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0, size = 16),
    strip.text = element_text(size = 10),
    legend.position = "none",  # Remove legend
    panel.grid.major = element_blank(),  # Faded major gridlines
    panel.grid.minor = element_line(color = "gray95", size = 0.01)   # Faded minor gridlines
  )

Salaries by Experience Level and Company Size

Statement:

The boxplot illustrates salary variations across experience levels (EN, MI, SE, EX) and company sizes (L, M, S). Key observations include:

Impact of Experience:
- Salaries increase consistently with experience level across all company sizes, with EX (Executive) roles commanding the highest median salaries.
Company Size Influence:
- Large companies (L) consistently offer higher salaries across all experience levels, with a significant gap compared to medium (M) and small (S) companies.
- Medium companies (M) show competitive salaries at senior levels (SE, EX) but offer lower compensation at entry-level (EN).
Small Companies (S):
- Small companies have the narrowest salary range and generally lower pay, with only EX-level roles showing a significant bump in compensation.
Notable Gaps:
- The widest salary range is observed in L-sized companies for SE and EX levels, reflecting more opportunities for higher pay in these organizations.
- EN-level roles show less variance across company sizes, suggesting entry-level roles are more standardized in pay.

library(dplyr)
library(ggplot2)

# Remove outliers using the IQR method
filtered_data <- data %>%
  mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
  group_by(experience_level, company_size) %>%
  filter(
    salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
    salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
  ) %>%
  ungroup()

# Set a fixed order for experience_level
filtered_data <- filtered_data %>%
  mutate(experience_level = factor(experience_level, levels = c("EN", "MI", "SE", "EX")))

# Create the boxplot
ggplot(filtered_data, aes(x = experience_level, y = salary_in_usd, fill = company_size)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ company_size, scales = "free_x") +
  labs(
    title = "",
    x = "Experience Level",
    y = "Salary (USD)",
    fill = "Company Size"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(dplyr)
library(ggplot2)
library(scales)

# Remove outliers using the IQR method
filtered_data <- data %>%
  mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
  group_by(experience_level, company_size) %>%
  filter(
    salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
    salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
  ) %>%
  ungroup()

# Set a fixed order for experience_level
filtered_data <- filtered_data %>%
  mutate(experience_level = factor(experience_level, levels = c("EN", "MI", "SE", "EX")))

# Create the boxplot
ggplot(filtered_data, aes(x = experience_level, y = salary_in_usd, fill = company_size)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) + # Suppress outlier points
  labs(
    title = "", # Salary by Experience Level and Company Size
    x = "Experience Level",
    y = "Salary (USD)",
    fill = "Company Size"
  ) +
  scale_y_continuous(labels = label_number(scale = 0.001, suffix = "K")) + # Replace 000 with K
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, vjust = 0.5), # Keep Experience Level horizontal
    panel.grid = element_blank() # Remove gridlines
  ) +
  facet_grid(~ company_size, scales = "free", space = "free_x") # Arrange facets horizontally

DATA 608 Story 4

Fomba Kassoh

2024-11-09

Salary Distribution of Data Practitioners

Salaries by Experience Level and Company Size

Regional Insights on Data Practitioner Salaries Across the US

States Grouped by Data Practitioner Salary Ranges

Heading: “Unveiling Salary Trends Across Data Roles”

Salaries by Experience Level and Company Size

Statement: