library(ggplot2)
# Define the roles
roles <- data.frame(
Role = c(
"Deep Learning (DL)",
"Machine Learning (ML)",
"Artificial Intelligence (AI)",
"Data Architecture",
"Data Science",
"Data Engineering",
"Data Management",
"Business Intelligence (BI)",
"Data Analytics",
"Big Data"
)
)
# Create the infographic
ggplot(roles, aes(y = Role, x = 1)) +
geom_text(aes(label = Role), size = 5, hjust = 0.5, vjust = 0.5) +
theme_void() +
labs(title = "Roles Under Data Practitioner") +
theme(
plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
axis.text.y = element_blank(), # Remove y-axis text
axis.ticks.y = element_blank() # Remove y-axis ticks
)
The histogram illustrates the salary distribution of data practitioners. The data shows a clear peak in salaries around the $100,000–$120,000 range, indicating this is the most common earnings bracket in the field. Salaries taper off toward both the lower end (below $50,000) and the higher end (above $200,000), reflecting a more limited number of practitioners in these extreme ranges. This distribution highlights a concentration of data practitioners within mid-level salary bands, consistent with a growing and maturing data profession.
# Load required libraries
library(ggplot2)
library(dplyr)
library(scales)
# Load the dataset
data <- read.csv("us_dataScience_salaries_2024.csv")
# Remove outliers using the IQR method
filtered_data <- data %>%
mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
filter(
salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
)
# Create a new column for fill color based on salary range
filtered_data <- filtered_data %>%
mutate(
fill_color = ifelse(salary_in_usd >= 100000 & salary_in_usd <= 200000, "#a1d99b", "#e5f5e0")
)
# Create the histogram
ggplot(filtered_data, aes(x = salary_in_usd, fill = fill_color)) +
geom_histogram(binwidth = 10000, color = "black", alpha = 0.7) +
scale_fill_manual(values = c("#e5f5e0" = "#e5f5e0", "#a1d99b" = "#a1d99b"), guide = "none") +
labs(
title = "Salary Distribution with Highlighted Range",
x = "Salary (USD)",
y = "Frequency"
) +
scale_x_continuous(
labels = scales::label_number(scale = 0.001, suffix = "K"),
breaks = c(0, 100000, 200000, 300000)
) +
geom_vline(xintercept = c(100000, 200000), color = "#a1d99b", linetype = "dashed") +
annotate(
"segment",
x = 100000, xend = 200000, y = max(table(cut(filtered_data$salary_in_usd, breaks = seq(0, max(filtered_data$salary_in_usd), by = 10000)))),
yend = max(table(cut(filtered_data$salary_in_usd, breaks = seq(0, max(filtered_data$salary_in_usd), by = 10000)))),
arrow = arrow(type = "closed", ends = "both", length = unit(0.2, "cm")), color = "#a1d99b"
) +
annotate(
"text",
x = 150000,
y = max(table(cut(filtered_data$salary_in_usd,
breaks = seq(0, max(filtered_data$salary_in_usd), by = 10000)))) + 20,
label = expression(bold("Most salaries range:\n $100K and $200K")),
color = "red",
size = 4,
hjust = 0.5 # Center justify the text
) +
theme_minimal() +
theme(
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'
## Salary Distribution of Data Practitioners by Job Category
The chart highlights the distribution of salaries across various data practitioner job categories. Roles like Machine Learning, Artificial Intelligence (AI), and Deep Learning dominate the higher salary ranges (above $200,000), reflecting their demand and specialized nature. Meanwhile, categories such as Business Intelligence (BI), Data Analytics, and Data Management are concentrated in lower salary ranges (below $100,000), suggesting their focus on entry to mid-level roles. The majority of data practitioners earn within the $100,000–$120,000 range, indicating this is the central salary band for the industry.
# Prepare data for clustering with outlier removal
clustering_data <- data %>%
mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
group_by(job_category) %>%
summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE)) %>%
ungroup() %>%
filter(
avg_salary > quantile(avg_salary, 0.25) - 1.5 * IQR(avg_salary) &
avg_salary < quantile(avg_salary, 0.75) + 1.5 * IQR(avg_salary)
)
# Create a custom salary range column for legend mapping
clustering_data <- clustering_data %>%
mutate(salary_range = case_when(
avg_salary >= 195000 ~ ">= 195K",
avg_salary < 131000 ~ "< 131K",
TRUE ~ "131K–195K"
))
# Reorder salary_range to match the plot order
clustering_data <- clustering_data %>%
mutate(
salary_range = factor(salary_range, levels = c(">= 195K", "131K–195K", "< 131K"))
)
# Update color mapping for salary ranges
salary_colors <- c(">= 195K" = "#a1d99b", "131K–195K" = "#e5f5e0", "< 131K" = "#fc9272")
# Plot with reversed bar order and legend at the bottom
ggplot(clustering_data, aes(x = reorder(job_category, -avg_salary), y = avg_salary, fill = salary_range)) +
geom_bar(stat = "identity", show.legend = TRUE) +
scale_fill_manual(values = salary_colors, name = "Salary Range") + # Custom colors and legend
coord_flip() +
labs(
title = "Job Titles Clustered by Salary Range",
x = "Job Title",
y = "Average Salary (USD)"
) +
theme_minimal() +
theme(
panel.grid = element_blank(),
legend.position = "bottom", # Move legend to the bottom
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
legend.background = element_rect(fill = "white", color = NA),
plot.title = element_text(hjust = 0, size = 16)
)
The boxplot illustrates salary variations across experience levels (EN, MI, SE, EX) and company sizes (L, M, S). Key observations include:
This analysis highlights that experience level is the dominant factor in salary progression, while company size further amplifies opportunities for higher earnings. Large companies provide the best overall compensation, especially for experienced professionals.
library(ggplot2)
library(dplyr)
# Ensure salary_in_usd is numeric and prepare the data
boxplot_data <- data %>%
mutate(
salary_in_usd = as.numeric(gsub(",", "", salary_in_usd)), # Remove commas and convert to numeric
company_size = case_when(
company_size == "L" ~ "Large (L)",
company_size == "M" ~ "Medium (M)",
company_size == "S" ~ "Small (S)"
),
company_size = factor(company_size, levels = c("Large (L)", "Medium (M)", "Small (S)")),
experience_level = factor(experience_level, levels = c("EN", "MI", "SE", "EX"))
)
# Define the color scheme
company_size_colors <- c("Large (L)" = "#fc9272", # Red
"Medium (M)" = "#a1d99b", # Green
"Small (S)" = "#9ecae1") # Blue
# Create the boxplot grouped by experience level
ggplot(boxplot_data, aes(x = experience_level, y = salary_in_usd, fill = company_size)) +
geom_boxplot(
position = position_dodge(width = 0.8), # Ensure grouped boxplots
outlier.shape = NA # Remove outliers for clarity
) +
facet_wrap(~ company_size, nrow = 1) + # Create facets for company sizes
scale_fill_manual(values = company_size_colors, name = "Company Size") + # Apply colors
labs(
title = "Salary Distribution by Company Size and Experience Level",
x = "Experience Level",
y = "Salary (USD)"
) +
scale_y_continuous(
limits = c(0, 400000), # Set y-axis range to max 400K
breaks = seq(0, 400000, by = 100000), # Define y-axis ticks
labels = scales::label_number(scale = 0.001, suffix = "K") # Format Y-axis labels
) +
theme_minimal() +
theme(
panel.grid.major.x = element_blank(), # Remove vertical gridlines
panel.grid.minor.x = element_blank(),
panel.grid.major.y = element_line(color = "lightgray", linetype = "dashed", size = 0.2), # Faded gridlines
panel.grid.minor.y = element_line(color = "lightgray", linetype = "dotted", size = 0.1), # Thinner minor gridlines
legend.position = "right", # Position legend on the right
legend.title = element_text(size = 10, face = "bold"),
legend.text = element_text(size = 9),
strip.text = element_text(size = 10, face = "bold"), # Style facet labels
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
axis.text.x = element_text(size = 10),
axis.text.y = element_text(size = 10),
plot.title = element_text(hjust = 0, size = 16)
)
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 57 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
The map visualization provides a regional analysis of median salaries across the United States for data practitioners. Here are key insights:
This analysis demonstrates how geography significantly impacts earning potential for data practitioners, with high-paying opportunities concentrated in specific areas. This insight can guide job seekers to target regions that align with their career and financial goals.
# Load required libraries
library(dplyr)
library(ggplot2)
library(maps)
library(cowplot)
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
# Load the data
state_data <- read_csv('state_datapractitioner_wages.csv')
## Rows: 444 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): state, state_abrev, ownership_type, occ_title, role, employment_to...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Replace non-numeric values (e.g., "*", "**") with NA in mean_salary, employment_total, and employment_per_1000
state_data <- state_data %>%
mutate(
average_salary = ifelse(grepl("[^0-9,]", average_salary) | average_salary == "*", NA, average_salary),
employment_total = ifelse(grepl("[^0-9,]", employment_total) | employment_total == "*", NA, employment_total),
employment_per_1000 = ifelse(grepl("[^0-9.]", employment_per_1000) | employment_per_1000 == "*", NA, employment_per_1000)
)
# Remove commas and convert columns to numeric
state_data <- state_data %>%
mutate(
average_salary = as.numeric(gsub(",", "", average_salary)),
employment_total = as.numeric(gsub(",", "", employment_total)),
employment_per_1000 = as.numeric(employment_per_1000)
)
# Define neighboring states for imputation
neighbors <- list(
"MT" = c("ID", "WY", "SD", "ND"),
"MS" = c("TN", "AR", "LA", "AL"),
"OK" = c("TX", "KS", "MO", "AR"),
"KS" = c("NE", "MO", "OK", "CO"),
"WY" = c("MT", "SD", "NE", "CO", "UT", "ID")
# Add remaining states as needed
)
# Function to impute missing values based on neighboring states
impute_missing_value <- function(state_abrev, variable, state_data, neighbors_list) {
neighbor_states <- neighbors_list[[state_abrev]]
if (is.null(neighbor_states) || length(neighbor_states) == 0) {
return(NA)
}
neighbor_values <- state_data %>%
filter(state_abrev %in% neighbor_states) %>%
pull(!!sym(variable))
if (all(is.na(neighbor_values))) {
return(NA)
} else {
return(mean(neighbor_values, na.rm = TRUE))
}
}
# Apply imputation for missing values
state_data <- state_data %>%
mutate(
average_salary = ifelse(
is.na(average_salary),
sapply(state_abrev, impute_missing_value, "average_salary", state_data, neighbors),
average_salary
),
employment_total = ifelse(
is.na(employment_total),
sapply(state_abrev, impute_missing_value, "employment_total", state_data, neighbors),
employment_total
),
employment_per_1000 = ifelse(
is.na(employment_per_1000),
sapply(state_abrev, impute_missing_value, "employment_per_1000", state_data, neighbors),
employment_per_1000
)
)
# Define region mapping for each state abbreviation
region_mapping <- data.frame(
state_abrev = c("CT", "ME", "MA", "NH", "RI", "VT", "NJ", "NY", "PA", # Northeast
"IL", "IN", "MI", "OH", "WI", # Midwest
"IA", "KS", "MN", "MO", "NE", "ND", "SD", # Midwest
"DE", "FL", "GA", "MD", "NC", "SC", "VA", "WV", # South Atlantic
"AL", "KY", "MS", "TN", # East South Central
"AR", "LA", "OK", "TX", # West South Central
"AZ", "CO", "ID", "MT", "NV", "NM", "UT", "WY", # Mountain
"AK", "CA", "HI", "OR", "WA"), # Pacific
region = c("Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast", "Northeast",
"Midwest", "Midwest", "Midwest", "Midwest", "Midwest",
"Midwest", "Midwest", "Midwest", "Midwest", "Midwest", "Midwest", "Midwest",
"South", "South", "South", "South", "South", "South", "South", "South",
"South", "South", "South", "South",
"South", "South", "South", "South",
"West", "West", "West", "West", "West", "West", "West", "West",
"West", "West", "West", "West", "West")
)
# Get US state map data
states_map <- map_data("state") %>%
rename(state_name = region) %>%
mutate(state_name = tolower(state_name))
# Map state abbreviations to state names
state_abbreviation_mapping <- data.frame(
state_name = tolower(state.name),
state_abrev = state.abb
)
# Prepare data for plotting
filtered_data <- state_data %>%
group_by(state_abrev) %>%
summarise(state_mean_salary = median(average_salary, na.rm = TRUE), .groups = "drop") %>%
left_join(region_mapping, by = "state_abrev") %>%
left_join(state_abbreviation_mapping, by = "state_abrev")
map_data_with_salary <- states_map %>%
left_join(filtered_data, by = c("state_name"))
# Define custom color gradient
salary_colors <- c("#f7fcf5", "#e5f5e0", "#c7e9c0", "#a1d99b", "#74c476", "#41ab5d", "#238b45", "#006d2c", "#00441b")
# Function to plot each region
plot_region <- function(region_name) {
region_data <- map_data_with_salary %>% filter(region == region_name)
ggplot() +
geom_polygon(data = region_data, aes(x = long, y = lat, group = group, fill = state_mean_salary), color = "white") +
scale_fill_gradientn(colors = salary_colors, name = "Average Salary ($)", na.value = "gray90") +
labs(title = region_name) +
theme_minimal() +
theme(
legend.position = "none",
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5, size = 14)
) +
coord_fixed(1.3)
}
# Generate plots
plot_northeast <- plot_region("Northeast")
plot_midwest <- plot_region("Midwest")
plot_south <- plot_region("South")
plot_west <- plot_region("West")
# Extract legend
legend_plot <- ggplot() +
geom_polygon(data = map_data_with_salary %>% filter(region == "Northeast"), aes(x = long, y = lat, group = group, fill = state_mean_salary), color = "white") +
scale_fill_gradientn(colors = salary_colors, name = "Median Salary ($)", na.value = "gray90") +
theme_minimal() +
theme(
legend.position = "right",
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank()
)
legend <- get_legend(legend_plot)
## Warning in get_plot_component(plot, "guide-box"): Multiple components found;
## returning the first one. To return all, use `return_all = TRUE`.
# Arrange final plot
final_plot <- plot_grid(
plot_grid(plot_northeast, plot_west, ncol = 2, align = "h"),
plot_grid(plot_midwest, plot_south, ncol = 2, align = "h"),
ncol = 1
)
final_output <- plot_grid(final_plot, legend, ncol = 2, rel_widths = c(3, 0.6))
# Display the plot
print(final_output)
This chart categorizes U.S. states into three clusters based on average salaries for data practitioners. States with the highest salaries (Cluster 1, marked in red) are dominated by hubs like California, which reflects the demand for tech talent in major urban centers. Cluster 2 (green) represents states with competitive but mid-tier salaries, offering opportunities for professionals seeking balanced costs of living and wages. Cluster 3 (blue) highlights states with lower average salaries, often in regions where the cost of living aligns with compensation. This distribution emphasizes geographic disparities in tech salaries, reflecting local market dynamics and industry presence.
# Load necessary libraries
library(dplyr)
library(ggplot2)
# Ensure the correct salary column is used
clustering_data <- state_data %>%
group_by(state_abrev) %>%
summarize(avg_salary = mean(average_salary, na.rm = TRUE)) %>%
ungroup()
# Perform clustering (k-means) on filtered data
set.seed(123)
kmeans_result <- kmeans(clustering_data$avg_salary, centers = 3)
clustering_data$cluster <- as.factor(kmeans_result$cluster)
# Ensure the correct order for clusters (red, lavender, green)
clustering_data <- clustering_data %>%
mutate(cluster = factor(cluster, levels = c("1", "2", "3")))
# Define custom colors for clusters
cluster_colors <- c("1" = "#de2d26", "2" = "#756bb1", "3" = "#31a354")
# Visualize clusters with dots and adjusted y-axis ticks
ggplot(clustering_data, aes(x = reorder(state_abrev, avg_salary), y = avg_salary, color = cluster)) +
geom_point(size = 2) + # Use dots for the visualization
scale_color_manual(values = cluster_colors, name = "Cluster") + # Assign custom colors
scale_y_continuous(
breaks = seq(
floor(min(clustering_data$avg_salary) / 10000) * 10000,
max(clustering_data$avg_salary),
by = 10000
),
labels = scales::label_number(scale = 0.001, suffix = "K") # Format labels
) +
labs(
title = "States Clustered by Salary Range",
x = "State Abbreviation",
y = "Average Salary (USD)",
color = "Cluster"
) +
theme_minimal() +
theme(
panel.grid.major = element_blank(), # Remove major gridlines
panel.grid.minor = element_blank(), # Remove minor gridlines
axis.text.y = element_text(size = 6), # Make y-axis tick labels bold
axis.text.x = element_text(size = 12, face = "bold"),
axis.title = element_text(size = 14),
plot.title = element_text(hjust = 0, size = 16)
) +
coord_flip() # Flip the coordinates for horizontal layout
Artificial Intelligence (AI), Machine Learning (ML), and Deep Learning (DL) consistently lead in salary trends, reflecting high demand for these advanced technical skills. While roles such as Data Science (DS) and Data Engineering (DE) demonstrate steady growth, Data Management (DM) experiences significant fluctuations. The overarching trend indicates a general rise in salaries from 2020 to 2023, with a slight dip in 2024, underscoring the competitive and evolving nature of the data industry.
library(dplyr)
library(ggplot2)
# Remove outliers using IQR method
filtered_data <- data %>%
mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
group_by(work_year) %>%
filter(
salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
) %>%
ungroup()
# Calculate average salary for each job category per year
category_trend <- filtered_data %>%
group_by(work_year, job_category) %>%
summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")
# Calculate overall average salary trend
overall_trend <- filtered_data %>%
group_by(work_year) %>%
summarize(overall_avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")
# Faceted line plot with conditional theming
ggplot(category_trend, aes(x = work_year, y = avg_salary / 10000, color = job_category)) +
geom_line(size = 1) +
geom_point(size = 2) +
geom_line(
data = overall_trend,
aes(x = work_year, y = overall_avg_salary / 10000),
color = "blue",
linetype = "dashed",
size = 1.2,
inherit.aes = FALSE
) +
facet_wrap(~job_category, scales = "free_y", ncol = 3) +
scale_y_continuous(
breaks = scales::pretty_breaks(), # Dynamically generate pretty breaks
labels = scales::label_number(scale = 1) # Format y-axis without extra "K"
) +
labs(
title = "Average Salary Trends by Job Category Over Years",
x = "Year",
y = "Average Salary x 10,000",
color = "Job Category"
) +
theme_minimal() +
theme(
strip.text = element_text(size = 10),
plot.title = element_text(hjust = 0, size = 16),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.y = element_blank(), # Remove y-axis title
panel.grid = element_blank(), # Remove all gridlines
legend.position = "none", # Remove legend
axis.text.y = element_blank() # Remove y-axis labels on specific columns
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(filtered_data, aes(x = job_category, y = salary_in_usd, fill = job_category)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
scale_y_continuous(labels = scales::label_number(scale = 0.001, suffix = "K")) +
labs(
title = "Salary Distribution of Data Practitioners by Job Category",
x = "Job Category",
y = "Salary (USD)"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none", # Removes the legend
panel.grid = element_blank() # Removes gridlines
)
# Remove outliers using the IQR method
filtered_data <- data %>%
mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
filter(
salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
)
# Calculate median salary for each job category and reorder job_category in descending order
sorted_data <- filtered_data %>%
group_by(job_category) %>%
summarize(median_salary = median(salary_in_usd, na.rm = TRUE), .groups = "drop") %>%
arrange(desc(median_salary)) %>%
mutate(job_category = factor(job_category, levels = job_category))
# Join sorted data back to the original dataset
filtered_data <- filtered_data %>%
mutate(job_category = factor(job_category, levels = levels(sorted_data$job_category)))
ggplot(filtered_data, aes(x = job_category, y = salary_in_usd, fill = job_category)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
scale_y_continuous(labels = scales::label_number(scale = 0.001, suffix = "K")) +
labs(
title = "Salary Distribution of Data Practitioners by Job Category",
x = NULL, # Removes x-axis label
y = "Salary (USD)"
) +
theme_minimal() +
theme(
axis.text.x = element_blank(), # Removes x-axis tick labels
axis.ticks.x = element_blank(), # Removes x-axis ticks
legend.position = "right", # Keeps the legend
panel.grid = element_blank(), # Removes gridlines
axis.title.x = element_blank() # Removes x-axis title
)
library(dplyr)
library(ggplot2)
# Remove outliers using IQR method
filtered_data <- data %>%
mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
group_by(work_year) %>%
filter(
salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
) %>%
ungroup()
# Calculate average salary for each job category
average_salary <- filtered_data %>%
group_by(job_category) %>%
summarize(overall_avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")
# Reorder job_category based on the overall average salary
filtered_data <- filtered_data %>%
mutate(job_category = factor(job_category, levels = average_salary %>% arrange(desc(overall_avg_salary)) %>% pull(job_category)))
# Calculate average salary for each job category per year
category_trend <- filtered_data %>%
group_by(work_year, job_category) %>%
summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop")
# Calculate overall average salary trend
overall_trend <- filtered_data %>%
group_by(work_year) %>%
summarize(avg_salary = mean(salary_in_usd, na.rm = TRUE), .groups = "drop") %>%
mutate(job_category = "Overall Average Salary")
# Combine category-specific and overall trends
combined_data <- bind_rows(category_trend, overall_trend)
# Faceted line plot with overall trend
ggplot(combined_data, aes(x = work_year, y = avg_salary / 10000, group = job_category)) +
geom_line(data = category_trend, aes(color = job_category), size = 1) +
geom_point(data = category_trend, aes(color = job_category), size = 2) +
geom_line(data = overall_trend, aes(y = avg_salary / 10000), color = "blue", linetype = "dashed", size = 1) +
scale_color_manual(
values = setNames(scales::hue_pal()(length(unique(category_trend$job_category))), unique(category_trend$job_category))
) +
scale_y_continuous(
breaks = scales::pretty_breaks(), # Dynamically generate breaks
labels = scales::label_number(accuracy = 1) # Remove decimal places
) +
facet_wrap(~job_category, scales = "free_y", ncol = 3) +
labs(
title = "Average Salary Trends by Job Category Over Years",
x = "Year",
y = "Average Salary x 10,000"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0, size = 16),
strip.text = element_text(size = 10),
legend.position = "none", # Remove legend
panel.grid.major = element_blank(), # Faded major gridlines
panel.grid.minor = element_line(color = "gray95", size = 0.01) # Faded minor gridlines
)
The boxplot illustrates salary variations across experience levels (EN, MI, SE, EX) and company sizes (L, M, S). Key observations include:
This analysis highlights that experience level is the dominant factor in salary progression, while company size further amplifies opportunities for higher earnings. Large companies provide the best overall compensation, especially for experienced professionals.
library(dplyr)
library(ggplot2)
# Remove outliers using the IQR method
filtered_data <- data %>%
mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
group_by(experience_level, company_size) %>%
filter(
salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
) %>%
ungroup()
# Set a fixed order for experience_level
filtered_data <- filtered_data %>%
mutate(experience_level = factor(experience_level, levels = c("EN", "MI", "SE", "EX")))
# Create the boxplot
ggplot(filtered_data, aes(x = experience_level, y = salary_in_usd, fill = company_size)) +
geom_boxplot(alpha = 0.7) +
facet_wrap(~ company_size, scales = "free_x") +
labs(
title = "",
x = "Experience Level",
y = "Salary (USD)",
fill = "Company Size"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(dplyr)
library(ggplot2)
library(scales)
# Remove outliers using the IQR method
filtered_data <- data %>%
mutate(salary_in_usd = as.numeric(gsub(",", "", salary_in_usd))) %>%
group_by(experience_level, company_size) %>%
filter(
salary_in_usd > quantile(salary_in_usd, 0.25) - 1.5 * IQR(salary_in_usd) &
salary_in_usd < quantile(salary_in_usd, 0.75) + 1.5 * IQR(salary_in_usd)
) %>%
ungroup()
# Set a fixed order for experience_level
filtered_data <- filtered_data %>%
mutate(experience_level = factor(experience_level, levels = c("EN", "MI", "SE", "EX")))
# Create the boxplot
ggplot(filtered_data, aes(x = experience_level, y = salary_in_usd, fill = company_size)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) + # Suppress outlier points
labs(
title = "", # Salary by Experience Level and Company Size
x = "Experience Level",
y = "Salary (USD)",
fill = "Company Size"
) +
scale_y_continuous(labels = label_number(scale = 0.001, suffix = "K")) + # Replace 000 with K
theme_minimal() +
theme(
axis.text.x = element_text(angle = 0, vjust = 0.5), # Keep Experience Level horizontal
panel.grid = element_blank() # Remove gridlines
) +
facet_grid(~ company_size, scales = "free", space = "free_x") # Arrange facets horizontally