#load libraries
library (tidyr)
library(tidyverse)
library(dplyr)
#visual plots
library(ggplot2)
#to use several colorblind-accessible palettes
library(RColorBrewer)
library(viridis)
# For comma formatting for Salary
library(scales)
#Data loading
salaries_state <-read.csv("https://raw.githubusercontent.com/datanerddhanya/DATA608/refs/heads/main/SalariesByState.csv")
# Removing the dollar sign, thousand separator and converting to numeric values
salaries_state$Annual.Salary <- as.numeric(gsub("\\$|,", "", salaries_state$Annual.Salary))
salaries_state$Monthly.Pay <- as.numeric(gsub("\\$|,", "", salaries_state$Monthly.Pay))
salaries_state$Weekly.Pay<- as.numeric(gsub("\\$|,", "", salaries_state$Weekly.Pay))
salaries_state$Hourly.Wage <- as.numeric(gsub("\\$|,", "", salaries_state$Hourly.Wage))
# Adding State Abbreviation to the data frame
data("state")
salaries_state$State.Abbr <- state.abb[match(salaries_state$State, state.name)]
# Reorder based on Avg_Salary
salaries_state <- salaries_state %>%
arrange(Annual.Salary) %>%
mutate(State = factor(State, levels = unique(State)), # Order states by salary
Job = factor(Job, levels = unique(Job))) # Order jobs by salary
# Calculate the median salary for each Job Title
job_medians <- salaries_state %>%
group_by(Job) %>%
summarize(median_salary = median(Annual.Salary, na.rm = TRUE))
# calculation to get the state with the highest salary for each job
top_states <- salaries_state %>%
group_by(Job) %>%
top_n(Annual.Salary,n=2) %>%
ungroup()
# calculation to get the mean salary for each job
mean_values <- salaries_state %>%
group_by(Job) %>%
summarise(mean.salary = mean(Annual.Salary))
# calculation to get the mean salary by each state
mean_values_state <- salaries_state %>%
group_by(State) %>%
summarise(mean.salary = mean(Annual.Salary))
I am presenting this data though Heatmap and blox plots/Violin plot to show the salary by role descriptor and state within single chart.
# Heatmap
ggplot(salaries_state, aes(x = Job , y = reorder(State,Annual.Salary,decreasing = TRUE))) +
geom_tile(aes(fill = Annual.Salary),color = "white") +
scale_fill_viridis(name = "Average Annual Salary in USD") +
labs(title = "Heatmap of Average Annual Salary in USD by Role descriptor and State",
x = "Role Descriptor",
y = "U.S State") +
theme_minimal() +
theme(axis.text.y = element_text(size = 8,margin = margin(t = 0, r = 10, b = 0, l = 0)),
axis.text.x = element_text(size = 8,margin = margin(t = 10, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank()
)
#save to a image to embed in pdf
ggsave("salary_heatmap.svg", width = 12, height = 20)
# Reorder Job Titles based on the median salary (in increasing order)
salaries_state <- salaries_state %>%
mutate(Job = reorder(Job, Annual.Salary, FUN = median))
#Violin plot
ggplot(salaries_state, aes(x = Job, y = Annual.Salary)) +
geom_violin(trim = TRUE,drop= FALSE) + # Creates the violin plot
geom_jitter(height = 0, width = 0.06) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels for readability
labs(title = "Average Annual Salary Distribution by Role Descriptor and State",
x = "Role Descriptor",
y = "Average Annual Salary ($)") +
scale_fill_viridis_d(option = "plasma", name = "State")
# Viloin plot with State mapping to color and fill
ggplot(salaries_state, aes(x = Job, y = Annual.Salary, color = State, fill = State)) +
geom_violin(trim = TRUE, drop = FALSE, alpha = 0.5) + # Creates the violin plot with transparency
geom_jitter(height = 0, width = 0.06, size = 1.5) + # Jitter points, mapping to state
geom_text(data = top_states, aes(label = State), vjust = -1.5, color = "black", size = 3.5) +
theme(axis.text.x = element_text(angle = 25, hjust = 1)) + # Rotate x-axis labels for readability
labs(title = "Average Annual Salary(in USD) Distribution by Role Descriptor and State",
subtitle = "Top 2 states mentioned for each role",
x = "Role Descriptor",
y = "Average Annual Salary in USD")
ggsave("salary_boxplot.png", width = 12, height = 15)
#Create stacked bar chart
ggplot(salaries_state, aes(x = Annual.Salary , y = reorder(State, Annual.Salary), fill = reorder(Job, Annual.Salary))) +
geom_bar(stat = "identity", height = 3) +
geom_text(aes(label = scales::comma(Annual.Salary)),
position = position_stack(vjust = 0.5), # Position text in the middle of each bar section
size = 2) + # Adjust size for readability
labs(title = "Average Annual Salary(in USD) Distribution by Role Descriptor and State",
x = "Total Annual Salary(in USD)",
y = "U.S State") +
theme_minimal() +
scale_fill_viridis_d(option = "viridis",name = "Role Descriptor") + # Use color-blind friendly viridis palette
scale_x_continuous(labels = comma) + # Format x-axis labels with commas
theme(
axis.text.y = element_text(size = 8, margin = margin(r = 1)), # Adjust y-axis label size and add space
)
ggsave("salary_stackedbarchart.svg", width = 12, height = 15)
Based on the data analysis (used Ziprecruiter website data) and visualizations prepared, the following conclusions can be made:
• The states which have the higher salary across all roles are Washington, Alaska, Massachusetts and New York. (Heatmap)
• The states which have the least salaries across 4 roles are Florida, West Virginia, Louisiana and Arkansas. (Heatmap)
• I see the decreasing variation, spread and density from left to right with salary sorted by State and role. (Stacked bar chart)
• It is interesting to observe that Alaska offers the highest salary for a Data Architect and New York offers the highest Salary for a Data Scientist. They are not offering the same for other data roles. (Stacked bar chart)
• The Data Analyst ($77,604) is the lowest average paying role. (mean values)
• The Data Architect ($134,540) is the highest average paying role. (mean values)
• Florida is the least paying state across the 4 roles ($83909). (mean values state)
• Washington is the highest paying state across the 4 roles ($126029). (mean values state)
• As there is a 10% increase in salary on average by role and a decent variation by state, both role and state offer variation to salary.