R Final Deliverable

Introduction

This data set includes Accounting industry salary and employment data based on May 2024. This data set was acquired from the U.S. Bureau of Labor Statistics.

Dataset

The data set included annual and hourly pay data for each U.S. State. It includes percentiles for each pay level as well. I added the seperation of each state into a region, as well as added the total employment.

# I paste some code in here, maybe to identify all of the libraries I need to use and then to read in the data and to report some details about the data. 
library(dplyr)
library(ggplot2)

Findings

The following tabs include relationships and information about Accounting industry data based on data from May 2024.

# I paste some code in here if needed. This might be manipulation of the data after reading it in, to remove bad data, for example.

Tab 1

This tab includes the average accounting salaries across the U.S.

library(scales)
library(ggplot2)
library(dplyr)
library(tidyr)
library(stringr)


##### BAR PLOT #####

#Read the file
mydata <- read.csv("Accounting Salary Data.csv", stringsAsFactors = FALSE)

#Remove codes names in the parentheses
mydata$AnnualMeanWage <- as.numeric(gsub("[\\$,]", "", mydata$`Annual.mean.wage..2.`))

#Using annual mean wage column
mydata$AreaName <- gsub("\\s*\\(.*\\)", "", mydata$Area.Name)

#Plot bar ordered by salary
ggplot(mydata, aes(x = reorder(AreaName, AnnualMeanWage), y = AnnualMeanWage)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  geom_text(aes(label = dollar(AnnualMeanWage)),
            hjust = -0.1, size = 3.5) +
  scale_y_continuous(labels = dollar_format()) +
  labs(
    x = "State",
    y = "Average Annual Salary",
    title = "Average Accounting Salaries Across the US"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  expand_limits(y = max(mydata$AnnualMeanWage) * 1.1)

Tab 2

This tab shows the hourly wage percentiles in the top 10 highest paying states.

#Remove codes in parentheses from Area.Name to get State names
mydata$State <- gsub("\\s*\\(.*\\)", "", mydata$Area.Name)

# Change 'District of Columbia' to 'D.C.'
mydata$State <- gsub("District of Columbia", "D.C.", mydata$State)

# Clean numeric columns (remove $ and commas, convert to numeric)
cols_to_clean <- c(
  "Hourly.mean.wage",
  "Hourly.10th.percentile.wage",
  "Hourly.25th.percentile.wage",
  "Hourly.median.wage",
  "Hourly.75th.percentile.wage",
  "Hourly.90th.percentile.wage"
)

mydata[cols_to_clean] <- lapply(mydata[cols_to_clean], function(x) {
  as.numeric(gsub("[\\$,]", "", x))
})

# Top 10 states by mean wage 
top10 <- mydata %>%
  arrange(desc(Hourly.mean.wage)) %>%
  slice(1:10)

# Reshape to long format 
top10_long <- top10 %>%
  select(State,
         Hourly.10th.percentile.wage,
         Hourly.25th.percentile.wage,
         Hourly.median.wage,
         Hourly.75th.percentile.wage,
         Hourly.90th.percentile.wage) %>%
  pivot_longer(-State, names_to = "Percentile", values_to = "Wage")

# Clean up Percentile names
top10_long$Percentile <- gsub("Hourly\\.", "", top10_long$Percentile)
top10_long$Percentile <- gsub("\\.percentile\\.wage", "", top10_long$Percentile)
top10_long$Percentile <- gsub("\\.median\\.wage", "median", top10_long$Percentile)

# Map to nicer legend labels
percentile_labels <- c(
  "10th" = "10th",
  "25th" = "25th",
  "median" = "Median",
  "75th" = "75th",
  "90th" = "90th"
)

top10_long$Percentile <- recode(top10_long$Percentile,
                                "10th" = "10th",
                                "25th" = "25th",
                                "median" = "Median",
                                "75th" = "75th",
                                "90th" = "90th")

# Plot 
ggplot(top10_long, aes(x = reorder(State, Wage, max), y = Wage, fill = Percentile)) +
  geom_bar(stat = "identity") +
  #Add $ labels inside each stacked bar
  geom_text(aes(label = dollar(Wage)),
            position = position_stack(vjust = 0.5),
            size = 3, color = "black") +
  labs(
    x = "State",
    y = "Hourly Wage ($)",
    title = "Hourly Wage Percentiles in Top 10 Highest-Wage States",
    fill = "Percentile"
  ) +
  theme_minimal() +
  #Center the title
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    axis.text.x = element_text(angle = 0, hjust = 0.5)
  ) +
  scale_y_continuous(labels = dollar_format())  # Format axis with $ signs

Tab 3

This tab shows the total employment for each state in the U.S.

mydata$`Employment..1.` <- as.numeric(gsub(",", "", mydata$`Employment..1.`))

# Calculate total employment across all states
total_employment <- sum(mydata$`Employment..1.`, na.rm = TRUE)

# Create a one-row data frame for the total
total_row <- data.frame(
  `AreaName` = "United States (Total)",
  `Employment..1.` = total_employment
)

# Add the total row to the original dataset
mydata_with_total <- bind_rows(mydata, total_row)

# Write the updated dataset back to CSV
write.csv(mydata, "Accounting Salary Data Cleaned_with_total.csv")

# Print confirmation
cat("A new file 'Accounting Salary Data Cleaned_with_total.csv' has been created with the total employment row added.\n")

## A new file 'Accounting Salary Data Cleaned_with_total.csv' has been created with the total employment row added.

# Summarize total employment per state
state_employment <- mydata %>%
  group_by(`AreaName`) %>%
  summarise(total_employment = sum(`Employment..1.`, na.rm = TRUE)) %>%
  arrange(desc(total_employment))


# Create the heat map
ggplot(state_employment, aes(x = reorder(`AreaName`, total_employment), 
                             y = 1, fill = total_employment)) +
  geom_tile(color = "white") +
  geom_text(aes(label = scales::comma(total_employment)), 
            color = "white", 
            fontface = "bold", 
            size = 3) +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  coord_flip() +
  theme_minimal() +
  labs(title = "Heat Map of Employment Across Each State",
       x = "State",
       y = "",
       fill = "Employment") +
  theme(plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
        axis.text.y = element_text(size = 10),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

Tab 4

This tab shows the relationship between the total employment and the average salary by state.

# Make the Employment column numeric
mydata$Employment..1. <- as.numeric(gsub(",", "", mydata$Employment..1.))

# Bubble chart of Annual Mean Wage vs Employment
ggplot(mydata, aes(x = `Employment..1.`, y = `Annual.mean.wage..2.`,
                   size = `Employment..1.`, color = `State`)) +
  geom_point(alpha = 0.6) +
  scale_size(range = c(2, 12)) +
  theme_minimal() +
  labs(title = "Relationship Between Employment and Average Wage by State",
       x = "Employment",
       y = "Annual Mean Wage",
       size = "Employment",
       color = "State") +
  theme(plot.title = element_text(hjust = 0.1, size = 16, face = "bold"))

Tab 5

This is a chart showing the average hourly wage by region in the U.S.

# Read the data
regiondata <- read.csv("Accounting Salary Data Cleaned.csv")

# Create a mapping of states to Census regions
northeast <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "Rhode Island", "Vermont",
               "New Jersey", "New York", "Pennsylvania")

midwest <- c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota",
             "Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin")

south <- c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia",
           "Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma",
           "South Carolina", "Tennessee", "Texas", "Virginia", "West Virginia")

west <- c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho", "Montana",
          "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming")

# Add the Region column
regiondata <- regiondata %>%
  mutate(Region = case_when(
    `State` %in% northeast ~ "Northeast",
    `State` %in% midwest ~ "Midwest",
    `State` %in% south ~ "South",
    `State` %in% west ~ "West",
    TRUE ~ "Other"  # For territories or unmatched areas
  ))

# Save the updated dataset (optional)
write.csv(regiondata, "Accounting Salary Data Cleaned_with_Regions.csv")

# Make the Average Hourly Wage column numeric 
regiondata <- regiondata %>%
  mutate(`Hourly.mean.wage` = as.numeric(gsub("[$,]", "", `Hourly.mean.wage`)))

# Summarize average hourly wage by region
region_wages <- regiondata %>%
  group_by(Region) %>%
  summarise(avg_hourly_wage = mean(`Hourly.mean.wage`, na.rm = TRUE))

# Create label text (Region + average wage)
region_wages <- region_wages %>%
  mutate(label = paste0(Region, "\n$", round(avg_hourly_wage, 2)))

# Create pie chart
ggplot(region_wages, aes(x = "", y = avg_hourly_wage, fill = Region)) +
  geom_col(width = 1, color = "white") +
  coord_polar(theta = "y") +
  theme_void() +
  geom_text(aes(label = label), 
            position = position_stack(vjust = 0.5), 
            size = 4) +
  labs(title = "Average Hourly Wage by U.S. Region", fill = "Region") +
  theme(plot.title = element_text(hjust = -0.8, size = 16, face = "bold"))