setup

QCEW 2023

clean

pull data

# Source: Long Island FIPS codes
nassau_data <- qcewGetAreaData("2023", "a", "36059")
suffolk_data <- qcewGetAreaData("2023", "a", "36103")

# Preview
# head(nassau_data)
# head(suffolk_data)
# unique(nassau_data$own_code)

industry_titles_url <- "https://data.bls.gov/cew/doc/titles/industry/industry_titles.csv"
industry_titles <- read.csv(industry_titles_url, stringsAsFactors = FALSE)

head(industry_titles)
##   industry_code                    industry_title
## 1            10          10 Total, all industries
## 2           101               101 Goods-producing
## 3          1011 1011 Natural resources and mining
## 4          1012                 1012 Construction
## 5          1013                1013 Manufacturing
## 6           102             102 Service-providing
# Join on industry_code
suffolk_labeled <- dplyr::left_join(suffolk_data, industry_titles, by = "industry_code")
nassau_labeled <- dplyr::left_join(nassau_data, industry_titles, by = "industry_code")

head(suffolk_labeled[, c("industry_code", "industry_title", "annual_avg_emplvl")])
##   industry_code                            industry_title annual_avg_emplvl
## 1            10                  10 Total, all industries            664266
## 2            10                  10 Total, all industries             11038
## 3           102                     102 Service-providing             11038
## 4          1021 1021 Trade, transportation, and utilities              4172
## 5          1024   1024 Professional and business services                73
## 6          1025        1025 Education and health services              1937
head(nassau_labeled[, c("industry_code", "industry_title", "annual_avg_emplvl")])
##   industry_code                            industry_title annual_avg_emplvl
## 1            10                  10 Total, all industries            624941
## 2            10                  10 Total, all industries              5381
## 3           102                     102 Service-providing              5381
## 4          1021 1021 Trade, transportation, and utilities              4393
## 5          1024   1024 Professional and business services                16
## 6          1026              1026 Leisure and hospitality                17

join

li_data <- bind_rows(nassau_labeled, suffolk_labeled)

parse NAICS

li_data <- li_data %>%
    mutate(
    agglvl_category = case_when(
      agglvl_code == 70 ~ "County, Total Covered",
      agglvl_code == 71 ~ "County, Total -- by ownership sector",
      agglvl_code == 72 ~ "County, by Domain -- by ownership sector",
      agglvl_code == 73 ~ "County, by Supersector -- by ownership sector",
      agglvl_code == 74 ~ "County, NAICS Sector -- by ownership sector",
      agglvl_code == 75 ~ "County, NAICS 3-digit -- by ownership sector",
      agglvl_code == 76 ~ "County, NAICS 4-digit -- by ownership sector",
      agglvl_code == 77 ~ "County, NAICS 5-digit -- by ownership sector",
      agglvl_code == 78 ~ "County, NAICS 6-digit -- by ownership sector",
      TRUE ~ "Other or Unknown"
    )
  ) %>% 
  mutate(
    naics_2d = case_when(
      agglvl_code == 74 ~ industry_code,
      agglvl_code >= 75 ~ str_sub(industry_code, 1, 2),
      TRUE ~ NA_character_
    ),
    naics_3d = case_when(
      agglvl_code == 75 ~ industry_code,
      agglvl_code >= 76 ~ str_sub(industry_code, 1, 3),
      TRUE ~ NA_character_
    ),
    naics_4d = case_when(
      agglvl_code == 76 ~ industry_code,
      agglvl_code >= 77 ~ str_sub(industry_code, 1, 4),
      TRUE ~ NA_character_
    ),
    naics_5d = case_when(
      agglvl_code == 77 ~ industry_code,
      agglvl_code == 78 ~ str_sub(industry_code, 1, 5),
      TRUE ~ NA_character_
    ),
    naics_6d = case_when(
      agglvl_code == 78 ~ industry_code,
      TRUE ~ NA_character_
    )
  )

summarize and visualize

by total jobs

by 3 and 4 digit NAICS

# Filter and categorize based on agglvl_code
li_data_3d4d <- li_data %>%
  filter(agglvl_code %in% c(75, 76))  # Keep only 3-digit and 4-digit codes

# Summarize data for the chart
li_sum_4d <- li_data_3d4d %>%
  filter(is.na(naics_4d)==FALSE) %>% 
  group_by(naics_3d, naics_4d, industry_title) %>%
  summarise(
    total_jobs = sum(annual_avg_emplvl, na.rm = TRUE),
    total_wages_Ms = sum(total_annual_wages, na.rm = TRUE)/1000000,
    avg_salary = mean(avg_annual_pay),
    .groups = "drop"
  )
# Prepare the top 10 3-digit codes based on total jobs
top_3d_codes <- li_sum_4d %>%
  group_by(naics_3d) %>%
  summarise(total_jobs_3d = sum(total_jobs, na.rm = TRUE), .groups = "drop") %>%
  slice_max(total_jobs_3d, n = 10)

# Filter only those 4-digit industries within the top 3-digit groups
jobs_plot_data <- li_sum_4d %>%
  filter(naics_3d %in% top_3d_codes$naics_3d) %>%
  left_join(top_3d_codes, by = "naics_3d") %>%
  mutate(
    naics_3d = fct_reorder(naics_3d, total_jobs_3d),  # correctly order 3-digit groups
    hover_label = paste0(
      "<b>Industry:</b> ", industry_title, "<br>",
      "<b>Total Jobs:</b> ", formatC(total_jobs, big.mark = ","), "<br>",
      "<b>Total Wages (Ms):</b> $", formatC(total_wages_Ms, big.mark = ","), "<br>",
      "<b>Average Wage:</b> $", format(round(avg_salary), big.mark = ",", scientific = FALSE)
    ) 
  )

# get 3d titles
naics_3d_titles <- li_data_3d4d %>%
  filter(is.na(naics_4d)) %>% 
  filter(naics_3d %in% top_3d_codes$naics_3d) %>%
  distinct(naics_3d, industry_title) %>%  
  rename("title_3d" = industry_title) %>% 
  select(naics_3d, title_3d)

jobs_plot_data <- jobs_plot_data %>%
  left_join(naics_3d_titles, by = "naics_3d") %>% 
  mutate(naics_3d = factor(naics_3d, levels = top_3d_codes$naics_3d))



#plot
p <- ggplot(jobs_plot_data, aes(
  x = naics_3d,  # Order by jobs but keep codes as values
  y = total_jobs,
  fill = industry_title,               # 4-digit colors
  text = hover_label
)) +
  geom_bar(stat = "identity") +
  scale_x_discrete(
    labels = setNames(jobs_plot_data$title_3d, jobs_plot_data$naics_3d)
  ) +
  labs(
    title = "Jobs in LI Indutry by 3 and 4 digit NAICS Codes",
    x = "Industry (3-Digit NAICS)",
    y = "Total Jobs",
    fill = "4-Digit Industry"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none",  # Remove legend
  ) 

ggplotly(p, tooltip = "text", width = 1000, height = 1000)

by total wages

by 3 and 4 digit NAICS

# Filter and categorize based on agglvl_code
li_data_3d4d <- li_data %>%
  filter(agglvl_code %in% c(75, 76))  # Keep only 3-digit and 4-digit codes

# Summarize data for the chart
li_sum_4d <- li_data_3d4d %>%
  filter(is.na(naics_4d)==FALSE) %>% 
  group_by(naics_3d, naics_4d, industry_title) %>%
  summarise(
    total_jobs = sum(annual_avg_emplvl, na.rm = TRUE),
    total_wages_Ms = sum(total_annual_wages, na.rm = TRUE)/1000000,
    avg_salary = mean(avg_annual_pay),
    .groups = "drop"
  )
# Prepare the top 10 3-digit codes based on total jobs
top_3d_codes <- li_sum_4d %>%
  group_by(naics_3d) %>%
  summarise(total_wages_Ms_3d = sum(total_wages_Ms, na.rm = TRUE), .groups = "drop") %>%
  slice_max(total_wages_Ms_3d, n = 10)

# Filter only those 4-digit industries within the top 3-digit groups
wages_plot_data <- li_sum_4d %>%
  filter(naics_3d %in% top_3d_codes$naics_3d) %>%
  left_join(top_3d_codes, by = "naics_3d") %>%
  mutate(
    naics_3d = fct_reorder(naics_3d, total_wages_Ms_3d),  # correctly order 3-digit groups
    hover_label = paste0(
      "<b>Industry:</b> ", industry_title, "<br>",
      "<b>Total Wages (Ms):</b> $ ", formatC(total_wages_Ms, big.mark = "."), "<br>",
      "<b>Average Wage:</b> $", format(round(avg_salary), big.mark = ",", scientific = FALSE), "<br>",
      "<b> Total Jobs:</b> ", formatC(total_jobs, big.mark = ",") 
    ) 
  )

# get 3d titles
naics_3d_titles <- li_data_3d4d %>%
  filter(is.na(naics_4d)) %>% 
  filter(naics_3d %in% top_3d_codes$naics_3d) %>%
  distinct(naics_3d, industry_title) %>%  
  rename("title_3d" = industry_title) %>% 
  select(naics_3d, title_3d)

wages_plot_data <- wages_plot_data %>%
  left_join(naics_3d_titles, by = "naics_3d") %>% 
  mutate(naics_3d = factor(naics_3d, levels = top_3d_codes$naics_3d))



#plot
p <- ggplot(wages_plot_data, aes(
  x = naics_3d,  # Order by jobs but keep codes as values
  y = total_wages_Ms,
  fill = industry_title,               # 4-digit colors
  text = hover_label
)) +
  geom_bar(stat = "identity") +
  scale_x_discrete(
    labels = setNames(wages_plot_data$title_3d, wages_plot_data$naics_3d)
  ) +
  labs(
    title = "Top 5 LI Industries by Highest Aggregate Wages (3 and 4 digit NAICS Codes)",
    x = "Industry (3-Digit NAICS)",
    y = "Total Wages (Millions of $)",
    fill = "4-Digit Industry"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none",  # Remove legend
  ) 

ggplotly(p, tooltip = "text", width = 1000, height = 1000)

QCEW time series

BLS API pull

Notes: - NAICS codes For periods from 2011 to 2016, QCEW data were coded using the 2012 version of NAICS. For periods from 2017 to 2021, QCEW data were coded using the 2017 version of NAICS. QCEW data from 2022-forward are and will be coded using the 2022 version of NAICS. reference - csv urls are available back to 2015

# Define years to pull (2015 through 2023)
years_to_pull <- 2015:2023

# Function to safely get and process data for a single year
get_li_year_data <- function(year) {
  message("Processing year: ", year)
  
  tryCatch({
    # Get county data
    nassau <- qcewGetAreaData(as.character(year), "a", "36059")
    suffolk <- qcewGetAreaData(as.character(year), "a", "36103")
    
    # Get industry titles (only need to do this once, but included here for robustness)
    industry_titles <- read.csv("https://data.bls.gov/cew/doc/titles/industry/industry_titles.csv", 
                              stringsAsFactors = FALSE)
    
    # Process and combine
    bind_rows(
      left_join(nassau, industry_titles, by = "industry_code") %>% mutate(county = "Nassau"),
      left_join(suffolk, industry_titles, by = "industry_code") %>% mutate(county = "Suffolk")
    ) %>% 
      mutate(year = year) %>% 
      return()
  }, error = function(e) {
    warning(paste("Failed for year", year, ":", e$message))
    return(NULL)
  })
}

# Get all years (with progress bar)
library(purrr)
all_years_data <- map_dfr(years_to_pull, ~{
  get_li_year_data(.x)
}, .id = "year_id")

# Add NAICS parsing (your existing code)
final_data <- all_years_data %>%
  mutate(
    agglvl_category = case_when(
      agglvl_code == 70 ~ "County, Total Covered",
      agglvl_code == 71 ~ "County, Total -- by ownership sector",
      agglvl_code == 72 ~ "County, by Domain -- by ownership sector",
      agglvl_code == 73 ~ "County, by Supersector -- by ownership sector",
      agglvl_code == 74 ~ "County, NAICS Sector -- by ownership sector",
      agglvl_code == 75 ~ "County, NAICS 3-digit -- by ownership sector",
      agglvl_code == 76 ~ "County, NAICS 4-digit -- by ownership sector",
      agglvl_code == 77 ~ "County, NAICS 5-digit -- by ownership sector",
      agglvl_code == 78 ~ "County, NAICS 6-digit -- by ownership sector",
      TRUE ~ "Other or Unknown"
    ),
    naics_2d = case_when(
      agglvl_code == 74 ~ industry_code,
      agglvl_code >= 75 ~ str_sub(industry_code, 1, 2),
      TRUE ~ NA_character_
    ),
    naics_3d = case_when(
      agglvl_code == 75 ~ industry_code,
      agglvl_code >= 76 ~ str_sub(industry_code, 1, 3),
      TRUE ~ NA_character_
    ),
    naics_4d = case_when(
      agglvl_code == 76 ~ industry_code,
      agglvl_code >= 77 ~ str_sub(industry_code, 1, 4),
      TRUE ~ NA_character_
    ),
    naics_5d = case_when(
      agglvl_code == 77 ~ industry_code,
      agglvl_code == 78 ~ str_sub(industry_code, 1, 5),
      TRUE ~ NA_character_
    ),
    naics_6d = case_when(
      agglvl_code == 78 ~ industry_code,
      TRUE ~ NA_character_
    )
  )

# Save the combined data
saveRDS(final_data, "long_island_qcew_2015_2023.rds")

aggregate data by year and industry

# Aggregate to 3-digit NAICS level
stacked_data <- final_data %>%
  filter(
    agglvl_code == 75,        # 3-digit NAICS level
    # own_code == 5,            # All ownership sectors (or use 5 for private only)
    !is.na(naics_3d)          # Exclude NA values
  ) %>%
  group_by(year, naics_2d, naics_3d, industry_title) %>%
  summarize(
    total_employment = sum(annual_avg_emplvl, na.rm = TRUE),
    .groups = "drop"
  )

stacked_data_2d <- final_data %>%
  filter(
    agglvl_code == 74,        # 2-digit NAICS level
    # own_code == 5,            # All ownership sectors (or use 5 for private only)
    !is.na(naics_2d)          # Exclude NA values
  ) %>%
  group_by(year, naics_2d, industry_title) %>%
  summarize(
    total_employment = sum(annual_avg_emplvl, na.rm = TRUE),
    avg_salary = mean(avg_annual_pay, na.rm = TRUE),
    .groups = "drop"
  ) 

growth of key industries

# 1. Define consistent industry names and colors
industry_colors <- c(
  "Manufacturing" = "#ED4E25",
  "Utilities" = "#F67A2B",
  "Construction" = "#F0BC4A",
  "Healthcare" = "#926EA4",  # Simplified from "Health care and Social Assistance"
  "Information" = "#6FBACD",
  "Professional Services" = "#133D66"  # Added for NAICS 54
)

# 2. Filter and calculate growth
growth_data <- final_data %>%
  filter(
    agglvl_code == 74,
    naics_2d %in% c("31-33", "22", "23", "51", "54", "62")  # All relevant codes
  ) %>%
  mutate(
    industry_name = case_when(
      naics_2d %in% c("31", "32", "33") ~ "Manufacturing",
      naics_2d == "22" ~ "Utilities",
      naics_2d == "23" ~ "Construction",
      naics_2d == "51" ~ "Information",
      naics_2d == "54" ~ "Professional Services",
      naics_2d == "62" ~ "Healthcare",
      TRUE ~ "Other"
    )
  ) %>%
  filter(industry_name != "Other") %>%  # Remove unmapped industries
  group_by(year, industry_name) %>%
  summarize(total_employment = sum(annual_avg_emplvl, na.rm = TRUE)) %>%
  group_by(industry_name) %>%
  mutate(
    growth_pct = (total_employment / lag(total_employment) - 1) * 100
  ) %>%
  filter(!is.na(growth_pct))
ggplot(growth_data, aes(
  x = year, 
  y = growth_pct, 
  color = factor(industry_name, levels = names(industry_colors))  # Force order
)) +
  geom_line(linewidth = 1.5) +
  geom_point(size = 3) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "#625B61") +  # Neutral gray
  
  # Apply RPA colors
  scale_color_manual(
    values = industry_colors) +
  # Formatting
  scale_y_continuous(
    labels = scales::percent_format(scale = 1),
    limits = c(-25, 25)  # Adjust based on your data range
  ) +
  labs(
    title = "Long Island Target Industries: Employment Growth (YoY %)",
    subtitle = "Private Sector | 2016-2023 | RPA Industry Focus",
    x = "Year",
    y = "Year-over-Year Growth (%)",
    color = NULL
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "bottom",
    legend.text = element_text(size = 9),
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(face = "bold", color = "#133D66")  # RPA navy for title
  ) 

ACS - Demo Breakdown

PUMS

pull data

filter

geography

# Define PUMAs for Nassau and Suffolk counties
nassau_pumas <- c("03201", "03202", "03203", "03204", "03205", "03206", "03207", "03208", "03209", "03210", "03211", "03212")
suffolk_pumas <- c("03301", "03302", "03303", "03304", "03305", "03306", "03307", "03308", "03309", "03310", "03311", "03312", "03313")

# Filter for Nassau County
nassau_pums <- pums_data %>%
  mutate(PUMA = str_pad(as.character(PUMA), 5, pad = "0")) %>%
  filter(PUMA %in% nassau_pumas)

# Filter for Suffolk County
suffolk_pums <- pums_data %>%
  mutate(PUMA = str_pad(as.character(PUMA), 5, pad = "0")) %>%
  filter(PUMA %in% suffolk_pumas)

# Combine Nassau and Suffolk PUMS into one dataset
li_pums <- bind_rows(nassau_pums, suffolk_pums)

industry

PUMS 2023 Data Dictionary

NOTE: This is filtering for healthcare industry right now as a stand-in, but can be adapted to any NAICS code.

# NOTE:
# Employment status recode (ESR)
# b .N/A (less than 16 years old)
# 1 .Civilian employed, at work
# 2 .Civilian employed, with a job but not at work
# 3 .Unemployed
# 4 .Armed forces, at work
# 5 .Armed forces, with a job but not at work
# 6 .Not in labor force

# Filter to employed persons only
pums_filtered <- li_pums %>%
  filter(ESR %in% c(1, 2))


pums_filtered %>% 
  summarise(na_count = sum(is.na(NAICSP)))
## # A tibble: 1 × 1
##   na_count
##      <int>
## 1        0
# Example: filter to health care & social assistance (NAICS 62)
industry_workers <- pums_filtered %>%
  filter(str_starts(NAICSP, "62"))

Demos

age

age_table <- industry_workers %>%
  mutate(age_group = case_when(
    AGEP >= 18 & AGEP <= 24 ~ "18–24",
    AGEP >= 25 & AGEP <= 34 ~ "25–34",
    AGEP >= 35 & AGEP <= 44 ~ "35–44",
    AGEP >= 45 & AGEP <= 54 ~ "45–54",
    AGEP >= 55 & AGEP <= 64 ~ "55–64",
    AGEP >= 65 ~ "65+",
    TRUE ~ "Under 18"
  )) %>%
  count(age_group, wt = PWGTP) %>%
  arrange(desc(n))

age_table
## # A tibble: 7 × 2
##   age_group     n
##   <chr>     <dbl>
## 1 25–34     55867
## 2 55–64     51422
## 3 35–44     48823
## 4 45–54     44178
## 5 65+       23201
## 6 18–24     19370
## 7 Under 18    375
# Create the bar chart using ggplot2
ggplot(age_table, aes(x = age_group, y = n, fill = age_group)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  scale_fill_manual(values = rpa_colors) + 
  labs(
    title = "Age Breakdown of Industry Workers",
    x = "Age Group",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for readability
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

education

# Recode SCHL values to human-readable labels
edu_table <- industry_workers %>%
  mutate(education = case_when(
    SCHL == "01" ~ "No schooling completed",
    SCHL == "02" ~ "Nursery school, preschool",
    SCHL == "03" ~ "Kindergarten",
    SCHL == "04" ~ "Grade 1",
    SCHL == "05" ~ "Grade 2",
    SCHL == "06" ~ "Grade 3",
    SCHL == "07" ~ "Grade 4",
    SCHL == "08" ~ "Grade 5",
    SCHL == "09" ~ "Grade 6",
    SCHL == "10" ~ "Grade 7",
    SCHL == "11" ~ "Grade 8",
    SCHL == "12" ~ "Grade 9",
    SCHL == "13" ~ "Grade 10",
    SCHL == "14" ~ "Grade 11",
    SCHL == "15" ~ "12th grade - no diploma",
    SCHL == "16" ~ "High school diploma",
    SCHL == "17" ~ "GED or alternative credential",
    SCHL == "18" ~ "Some college (<1 year)",
    SCHL == "19" ~ "Some college (1+ years)",
    SCHL == "20" ~ "Associate's degree",
    SCHL == "21" ~ "Bachelor's degree",
    SCHL == "22" ~ "Master's degree",
    SCHL == "23" ~ "Professional degree",
    SCHL == "24" ~ "Doctorate degree",
    TRUE ~ "Other / N/A"
  )) %>%
  count(education, wt = PWGTP) %>%
  arrange(desc(n)) %>%
  slice_max(n, n = 10)

# Optional: View table
edu_table
## # A tibble: 10 × 2
##    education                         n
##    <chr>                         <dbl>
##  1 Bachelor's degree             65581
##  2 Master's degree               38944
##  3 High school diploma           34939
##  4 Associate's degree            25679
##  5 Some college (1+ years)       21463
##  6 Professional degree           21095
##  7 Some college (<1 year)        12151
##  8 Doctorate degree               8714
##  9 GED or alternative credential  4668
## 10 No schooling completed         3188
# Plot
ggplot(edu_table, aes(x = reorder(education, -n), y = n, fill = education)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  scale_fill_manual(values = ext_rpa_colors) + 
  labs(
    title = "Education Breakdown of Industry Workers (Top 10)",
    x = "Educational Attainment",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5)
  )

race

industry_workers %>%
  count(RAC1P, wt = PWGTP) %>%
  arrange(desc(n))
## # A tibble: 8 × 2
##   RAC1P      n
##   <chr>  <dbl>
## 1 1     120382
## 2 2      41671
## 3 6      29817
## 4 9      25563
## 5 8      24821
## 6 3        791
## 7 5        143
## 8 7         48
race_table <- industry_workers %>%
  mutate(race_ethnicity = case_when(
    HISP != "01" ~ "Hispanic/Latino",
    RAC1P == "1" ~ "White alone",
    RAC1P == "2" ~ "Black or African American alone",
    RAC1P == "6" ~ "Asian alone",
    TRUE ~ "Other"
  )) %>%
  count(race_ethnicity, wt = PWGTP)

race_table
## # A tibble: 5 × 2
##   race_ethnicity                       n
##   <chr>                            <dbl>
## 1 Asian alone                      29589
## 2 Black or African American alone  40286
## 3 Hispanic/Latino                  46494
## 4 Other                            11494
## 5 White alone                     115373
# Create the bar chart using ggplot2
ggplot(race_table, aes(x = race_ethnicity, y = n, fill = race_ethnicity)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  scale_fill_manual(values = rpa_colors) + 
  labs(
    title = "Race Breakdown of Industry Workers",
    x = "Race Group",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for readability
    plot.title = element_text(hjust = 0.5)  # Center the title
  )

personal income

inc_table <- industry_workers %>%
  mutate(income_group = case_when(
    PINCP < 25000 ~ "<$25k",
    PINCP < 50000 ~ "$25k–$49k",
    PINCP < 75000 ~ "$50k–$74k",
    PINCP < 100000 ~ "$75k–$99k",
    PINCP >= 100000 ~ "$100k+",
    TRUE ~ "No income"
  )) %>%
  count(income_group, wt = PWGTP)

inc_table
## # A tibble: 5 × 2
##   income_group     n
##   <chr>        <dbl>
## 1 $100k+       71425
## 2 $25k–$49k    57684
## 3 $50k–$74k    46456
## 4 $75k–$99k    32618
## 5 <$25k        35053
# Create the bar chart using ggplot2
ggplot(inc_table, aes(x = income_group, y = n, fill = income_group)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  scale_fill_manual(values = rpa_colors) + 
  labs(
    title = "Income Breakdown of Industry Workers",
    x = "Income Group",
    y = "Count"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),  # Rotate x-axis labels for readability
    plot.title = element_text(hjust = 0.5)  # Center the title
  )