# Source: Long Island FIPS codes
nassau_data <- qcewGetAreaData("2023", "a", "36059")
suffolk_data <- qcewGetAreaData("2023", "a", "36103")
# Preview
# head(nassau_data)
# head(suffolk_data)
# unique(nassau_data$own_code)
industry_titles_url <- "https://data.bls.gov/cew/doc/titles/industry/industry_titles.csv"
industry_titles <- read.csv(industry_titles_url, stringsAsFactors = FALSE)
head(industry_titles)
## industry_code industry_title
## 1 10 10 Total, all industries
## 2 101 101 Goods-producing
## 3 1011 1011 Natural resources and mining
## 4 1012 1012 Construction
## 5 1013 1013 Manufacturing
## 6 102 102 Service-providing
# Join on industry_code
suffolk_labeled <- dplyr::left_join(suffolk_data, industry_titles, by = "industry_code")
nassau_labeled <- dplyr::left_join(nassau_data, industry_titles, by = "industry_code")
head(suffolk_labeled[, c("industry_code", "industry_title", "annual_avg_emplvl")])
## industry_code industry_title annual_avg_emplvl
## 1 10 10 Total, all industries 664266
## 2 10 10 Total, all industries 11038
## 3 102 102 Service-providing 11038
## 4 1021 1021 Trade, transportation, and utilities 4172
## 5 1024 1024 Professional and business services 73
## 6 1025 1025 Education and health services 1937
head(nassau_labeled[, c("industry_code", "industry_title", "annual_avg_emplvl")])
## industry_code industry_title annual_avg_emplvl
## 1 10 10 Total, all industries 624941
## 2 10 10 Total, all industries 5381
## 3 102 102 Service-providing 5381
## 4 1021 1021 Trade, transportation, and utilities 4393
## 5 1024 1024 Professional and business services 16
## 6 1026 1026 Leisure and hospitality 17
li_data <- bind_rows(nassau_labeled, suffolk_labeled)
li_data <- li_data %>%
mutate(
agglvl_category = case_when(
agglvl_code == 70 ~ "County, Total Covered",
agglvl_code == 71 ~ "County, Total -- by ownership sector",
agglvl_code == 72 ~ "County, by Domain -- by ownership sector",
agglvl_code == 73 ~ "County, by Supersector -- by ownership sector",
agglvl_code == 74 ~ "County, NAICS Sector -- by ownership sector",
agglvl_code == 75 ~ "County, NAICS 3-digit -- by ownership sector",
agglvl_code == 76 ~ "County, NAICS 4-digit -- by ownership sector",
agglvl_code == 77 ~ "County, NAICS 5-digit -- by ownership sector",
agglvl_code == 78 ~ "County, NAICS 6-digit -- by ownership sector",
TRUE ~ "Other or Unknown"
)
) %>%
mutate(
naics_2d = case_when(
agglvl_code == 74 ~ industry_code,
agglvl_code >= 75 ~ str_sub(industry_code, 1, 2),
TRUE ~ NA_character_
),
naics_3d = case_when(
agglvl_code == 75 ~ industry_code,
agglvl_code >= 76 ~ str_sub(industry_code, 1, 3),
TRUE ~ NA_character_
),
naics_4d = case_when(
agglvl_code == 76 ~ industry_code,
agglvl_code >= 77 ~ str_sub(industry_code, 1, 4),
TRUE ~ NA_character_
),
naics_5d = case_when(
agglvl_code == 77 ~ industry_code,
agglvl_code == 78 ~ str_sub(industry_code, 1, 5),
TRUE ~ NA_character_
),
naics_6d = case_when(
agglvl_code == 78 ~ industry_code,
TRUE ~ NA_character_
)
)
# Filter and categorize based on agglvl_code
li_data_3d4d <- li_data %>%
filter(agglvl_code %in% c(75, 76)) # Keep only 3-digit and 4-digit codes
# Summarize data for the chart
li_sum_4d <- li_data_3d4d %>%
filter(is.na(naics_4d)==FALSE) %>%
group_by(naics_3d, naics_4d, industry_title) %>%
summarise(
total_jobs = sum(annual_avg_emplvl, na.rm = TRUE),
total_wages_Ms = sum(total_annual_wages, na.rm = TRUE)/1000000,
avg_salary = mean(avg_annual_pay),
.groups = "drop"
)
# Prepare the top 10 3-digit codes based on total jobs
top_3d_codes <- li_sum_4d %>%
group_by(naics_3d) %>%
summarise(total_jobs_3d = sum(total_jobs, na.rm = TRUE), .groups = "drop") %>%
slice_max(total_jobs_3d, n = 10)
# Filter only those 4-digit industries within the top 3-digit groups
jobs_plot_data <- li_sum_4d %>%
filter(naics_3d %in% top_3d_codes$naics_3d) %>%
left_join(top_3d_codes, by = "naics_3d") %>%
mutate(
naics_3d = fct_reorder(naics_3d, total_jobs_3d), # correctly order 3-digit groups
hover_label = paste0(
"<b>Industry:</b> ", industry_title, "<br>",
"<b>Total Jobs:</b> ", formatC(total_jobs, big.mark = ","), "<br>",
"<b>Total Wages (Ms):</b> $", formatC(total_wages_Ms, big.mark = ","), "<br>",
"<b>Average Wage:</b> $", format(round(avg_salary), big.mark = ",", scientific = FALSE)
)
)
# get 3d titles
naics_3d_titles <- li_data_3d4d %>%
filter(is.na(naics_4d)) %>%
filter(naics_3d %in% top_3d_codes$naics_3d) %>%
distinct(naics_3d, industry_title) %>%
rename("title_3d" = industry_title) %>%
select(naics_3d, title_3d)
jobs_plot_data <- jobs_plot_data %>%
left_join(naics_3d_titles, by = "naics_3d") %>%
mutate(naics_3d = factor(naics_3d, levels = top_3d_codes$naics_3d))
#plot
p <- ggplot(jobs_plot_data, aes(
x = naics_3d, # Order by jobs but keep codes as values
y = total_jobs,
fill = industry_title, # 4-digit colors
text = hover_label
)) +
geom_bar(stat = "identity") +
scale_x_discrete(
labels = setNames(jobs_plot_data$title_3d, jobs_plot_data$naics_3d)
) +
labs(
title = "Jobs in LI Indutry by 3 and 4 digit NAICS Codes",
x = "Industry (3-Digit NAICS)",
y = "Total Jobs",
fill = "4-Digit Industry"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none", # Remove legend
)
ggplotly(p, tooltip = "text", width = 1000, height = 1000)
# Filter and categorize based on agglvl_code
li_data_3d4d <- li_data %>%
filter(agglvl_code %in% c(75, 76)) # Keep only 3-digit and 4-digit codes
# Summarize data for the chart
li_sum_4d <- li_data_3d4d %>%
filter(is.na(naics_4d)==FALSE) %>%
group_by(naics_3d, naics_4d, industry_title) %>%
summarise(
total_jobs = sum(annual_avg_emplvl, na.rm = TRUE),
total_wages_Ms = sum(total_annual_wages, na.rm = TRUE)/1000000,
avg_salary = mean(avg_annual_pay),
.groups = "drop"
)
# Prepare the top 10 3-digit codes based on total jobs
top_3d_codes <- li_sum_4d %>%
group_by(naics_3d) %>%
summarise(total_wages_Ms_3d = sum(total_wages_Ms, na.rm = TRUE), .groups = "drop") %>%
slice_max(total_wages_Ms_3d, n = 10)
# Filter only those 4-digit industries within the top 3-digit groups
wages_plot_data <- li_sum_4d %>%
filter(naics_3d %in% top_3d_codes$naics_3d) %>%
left_join(top_3d_codes, by = "naics_3d") %>%
mutate(
naics_3d = fct_reorder(naics_3d, total_wages_Ms_3d), # correctly order 3-digit groups
hover_label = paste0(
"<b>Industry:</b> ", industry_title, "<br>",
"<b>Total Wages (Ms):</b> $ ", formatC(total_wages_Ms, big.mark = "."), "<br>",
"<b>Average Wage:</b> $", format(round(avg_salary), big.mark = ",", scientific = FALSE), "<br>",
"<b> Total Jobs:</b> ", formatC(total_jobs, big.mark = ",")
)
)
# get 3d titles
naics_3d_titles <- li_data_3d4d %>%
filter(is.na(naics_4d)) %>%
filter(naics_3d %in% top_3d_codes$naics_3d) %>%
distinct(naics_3d, industry_title) %>%
rename("title_3d" = industry_title) %>%
select(naics_3d, title_3d)
wages_plot_data <- wages_plot_data %>%
left_join(naics_3d_titles, by = "naics_3d") %>%
mutate(naics_3d = factor(naics_3d, levels = top_3d_codes$naics_3d))
#plot
p <- ggplot(wages_plot_data, aes(
x = naics_3d, # Order by jobs but keep codes as values
y = total_wages_Ms,
fill = industry_title, # 4-digit colors
text = hover_label
)) +
geom_bar(stat = "identity") +
scale_x_discrete(
labels = setNames(wages_plot_data$title_3d, wages_plot_data$naics_3d)
) +
labs(
title = "Top 5 LI Industries by Highest Aggregate Wages (3 and 4 digit NAICS Codes)",
x = "Industry (3-Digit NAICS)",
y = "Total Wages (Millions of $)",
fill = "4-Digit Industry"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none", # Remove legend
)
ggplotly(p, tooltip = "text", width = 1000, height = 1000)
Notes: - NAICS codes For periods from 2011 to 2016, QCEW data were coded using the 2012 version of NAICS. For periods from 2017 to 2021, QCEW data were coded using the 2017 version of NAICS. QCEW data from 2022-forward are and will be coded using the 2022 version of NAICS. reference - csv urls are available back to 2015
# Define years to pull (2015 through 2023)
years_to_pull <- 2015:2023
# Function to safely get and process data for a single year
get_li_year_data <- function(year) {
message("Processing year: ", year)
tryCatch({
# Get county data
nassau <- qcewGetAreaData(as.character(year), "a", "36059")
suffolk <- qcewGetAreaData(as.character(year), "a", "36103")
# Get industry titles (only need to do this once, but included here for robustness)
industry_titles <- read.csv("https://data.bls.gov/cew/doc/titles/industry/industry_titles.csv",
stringsAsFactors = FALSE)
# Process and combine
bind_rows(
left_join(nassau, industry_titles, by = "industry_code") %>% mutate(county = "Nassau"),
left_join(suffolk, industry_titles, by = "industry_code") %>% mutate(county = "Suffolk")
) %>%
mutate(year = year) %>%
return()
}, error = function(e) {
warning(paste("Failed for year", year, ":", e$message))
return(NULL)
})
}
# Get all years (with progress bar)
library(purrr)
all_years_data <- map_dfr(years_to_pull, ~{
get_li_year_data(.x)
}, .id = "year_id")
# Add NAICS parsing (your existing code)
final_data <- all_years_data %>%
mutate(
agglvl_category = case_when(
agglvl_code == 70 ~ "County, Total Covered",
agglvl_code == 71 ~ "County, Total -- by ownership sector",
agglvl_code == 72 ~ "County, by Domain -- by ownership sector",
agglvl_code == 73 ~ "County, by Supersector -- by ownership sector",
agglvl_code == 74 ~ "County, NAICS Sector -- by ownership sector",
agglvl_code == 75 ~ "County, NAICS 3-digit -- by ownership sector",
agglvl_code == 76 ~ "County, NAICS 4-digit -- by ownership sector",
agglvl_code == 77 ~ "County, NAICS 5-digit -- by ownership sector",
agglvl_code == 78 ~ "County, NAICS 6-digit -- by ownership sector",
TRUE ~ "Other or Unknown"
),
naics_2d = case_when(
agglvl_code == 74 ~ industry_code,
agglvl_code >= 75 ~ str_sub(industry_code, 1, 2),
TRUE ~ NA_character_
),
naics_3d = case_when(
agglvl_code == 75 ~ industry_code,
agglvl_code >= 76 ~ str_sub(industry_code, 1, 3),
TRUE ~ NA_character_
),
naics_4d = case_when(
agglvl_code == 76 ~ industry_code,
agglvl_code >= 77 ~ str_sub(industry_code, 1, 4),
TRUE ~ NA_character_
),
naics_5d = case_when(
agglvl_code == 77 ~ industry_code,
agglvl_code == 78 ~ str_sub(industry_code, 1, 5),
TRUE ~ NA_character_
),
naics_6d = case_when(
agglvl_code == 78 ~ industry_code,
TRUE ~ NA_character_
)
)
# Save the combined data
saveRDS(final_data, "long_island_qcew_2015_2023.rds")
# Aggregate to 3-digit NAICS level
stacked_data <- final_data %>%
filter(
agglvl_code == 75, # 3-digit NAICS level
# own_code == 5, # All ownership sectors (or use 5 for private only)
!is.na(naics_3d) # Exclude NA values
) %>%
group_by(year, naics_2d, naics_3d, industry_title) %>%
summarize(
total_employment = sum(annual_avg_emplvl, na.rm = TRUE),
.groups = "drop"
)
stacked_data_2d <- final_data %>%
filter(
agglvl_code == 74, # 2-digit NAICS level
# own_code == 5, # All ownership sectors (or use 5 for private only)
!is.na(naics_2d) # Exclude NA values
) %>%
group_by(year, naics_2d, industry_title) %>%
summarize(
total_employment = sum(annual_avg_emplvl, na.rm = TRUE),
avg_salary = mean(avg_annual_pay, na.rm = TRUE),
.groups = "drop"
)
# 1. Define consistent industry names and colors
industry_colors <- c(
"Manufacturing" = "#ED4E25",
"Utilities" = "#F67A2B",
"Construction" = "#F0BC4A",
"Healthcare" = "#926EA4", # Simplified from "Health care and Social Assistance"
"Information" = "#6FBACD",
"Professional Services" = "#133D66" # Added for NAICS 54
)
# 2. Filter and calculate growth
growth_data <- final_data %>%
filter(
agglvl_code == 74,
naics_2d %in% c("31-33", "22", "23", "51", "54", "62") # All relevant codes
) %>%
mutate(
industry_name = case_when(
naics_2d %in% c("31", "32", "33") ~ "Manufacturing",
naics_2d == "22" ~ "Utilities",
naics_2d == "23" ~ "Construction",
naics_2d == "51" ~ "Information",
naics_2d == "54" ~ "Professional Services",
naics_2d == "62" ~ "Healthcare",
TRUE ~ "Other"
)
) %>%
filter(industry_name != "Other") %>% # Remove unmapped industries
group_by(year, industry_name) %>%
summarize(total_employment = sum(annual_avg_emplvl, na.rm = TRUE)) %>%
group_by(industry_name) %>%
mutate(
growth_pct = (total_employment / lag(total_employment) - 1) * 100
) %>%
filter(!is.na(growth_pct))
ggplot(growth_data, aes(
x = year,
y = growth_pct,
color = factor(industry_name, levels = names(industry_colors)) # Force order
)) +
geom_line(linewidth = 1.5) +
geom_point(size = 3) +
geom_hline(yintercept = 0, linetype = "dashed", color = "#625B61") + # Neutral gray
# Apply RPA colors
scale_color_manual(
values = industry_colors) +
# Formatting
scale_y_continuous(
labels = scales::percent_format(scale = 1),
limits = c(-25, 25) # Adjust based on your data range
) +
labs(
title = "Long Island Target Industries: Employment Growth (YoY %)",
subtitle = "Private Sector | 2016-2023 | RPA Industry Focus",
x = "Year",
y = "Year-over-Year Growth (%)",
color = NULL
) +
theme_minimal(base_size = 12) +
theme(
legend.position = "bottom",
legend.text = element_text(size = 9),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(face = "bold", color = "#133D66") # RPA navy for title
)
# Define PUMAs for Nassau and Suffolk counties
nassau_pumas <- c("03201", "03202", "03203", "03204", "03205", "03206", "03207", "03208", "03209", "03210", "03211", "03212")
suffolk_pumas <- c("03301", "03302", "03303", "03304", "03305", "03306", "03307", "03308", "03309", "03310", "03311", "03312", "03313")
# Filter for Nassau County
nassau_pums <- pums_data %>%
mutate(PUMA = str_pad(as.character(PUMA), 5, pad = "0")) %>%
filter(PUMA %in% nassau_pumas)
# Filter for Suffolk County
suffolk_pums <- pums_data %>%
mutate(PUMA = str_pad(as.character(PUMA), 5, pad = "0")) %>%
filter(PUMA %in% suffolk_pumas)
# Combine Nassau and Suffolk PUMS into one dataset
li_pums <- bind_rows(nassau_pums, suffolk_pums)
NOTE: This is filtering for healthcare industry right now as a stand-in, but can be adapted to any NAICS code.
# NOTE:
# Employment status recode (ESR)
# b .N/A (less than 16 years old)
# 1 .Civilian employed, at work
# 2 .Civilian employed, with a job but not at work
# 3 .Unemployed
# 4 .Armed forces, at work
# 5 .Armed forces, with a job but not at work
# 6 .Not in labor force
# Filter to employed persons only
pums_filtered <- li_pums %>%
filter(ESR %in% c(1, 2))
pums_filtered %>%
summarise(na_count = sum(is.na(NAICSP)))
## # A tibble: 1 × 1
## na_count
## <int>
## 1 0
# Example: filter to health care & social assistance (NAICS 62)
industry_workers <- pums_filtered %>%
filter(str_starts(NAICSP, "62"))
age_table <- industry_workers %>%
mutate(age_group = case_when(
AGEP >= 18 & AGEP <= 24 ~ "18–24",
AGEP >= 25 & AGEP <= 34 ~ "25–34",
AGEP >= 35 & AGEP <= 44 ~ "35–44",
AGEP >= 45 & AGEP <= 54 ~ "45–54",
AGEP >= 55 & AGEP <= 64 ~ "55–64",
AGEP >= 65 ~ "65+",
TRUE ~ "Under 18"
)) %>%
count(age_group, wt = PWGTP) %>%
arrange(desc(n))
age_table
## # A tibble: 7 × 2
## age_group n
## <chr> <dbl>
## 1 25–34 55867
## 2 55–64 51422
## 3 35–44 48823
## 4 45–54 44178
## 5 65+ 23201
## 6 18–24 19370
## 7 Under 18 375
# Create the bar chart using ggplot2
ggplot(age_table, aes(x = age_group, y = n, fill = age_group)) +
geom_bar(stat = "identity", show.legend = FALSE) +
scale_fill_manual(values = rpa_colors) +
labs(
title = "Age Breakdown of Industry Workers",
x = "Age Group",
y = "Count"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
plot.title = element_text(hjust = 0.5) # Center the title
)
# Recode SCHL values to human-readable labels
edu_table <- industry_workers %>%
mutate(education = case_when(
SCHL == "01" ~ "No schooling completed",
SCHL == "02" ~ "Nursery school, preschool",
SCHL == "03" ~ "Kindergarten",
SCHL == "04" ~ "Grade 1",
SCHL == "05" ~ "Grade 2",
SCHL == "06" ~ "Grade 3",
SCHL == "07" ~ "Grade 4",
SCHL == "08" ~ "Grade 5",
SCHL == "09" ~ "Grade 6",
SCHL == "10" ~ "Grade 7",
SCHL == "11" ~ "Grade 8",
SCHL == "12" ~ "Grade 9",
SCHL == "13" ~ "Grade 10",
SCHL == "14" ~ "Grade 11",
SCHL == "15" ~ "12th grade - no diploma",
SCHL == "16" ~ "High school diploma",
SCHL == "17" ~ "GED or alternative credential",
SCHL == "18" ~ "Some college (<1 year)",
SCHL == "19" ~ "Some college (1+ years)",
SCHL == "20" ~ "Associate's degree",
SCHL == "21" ~ "Bachelor's degree",
SCHL == "22" ~ "Master's degree",
SCHL == "23" ~ "Professional degree",
SCHL == "24" ~ "Doctorate degree",
TRUE ~ "Other / N/A"
)) %>%
count(education, wt = PWGTP) %>%
arrange(desc(n)) %>%
slice_max(n, n = 10)
# Optional: View table
edu_table
## # A tibble: 10 × 2
## education n
## <chr> <dbl>
## 1 Bachelor's degree 65581
## 2 Master's degree 38944
## 3 High school diploma 34939
## 4 Associate's degree 25679
## 5 Some college (1+ years) 21463
## 6 Professional degree 21095
## 7 Some college (<1 year) 12151
## 8 Doctorate degree 8714
## 9 GED or alternative credential 4668
## 10 No schooling completed 3188
# Plot
ggplot(edu_table, aes(x = reorder(education, -n), y = n, fill = education)) +
geom_bar(stat = "identity", show.legend = FALSE) +
scale_fill_manual(values = ext_rpa_colors) +
labs(
title = "Education Breakdown of Industry Workers (Top 10)",
x = "Educational Attainment",
y = "Count"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5)
)
industry_workers %>%
count(RAC1P, wt = PWGTP) %>%
arrange(desc(n))
## # A tibble: 8 × 2
## RAC1P n
## <chr> <dbl>
## 1 1 120382
## 2 2 41671
## 3 6 29817
## 4 9 25563
## 5 8 24821
## 6 3 791
## 7 5 143
## 8 7 48
race_table <- industry_workers %>%
mutate(race_ethnicity = case_when(
HISP != "01" ~ "Hispanic/Latino",
RAC1P == "1" ~ "White alone",
RAC1P == "2" ~ "Black or African American alone",
RAC1P == "6" ~ "Asian alone",
TRUE ~ "Other"
)) %>%
count(race_ethnicity, wt = PWGTP)
race_table
## # A tibble: 5 × 2
## race_ethnicity n
## <chr> <dbl>
## 1 Asian alone 29589
## 2 Black or African American alone 40286
## 3 Hispanic/Latino 46494
## 4 Other 11494
## 5 White alone 115373
# Create the bar chart using ggplot2
ggplot(race_table, aes(x = race_ethnicity, y = n, fill = race_ethnicity)) +
geom_bar(stat = "identity", show.legend = FALSE) +
scale_fill_manual(values = rpa_colors) +
labs(
title = "Race Breakdown of Industry Workers",
x = "Race Group",
y = "Count"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
plot.title = element_text(hjust = 0.5) # Center the title
)
inc_table <- industry_workers %>%
mutate(income_group = case_when(
PINCP < 25000 ~ "<$25k",
PINCP < 50000 ~ "$25k–$49k",
PINCP < 75000 ~ "$50k–$74k",
PINCP < 100000 ~ "$75k–$99k",
PINCP >= 100000 ~ "$100k+",
TRUE ~ "No income"
)) %>%
count(income_group, wt = PWGTP)
inc_table
## # A tibble: 5 × 2
## income_group n
## <chr> <dbl>
## 1 $100k+ 71425
## 2 $25k–$49k 57684
## 3 $50k–$74k 46456
## 4 $75k–$99k 32618
## 5 <$25k 35053
# Create the bar chart using ggplot2
ggplot(inc_table, aes(x = income_group, y = n, fill = income_group)) +
geom_bar(stat = "identity", show.legend = FALSE) +
scale_fill_manual(values = rpa_colors) +
labs(
title = "Income Breakdown of Industry Workers",
x = "Income Group",
y = "Count"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1), # Rotate x-axis labels for readability
plot.title = element_text(hjust = 0.5) # Center the title
)