library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(umap)
## Warning: package 'umap' was built under R version 4.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Load dataset
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
1: Standardize Employment Data
employed_grouped <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = sum(employ_n), .groups = "drop")
employed_tidy <- employed_grouped %>%
filter(race_gender != "TOTAL") %>%
left_join(
employed_grouped %>%
filter(race_gender == "TOTAL") %>%
select(occupation, total = n),
by = "occupation"
) %>%
mutate(pct = n / total) %>%
filter(total > 1000) %>%
select(-n)
employed_standard <- employed_tidy %>%
group_by(race_gender) %>%
mutate(pct = scale(pct) %>% as.numeric()) %>%
ungroup() %>%
mutate(total = log(total) %>% scale() %>% as.numeric())
4: Elbow Method for Optimal K
kclusts <- tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~kmeans(occupation_demo_tbl %>% select(-occupation), centers = .x, nstart = 20)),
glanced = map(kclust, glance)
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_point() +
geom_line()

# Final chosen cluster
final_cluster <- kmeans(occupation_demo_tbl %>% select(-occupation), centers = 5, nstart = 20)
augment(final_cluster, occupation_demo_tbl) %>%
ggplot(aes(total, asian, color = .cluster)) +
geom_point()

5: Reduce Dimensions using UMAP
umap_results <- occupation_demo_tbl %>%
select(-occupation) %>%
umap()
umap_results_tbl <- as_tibble(umap_results$layout) %>%
bind_cols(occupation_demo_tbl %>% select(occupation))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tbl %>%
ggplot(aes(V1, V2)) +
geom_point()

6: Add Cluster Results and Visualize
kmeans_umap_tbl <- augment(final_cluster, occupation_demo_tbl) %>%
select(occupation, .cluster) %>%
left_join(umap_results_tbl, by = "occupation") %>%
left_join(
employed_tidy %>%
select(-total) %>%
pivot_wider(names_from = race_gender, values_from = pct) %>%
clean_names(),
by = "occupation"
)
g <- kmeans_umap_tbl %>%
mutate(text_label = str_glue(
"Occupation: {occupation}
Cluster: {.cluster}
Asian: {percent(asian, 1)}
Women: {percent(women, 1)}"
)) %>%
ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
geom_point()
ggplotly(g, tooltip = "text")