library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
## Warning: package 'broom' was built under R version 4.4.3
library(umap)
## Warning: package 'umap' was built under R version 4.4.3
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Load dataset
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1: Standardize Employment Data

employed_grouped <- employed %>%
  filter(!is.na(employ_n)) %>%
  group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
  summarise(n = sum(employ_n), .groups = "drop")

employed_tidy <- employed_grouped %>%
  filter(race_gender != "TOTAL") %>%
  left_join(
    employed_grouped %>%
      filter(race_gender == "TOTAL") %>%
      select(occupation, total = n),
    by = "occupation"
  ) %>%
  mutate(pct = n / total) %>%
  filter(total > 1000) %>%
  select(-n)

employed_standard <- employed_tidy %>%
  group_by(race_gender) %>%
  mutate(pct = scale(pct) %>% as.numeric()) %>%
  ungroup() %>%
  mutate(total = log(total) %>% scale() %>% as.numeric())

2: Pivot Data to Wide Format

occupation_demo_tbl <- employed_tidy %>%
  pivot_wider(names_from = race_gender, values_from = pct) %>%
  clean_names()

3: Perform K-Means Clustering (Initial)

occupation_cluster <- kmeans(
  occupation_demo_tbl %>% select(-occupation),
  centers = 3,
  nstart = 20
)

summary(occupation_cluster)
##              Length Class  Mode   
## cluster      232    -none- numeric
## centers       18    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric
tidy(occupation_cluster)
## # A tibble: 3 × 9
##     total  asian black_or_african_ame…¹   men white women  size withinss cluster
##     <dbl>  <dbl>                  <dbl> <dbl> <dbl> <dbl> <int>    <dbl> <fct>  
## 1  1.80e6 0.0424                  0.104 0.693 0.816 0.305   198  8.13e14 1      
## 2  5.41e7 0.0654                  0.124 0.533 0.774 0.467     8  4.86e15 2      
## 3  1.47e7 0.0684                  0.120 0.554 0.779 0.446    26  5.46e14 3      
## # ℹ abbreviated name: ¹​black_or_african_american
glance(occupation_cluster)
## # A tibble: 1 × 4
##     totss tot.withinss betweenss  iter
##     <dbl>        <dbl>     <dbl> <int>
## 1 3.00e16      6.22e15   2.37e16     3
augment(occupation_cluster, occupation_demo_tbl) %>%
  ggplot(aes(total, asian, color = .cluster)) +
  geom_point()

4: Elbow Method for Optimal K

kclusts <- tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~kmeans(occupation_demo_tbl %>% select(-occupation), centers = .x, nstart = 20)),
    glanced = map(kclust, glance)
  )

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_point() +
  geom_line()

# Final chosen cluster
final_cluster <- kmeans(occupation_demo_tbl %>% select(-occupation), centers = 5, nstart = 20)

augment(final_cluster, occupation_demo_tbl) %>%
  ggplot(aes(total, asian, color = .cluster)) +
  geom_point()

5: Reduce Dimensions using UMAP

umap_results <- occupation_demo_tbl %>%
  select(-occupation) %>%
  umap()

umap_results_tbl <- as_tibble(umap_results$layout) %>%
  bind_cols(occupation_demo_tbl %>% select(occupation))
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tbl %>%
  ggplot(aes(V1, V2)) +
  geom_point()

6: Add Cluster Results and Visualize

kmeans_umap_tbl <- augment(final_cluster, occupation_demo_tbl) %>%
  select(occupation, .cluster) %>%
  left_join(umap_results_tbl, by = "occupation") %>%
  left_join(
    employed_tidy %>%
      select(-total) %>%
      pivot_wider(names_from = race_gender, values_from = pct) %>%
      clean_names(),
    by = "occupation"
  )
g <- kmeans_umap_tbl %>%
  mutate(text_label = str_glue(
    "Occupation: {occupation}
     Cluster: {.cluster}
     Asian: {percent(asian, 1)}
     Women: {percent(women, 1)}"
  )) %>%
  ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
  geom_point()

ggplotly(g, tooltip = "text")