Explore Data

library(tidyverse)

employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
employed_tidy <- employed %>%
    filter(!is.na(employ_n)) %>%
    group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
    summarise(n = mean(employ_n)) %>%
    ungroup()
employed_tidy %>%
    filter(race_gender == "TOTAL")
## # A tibble: 239 × 3
##    occupation                                                 race_gender      n
##    <chr>                                                      <chr>        <dbl>
##  1 Agriculture and related Construction and extraction occup… TOTAL       1.22e4
##  2 Agriculture and related Farming, fishing, and forestry oc… TOTAL       9.56e5
##  3 Agriculture and related Installation, maintenance, and re… TOTAL       3.23e4
##  4 Agriculture and related Manage-ment, business, and financ… TOTAL       1.01e6
##  5 Agriculture and related Management, business, and financi… TOTAL       1.04e6
##  6 Agriculture and related Office and administrative support… TOTAL       8.58e4
##  7 Agriculture and related Production occupations             TOTAL       3.52e4
##  8 Agriculture and related Professional and related occupati… TOTAL       4.92e4
##  9 Agriculture and related Protective service occupations     TOTAL       1.47e4
## 10 Agriculture and related Sales and related occupations      TOTAL       1.57e4
## # ℹ 229 more rows
employment_demo <- employed_tidy %>%
    filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
    pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
    janitor::clean_names() %>%
    left_join(employed_tidy %>%
                  filter(race_gender == "TOTAL") %>%
                  select(-race_gender) %>%
                  rename(total = n)) %>%
    filter(total > 1e4) %>%
    mutate(across(c(asian, black_or_african_american, women), ~ . / total), 
           total = log(total), 
           across(is.numeric, ~as.numeric(scale(.)))) %>%
    mutate(occupation = snakecase::to_snake_case(occupation))

employment_demo
## # A tibble: 211 × 5
##    occupation                       asian black_or_african_ame…¹    women  total
##    <chr>                            <dbl>                  <dbl>    <dbl>  <dbl>
##  1 agriculture_and_related_constr… -0.625                -0.501  -1.33    -1.98 
##  2 agriculture_and_related_farmin… -1.05                 -1.38   -0.515    0.630
##  3 agriculture_and_related_instal… -0.999                -1.44   -1.41    -1.39 
##  4 agriculture_and_related_manage… -1.17                 -1.87   -0.293    0.662
##  5 agriculture_and_related_manage… -1.17                 -1.85   -0.302    0.682
##  6 agriculture_and_related_office… -0.753                -1.73    2.28    -0.811
##  7 agriculture_and_related_produc… -0.444                -0.0947 -0.631   -1.34 
##  8 agriculture_and_related_profes… -0.421                -1.33    0.00826 -1.14 
##  9 agriculture_and_related_protec… -1.48                 -0.758  -0.846   -1.87 
## 10 agriculture_and_related_sales_… -1.48                 -1.62    0.437   -1.83 
## # ℹ 201 more rows
## # ℹ abbreviated name: ¹​black_or_african_american

Implement k-means clustering

employment_clust <- kmeans(select(employment_demo, -occupation), centers = 3)

summary(employment_clust)
##              Length Class  Mode   
## cluster      211    -none- numeric
## centers       12    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric
library(broom)

tidy(employment_clust)
## # A tibble: 3 × 7
##    asian black_or_african_american   women  total  size withinss cluster
##    <dbl>                     <dbl>   <dbl>  <dbl> <int>    <dbl> <fct>  
## 1 -0.839                    -0.711 -0.990  -0.572    64     97.4 1      
## 2  0.733                    -0.228  0.704   0.746    88    228.  2      
## 3 -0.184                     1.11   0.0237 -0.491    59    120.  3
augment(employment_clust, employment_demo) %>%
    ggplot(aes(total, black_or_african_american, color = .cluster)) +
    geom_point(alpha = 0.8)

Choosing k

kclusts <-
    tibble(k = 1:9) %>%
    mutate(
        kclust = map(k, ~ kmeans(select(employment_demo, -occupation), .x)),
        tidied = map(kclust, tidy),
        glanced = map(kclust, glance),
        augmented = map(kclust, augment, employment_demo)
    )

kclusts %>%
    unnest(glanced) %>%
    ggplot(aes(k, tot.withinss)) +
    geom_line(alpha = 0.8) +
    geom_point(size = 2)

library(plotly)

employment_clust <- kmeans(select(employment_demo, -occupation), 
                           centers = 5)

p <- augment(employment_clust, employment_demo) %>%
    ggplot(aes(total, women, color = .cluster, name = occupation)) +
    geom_point(alpha = 0.8)

ggplotly(p)