Explore Data
library(tidyverse)
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
employed_tidy <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = mean(employ_n)) %>%
ungroup()
employed_tidy %>%
filter(race_gender == "TOTAL")
## # A tibble: 239 × 3
## occupation race_gender n
## <chr> <chr> <dbl>
## 1 Agriculture and related Construction and extraction occup… TOTAL 1.22e4
## 2 Agriculture and related Farming, fishing, and forestry oc… TOTAL 9.56e5
## 3 Agriculture and related Installation, maintenance, and re… TOTAL 3.23e4
## 4 Agriculture and related Manage-ment, business, and financ… TOTAL 1.01e6
## 5 Agriculture and related Management, business, and financi… TOTAL 1.04e6
## 6 Agriculture and related Office and administrative support… TOTAL 8.58e4
## 7 Agriculture and related Production occupations TOTAL 3.52e4
## 8 Agriculture and related Professional and related occupati… TOTAL 4.92e4
## 9 Agriculture and related Protective service occupations TOTAL 1.47e4
## 10 Agriculture and related Sales and related occupations TOTAL 1.57e4
## # ℹ 229 more rows
employment_demo <- employed_tidy %>%
filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
janitor::clean_names() %>%
left_join(employed_tidy %>%
filter(race_gender == "TOTAL") %>%
select(-race_gender) %>%
rename(total = n)) %>%
filter(total > 1e4) %>%
mutate(across(c(asian, black_or_african_american, women), ~ . / total),
total = log(total),
across(is.numeric, ~as.numeric(scale(.)))) %>%
mutate(occupation = snakecase::to_snake_case(occupation))
employment_demo
## # A tibble: 211 × 5
## occupation asian black_or_african_ame…¹ women total
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 agriculture_and_related_constr… -0.625 -0.501 -1.33 -1.98
## 2 agriculture_and_related_farmin… -1.05 -1.38 -0.515 0.630
## 3 agriculture_and_related_instal… -0.999 -1.44 -1.41 -1.39
## 4 agriculture_and_related_manage… -1.17 -1.87 -0.293 0.662
## 5 agriculture_and_related_manage… -1.17 -1.85 -0.302 0.682
## 6 agriculture_and_related_office… -0.753 -1.73 2.28 -0.811
## 7 agriculture_and_related_produc… -0.444 -0.0947 -0.631 -1.34
## 8 agriculture_and_related_profes… -0.421 -1.33 0.00826 -1.14
## 9 agriculture_and_related_protec… -1.48 -0.758 -0.846 -1.87
## 10 agriculture_and_related_sales_… -1.48 -1.62 0.437 -1.83
## # ℹ 201 more rows
## # ℹ abbreviated name: ¹​black_or_african_american
Implement k-means clustering
employment_clust <- kmeans(select(employment_demo, -occupation), centers = 3)
summary(employment_clust)
## Length Class Mode
## cluster 211 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(employment_clust)
## # A tibble: 3 × 7
## asian black_or_african_american women total size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 -0.839 -0.711 -0.990 -0.572 64 97.4 1
## 2 0.733 -0.228 0.704 0.746 88 228. 2
## 3 -0.184 1.11 0.0237 -0.491 59 120. 3
augment(employment_clust, employment_demo) %>%
ggplot(aes(total, black_or_african_american, color = .cluster)) +
geom_point(alpha = 0.8)

Choosing k
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(employment_demo, -occupation), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, employment_demo)
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.8) +
geom_point(size = 2)

library(plotly)
employment_clust <- kmeans(select(employment_demo, -occupation),
centers = 5)
p <- augment(employment_clust, employment_demo) %>%
ggplot(aes(total, women, color = .cluster, name = occupation)) +
geom_point(alpha = 0.8)
ggplotly(p)