Set up
library(tidyverse)
library(broom) # Tidy model results
library(umap) # Dimension reduction
library(plotly) # Interactive visualization
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
1 Convert data to standardized form
employed_grouped <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = sum(employ_n)) %>%
ungroup()
employed_tidy <- employed_grouped %>%
# Remove TOTAL category
filter(race_gender != "TOTAL") %>%
# Add TOTAL column
left_join(employed_grouped %>%
filter(race_gender == "TOTAL") %>%
select(occupation, total = n)) %>%
# Get percentage (pct) in total
mutate(pct = n / total) %>%
# Remove outliers
filter(total > 1000) %>%
select(-n)
employed_tidy
## # A tibble: 1,160 × 4
## occupation race_gender total pct
## <chr> <chr> <dbl> <dbl>
## 1 Agriculture and related Construction and extractio… Asian 7.3 e4 0.0274
## 2 Agriculture and related Construction and extractio… Black or A… 7.3 e4 0.0822
## 3 Agriculture and related Construction and extractio… Men 7.3 e4 0.973
## 4 Agriculture and related Construction and extractio… White 7.3 e4 0.863
## 5 Agriculture and related Construction and extractio… Women 7.3 e4 0.0274
## 6 Agriculture and related Farming, fishing, and fore… Asian 5.74e6 0.0139
## 7 Agriculture and related Farming, fishing, and fore… Black or A… 5.74e6 0.0342
## 8 Agriculture and related Farming, fishing, and fore… Men 5.74e6 0.789
## 9 Agriculture and related Farming, fishing, and fore… White 5.74e6 0.911
## 10 Agriculture and related Farming, fishing, and fore… Women 5.74e6 0.211
## # ℹ 1,150 more rows
employed_standard <- employed_tidy %>%
# Standardize
group_by(race_gender) %>%
mutate(pct = pct %>% scale() %>% as.numeric()) %>%
ungroup() %>%
mutate(total = total %>% log() %>% scale() %>% as.numeric())
employed_standard
## # A tibble: 1,160 × 4
## occupation race_gender total pct
## <chr> <chr> <dbl> <dbl>
## 1 Agriculture and related Construction and extractio… Asian -1.30 -0.539
## 2 Agriculture and related Construction and extractio… Black or A… -1.30 -0.405
## 3 Agriculture and related Construction and extractio… Men -1.30 1.31
## 4 Agriculture and related Construction and extractio… White -1.30 0.725
## 5 Agriculture and related Construction and extractio… Women -1.30 -1.30
## 6 Agriculture and related Farming, fishing, and fore… Asian 0.819 -0.928
## 7 Agriculture and related Farming, fishing, and fore… Black or A… 0.819 -1.21
## 8 Agriculture and related Farming, fishing, and fore… Men 0.819 0.510
## 9 Agriculture and related Farming, fishing, and fore… White 0.819 1.38
## 10 Agriculture and related Farming, fishing, and fore… Women 0.819 -0.503
## # ℹ 1,150 more rows
4 Select optimal number of clusters
kclusts <- tibble(k = 1:9) %>%
mutate(kclust = map(.x = k, .f = ~ kmeans(occupation_demo_table %>%
select(-occupation), centers = .x, nstart = 20)), glanced = map(.x = kclust, .f = glance))
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_point() +
geom_line()

final_cluster <- kmeans(occupation_demo_table %>% select(-occupation), centers = 5, nstart = 20)
augment(final_cluster, occupation_demo_table) %>%
ggplot(aes(total, asian, color = .cluster)) +
geom_point()

5 Reduce dimension using UMAP
umap_results <- occupation_demo_table %>%
select(-occupation) %>%
umap()
umap_results_tbl <- umap_results$layout %>%
as.tibble() %>%
bind_cols(occupation_demo_table %>% select(occupation))
umap_results_tbl
## # A tibble: 232 × 3
## V1 V2 occupation
## <dbl> <dbl> <chr>
## 1 -6.17 -8.15 Agriculture and related Construction and extraction occupations
## 2 -1.06 8.75 Agriculture and related Farming, fishing, and forestry occupati…
## 3 -0.822 -9.25 Agriculture and related Installation, maintenance, and repair o…
## 4 6.97 -1.42 Agriculture and related Manage-ment, business, and financial op…
## 5 -0.394 8.72 Agriculture and related Management, business, and financial ope…
## 6 3.74 -7.26 Agriculture and related Office and administrative support occup…
## 7 -0.672 -9.45 Agriculture and related Production occupations
## 8 1.37 -8.61 Agriculture and related Professional and related occupations
## 9 -5.89 -8.28 Agriculture and related Protective service occupations
## 10 -5.79 -8.31 Agriculture and related Sales and related occupations
## # ℹ 222 more rows
umap_results_tbl %>%
ggplot(aes(V1, V2, text = occupation)) +
geom_point()

6 Visualize clusters by adding k-means results
kmeans_umap_tbl <- final_cluster %>%
augment(occupation_demo_table) %>%
select(occupation, .cluster) %>%
# Add umap results
left_join(umap_results_tbl) %>%
# Add employment info
left_join(employed_tidy %>%
select(-total) %>%
pivot_wider(names_from = race_gender, values_from = pct) %>%
janitor::clean_names())
kmeans_umap_tbl
## # A tibble: 232 × 9
## occupation .cluster V1 V2 asian black_or_african_ame…¹ men white
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agriculture… 2 -6.17 -8.15 0.0274 0.0822 0.973 0.863
## 2 Agriculture… 3 -1.06 8.75 0.0139 0.0342 0.789 0.911
## 3 Agriculture… 2 -0.822 -9.25 0.0155 0.0309 0.985 0.918
## 4 Agriculture… 2 6.97 -1.42 0.00992 0.00794 0.739 0.967
## 5 Agriculture… 3 -0.394 8.72 0.00997 0.00882 0.741 0.962
## 6 Agriculture… 2 3.74 -7.26 0.0233 0.0155 0.159 0.938
## 7 Agriculture… 2 -0.672 -9.45 0.0332 0.104 0.815 0.820
## 8 Agriculture… 2 1.37 -8.61 0.0339 0.0373 0.675 0.902
## 9 Agriculture… 2 -5.89 -8.28 0 0.0682 0.864 0.875
## 10 Agriculture… 2 -5.79 -8.31 0 0.0213 0.585 0.968
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american
## # ℹ 1 more variable: women <dbl>
g <- kmeans_umap_tbl %>%
# Create text label
mutate(text_label = str_glue("Occupation: {occupation}
Cluster: {.cluster}
Asian: {asian %>% scales::percent()}
Women: {women %>% scales::percent()}")) %>%
# Plot
ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
geom_point()
g %>% ggplotly(tooltip = "text")