R Markdown

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   industry = col_character(),
##   major_occupation = col_character(),
##   minor_occupation = col_character(),
##   race_gender = col_character(),
##   industry_total = col_double(),
##   employ_n = col_double(),
##   year = col_double()
## )
employed_tidy <- employed %>%
  filter(year==2020)%>%
  filter(!is.na(employ_n)) %>%
  group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
  summarise(n = mean(employ_n)) %>%
  ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the `.groups` argument.
employment_demo <- employed_tidy %>%
  filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
  pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
  janitor::clean_names() %>%
  left_join(employed_tidy %>%
    filter(race_gender == "TOTAL") %>%
    select(-race_gender) %>%
    rename(total = n)) %>%
  filter(total > 1e3) %>%
  mutate(across(c(asian, black_or_african_american, women), ~ . / (total)),
    total = log(total),
    across(where(is.numeric), ~ as.numeric(scale(.)))
  ) %>%
  mutate(occupation = snakecase::to_snake_case(occupation))
## Joining, by = "occupation"
employment_demo
## # A tibble: 200 x 5
##    occupation                            asian black_or_african_a~  women  total
##    <chr>                                 <dbl>               <dbl>  <dbl>  <dbl>
##  1 agriculture_and_related_constructi~ -1.01                 1.27  -1.40  -1.60 
##  2 agriculture_and_related_farming_fi~ -0.743               -0.997 -0.431  0.743
##  3 agriculture_and_related_installati~ -1.01                -0.284 -1.15  -0.985
##  4 agriculture_and_related_management~ -0.861               -1.49  -0.304  0.759
##  5 agriculture_and_related_office_and~ -0.577               -1.43   2.03  -0.433
##  6 agriculture_and_related_production~  0.484               -0.900 -0.708 -0.861
##  7 agriculture_and_related_profession~  0.0710              -0.351  0.181 -0.694
##  8 agriculture_and_related_protective~ -1.01                -1.58  -0.762 -1.47 
##  9 agriculture_and_related_sales_and_~ -1.01                -0.392  0.685 -1.51 
## 10 agriculture_and_related_service_oc~ -1.01                -0.972  0.463 -0.444
## # ... with 190 more rows
employment_clust <- kmeans(select(employment_demo, -occupation), centers = 3)
summary(employment_clust)
##              Length Class  Mode   
## cluster      200    -none- numeric
## centers       12    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric
library(broom)
tidy(employment_clust)
## # A tibble: 3 x 7
##     asian black_or_african_american   women   total  size withinss cluster
##     <dbl>                     <dbl>   <dbl>   <dbl> <int>    <dbl> <fct>  
## 1  0.0370                    0.326   0.693   0.488     93    253.  1      
## 2  2.28                     -0.0701  0.0542  0.0977    20     89.6 2      
## 3 -0.563                    -0.333  -0.753  -0.545     87    160.  3
augment(employment_clust, employment_demo) %>%
  ggplot(aes(total, black_or_african_american, color = .cluster)) +
  geom_point()

### Chossing K

kclusts <-
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~ kmeans(select(employment_demo, -occupation), .x)),
    glanced = map(kclust, glance),
  )

kclusts %>%
  unnest(cols = c(glanced)) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.5, size = 1.2, color = "midnightblue") +
  geom_point(size = 2, color = "midnightblue")

final_clust <- kmeans(select(employment_demo, -occupation), centers = 5)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- augment(final_clust, employment_demo) %>%
  ggplot(aes(total, women, color = .cluster, name = occupation)) +
  geom_point()

ggplotly(p, height = 500)