Set up

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.3.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(broom) # tidy model results
library(umap) # dimension reduction

## Warning: package 'umap' was built under R version 4.3.3

library(plotly) # interactive visualization

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")

## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1 Convert data to standarized form

employed_grouped <- employed %>%
    filter(!is.na(employ_n)) %>%
    group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
    summarise(n = sum(employ_n)) %>%
    ungroup()

## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.

employed_tidy <- employed_grouped %>%
  
    # Remove total category
    filter(race_gender != "TOTAL") %>%

    # Add total column
    left_join(employed_grouped %>%
                  filter(race_gender == "TOTAL") %>%
                  select(occupation, total = n)) %>%
      
    # Get pct in total
    mutate(pct = n / total) %>%
    
    # Standardize
    group_by(race_gender) %>%
    mutate(pct = pct %>% scale() %>% as.numeric()) %>%
    ungroup() %>%
    
    # Remove outliers
    filter(total > 1000) %>%
    mutate(total = total %>% log() %>% scale() %>% as.numeric()) %>%
    select(-n)

## Joining with `by = join_by(occupation)`

employed_tidy

## # A tibble: 1,160 × 4
##    occupation                                          race_gender  total    pct
##    <chr>                                               <chr>        <dbl>  <dbl>
##  1 Agriculture and related Construction and extractio… Asian       -1.30  -0.514
##  2 Agriculture and related Construction and extractio… Black or A… -1.30  -0.368
##  3 Agriculture and related Construction and extractio… Men         -1.30   1.27 
##  4 Agriculture and related Construction and extractio… White       -1.30   0.585
##  5 Agriculture and related Construction and extractio… Women       -1.30  -1.27 
##  6 Agriculture and related Farming, fishing, and fore… Asian        0.819 -0.900
##  7 Agriculture and related Farming, fishing, and fore… Black or A…  0.819 -1.16 
##  8 Agriculture and related Farming, fishing, and fore… Men          0.819  0.481
##  9 Agriculture and related Farming, fishing, and fore… White        0.819  1.10 
## 10 Agriculture and related Farming, fishing, and fore… Women        0.819 -0.475
## # ℹ 1,150 more rows

2 Spread to object-characteristics format

occupation_demo_tbl <- employed_tidy %>%
    pivot_wider(names_from = race_gender, values_from = pct) %>%
    janitor::clean_names()

occupation_demo_tbl

## # A tibble: 232 × 7
##    occupation          total  asian black_or_african_ame…¹     men white   women
##    <chr>               <dbl>  <dbl>                  <dbl>   <dbl> <dbl>   <dbl>
##  1 Agriculture and … -1.30   -0.514               -0.368    1.27   0.585 -1.27  
##  2 Agriculture and …  0.819  -0.900               -1.16     0.481  1.10  -0.475 
##  3 Agriculture and … -0.827  -0.856               -1.21     1.32   1.18  -1.34  
##  4 Agriculture and … -0.0262 -1.01                -1.59     0.267  1.72  -0.259 
##  5 Agriculture and …  0.773  -1.01                -1.58     0.274  1.66  -0.267 
##  6 Agriculture and … -0.353  -0.631               -1.47    -2.23   1.40   2.24  
##  7 Agriculture and … -0.786  -0.348               -0.00337  0.594  0.118 -0.587 
##  8 Agriculture and … -0.623  -0.327               -1.11    -0.0108 1.01   0.0339
##  9 Agriculture and … -1.21   -1.30                -0.599    0.802  0.716 -0.796 
## 10 Agriculture and … -1.18   -1.30                -1.37    -0.396  1.73   0.451 
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american

3 Perform k-means clustering

occupation_cluster <- kmeans(occupation_demo_tbl %>%
select(-occupation), centers = 3, nstart = 20)

summary(occupation_cluster)

##              Length Class  Mode   
## cluster      232    -none- numeric
## centers       18    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric

tidy(occupation_cluster)

## # A tibble: 3 × 9
##    total   asian black_or_african_american     men  white   women  size withinss
##    <dbl>   <dbl>                     <dbl>   <dbl>  <dbl>   <dbl> <int>    <dbl>
## 1 -0.228 -0.0215                    1.19    0.0644 -0.758 -0.0670    54     158.
## 2  0.644  0.675                    -0.0493 -0.883  -0.149  0.891     91     326.
## 3 -0.532 -0.633                    -0.608   0.820   0.656 -0.827     87     217.
## # ℹ 1 more variable: cluster <fct>

augment(occupation_cluster, occupation_demo_tbl) %>%

    ggplot(aes(total, asian, color = (.cluster))) +
    geom_point()

4 Select optimal number of clusters

kclusts <- tibble(k = 1:9) %>%
    mutate(kclusts = map(.x = k, .f = ~ kmeans(occupation_demo_tbl %>%
select(-occupation), centers = .x, nstart = 20)),
          glanced = map(.x = kclusts, .f = glance))

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(tot.withinss, k)) +
  geom_point() +
  geom_line()

final_cluster <- kmeans(occupation_demo_tbl %>% select(-occupation), centers = 
5, nstart = 20)

augment(final_cluster, occupation_demo_tbl) %>%

    ggplot(aes(total, asian, color = .cluster)) +
    geom_point()

5 Reduce dimension using UMAP

umap_results <- occupation_demo_tbl %>%
    select(-occupation) %>%
    umap()

umap_results_tbl <- umap_results$layout %>%
    as.tibble() %>%
    bind_cols(occupation_demo_tbl %>% select(occupation))

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
##   Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

umap_results_tbl

## # A tibble: 232 × 3
##        V1    V2 occupation                                                      
##     <dbl> <dbl> <chr>                                                           
##  1 -0.965 -4.06 Agriculture and related Construction and extraction occupations 
##  2  1.24  -1.88 Agriculture and related Farming, fishing, and forestry occupati…
##  3 -0.940 -4.86 Agriculture and related Installation, maintenance, and repair o…
##  4  0.977 -2.32 Agriculture and related Manage-ment, business, and financial op…
##  5  1.21  -2.00 Agriculture and related Management, business, and financial ope…
##  6  1.62   3.51 Agriculture and related Office and administrative support occup…
##  7 -0.912 -2.59 Agriculture and related Production occupations                  
##  8  0.554 -2.73 Agriculture and related Professional and related occupations    
##  9 -0.595 -3.83 Agriculture and related Protective service occupations          
## 10  0.590 -3.15 Agriculture and related Sales and related occupations           
## # ℹ 222 more rows

umap_results_tbl %>%
  ggplot(aes(V1, V2)) +
  geom_point()

6 Visualize clusters by adding k-means results

kmeans_umap_tbl <- final_cluster %>%
  augment(occupation_demo_tbl) %>%
  select(occupation, .cluster) %>%
  
  # Add umap results
  left_join(umap_results_tbl) %>%
  
  # Add employment info
  left_join(employed_grouped %>%
                pivot_wider(names_from =  race_gender, values_from = n) %>%
                janitor::clean_names())

## Joining with `by = join_by(occupation)`
## Joining with `by = join_by(occupation)`

kmeans_umap_tbl

## # A tibble: 232 × 10
##    occupation   .cluster     V1    V2 asian black_or_african_ame…¹    men  total
##    <chr>        <fct>     <dbl> <dbl> <dbl>                  <dbl>  <dbl>  <dbl>
##  1 Agriculture… 5        -0.965 -4.06  2000                   6000 7.1 e4 7.3 e4
##  2 Agriculture… 5         1.24  -1.88 80000                 196000 4.53e6 5.74e6
##  3 Agriculture… 5        -0.940 -4.86  3000                   6000 1.91e5 1.94e5
##  4 Agriculture… 4         0.977 -2.32 10000                   8000 7.45e5 1.01e6
##  5 Agriculture… 5         1.21  -2.00 52000                  46000 3.86e6 5.22e6
##  6 Agriculture… 2         1.62   3.51 12000                   8000 8.2 e4 5.15e5
##  7 Agriculture… 5        -0.912 -2.59  7000                  22000 1.72e5 2.11e5
##  8 Agriculture… 4         0.554 -2.73 10000                  11000 1.99e5 2.95e5
##  9 Agriculture… 4        -0.595 -3.83     0                   6000 7.6 e4 8.80e4
## 10 Agriculture… 4         0.590 -3.15     0                   2000 5.50e4 9.40e4
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american
## # ℹ 2 more variables: white <dbl>, women <dbl>

g <- kmeans_umap_tbl %>%

    # Create text label
    mutate(text_label = str_glue("Occupation: {occupation}
                                 Cluster:     {.cluster}
                                 Asian:       {asian} 
                                 Women:       {women}")) %>%
  
  # Plot
  ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
  geom_point()

g %>% ggplotly(tooltip = "text")

Code Along 9: Cluster Model

John Tomlinson

2024-04-09

Set up

1 Convert data to standarized form

2 Spread to object-characteristics format

3 Perform k-means clustering

4 Select optimal number of clusters

5 Reduce dimension using UMAP

6 Visualize clusters by adding k-means results