Set Up

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(broom) # tidy model results
library(umap) # dimension reduction

## Warning: package 'umap' was built under R version 4.4.3

library(plotly) # interactive visualization

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")

## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1 Convert data to standardized form

employed_grouped <- employed %>%
    filter(!is.na(employ_n)) %>%
    group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
    summarise(n = sum(employ_n)) %>%
    ungroup()

## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.

employed_tidy <- employed_grouped %>%
    
    # Remove total category
    filter(race_gender !="Total") %>%
    
    # Add total column
    left_join(employed_grouped %>%
                  filter(race_gender == "TOTAL") %>%
                  select(occupation, total = n)) %>%
    
    # Get pct in total
    mutate(pct = n / total) %>%
    
    # Remove Outliers
    filter(total > 1000)  %>%
    select(-n)

## Joining with `by = join_by(occupation)`

employed_tidy

## # A tibble: 1,392 × 4
##    occupation                                          race_gender  total    pct
##    <chr>                                               <chr>        <dbl>  <dbl>
##  1 Agriculture and related Construction and extractio… Asian       7.3 e4 0.0274
##  2 Agriculture and related Construction and extractio… Black or A… 7.3 e4 0.0822
##  3 Agriculture and related Construction and extractio… Men         7.3 e4 0.973 
##  4 Agriculture and related Construction and extractio… TOTAL       7.3 e4 1     
##  5 Agriculture and related Construction and extractio… White       7.3 e4 0.863 
##  6 Agriculture and related Construction and extractio… Women       7.3 e4 0.0274
##  7 Agriculture and related Farming, fishing, and fore… Asian       5.74e6 0.0139
##  8 Agriculture and related Farming, fishing, and fore… Black or A… 5.74e6 0.0342
##  9 Agriculture and related Farming, fishing, and fore… Men         5.74e6 0.789 
## 10 Agriculture and related Farming, fishing, and fore… TOTAL       5.74e6 1     
## # ℹ 1,382 more rows

employed_standard <- employed_tidy %>%
    
    # Standardize
    group_by(race_gender) %>%
    mutate(pct = pct %>% scale() %>% as.numeric()) %>%
    ungroup()%>%
    mutate(total = total %>% log() %>% scale() %>% as.numeric())

employed_standard

## # A tibble: 1,392 × 4
##    occupation                                         race_gender  total     pct
##    <chr>                                              <chr>        <dbl>   <dbl>
##  1 Agriculture and related Construction and extracti… Asian       -1.30   -0.539
##  2 Agriculture and related Construction and extracti… Black or A… -1.30   -0.405
##  3 Agriculture and related Construction and extracti… Men         -1.30    1.31 
##  4 Agriculture and related Construction and extracti… TOTAL       -1.30  NaN    
##  5 Agriculture and related Construction and extracti… White       -1.30    0.725
##  6 Agriculture and related Construction and extracti… Women       -1.30   -1.30 
##  7 Agriculture and related Farming, fishing, and for… Asian        0.819  -0.928
##  8 Agriculture and related Farming, fishing, and for… Black or A…  0.819  -1.21 
##  9 Agriculture and related Farming, fishing, and for… Men          0.819   0.510
## 10 Agriculture and related Farming, fishing, and for… TOTAL        0.819 NaN    
## # ℹ 1,382 more rows

2 Spread to object-characteristics format

occupation_demo_tbl <- employed_tidy %>%
    pivot_wider(names_from = race_gender,values_from = pct) %>%
    janitor::clean_names()

occupation_demo_tbl

## # A tibble: 232 × 8
##    occupation    total   asian black_or_african_ame…¹   men total_2 white  women
##    <chr>         <dbl>   <dbl>                  <dbl> <dbl>   <dbl> <dbl>  <dbl>
##  1 Agriculture… 7.3 e4 0.0274                 0.0822  0.973       1 0.863 0.0274
##  2 Agriculture… 5.74e6 0.0139                 0.0342  0.789       1 0.911 0.211 
##  3 Agriculture… 1.94e5 0.0155                 0.0309  0.985       1 0.918 0.0103
##  4 Agriculture… 1.01e6 0.00992                0.00794 0.739       1 0.967 0.261 
##  5 Agriculture… 5.22e6 0.00997                0.00882 0.741       1 0.962 0.259 
##  6 Agriculture… 5.15e5 0.0233                 0.0155  0.159       1 0.938 0.841 
##  7 Agriculture… 2.11e5 0.0332                 0.104   0.815       1 0.820 0.185 
##  8 Agriculture… 2.95e5 0.0339                 0.0373  0.675       1 0.902 0.329 
##  9 Agriculture… 8.80e4 0                      0.0682  0.864       1 0.875 0.136 
## 10 Agriculture… 9.40e4 0                      0.0213  0.585       1 0.968 0.426 
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american

3 Perform k-means clustering

occupation_cluster <- kmeans(occupation_demo_tbl %>%
select(-occupation, -total_2), centers = 5, nstart = 20)

summary(occupation_cluster)

##              Length Class  Mode   
## cluster      232    -none- numeric
## centers       30    -none- numeric
## totss          1    -none- numeric
## withinss       5    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           5    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric

tidy(occupation_cluster)

## # A tibble: 5 × 9
##     total  asian black_or_african_ame…¹   men white women  size withinss cluster
##     <dbl>  <dbl>                  <dbl> <dbl> <dbl> <dbl> <int>    <dbl> <fct>  
## 1  8.01e5 0.0381                  0.102 0.724 0.820 0.273   150  1.05e14 1      
## 2  1.62e7 0.0668                  0.126 0.510 0.774 0.490    20  3.42e14 2      
## 3  5.43e6 0.0578                  0.108 0.607 0.801 0.393    54  2.15e14 3      
## 4  4.52e7 0.0645                  0.125 0.570 0.772 0.430     7  4.32e14 4      
## 5  1.16e8 0.0720                  0.117 0.268 0.785 0.732     1  0       5      
## # ℹ abbreviated name: ¹black_or_african_american

augment(occupation_cluster, occupation_demo_tbl) %>%
    
    ggplot(aes(total, asian, color = .cluster)) +
    geom_point()

4 Select Optimal Number of CLusters

kclusts <- tibble(k = 1:9) %>%
    mutate(kclust = map(.x = k, .f = ~ kmeans(occupation_demo_tbl %>%
select(-occupation, -total_2), centers = .x, nstart = 20)),
            glanced = map(.x = kclust, .f = glance))

kclusts %>%
    unnest(glanced) %>%
    ggplot(aes(k, tot.withinss)) +
    geom_point() +
    geom_line()

final_cluster <- kmeans(occupation_demo_tbl %>% select(-occupation, -total_2), centers = 5, nstart = 20)

5 Reduce dimension using UMAP

umap_results <- occupation_demo_tbl %>%
    select(-occupation, -total_2) %>%
    umap()

umap_results_tbl <- umap_results$layout %>%
    as.tibble() %>%
    bind_cols(occupation_demo_tbl %>% select(occupation))

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
##   Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

umap_results_tbl

## # A tibble: 232 × 3
##         V1     V2 occupation                                                    
##      <dbl>  <dbl> <chr>                                                         
##  1 -6.72   -7.34  Agriculture and related Construction and extraction occupatio…
##  2 -0.641   7.01  Agriculture and related Farming, fishing, and forestry occupa…
##  3 -2.28   -5.44  Agriculture and related Installation, maintenance, and repair…
##  4  7.66   -0.915 Agriculture and related Manage-ment, business, and financial …
##  5  0.0481  7.16  Agriculture and related Management, business, and financial o…
##  6  2.52   -4.56  Agriculture and related Office and administrative support occ…
##  7 -2.12   -5.29  Agriculture and related Production occupations                
##  8 -0.184  -5.30  Agriculture and related Professional and related occupations  
##  9 -6.56   -7.48  Agriculture and related Protective service occupations        
## 10 -6.20   -7.73  Agriculture and related Sales and related occupations         
## # ℹ 222 more rows

umap_results_tbl %>%
    ggplot(aes(V1, V2)) +
    geom_point()

6 Visualize Clusters by adding k-means results

kmeans_umap_tbl <- final_cluster %>%
    augment(occupation_demo_tbl) %>%
    select(occupation, .cluster) %>%
    
    # Add umap results
    left_join(umap_results_tbl) %>%
    
    # Add employment info
    left_join(employed_tidy %>%
                  select(-total) %>%
                  pivot_wider(names_from = race_gender, values_from = pct) %>%
                  janitor::clean_names())

## Joining with `by = join_by(occupation)`
## Joining with `by = join_by(occupation)`

kmeans_umap_tbl

## # A tibble: 232 × 10
##    occupation .cluster      V1     V2   asian black_or_african_ame…¹   men total
##    <chr>      <fct>      <dbl>  <dbl>   <dbl>                  <dbl> <dbl> <dbl>
##  1 Agricultu… 1        -6.72   -7.34  0.0274                 0.0822  0.973     1
##  2 Agricultu… 4        -0.641   7.01  0.0139                 0.0342  0.789     1
##  3 Agricultu… 1        -2.28   -5.44  0.0155                 0.0309  0.985     1
##  4 Agricultu… 1         7.66   -0.915 0.00992                0.00794 0.739     1
##  5 Agricultu… 4         0.0481  7.16  0.00997                0.00882 0.741     1
##  6 Agricultu… 1         2.52   -4.56  0.0233                 0.0155  0.159     1
##  7 Agricultu… 1        -2.12   -5.29  0.0332                 0.104   0.815     1
##  8 Agricultu… 1        -0.184  -5.30  0.0339                 0.0373  0.675     1
##  9 Agricultu… 1        -6.56   -7.48  0                      0.0682  0.864     1
## 10 Agricultu… 1        -6.20   -7.73  0                      0.0213  0.585     1
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american
## # ℹ 2 more variables: white <dbl>, women <dbl>

g <- kmeans_umap_tbl %>%
    
    # Create text label
    mutate(text_label = str_glue("Occupation: {occupation}
                                 Cluster: {.cluster}
                                 Asian: {asian %>% scales::percent(1)}
                                 Women: {women %>% scales::percent(1)}")) %>%
    
    # Plot
    ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
    geom_point()

g %>% ggplotly(tooltip = "text")

Code Along 9: Cluster Model

Paige Biester

2025-04-9

Set Up

1 Convert data to standardized form

2 Spread to object-characteristics format

3 Perform k-means clustering

4 Select Optimal Number of CLusters

5 Reduce dimension using UMAP

6 Visualize Clusters by adding k-means results