Set Up

library(tidyverse)

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(broom)
library(umap)
library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")

## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

employed_grouped <- employed %>%
  filter(!is.na(employ_n)) %>%
  group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
  summarise(n = sum(employ_n)) %>%
  ungroup()

## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.

employed_tidy <- employed_grouped %>%
  
  # Remove total category
  filter(race_gender != "TOTAL") %>%
  
  #Add total column
  left_join(employed_grouped %>%
              filter(race_gender == "TOTAL") %>%
              select(occupation, total = n)) %>%
  
  # Get pct in total
  mutate(pct = n / total) %>%
  
  # Remove Outliers
  filter(total > 1000) %>%
  select(-n)

## Joining with `by = join_by(occupation)`

employed_tidy

## # A tibble: 1,160 × 4
##    occupation                                          race_gender  total    pct
##    <chr>                                               <chr>        <dbl>  <dbl>
##  1 Agriculture and related Construction and extractio… Asian       7.3 e4 0.0274
##  2 Agriculture and related Construction and extractio… Black or A… 7.3 e4 0.0822
##  3 Agriculture and related Construction and extractio… Men         7.3 e4 0.973 
##  4 Agriculture and related Construction and extractio… White       7.3 e4 0.863 
##  5 Agriculture and related Construction and extractio… Women       7.3 e4 0.0274
##  6 Agriculture and related Farming, fishing, and fore… Asian       5.74e6 0.0139
##  7 Agriculture and related Farming, fishing, and fore… Black or A… 5.74e6 0.0342
##  8 Agriculture and related Farming, fishing, and fore… Men         5.74e6 0.789 
##  9 Agriculture and related Farming, fishing, and fore… White       5.74e6 0.911 
## 10 Agriculture and related Farming, fishing, and fore… Women       5.74e6 0.211 
## # ℹ 1,150 more rows

employed_standard <- employed_tidy %>%
  
  # Standardize
  group_by(race_gender) %>%
  mutate(pct = pct %>% scale() %>% as.numeric()) %>%
  ungroup()  %>%
  mutate(total = total %>% log() %>% scale() %>% as.numeric()) 
  
employed_standard

## # A tibble: 1,160 × 4
##    occupation                                          race_gender  total    pct
##    <chr>                                               <chr>        <dbl>  <dbl>
##  1 Agriculture and related Construction and extractio… Asian       -1.30  -0.539
##  2 Agriculture and related Construction and extractio… Black or A… -1.30  -0.405
##  3 Agriculture and related Construction and extractio… Men         -1.30   1.31 
##  4 Agriculture and related Construction and extractio… White       -1.30   0.725
##  5 Agriculture and related Construction and extractio… Women       -1.30  -1.30 
##  6 Agriculture and related Farming, fishing, and fore… Asian        0.819 -0.928
##  7 Agriculture and related Farming, fishing, and fore… Black or A…  0.819 -1.21 
##  8 Agriculture and related Farming, fishing, and fore… Men          0.819  0.510
##  9 Agriculture and related Farming, fishing, and fore… White        0.819  1.38 
## 10 Agriculture and related Farming, fishing, and fore… Women        0.819 -0.503
## # ℹ 1,150 more rows

occupation_demo_tbl <- employed_tidy %>%
  pivot_wider(names_from = race_gender, values_from = pct) %>%
  janitor::clean_names() 

occupation_demo_tbl

## # A tibble: 232 × 7
##    occupation            total   asian black_or_african_ame…¹   men white  women
##    <chr>                 <dbl>   <dbl>                  <dbl> <dbl> <dbl>  <dbl>
##  1 Agriculture and rel… 7.3 e4 0.0274                 0.0822  0.973 0.863 0.0274
##  2 Agriculture and rel… 5.74e6 0.0139                 0.0342  0.789 0.911 0.211 
##  3 Agriculture and rel… 1.94e5 0.0155                 0.0309  0.985 0.918 0.0103
##  4 Agriculture and rel… 1.01e6 0.00992                0.00794 0.739 0.967 0.261 
##  5 Agriculture and rel… 5.22e6 0.00997                0.00882 0.741 0.962 0.259 
##  6 Agriculture and rel… 5.15e5 0.0233                 0.0155  0.159 0.938 0.841 
##  7 Agriculture and rel… 2.11e5 0.0332                 0.104   0.815 0.820 0.185 
##  8 Agriculture and rel… 2.95e5 0.0339                 0.0373  0.675 0.902 0.329 
##  9 Agriculture and rel… 8.80e4 0                      0.0682  0.864 0.875 0.136 
## 10 Agriculture and rel… 9.40e4 0                      0.0213  0.585 0.968 0.426 
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american

occupation_cluster <- kmeans(occupation_demo_tbl %>%
select(-occupation), centers = 3, nstart = 20) 

occupation_cluster

## K-means clustering with 3 clusters of sizes 198, 26, 8
## 
## Cluster means:
##      total      asian black_or_african_american       men     white     women
## 1  1796409 0.04237561                 0.1037127 0.6925912 0.8156003 0.3050666
## 2 14712692 0.06835789                 0.1202226 0.5537719 0.7792893 0.4462197
## 3 54063250 0.06543159                 0.1242421 0.5326546 0.7738826 0.4673395
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1
##  [38] 1 1 1 2 2 1 3 1 1 3 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 2 1 1 1 1 1 3 1 1 1 1 1 2 1 3 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [149] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 3 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1
## [186] 1 1 1 1 2 1 1 1 3 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 3 1 2 1 1
## [223] 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 8.128861e+14 5.459256e+14 4.860479e+15
##  (between_SS / total_SS =  79.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

summary(occupation_cluster)

##              Length Class  Mode   
## cluster      232    -none- numeric
## centers       18    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric

tidy(occupation_cluster)

## # A tibble: 3 × 9
##     total  asian black_or_african_ame…¹   men white women  size withinss cluster
##     <dbl>  <dbl>                  <dbl> <dbl> <dbl> <dbl> <int>    <dbl> <fct>  
## 1  1.80e6 0.0424                  0.104 0.693 0.816 0.305   198  8.13e14 1      
## 2  1.47e7 0.0684                  0.120 0.554 0.779 0.446    26  5.46e14 2      
## 3  5.41e7 0.0654                  0.124 0.533 0.774 0.467     8  4.86e15 3      
## # ℹ abbreviated name: ¹black_or_african_american

glance(occupation_cluster)

## # A tibble: 1 × 4
##     totss tot.withinss betweenss  iter
##     <dbl>        <dbl>     <dbl> <int>
## 1 3.00e16      6.22e15   2.37e16     2

augment(occupation_cluster, occupation_demo_tbl) %>%
  
  ggplot(aes(total, asian, color = .cluster)) +
  geom_point()

kclusts <- tibble(k = 1:9) %>%
  mutate(kclust = map(.x = k, .f = ~ kmeans(occupation_demo_tbl %>%
select(-occupation), centers = 3, nstart = 20)),
glanced = map(.x = kclust, .f = glance)) 

kclusts %>% 
  unnest(glanced) %>%
  ggplot(aes(tot.withinss, k)) +
  geom_point() +
  geom_line()

final_cluster <- kmeans(occupation_demo_tbl %>%
select(-occupation), centers = 5, nstart = 20)

augment(final_cluster, occupation_demo_tbl) %>%
  
  ggplot(aes(total, asian, color = .cluster)) +
  geom_point()

Reduce dimensions using UMAP

umap_results <- occupation_demo_tbl %>%
  select(-occupation) %>%
  umap() 

umap_results_tbl <- umap_results$layout %>%
  as.tibble() %>%
  bind_cols(occupation_demo_tbl %>% select(occupation))

## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
##   Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

umap_results_tbl

## # A tibble: 232 × 3
##         V1    V2 occupation                                                     
##      <dbl> <dbl> <chr>                                                          
##  1 -8.26   -7.63 Agriculture and related Construction and extraction occupations
##  2 -0.0687  7.24 Agriculture and related Farming, fishing, and forestry occupat…
##  3 -2.22   -6.06 Agriculture and related Installation, maintenance, and repair …
##  4  7.65   -2.24 Agriculture and related Manage-ment, business, and financial o…
##  5  0.771   7.18 Agriculture and related Management, business, and financial op…
##  6  3.18   -4.57 Agriculture and related Office and administrative support occu…
##  7 -1.95   -6.01 Agriculture and related Production occupations                 
##  8 -0.219  -5.54 Agriculture and related Professional and related occupations   
##  9 -8.04   -7.51 Agriculture and related Protective service occupations         
## 10 -7.67   -7.32 Agriculture and related Sales and related occupations          
## # ℹ 222 more rows

umap_results_tbl %>%
  ggplot(aes(V1, V2)) +
  geom_point()

Visualize clusters byt adding k-means results

kmeans_umap_tbl <- final_cluster %>%
  augment(occupation_demo_tbl) %>%
  select(occupation, .cluster) %>%
  
  # add umap results 
  left_join(umap_results_tbl) %>%
  
  # Add employment info
  left_join(employed_tidy %>%
              select(-total) %>%
        pivot_wider(names_from = race_gender, values_from = pct) %>%
          janitor::clean_names())

## Joining with `by = join_by(occupation)`
## Joining with `by = join_by(occupation)`

kmeans_umap_tbl

## # A tibble: 232 × 9
##    occupation  .cluster      V1    V2   asian black_or_african_ame…¹   men white
##    <chr>       <fct>      <dbl> <dbl>   <dbl>                  <dbl> <dbl> <dbl>
##  1 Agricultur… 3        -8.26   -7.63 0.0274                 0.0822  0.973 0.863
##  2 Agricultur… 2        -0.0687  7.24 0.0139                 0.0342  0.789 0.911
##  3 Agricultur… 3        -2.22   -6.06 0.0155                 0.0309  0.985 0.918
##  4 Agricultur… 3         7.65   -2.24 0.00992                0.00794 0.739 0.967
##  5 Agricultur… 2         0.771   7.18 0.00997                0.00882 0.741 0.962
##  6 Agricultur… 3         3.18   -4.57 0.0233                 0.0155  0.159 0.938
##  7 Agricultur… 3        -1.95   -6.01 0.0332                 0.104   0.815 0.820
##  8 Agricultur… 3        -0.219  -5.54 0.0339                 0.0373  0.675 0.902
##  9 Agricultur… 3        -8.04   -7.51 0                      0.0682  0.864 0.875
## 10 Agricultur… 3        -7.67   -7.32 0                      0.0213  0.585 0.968
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american
## # ℹ 1 more variable: women <dbl>

g <- kmeans_umap_tbl %>%
  
  #create text label
  mutate(text_label = str_glue("Occupation: {occupation}
                               Cluster:      {.cluster}
                               Asian:        {asian}
                               Women:        {women}")) %>%
  
  #plot 
  ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
  geom_point()

g %>% ggplotly(tooltip = "text")

Code Along 9

Reed Wilson

2024-04-08

Set Up

Reduce dimensions using UMAP

Visualize clusters byt adding k-means results