Set up
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom) # tidy model results
library(umap) # dimension reduction
## Warning: package 'umap' was built under R version 4.3.3
library(plotly) # interactive visualization
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
4 Select optimal number of clusters
kclusts <- tibble(k = 1:9) %>%
mutate(kclusts = map(.x = k, .f = ~ kmeans(occupation_demo_tbl %>%
select(-occupation), centers = .x, nstart = 20)),
glanced = map(.x = kclusts, .f = glance))
kclusts %>%
unnest(glanced) %>%
ggplot(aes(tot.withinss, k)) +
geom_point() +
geom_line()

final_cluster <- kmeans(occupation_demo_tbl %>% select(-occupation), centers =
5, nstart = 20)
augment(final_cluster, occupation_demo_tbl) %>%
ggplot(aes(total, asian, color = .cluster)) +
geom_point()

5 Reduce dimension using UMAP
umap_results <- occupation_demo_tbl %>%
select(-occupation) %>%
umap()
umap_results_tbl <- umap_results$layout %>%
as.tibble() %>%
bind_cols(occupation_demo_tbl %>% select(occupation))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
## Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tbl
## # A tibble: 232 × 3
## V1 V2 occupation
## <dbl> <dbl> <chr>
## 1 -0.965 -4.06 Agriculture and related Construction and extraction occupations
## 2 1.24 -1.88 Agriculture and related Farming, fishing, and forestry occupati…
## 3 -0.940 -4.86 Agriculture and related Installation, maintenance, and repair o…
## 4 0.977 -2.32 Agriculture and related Manage-ment, business, and financial op…
## 5 1.21 -2.00 Agriculture and related Management, business, and financial ope…
## 6 1.62 3.51 Agriculture and related Office and administrative support occup…
## 7 -0.912 -2.59 Agriculture and related Production occupations
## 8 0.554 -2.73 Agriculture and related Professional and related occupations
## 9 -0.595 -3.83 Agriculture and related Protective service occupations
## 10 0.590 -3.15 Agriculture and related Sales and related occupations
## # ℹ 222 more rows
umap_results_tbl %>%
ggplot(aes(V1, V2)) +
geom_point()

6 Visualize clusters by adding k-means results
kmeans_umap_tbl <- final_cluster %>%
augment(occupation_demo_tbl) %>%
select(occupation, .cluster) %>%
# Add umap results
left_join(umap_results_tbl) %>%
# Add employment info
left_join(employed_grouped %>%
pivot_wider(names_from = race_gender, values_from = n) %>%
janitor::clean_names())
## Joining with `by = join_by(occupation)`
## Joining with `by = join_by(occupation)`
kmeans_umap_tbl
## # A tibble: 232 × 10
## occupation .cluster V1 V2 asian black_or_african_ame…¹ men total
## <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agriculture… 5 -0.965 -4.06 2000 6000 7.1 e4 7.3 e4
## 2 Agriculture… 5 1.24 -1.88 80000 196000 4.53e6 5.74e6
## 3 Agriculture… 5 -0.940 -4.86 3000 6000 1.91e5 1.94e5
## 4 Agriculture… 4 0.977 -2.32 10000 8000 7.45e5 1.01e6
## 5 Agriculture… 5 1.21 -2.00 52000 46000 3.86e6 5.22e6
## 6 Agriculture… 2 1.62 3.51 12000 8000 8.2 e4 5.15e5
## 7 Agriculture… 5 -0.912 -2.59 7000 22000 1.72e5 2.11e5
## 8 Agriculture… 4 0.554 -2.73 10000 11000 1.99e5 2.95e5
## 9 Agriculture… 4 -0.595 -3.83 0 6000 7.6 e4 8.80e4
## 10 Agriculture… 4 0.590 -3.15 0 2000 5.50e4 9.40e4
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american
## # ℹ 2 more variables: white <dbl>, women <dbl>
g <- kmeans_umap_tbl %>%
# Create text label
mutate(text_label = str_glue("Occupation: {occupation}
Cluster: {.cluster}
Asian: {asian}
Women: {women}")) %>%
# Plot
ggplot(aes(V1, V2, color = .cluster, text = text_label)) +
geom_point()
g %>% ggplotly(tooltip = "text")