Set Up
library(tidyverse)
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(broom)
library(umap)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
employed_grouped <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = sum(employ_n)) %>%
ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.
employed_tidy <- employed_grouped %>%
# Remove total category
filter(race_gender != "TOTAL") %>%
#Add total column
left_join(employed_grouped %>%
filter(race_gender == "TOTAL") %>%
select(occupation, total = n)) %>%
# Get pct in total
mutate(pct = n / total) %>%
# Remove Outliers
filter(total > 1000) %>%
select(-n)
## Joining with `by = join_by(occupation)`
employed_tidy
## # A tibble: 1,160 × 4
## occupation race_gender total pct
## <chr> <chr> <dbl> <dbl>
## 1 Agriculture and related Construction and extractio… Asian 7.3 e4 0.0274
## 2 Agriculture and related Construction and extractio… Black or A… 7.3 e4 0.0822
## 3 Agriculture and related Construction and extractio… Men 7.3 e4 0.973
## 4 Agriculture and related Construction and extractio… White 7.3 e4 0.863
## 5 Agriculture and related Construction and extractio… Women 7.3 e4 0.0274
## 6 Agriculture and related Farming, fishing, and fore… Asian 5.74e6 0.0139
## 7 Agriculture and related Farming, fishing, and fore… Black or A… 5.74e6 0.0342
## 8 Agriculture and related Farming, fishing, and fore… Men 5.74e6 0.789
## 9 Agriculture and related Farming, fishing, and fore… White 5.74e6 0.911
## 10 Agriculture and related Farming, fishing, and fore… Women 5.74e6 0.211
## # ℹ 1,150 more rows
employed_standard <- employed_tidy %>%
# Standardize
group_by(race_gender) %>%
mutate(pct = pct %>% scale() %>% as.numeric()) %>%
ungroup() %>%
mutate(total = total %>% log() %>% scale() %>% as.numeric())
employed_standard
## # A tibble: 1,160 × 4
## occupation race_gender total pct
## <chr> <chr> <dbl> <dbl>
## 1 Agriculture and related Construction and extractio… Asian -1.30 -0.539
## 2 Agriculture and related Construction and extractio… Black or A… -1.30 -0.405
## 3 Agriculture and related Construction and extractio… Men -1.30 1.31
## 4 Agriculture and related Construction and extractio… White -1.30 0.725
## 5 Agriculture and related Construction and extractio… Women -1.30 -1.30
## 6 Agriculture and related Farming, fishing, and fore… Asian 0.819 -0.928
## 7 Agriculture and related Farming, fishing, and fore… Black or A… 0.819 -1.21
## 8 Agriculture and related Farming, fishing, and fore… Men 0.819 0.510
## 9 Agriculture and related Farming, fishing, and fore… White 0.819 1.38
## 10 Agriculture and related Farming, fishing, and fore… Women 0.819 -0.503
## # ℹ 1,150 more rows
occupation_demo_tbl <- employed_tidy %>%
pivot_wider(names_from = race_gender, values_from = pct) %>%
janitor::clean_names()
occupation_demo_tbl
## # A tibble: 232 × 7
## occupation total asian black_or_african_ame…¹ men white women
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Agriculture and rel… 7.3 e4 0.0274 0.0822 0.973 0.863 0.0274
## 2 Agriculture and rel… 5.74e6 0.0139 0.0342 0.789 0.911 0.211
## 3 Agriculture and rel… 1.94e5 0.0155 0.0309 0.985 0.918 0.0103
## 4 Agriculture and rel… 1.01e6 0.00992 0.00794 0.739 0.967 0.261
## 5 Agriculture and rel… 5.22e6 0.00997 0.00882 0.741 0.962 0.259
## 6 Agriculture and rel… 5.15e5 0.0233 0.0155 0.159 0.938 0.841
## 7 Agriculture and rel… 2.11e5 0.0332 0.104 0.815 0.820 0.185
## 8 Agriculture and rel… 2.95e5 0.0339 0.0373 0.675 0.902 0.329
## 9 Agriculture and rel… 8.80e4 0 0.0682 0.864 0.875 0.136
## 10 Agriculture and rel… 9.40e4 0 0.0213 0.585 0.968 0.426
## # ℹ 222 more rows
## # ℹ abbreviated name: ¹black_or_african_american
occupation_cluster <- kmeans(occupation_demo_tbl %>%
select(-occupation), centers = 3, nstart = 20)
occupation_cluster
## K-means clustering with 3 clusters of sizes 198, 26, 8
##
## Cluster means:
## total asian black_or_african_american men white women
## 1 1796409 0.04237561 0.1037127 0.6925912 0.8156003 0.3050666
## 2 14712692 0.06835789 0.1202226 0.5537719 0.7792893 0.4462197
## 3 54063250 0.06543159 0.1242421 0.5326546 0.7738826 0.4673395
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 1 1 1 1 1
## [38] 1 1 1 2 2 1 3 1 1 3 1 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 2 1 1 1 1 1 3 1 1 1 1 1 2 1 3 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1
## [149] 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 3 1 1 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1
## [186] 1 1 1 1 2 1 1 1 3 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 1 1 2 1 1 1 3 1 2 1 1
## [223] 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 8.128861e+14 5.459256e+14 4.860479e+15
## (between_SS / total_SS = 79.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
summary(occupation_cluster)
## Length Class Mode
## cluster 232 -none- numeric
## centers 18 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
tidy(occupation_cluster)
## # A tibble: 3 × 9
## total asian black_or_african_ame…¹ men white women size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 1.80e6 0.0424 0.104 0.693 0.816 0.305 198 8.13e14 1
## 2 1.47e7 0.0684 0.120 0.554 0.779 0.446 26 5.46e14 2
## 3 5.41e7 0.0654 0.124 0.533 0.774 0.467 8 4.86e15 3
## # ℹ abbreviated name: ¹black_or_african_american
glance(occupation_cluster)
## # A tibble: 1 × 4
## totss tot.withinss betweenss iter
## <dbl> <dbl> <dbl> <int>
## 1 3.00e16 6.22e15 2.37e16 2
augment(occupation_cluster, occupation_demo_tbl) %>%
ggplot(aes(total, asian, color = .cluster)) +
geom_point()

kclusts <- tibble(k = 1:9) %>%
mutate(kclust = map(.x = k, .f = ~ kmeans(occupation_demo_tbl %>%
select(-occupation), centers = 3, nstart = 20)),
glanced = map(.x = kclust, .f = glance))
kclusts %>%
unnest(glanced) %>%
ggplot(aes(tot.withinss, k)) +
geom_point() +
geom_line()

final_cluster <- kmeans(occupation_demo_tbl %>%
select(-occupation), centers = 5, nstart = 20)
augment(final_cluster, occupation_demo_tbl) %>%
ggplot(aes(total, asian, color = .cluster)) +
geom_point()

Reduce dimensions using UMAP
umap_results <- occupation_demo_tbl %>%
select(-occupation) %>%
umap()
umap_results_tbl <- umap_results$layout %>%
as.tibble() %>%
bind_cols(occupation_demo_tbl %>% select(occupation))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## ℹ The deprecated feature was likely used in the tibble package.
## Please report the issue at <https://github.com/tidyverse/tibble/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
umap_results_tbl
## # A tibble: 232 × 3
## V1 V2 occupation
## <dbl> <dbl> <chr>
## 1 -8.26 -7.63 Agriculture and related Construction and extraction occupations
## 2 -0.0687 7.24 Agriculture and related Farming, fishing, and forestry occupat…
## 3 -2.22 -6.06 Agriculture and related Installation, maintenance, and repair …
## 4 7.65 -2.24 Agriculture and related Manage-ment, business, and financial o…
## 5 0.771 7.18 Agriculture and related Management, business, and financial op…
## 6 3.18 -4.57 Agriculture and related Office and administrative support occu…
## 7 -1.95 -6.01 Agriculture and related Production occupations
## 8 -0.219 -5.54 Agriculture and related Professional and related occupations
## 9 -8.04 -7.51 Agriculture and related Protective service occupations
## 10 -7.67 -7.32 Agriculture and related Sales and related occupations
## # ℹ 222 more rows
umap_results_tbl %>%
ggplot(aes(V1, V2)) +
geom_point()
