library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
employed_tidy <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = mean(employ_n)) %>%
ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.
employed_tidy %>%
filter(race_gender == "TOTAL")
## # A tibble: 239 × 3
## occupation race_gender n
## <chr> <chr> <dbl>
## 1 Agriculture and related Construction and extraction occup… TOTAL 1.22e4
## 2 Agriculture and related Farming, fishing, and forestry oc… TOTAL 9.56e5
## 3 Agriculture and related Installation, maintenance, and re… TOTAL 3.23e4
## 4 Agriculture and related Manage-ment, business, and financ… TOTAL 1.01e6
## 5 Agriculture and related Management, business, and financi… TOTAL 1.04e6
## 6 Agriculture and related Office and administrative support… TOTAL 8.58e4
## 7 Agriculture and related Production occupations TOTAL 3.52e4
## 8 Agriculture and related Professional and related occupati… TOTAL 4.92e4
## 9 Agriculture and related Protective service occupations TOTAL 1.47e4
## 10 Agriculture and related Sales and related occupations TOTAL 1.57e4
## # ℹ 229 more rows
employment_demo <- employed_tidy %>%
filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
janitor::clean_names() %>%
left_join(employed_tidy %>%
filter(race_gender == "TOTAL") %>%
select(-race_gender) %>%
rename(total = n)) %>%
filter(total > 1e3) %>%
mutate(across(c(asian, black_or_african_american, women), ~ . / (total)),
total = log(total),
across(where(is.numeric), ~ as.numeric(scale(.)))) %>%
mutate(occupation = snakecase::to_snake_case(occupation))
## Joining with `by = join_by(occupation)`
employment_demo
## # A tibble: 230 × 5
## occupation asian black_or_african_ame…¹ women total
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 agriculture_and_related_constr… -0.553 -0.410 -1.31 -1.48
## 2 agriculture_and_related_farmin… -0.943 -1.22 -0.509 0.706
## 3 agriculture_and_related_instal… -0.898 -1.28 -1.38 -0.992
## 4 agriculture_and_related_manage… -1.06 -1.66 -0.291 0.733
## 5 agriculture_and_related_manage… -1.06 -1.65 -0.300 0.750
## 6 agriculture_and_related_office… -0.671 -1.54 2.23 -0.503
## 7 agriculture_and_related_produc… -0.385 -0.0372 -0.622 -0.950
## 8 agriculture_and_related_profes… -0.364 -1.17 0.00410 -0.782
## 9 agriculture_and_related_protec… -1.35 -0.647 -0.833 -1.39
## 10 agriculture_and_related_sales_… -1.35 -1.44 0.425 -1.36
## # ℹ 220 more rows
## # ℹ abbreviated name: ¹black_or_african_american
employment_clust <- kmeans(select(employment_demo, -occupation), centers = 3)
summary(employment_clust)
## Length Class Mode
## cluster 230 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(employment_clust)
## # A tibble: 3 × 7
## asian black_or_african_american women total size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 -0.0673 1.21 0.160 -0.233 62 149. 1
## 2 -0.777 -0.652 -0.869 -0.602 80 149. 2
## 3 0.754 -0.261 0.677 0.711 88 209. 3
augment(employment_clust, employment_demo) %>%
ggplot(aes(total, black_or_african_american, color = .cluster)) +
geom_point(alpha = 0.8)
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(employment_demo, -occupation), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, employment_demo)
)
kclusts %>%
unnest(cols = c(glanced)) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.5, size = 1.2, color = "midnightblue") +
geom_point(size = 2, color = "midnightblue")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
final_clust <- kmeans(select(employment_demo, -occupation), centers = 5)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
employment_clust <- kmeans(select(employment_demo, -occupation), centers = 5)
p <- augment(final_clust, employment_demo) %>%
ggplot(aes(total, women, color = .cluster, name = occupation)) +
geom_point(alpha = 0.8)
ggplotly(p, height = 500)
What is the type of clustering used in the analysis? The clustering type we use in the analysis is k-means clustering. K-means clustering is a clustering algorithm that splits a set of observations into predetermined numbers of clusters. In this code along, k-means clustering is used to group similar occupations based on the scale of employment numbers in different demographic groups.
How is the optimal value for k found? We used the total sum of squares information in order to find the optimal value for k. Using tibble(k = 1:9) the code tries a range of k values from 1 to 9.