library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
employed_tidy <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = mean(employ_n)) %>%
ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.
employed_tidy %>%
filter(race_gender == "TOTAL")
## # A tibble: 239 × 3
## occupation race_gender n
## <chr> <chr> <dbl>
## 1 Agriculture and related Construction and extraction occup… TOTAL 1.22e4
## 2 Agriculture and related Farming, fishing, and forestry oc… TOTAL 9.56e5
## 3 Agriculture and related Installation, maintenance, and re… TOTAL 3.23e4
## 4 Agriculture and related Manage-ment, business, and financ… TOTAL 1.01e6
## 5 Agriculture and related Management, business, and financi… TOTAL 1.04e6
## 6 Agriculture and related Office and administrative support… TOTAL 8.58e4
## 7 Agriculture and related Production occupations TOTAL 3.52e4
## 8 Agriculture and related Professional and related occupati… TOTAL 4.92e4
## 9 Agriculture and related Protective service occupations TOTAL 1.47e4
## 10 Agriculture and related Sales and related occupations TOTAL 1.57e4
## # ℹ 229 more rows
employment_demo <- employed_tidy %>%
filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
janitor::clean_names() %>%
left_join(employed_tidy %>%
filter(race_gender == "TOTAL")) %>%
select(-race_gender) %>%
rename(total = n) %>%
filter(total > 1e4) %>%
mutate(across(c(asian, black_or_african_american, women), ~ . / total),
total = log(total),
across(is.numeric, ~as.numeric(scale(.)))) %>%
mutate(occupation = snakecase::to_snake_case(occupation))
## Joining with `by = join_by(occupation)`
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(is.numeric, ~as.numeric(scale(.)))`.
## Caused by warning:
## ! Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.numeric)
##
## # Now:
## data %>% select(where(is.numeric))
employment_demo
## # A tibble: 211 × 5
## occupation asian black_or_african_ame…¹ women total
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 agriculture_and_related_constr… -0.625 -0.501 -1.33 -1.98
## 2 agriculture_and_related_farmin… -1.05 -1.38 -0.515 0.630
## 3 agriculture_and_related_instal… -0.999 -1.44 -1.41 -1.39
## 4 agriculture_and_related_manage… -1.17 -1.87 -0.293 0.662
## 5 agriculture_and_related_manage… -1.17 -1.85 -0.302 0.682
## 6 agriculture_and_related_office… -0.753 -1.73 2.28 -0.811
## 7 agriculture_and_related_produc… -0.444 -0.0947 -0.631 -1.34
## 8 agriculture_and_related_profes… -0.421 -1.33 0.00826 -1.14
## 9 agriculture_and_related_protec… -1.48 -0.758 -0.846 -1.87
## 10 agriculture_and_related_sales_… -1.48 -1.62 0.437 -1.83
## # ℹ 201 more rows
## # ℹ abbreviated name: ¹black_or_african_american
employment_clust <- kmeans(select(employment_demo, - occupation), centers = 3)
summary(employment_clust)
## Length Class Mode
## cluster 211 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(employment_clust)
## # A tibble: 3 × 7
## asian black_or_african_american women total size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 -0.699 -0.392 -0.869 -0.657 84 159. 1
## 2 -0.0407 0.738 0.716 0.184 80 202. 2
## 3 1.32 -0.556 0.333 0.860 47 102. 3
augment(employment_clust, employment_demo) %>%
ggplot(aes(total, black_or_african_american, color = .cluster)) +
geom_point()
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(employment_demo, - occupation), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, employment_demo)
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.8) +
geom_point(size = 2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
employment_clust <- kmeans(select(employment_demo, - occupation), centers = 5)
p <- augment(employment_clust, employment_demo) %>%
ggplot(aes(total, women, color = .cluster, name = occupation)) +
geom_point(alpha = 0.8)
ggplotly(p)
The dataset contains employment statistics across various industries and occupations, detailed by race and gender. Numeric fields record total employment numbers and years, while character fields describe industry and job classifications. It is pre-processed to average employment figures by demographic groups, particularly focusing on women, and Black or African American, and Asian categories.
The key variables in the analysis include ‘industry’, ‘minor_occupation’, ‘race_gender’, and ‘employ_n’, representing occupational categories, demographic groups, and average employment numbers, respectively. These variables are transformed for clustering based on race and gender demographics within occupations.
The original data provides raw employment figures by demographic groups across occupations. The transformed data is normalized, aggregated, and scaled for modeling to highlight relative differences and trends, facilitating more meaningful clustering and pattern detection in employment demographics across various industries.
The analysis employs k-means clustering, a method that partitions the data into k distinct clusters based on feature similarity, aiming to minimize within-cluster variance while maximizing between-cluster differences. This technique is used to identify underlying patterns in employment demographics.
The optimal value for k is determined using the elbow method, which involves running the k-means clustering algorithm for a range of k values and plotting the total within-cluster sum of squares (tot.withinss) against k. The “elbow point,” where the rate of decrease sharply changes, suggests the optimal number of clusters to use for the analysis.