library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
employed_tidy <- employed %>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = mean(employ_n)) %>%
ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.
employment_demo <- employed_tidy %>%
filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
janitor::clean_names() %>%
left_join(employed_tidy %>%
filter(race_gender == "TOTAL") %>%
select(-race_gender) %>%
rename(total = n)) %>%
filter(total > 1e3) %>%
mutate(across(c(asian, black_or_african_american, women), ~ . / (total)),
total = log(total),
across(where(is.numeric), ~ as.numeric(scale(.)))
) %>%
mutate(occupation = snakecase::to_snake_case(occupation))
## Joining with `by = join_by(occupation)`
employment_demo
## # A tibble: 230 × 5
## occupation asian black_or_african_ame…¹ women total
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 agriculture_and_related_constr… -0.553 -0.410 -1.31 -1.48
## 2 agriculture_and_related_farmin… -0.943 -1.22 -0.509 0.706
## 3 agriculture_and_related_instal… -0.898 -1.28 -1.38 -0.992
## 4 agriculture_and_related_manage… -1.06 -1.66 -0.291 0.733
## 5 agriculture_and_related_manage… -1.06 -1.65 -0.300 0.750
## 6 agriculture_and_related_office… -0.671 -1.54 2.23 -0.503
## 7 agriculture_and_related_produc… -0.385 -0.0372 -0.622 -0.950
## 8 agriculture_and_related_profes… -0.364 -1.17 0.00410 -0.782
## 9 agriculture_and_related_protec… -1.35 -0.647 -0.833 -1.39
## 10 agriculture_and_related_sales_… -1.35 -1.44 0.425 -1.36
## # ℹ 220 more rows
## # ℹ abbreviated name: ¹​black_or_african_american
employment_clust <- kmeans(select(employment_demo, -occupation), centers = 3)
summary(employment_clust)
## Length Class Mode
## cluster 230 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(employment_clust)
## # A tibble: 3 × 7
## asian black_or_african_american women total size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 0.0241 0.719 0.680 0.426 88 197. 1
## 2 -0.699 -0.396 -0.796 -0.620 97 202. 2
## 3 1.46 -0.551 0.385 0.503 45 125. 3
augment(employment_clust, employment_demo) %>%
ggplot(aes(total, black_or_african_american, color = .cluster)) +
geom_point()
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(employment_demo, -occupation), .x)),
glanced = map(kclust, glance)
)
kclusts %>%
unnest(cols = c(glanced)) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.5, size = 1.2, color = "midnightblue") +
geom_point(size = 2, color = "midnightblue")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
final_clust <- kmeans(select(employment_demo, -occupation), centers = 5)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- augment(final_clust, employment_demo) %>%
ggplot(aes(total, women, color = .cluster, name = occupation)) +
geom_point()
ggplotly(p, height = 500)
#1 - The modeling goal was to use k means clustering to explore employment by race and gender. - The data consists of what industry people are in, their occupation, race and gender, year, etc. - The main variables used in the analysis consist of race, gender, and occupation.
#2 - The original data had more variables and data points, while the transformed data has selected the main datait wants to work with. For example it has selected certain gender and race instead of using all of them. We needed to center and scale the variables since they are on such different scales.
#3 - k-means Clustering - To find optimal K value you can look at the total within-cluster squares and see if there is a drop off.
#4 - We were able to see what occupations are more alike in terms of what proportion of a certain demographic works in them.