library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## industry = col_character(),
## major_occupation = col_character(),
## minor_occupation = col_character(),
## race_gender = col_character(),
## industry_total = col_double(),
## employ_n = col_double(),
## year = col_double()
## )
employed_tidy <- employed %>%
filter(year==2020)%>%
filter(!is.na(employ_n)) %>%
group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
summarise(n = mean(employ_n)) %>%
ungroup()
## `summarise()` has grouped output by 'occupation'. You can override using the `.groups` argument.
employment_demo <- employed_tidy %>%
filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
janitor::clean_names() %>%
left_join(employed_tidy %>%
filter(race_gender == "TOTAL") %>%
select(-race_gender) %>%
rename(total = n)) %>%
filter(total > 1e3) %>%
mutate(across(c(asian, black_or_african_american, women), ~ . / (total)),
total = log(total),
across(where(is.numeric), ~ as.numeric(scale(.)))
) %>%
mutate(occupation = snakecase::to_snake_case(occupation))
## Joining, by = "occupation"
employment_demo
## # A tibble: 200 x 5
## occupation asian black_or_african_a~ women total
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 agriculture_and_related_constructi~ -1.01 1.27 -1.40 -1.60
## 2 agriculture_and_related_farming_fi~ -0.743 -0.997 -0.431 0.743
## 3 agriculture_and_related_installati~ -1.01 -0.284 -1.15 -0.985
## 4 agriculture_and_related_management~ -0.861 -1.49 -0.304 0.759
## 5 agriculture_and_related_office_and~ -0.577 -1.43 2.03 -0.433
## 6 agriculture_and_related_production~ 0.484 -0.900 -0.708 -0.861
## 7 agriculture_and_related_profession~ 0.0710 -0.351 0.181 -0.694
## 8 agriculture_and_related_protective~ -1.01 -1.58 -0.762 -1.47
## 9 agriculture_and_related_sales_and_~ -1.01 -0.392 0.685 -1.51
## 10 agriculture_and_related_service_oc~ -1.01 -0.972 0.463 -0.444
## # ... with 190 more rows
employment_clust <- kmeans(select(employment_demo, -occupation), centers = 3)
summary(employment_clust)
## Length Class Mode
## cluster 200 -none- numeric
## centers 12 -none- numeric
## totss 1 -none- numeric
## withinss 3 -none- numeric
## tot.withinss 1 -none- numeric
## betweenss 1 -none- numeric
## size 3 -none- numeric
## iter 1 -none- numeric
## ifault 1 -none- numeric
library(broom)
tidy(employment_clust)
## # A tibble: 3 x 7
## asian black_or_african_american women total size withinss cluster
## <dbl> <dbl> <dbl> <dbl> <int> <dbl> <fct>
## 1 0.0370 0.326 0.693 0.488 93 253. 1
## 2 2.28 -0.0701 0.0542 0.0977 20 89.6 2
## 3 -0.563 -0.333 -0.753 -0.545 87 160. 3
augment(employment_clust, employment_demo) %>%
ggplot(aes(total, black_or_african_american, color = .cluster)) +
geom_point()
### Chossing K
kclusts <-
tibble(k = 1:9) %>%
mutate(
kclust = map(k, ~ kmeans(select(employment_demo, -occupation), .x)),
glanced = map(kclust, glance),
)
kclusts %>%
unnest(cols = c(glanced)) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.5, size = 1.2, color = "midnightblue") +
geom_point(size = 2, color = "midnightblue")
final_clust <- kmeans(select(employment_demo, -occupation), centers = 5)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- augment(final_clust, employment_demo) %>%
ggplot(aes(total, women, color = .cluster, name = occupation)) +
geom_point()
ggplotly(p, height = 500)