Code Along 9

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

employed <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-02-23/employed.csv")

## Rows: 8184 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): industry, major_occupation, minor_occupation, race_gender
## dbl (3): industry_total, employ_n, year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

employed_tidy <- employed %>%
  filter(!is.na(employ_n)) %>%
  group_by(occupation = paste(industry, minor_occupation), race_gender) %>%
  summarise(n = mean(employ_n)) %>%
  ungroup()

## `summarise()` has grouped output by 'occupation'. You can override using the
## `.groups` argument.

employment_demo <- employed_tidy %>%
  filter(race_gender %in% c("Women", "Black or African American", "Asian")) %>%
  pivot_wider(names_from = race_gender, values_from = n, values_fill = 0) %>%
  janitor::clean_names() %>%
  left_join(employed_tidy %>%
    filter(race_gender == "TOTAL") %>%
    select(-race_gender) %>%
    rename(total = n)) %>%
  filter(total > 1e3) %>%
  mutate(across(c(asian, black_or_african_american, women), ~ . / (total)),
    total = log(total),
    across(where(is.numeric), ~ as.numeric(scale(.)))
  ) %>%
  mutate(occupation = snakecase::to_snake_case(occupation))

## Joining with `by = join_by(occupation)`

employment_demo

## # A tibble: 230 × 5
##    occupation                       asian black_or_african_ame…¹    women  total
##    <chr>                            <dbl>                  <dbl>    <dbl>  <dbl>
##  1 agriculture_and_related_constr… -0.553                -0.410  -1.31    -1.48 
##  2 agriculture_and_related_farmin… -0.943                -1.22   -0.509    0.706
##  3 agriculture_and_related_instal… -0.898                -1.28   -1.38    -0.992
##  4 agriculture_and_related_manage… -1.06                 -1.66   -0.291    0.733
##  5 agriculture_and_related_manage… -1.06                 -1.65   -0.300    0.750
##  6 agriculture_and_related_office… -0.671                -1.54    2.23    -0.503
##  7 agriculture_and_related_produc… -0.385                -0.0372 -0.622   -0.950
##  8 agriculture_and_related_profes… -0.364                -1.17    0.00410 -0.782
##  9 agriculture_and_related_protec… -1.35                 -0.647  -0.833   -1.39 
## 10 agriculture_and_related_sales_… -1.35                 -1.44    0.425   -1.36 
## # ℹ 220 more rows
## # ℹ abbreviated name: ¹black_or_african_american

K means clustering

employment_clust <- kmeans(select(employment_demo, -occupation), centers = 3)
summary(employment_clust)

##              Length Class  Mode   
## cluster      230    -none- numeric
## centers       12    -none- numeric
## totss          1    -none- numeric
## withinss       3    -none- numeric
## tot.withinss   1    -none- numeric
## betweenss      1    -none- numeric
## size           3    -none- numeric
## iter           1    -none- numeric
## ifault         1    -none- numeric

library(broom)
tidy(employment_clust)

## # A tibble: 3 × 7
##     asian black_or_african_american  women  total  size withinss cluster
##     <dbl>                     <dbl>  <dbl>  <dbl> <int>    <dbl> <fct>  
## 1  0.0241                     0.719  0.680  0.426    88     197. 1      
## 2 -0.699                     -0.396 -0.796 -0.620    97     202. 2      
## 3  1.46                      -0.551  0.385  0.503    45     125. 3

augment(employment_clust, employment_demo) %>%
  ggplot(aes(total, black_or_african_american, color = .cluster)) +
  geom_point()

Choosing K

kclusts <-
  tibble(k = 1:9) %>%
  mutate(
    kclust = map(k, ~ kmeans(select(employment_demo, -occupation), .x)),
    glanced = map(kclust, glance)
  )

kclusts %>%
  unnest(cols = c(glanced)) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.5, size = 1.2, color = "midnightblue") +
  geom_point(size = 2, color = "midnightblue")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

final_clust <- kmeans(select(employment_demo, -occupation), centers = 5)

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p <- augment(final_clust, employment_demo) %>%
  ggplot(aes(total, women, color = .cluster, name = occupation)) +
  geom_point()

ggplotly(p, height = 500)

#1 - The modeling goal was to use k means clustering to explore employment by race and gender. - The data consists of what industry people are in, their occupation, race and gender, year, etc. - The main variables used in the analysis consist of race, gender, and occupation.

#2 - The original data had more variables and data points, while the transformed data has selected the main datait wants to work with. For example it has selected certain gender and race instead of using all of them. We needed to center and scale the variables since they are on such different scales.

#3 - k-means Clustering - To find optimal K value you can look at the total within-cluster squares and see if there is a drop off.

#4 - We were able to see what occupations are more alike in terms of what proportion of a certain demographic works in them.

Code Along 9

Colin Tracy

2023-11-07

K means clustering

Choosing K