Import your data

canada_births_1991_2022 <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-01-09/canada_births_1991_2022.csv')
## Rows: 384 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): year, month, births
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nhl_player_births <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-01-09/nhl_player_births.csv')
## Rows: 8474 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): first_name, last_name, birth_city, birth_country, birth_state_prov...
## dbl  (3): player_id, birth_year, birth_month
## date (1): birth_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nhl_rosters <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-01-09/nhl_rosters.csv')
## Rows: 54883 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): team_code, position_type, headshot, first_name, last_name, positi...
## dbl   (7): season, player_id, sweater_number, height_in_inches, weight_in_po...
## date  (1): birth_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nhl_teams <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-01-09/nhl_teams.csv')
## Rows: 59 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): team_code, full_name
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chapter 15

Create a factor

# Count number of players by province
nhl_player_births %>% 
  filter(birth_country == "CAN") %>%
  count(birth_state_province)
## # A tibble: 12 × 2
##    birth_state_province          n
##    <chr>                     <int>
##  1 Alberta                     645
##  2 British Columbia            408
##  3 Manitoba                    408
##  4 New Brunswick                53
##  5 Newfoundland and Labrador    31
##  6 Northwest Territories         4
##  7 Nova Scotia                  79
##  8 Ontario                    2407
##  9 Prince Edward Island         35
## 10 Quebec                      865
## 11 Saskatchewan                530
## 12 Yukon Territory               3
prov_levels <- c("Ontario", "Quebec", "Alberta", "British Columbia",
                 "Saskatchewan", "Manitoba", "Nova Scotia", "New Brunswick",
                 "Newfoundland and Labrador", "Prince Edward Island")


data_rev <- nhl_player_births %>%
  filter(birth_country == "CAN") %>%
  mutate(birth_state_province = birth_state_province %>% 
           factor(levels = prov_levels))

Modify factor order

Make two bar charts here - one before ordering another after

# Summarize data
data_summary <- nhl_player_births %>%
  filter(birth_country == "CAN") %>%
  group_by(birth_state_province) %>%
  summarise(
    players = n()
  )

data_summary   
## # A tibble: 12 × 2
##    birth_state_province      players
##    <chr>                       <int>
##  1 Alberta                       645
##  2 British Columbia              408
##  3 Manitoba                      408
##  4 New Brunswick                  53
##  5 Newfoundland and Labrador      31
##  6 Northwest Territories           4
##  7 Nova Scotia                    79
##  8 Ontario                      2407
##  9 Prince Edward Island           35
## 10 Quebec                        865
## 11 Saskatchewan                  530
## 12 Yukon Territory                 3
# First plot 
ggplot(data_summary, aes(players, birth_state_province)) +
  geom_point()

# Second plot 
ggplot(data_summary, aes(players, fct_reorder(birth_state_province, players))) +
  geom_point()

Modify factor levels

Show examples of three functions:

  • fct_recode
nhl_player_births %>%
  filter(birth_country == "CAN") %>%
  mutate(birth_state_province = fct_recode(birth_state_province,
           "BC"  = "British Columbia",
           "AB"  = "Alberta",
           "SK"  = "Saskatchewan",
           "MB"  = "Manitoba",
           "ON"  = "Ontario",
           "QC"  = "Quebec",
           "NB"  = "New Brunswick",
           "NS"  = "Nova Scotia",
           "PEI" = "Prince Edward Island",
           "NL"  = "Newfoundland and Labrador")) %>%
  count(birth_state_province)
## # A tibble: 12 × 2
##    birth_state_province      n
##    <fct>                 <int>
##  1 AB                      645
##  2 BC                      408
##  3 MB                      408
##  4 NB                       53
##  5 NL                       31
##  6 Northwest Territories     4
##  7 NS                       79
##  8 ON                     2407
##  9 PEI                      35
## 10 QC                      865
## 11 SK                      530
## 12 Yukon Territory           3
  • fct_collapse
nhl_player_births %>%
  filter(birth_country == "CAN") %>%
  mutate(birth_state_province = fct_collapse(birth_state_province,
           West  = c("British Columbia", "Alberta", "Saskatchewan", "Manitoba"),
           East  = c("Ontario", "Quebec", "New Brunswick", "Nova Scotia",
                     "Prince Edward Island", "Newfoundland and Labrador"))) %>%
  count(birth_state_province)
## # A tibble: 4 × 2
##   birth_state_province      n
##   <fct>                 <int>
## 1 West                   1991
## 2 East                   3470
## 3 Northwest Territories     4
## 4 Yukon Territory           3
  • fct_lump
nhl_player_births %>%
  filter(birth_country == "CAN") %>%
  mutate(birth_state_province = fct_lump(birth_state_province, n = 3)) %>% 
  count(birth_state_province)
## # A tibble: 4 × 2
##   birth_state_province     n
##   <fct>                <int>
## 1 Alberta                645
## 2 Ontario               2407
## 3 Quebec                 865
## 4 Other                 1551