Import your data

nhl_rosters <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-01-09/nhl_rosters.csv')
## Rows: 54883 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): team_code, position_type, headshot, first_name, last_name, positi...
## dbl   (7): season, player_id, sweater_number, height_in_inches, weight_in_po...
## date  (1): birth_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# using set seed this time so we get same players...
set.seed(121)
nhl_filtered <- nhl_rosters %>% 
    select(team_code, last_name, position_type, weight_in_pounds, height_in_centimeters, birth_country)%>%
    sample_n(500)

nhl_filtered
## # A tibble: 500 × 6
##    team_code last_name   position_type weight_in_pounds height_in_centimeters
##    <chr>     <chr>       <chr>                    <dbl>                 <dbl>
##  1 EDM       Chychrun    defensemen                 215                   193
##  2 MTL       Hodge       goalies                    150                   168
##  3 DAL       Harvey      forwards                   210                   183
##  4 TBL       Leach       defensemen                 220                   196
##  5 MIN       Clutterbuck forwards                   212                   183
##  6 DET       Giacomin    goalies                    180                   180
##  7 PHI       Simmonds    forwards                   184                   188
##  8 DAL       Nieuwendyk  forwards                   209                   188
##  9 EDM       Semenko     forwards                   216                   188
## 10 MNS       Rombough    forwards                   215                   191
## # ℹ 490 more rows
## # ℹ 1 more variable: birth_country <chr>

Chapter 15

Create a factor

Modify factor order

NHL Player Weight by country unordered

nhl_weights <- nhl_filtered %>%
    group_by(birth_country) %>%
    summarise(
        weight_in_pounds = mean(weight_in_pounds, na.rm = TRUE),
        n = n()
    )

ggplot(nhl_weights, aes(x = birth_country, y = weight_in_pounds)) + 
    geom_col()+
    labs(
        x = "Birth Country",
        y = "Avg Weight (lbs)",
        title = "Avg Weight by Birth Country",
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(nhl_weights, aes(x = fct_reorder(birth_country, weight_in_pounds), y = weight_in_pounds)) + 
    geom_col() +
    labs(
        x = "Birth Country",
        y = "Avg Weight (lbs)",
        title = "Avg Weight by Birth Country",
    ) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

NHL Player Height by country unordered

nhl_heights <- nhl_filtered %>%
    group_by(birth_country) %>%
    summarise(
        height_in_centimeters = mean(height_in_centimeters, na.rm = TRUE),
        n = n()
    )

ggplot(nhl_heights, aes(x = birth_country, y = height_in_centimeters)) + 
    geom_col()

ggplot(nhl_filtered, aes(birth_country)) +
  geom_bar()

Modify factor levels

fct_recode:

nhl_filtered %>% distinct(birth_country)
## # A tibble: 14 × 1
##    birth_country
##    <chr>        
##  1 CAN          
##  2 USA          
##  3 RUS          
##  4 GBR          
##  5 NLD          
##  6 UKR          
##  7 SVK          
##  8 FIN          
##  9 CHE          
## 10 CZE          
## 11 SWE          
## 12 DEU          
## 13 FRA          
## 14 LVA
nhl_filtered %>%
    
    mutate(birth_country_rev = fct_recode(birth_country, "cold place" = "CAN")) %>%
    select(birth_country, birth_country_rev) %>%
    filter(birth_country == "CAN")
## # A tibble: 322 × 2
##    birth_country birth_country_rev
##    <chr>         <fct>            
##  1 CAN           cold place       
##  2 CAN           cold place       
##  3 CAN           cold place       
##  4 CAN           cold place       
##  5 CAN           cold place       
##  6 CAN           cold place       
##  7 CAN           cold place       
##  8 CAN           cold place       
##  9 CAN           cold place       
## 10 CAN           cold place       
## # ℹ 312 more rows
  • fct_collapse:
nhl_filtered %>% 
    
    mutate(birth_country_col = fct_collapse(birth_country, "Scandinavian country" = c("FIN","SWE"))) %>%
    select(birth_country, birth_country_col) %>%
    filter(birth_country == "FIN" | birth_country == "SWE")
## # A tibble: 37 × 2
##    birth_country birth_country_col   
##    <chr>         <fct>               
##  1 FIN           Scandinavian country
##  2 SWE           Scandinavian country
##  3 FIN           Scandinavian country
##  4 SWE           Scandinavian country
##  5 FIN           Scandinavian country
##  6 SWE           Scandinavian country
##  7 SWE           Scandinavian country
##  8 FIN           Scandinavian country
##  9 FIN           Scandinavian country
## 10 FIN           Scandinavian country
## # ℹ 27 more rows
  • fct_lump
nhl_filtered %>% count(birth_country)
## # A tibble: 14 × 2
##    birth_country     n
##    <chr>         <int>
##  1 CAN             322
##  2 CHE               1
##  3 CZE              13
##  4 DEU               4
##  5 FIN              13
##  6 FRA               1
##  7 GBR               5
##  8 LVA               1
##  9 NLD               1
## 10 RUS              21
## 11 SVK               8
## 12 SWE              24
## 13 UKR               2
## 14 USA              84
nhl_filtered %>% mutate(country_lump = fct_lump(birth_country)) %>% distinct(country_lump)
## # A tibble: 2 × 1
##   country_lump
##   <fct>       
## 1 CAN         
## 2 Other