Import your data

rosters <- read_excel("../00_data/myData.xlsx", sheet = "nhl_rosters")

rosters
## # A tibble: 54,883 × 18
##    team_code   season position_type player_id headshot      first_name last_name
##    <chr>        <dbl> <chr>             <dbl> <chr>         <chr>      <chr>    
##  1 ATL       19992000 forwards        8467867 https://asse… Bryan      Adams    
##  2 ATL       19992000 forwards        8445176 https://asse… Donald     Audette  
##  3 ATL       19992000 forwards        8460014 https://asse… Eric       Bertrand 
##  4 ATL       19992000 forwards        8460510 https://asse… Jason      Botterill
##  5 ATL       19992000 forwards        8459596 https://asse… Andrew     Brunette 
##  6 ATL       19992000 forwards        8445733 https://asse… Kelly      Buchberg…
##  7 ATL       19992000 forwards        8460573 https://asse… Hnat       Domenich…
##  8 ATL       19992000 forwards        8459450 https://asse… Shean      Donovan  
##  9 ATL       19992000 forwards        8446675 https://asse… Nelson     Emerson  
## 10 ATL       19992000 forwards        8446823 https://asse… Ray        Ferraro  
## # ℹ 54,873 more rows
## # ℹ 11 more variables: sweater_number <chr>, position_code <chr>,
## #   shoots_catches <chr>, height_in_inches <dbl>, weight_in_pounds <dbl>,
## #   height_in_centimeters <dbl>, weight_in_kilograms <dbl>, birth_date <dttm>,
## #   birth_city <chr>, birth_country <chr>, birth_state_province <chr>

Chapter 15

Create a factor

Modify factor order

Make two bar charts here - one before ordering another after

    #Transform Data: Calculate average Height of player by Team
height_by_team <- rosters %>%
    
    group_by(team_code) %>%
        summarise(
            avg_player_height_team = mean(height_in_inches, na.rm = TRUE)
        )
    
  height_by_team  
## # A tibble: 58 × 2
##    team_code avg_player_height_team
##    <chr>                      <dbl>
##  1 AFM                         72.0
##  2 ANA                         73.2
##  3 ARI                         73.2
##  4 ATL                         72.9
##  5 BOS                         71.8
##  6 BRK                         70.6
##  7 BUF                         72.7
##  8 CAR                         73.0
##  9 CBJ                         73.2
## 10 CGS                         71.4
## # ℹ 48 more rows
  #plot 
  height_by_team %>%
      ggplot(aes(y = fct_reorder(team_code, avg_player_height_team), x = avg_player_height_team)) +
      geom_col()

Modify factor levels

Show examples of three functions:

  • fct_recode
  • fct_collapse
  • fct_lump
height_by_team <- rosters %>%
    
    group_by(team_code) %>%
        summarise(
            avg_player_height_team = mean(height_in_inches, na.rm = TRUE)
        )
    
  height_by_team  
## # A tibble: 58 × 2
##    team_code avg_player_height_team
##    <chr>                      <dbl>
##  1 AFM                         72.0
##  2 ANA                         73.2
##  3 ARI                         73.2
##  4 ATL                         72.9
##  5 BOS                         71.8
##  6 BRK                         70.6
##  7 BUF                         72.7
##  8 CAR                         73.0
##  9 CBJ                         73.2
## 10 CGS                         71.4
## # ℹ 48 more rows
 rosters %>%
     select(team_code) %>% 
     mutate(team_code = fct_recode(team_code,
    "Boston, Bruins"    = "BOS",
    "Buffalo"      = "BUF",
    "Tortonto" = "TOR",
    "Montreal" = "MTL",
    "Chicago"   = "CHI"
  )) %>%
  count(team_code)
## # A tibble: 58 × 2
##    team_code          n
##    <fct>          <int>
##  1 AFM              222
##  2 ANA             1093
##  3 ARI              359
##  4 ATL              410
##  5 Boston, Bruins  2988
##  6 BRK               26
##  7 Buffalo         1800
##  8 CAR              852
##  9 CBJ              853
## 10 CGS              184
## # ℹ 48 more rows
rosters %>% 
    select(team_code) %>%
     mutate(team_code = fct_collapse(team_code,
                         good = c("BOS", "BUF", "EDM", "DET"),
                         bad = c("CAR", "ANA")
                         )) %>%
        count(team_code)                            
## # A tibble: 54 × 2
##    team_code     n
##    <fct>     <int>
##  1 AFM         222
##  2 bad        1945
##  3 ARI         359
##  4 ATL         410
##  5 good       9247
##  6 BRK          26
##  7 CBJ         853
##  8 CGS         184
##  9 CGY        1530
## 10 CHI        2819
## # ℹ 44 more rows
 rosters %>% 
     mutate(teams_lump = fct_lump(team_code, prop = 0.035)) %>% count(teams_lump, sort = T)
## # A tibble: 10 × 2
##    teams_lump     n
##    <fct>      <int>
##  1 Other      31459
##  2 MTL         3009
##  3 BOS         2988
##  4 TOR         2944
##  5 NYR         2943
##  6 DET         2883
##  7 CHI         2819
##  8 PIT         1979
##  9 STL         1935
## 10 PHI         1924