myData <- read_excel("../00_data/myData.xlsx")
myData
## # A tibble: 49,384 × 25
##    player       season  rank date                game_…¹ age   team  at    opp  
##    <chr>         <dbl> <dbl> <dttm>                <dbl> <chr> <chr> <chr> <chr>
##  1 Alex Ovechk…   2006     1 2005-10-05 00:00:00       1 20-0… WSH   NA    CBJ  
##  2 Alex Ovechk…   2006     2 2005-10-07 00:00:00       2 20-0… WSH   NA    ATL  
##  3 Alex Ovechk…   2006     3 2005-10-08 00:00:00       3 20-0… WSH   @     ATL  
##  4 Alex Ovechk…   2006     4 2005-10-10 00:00:00       4 20-0… WSH   NA    NYR  
##  5 Alex Ovechk…   2006     5 2005-10-12 00:00:00       5 20-0… WSH   @     CAR  
##  6 Alex Ovechk…   2006     6 2005-10-13 00:00:00       6 20-0… WSH   NA    NYI  
##  7 Alex Ovechk…   2006     7 2005-10-16 00:00:00       7 20-0… WSH   NA    TBL  
##  8 Alex Ovechk…   2006     8 2005-10-20 00:00:00       8 20-0… WSH   @     FLA  
##  9 Alex Ovechk…   2006     9 2005-10-22 00:00:00       9 20-0… WSH   NA    CAR  
## 10 Alex Ovechk…   2006    10 2005-10-26 00:00:00      10 20-0… WSH   @     BUF  
## # … with 49,374 more rows, 16 more variables: location <chr>, outcome <chr>,
## #   goals <dbl>, assists <dbl>, points <dbl>, plus_minus <dbl>,
## #   penalty_min <dbl>, goals_even <dbl>, goals_powerplay <dbl>,
## #   goals_short <dbl>, goals_gamewinner <dbl>, assists_even <chr>,
## #   assists_powerplay <chr>, assists_short <chr>, shots <dbl>,
## #   shot_percent <chr>, and abbreviated variable name ¹​game_num
skimr::skim(myData)
Data summary
Name myData
Number of rows 49384
Number of columns 25
_______________________
Column type frequency:
character 11
numeric 13
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
player 0 1 9 16 0 42 0
age 0 1 6 6 0 8827 0
team 0 1 3 3 0 37 0
at 0 1 1 2 0 2 0
opp 0 1 3 3 0 41 0
location 0 1 4 4 0 2 0
outcome 0 1 1 4 0 5 0
assists_even 0 1 1 2 0 8 0
assists_powerplay 0 1 1 2 0 5 0
assists_short 0 1 1 2 0 5 0
shot_percent 0 1 1 18 0 41 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
season 0 1 2005.25 10.71 1980 1997 2008 2014 2020 ▂▃▅▇▇
rank 0 1 37.54 22.37 1 18 36 56 84 ▇▇▇▆▃
game_num 0 1 37.54 22.37 1 18 36 56 84 ▇▇▇▆▃
goals 0 1 0.41 0.66 0 0 0 1 5 ▇▁▁▁▁
assists 0 1 0.55 0.78 0 0 0 1 7 ▇▁▁▁▁
points 0 1 0.97 1.04 0 0 1 2 8 ▇▂▁▁▁
plus_minus 0 1 0.08 1.32 -6 -1 0 1 8 ▁▃▇▁▁
penalty_min 0 1 0.65 1.69 0 0 0 0 35 ▇▁▁▁▁
goals_even 0 1 0.27 0.53 0 0 0 0 5 ▇▁▁▁▁
goals_powerplay 0 1 0.13 0.36 0 0 0 0 4 ▇▁▁▁▁
goals_short 0 1 0.01 0.12 0 0 0 0 2 ▇▁▁▁▁
goals_gamewinner 0 1 0.06 0.24 0 0 0 0 1 ▇▁▁▁▁
shots 0 1 3.02 1.99 0 2 3 4 15 ▇▃▁▁▁

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
date 0 1 1979-10-10 2020-02-26 2007-12-16 6523
set.seed(123)
data <-myData %>%
    select(player, goals, outcome) %>%
    sample_n(10)
data %>% count(outcome)
## # A tibble: 3 × 2
##   outcome     n
##   <chr>   <int>
## 1 L           5
## 2 L-SO        1
## 3 W           4
data %>%
    mutate(outcome_rev = fct_recode(outcome,
                                    "loss" = "L",
                                    "win"= "W",
                                    "loss shoutout" = "L-SO"))
## # A tibble: 10 × 4
##    player           goals outcome outcome_rev  
##    <chr>            <dbl> <chr>   <fct>        
##  1 Brendan Shanahan     0 L       loss         
##  2 Luc Robitaille       0 W       win          
##  3 Luc Robitaille       1 L       loss         
##  4 Patrick Kane         1 L       loss         
##  5 Brad Marchand        1 L-SO    loss shoutout
##  6 Teemu Selanne        1 W       win          
##  7 Tyler Seguin         0 L       loss         
##  8 Patrick Marleau      1 W       win          
##  9 Dino Ciccarelli      0 W       win          
## 10 Mario Lemieux        2 L       loss
data %>%
    mutate(outcome_col = fct_collapse(outcome,
                                      loss = c("L","L-SO"),
                                      win = c("W")))
## # A tibble: 10 × 4
##    player           goals outcome outcome_col
##    <chr>            <dbl> <chr>   <fct>      
##  1 Brendan Shanahan     0 L       loss       
##  2 Luc Robitaille       0 W       win        
##  3 Luc Robitaille       1 L       loss       
##  4 Patrick Kane         1 L       loss       
##  5 Brad Marchand        1 L-SO    loss       
##  6 Teemu Selanne        1 W       win        
##  7 Tyler Seguin         0 L       loss       
##  8 Patrick Marleau      1 W       win        
##  9 Dino Ciccarelli      0 W       win        
## 10 Mario Lemieux        2 L       loss
data %>% count(outcome)
## # A tibble: 3 × 2
##   outcome     n
##   <chr>   <int>
## 1 L           5
## 2 L-SO        1
## 3 W           4
data %>%
    mutate(outcome_lump = fct_lump(outcome, n =1 ))
## # A tibble: 10 × 4
##    player           goals outcome outcome_lump
##    <chr>            <dbl> <chr>   <fct>       
##  1 Brendan Shanahan     0 L       L           
##  2 Luc Robitaille       0 W       Other       
##  3 Luc Robitaille       1 L       L           
##  4 Patrick Kane         1 L       L           
##  5 Brad Marchand        1 L-SO    Other       
##  6 Teemu Selanne        1 W       Other       
##  7 Tyler Seguin         0 L       L           
##  8 Patrick Marleau      1 W       Other       
##  9 Dino Ciccarelli      0 W       Other       
## 10 Mario Lemieux        2 L       L