myData <- read_excel("../00_data/myData.xlsx")
myData
## # A tibble: 49,384 × 25
## player season rank date game_…¹ age team at opp
## <chr> <dbl> <dbl> <dttm> <dbl> <chr> <chr> <chr> <chr>
## 1 Alex Ovechk… 2006 1 2005-10-05 00:00:00 1 20-0… WSH NA CBJ
## 2 Alex Ovechk… 2006 2 2005-10-07 00:00:00 2 20-0… WSH NA ATL
## 3 Alex Ovechk… 2006 3 2005-10-08 00:00:00 3 20-0… WSH @ ATL
## 4 Alex Ovechk… 2006 4 2005-10-10 00:00:00 4 20-0… WSH NA NYR
## 5 Alex Ovechk… 2006 5 2005-10-12 00:00:00 5 20-0… WSH @ CAR
## 6 Alex Ovechk… 2006 6 2005-10-13 00:00:00 6 20-0… WSH NA NYI
## 7 Alex Ovechk… 2006 7 2005-10-16 00:00:00 7 20-0… WSH NA TBL
## 8 Alex Ovechk… 2006 8 2005-10-20 00:00:00 8 20-0… WSH @ FLA
## 9 Alex Ovechk… 2006 9 2005-10-22 00:00:00 9 20-0… WSH NA CAR
## 10 Alex Ovechk… 2006 10 2005-10-26 00:00:00 10 20-0… WSH @ BUF
## # … with 49,374 more rows, 16 more variables: location <chr>, outcome <chr>,
## # goals <dbl>, assists <dbl>, points <dbl>, plus_minus <dbl>,
## # penalty_min <dbl>, goals_even <dbl>, goals_powerplay <dbl>,
## # goals_short <dbl>, goals_gamewinner <dbl>, assists_even <chr>,
## # assists_powerplay <chr>, assists_short <chr>, shots <dbl>,
## # shot_percent <chr>, and abbreviated variable name ¹game_num
skimr::skim(myData)
Data summary
| Name |
myData |
| Number of rows |
49384 |
| Number of columns |
25 |
| _______________________ |
|
| Column type frequency: |
|
| character |
11 |
| numeric |
13 |
| POSIXct |
1 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| player |
0 |
1 |
9 |
16 |
0 |
42 |
0 |
| age |
0 |
1 |
6 |
6 |
0 |
8827 |
0 |
| team |
0 |
1 |
3 |
3 |
0 |
37 |
0 |
| at |
0 |
1 |
1 |
2 |
0 |
2 |
0 |
| opp |
0 |
1 |
3 |
3 |
0 |
41 |
0 |
| location |
0 |
1 |
4 |
4 |
0 |
2 |
0 |
| outcome |
0 |
1 |
1 |
4 |
0 |
5 |
0 |
| assists_even |
0 |
1 |
1 |
2 |
0 |
8 |
0 |
| assists_powerplay |
0 |
1 |
1 |
2 |
0 |
5 |
0 |
| assists_short |
0 |
1 |
1 |
2 |
0 |
5 |
0 |
| shot_percent |
0 |
1 |
1 |
18 |
0 |
41 |
0 |
Variable type: numeric
| season |
0 |
1 |
2005.25 |
10.71 |
1980 |
1997 |
2008 |
2014 |
2020 |
▂▃▅▇▇ |
| rank |
0 |
1 |
37.54 |
22.37 |
1 |
18 |
36 |
56 |
84 |
▇▇▇▆▃ |
| game_num |
0 |
1 |
37.54 |
22.37 |
1 |
18 |
36 |
56 |
84 |
▇▇▇▆▃ |
| goals |
0 |
1 |
0.41 |
0.66 |
0 |
0 |
0 |
1 |
5 |
▇▁▁▁▁ |
| assists |
0 |
1 |
0.55 |
0.78 |
0 |
0 |
0 |
1 |
7 |
▇▁▁▁▁ |
| points |
0 |
1 |
0.97 |
1.04 |
0 |
0 |
1 |
2 |
8 |
▇▂▁▁▁ |
| plus_minus |
0 |
1 |
0.08 |
1.32 |
-6 |
-1 |
0 |
1 |
8 |
▁▃▇▁▁ |
| penalty_min |
0 |
1 |
0.65 |
1.69 |
0 |
0 |
0 |
0 |
35 |
▇▁▁▁▁ |
| goals_even |
0 |
1 |
0.27 |
0.53 |
0 |
0 |
0 |
0 |
5 |
▇▁▁▁▁ |
| goals_powerplay |
0 |
1 |
0.13 |
0.36 |
0 |
0 |
0 |
0 |
4 |
▇▁▁▁▁ |
| goals_short |
0 |
1 |
0.01 |
0.12 |
0 |
0 |
0 |
0 |
2 |
▇▁▁▁▁ |
| goals_gamewinner |
0 |
1 |
0.06 |
0.24 |
0 |
0 |
0 |
0 |
1 |
▇▁▁▁▁ |
| shots |
0 |
1 |
3.02 |
1.99 |
0 |
2 |
3 |
4 |
15 |
▇▃▁▁▁ |
Variable type: POSIXct
| date |
0 |
1 |
1979-10-10 |
2020-02-26 |
2007-12-16 |
6523 |
set.seed(123)
data <-myData %>%
select(player, goals, outcome) %>%
sample_n(10)
data %>% count(outcome)
## # A tibble: 3 × 2
## outcome n
## <chr> <int>
## 1 L 5
## 2 L-SO 1
## 3 W 4
data %>%
mutate(outcome_rev = fct_recode(outcome,
"loss" = "L",
"win"= "W",
"loss shoutout" = "L-SO"))
## # A tibble: 10 × 4
## player goals outcome outcome_rev
## <chr> <dbl> <chr> <fct>
## 1 Brendan Shanahan 0 L loss
## 2 Luc Robitaille 0 W win
## 3 Luc Robitaille 1 L loss
## 4 Patrick Kane 1 L loss
## 5 Brad Marchand 1 L-SO loss shoutout
## 6 Teemu Selanne 1 W win
## 7 Tyler Seguin 0 L loss
## 8 Patrick Marleau 1 W win
## 9 Dino Ciccarelli 0 W win
## 10 Mario Lemieux 2 L loss
data %>%
mutate(outcome_col = fct_collapse(outcome,
loss = c("L","L-SO"),
win = c("W")))
## # A tibble: 10 × 4
## player goals outcome outcome_col
## <chr> <dbl> <chr> <fct>
## 1 Brendan Shanahan 0 L loss
## 2 Luc Robitaille 0 W win
## 3 Luc Robitaille 1 L loss
## 4 Patrick Kane 1 L loss
## 5 Brad Marchand 1 L-SO loss
## 6 Teemu Selanne 1 W win
## 7 Tyler Seguin 0 L loss
## 8 Patrick Marleau 1 W win
## 9 Dino Ciccarelli 0 W win
## 10 Mario Lemieux 2 L loss
data %>% count(outcome)
## # A tibble: 3 × 2
## outcome n
## <chr> <int>
## 1 L 5
## 2 L-SO 1
## 3 W 4
data %>%
mutate(outcome_lump = fct_lump(outcome, n =1 ))
## # A tibble: 10 × 4
## player goals outcome outcome_lump
## <chr> <dbl> <chr> <fct>
## 1 Brendan Shanahan 0 L L
## 2 Luc Robitaille 0 W Other
## 3 Luc Robitaille 1 L L
## 4 Patrick Kane 1 L L
## 5 Brad Marchand 1 L-SO Other
## 6 Teemu Selanne 1 W Other
## 7 Tyler Seguin 0 L L
## 8 Patrick Marleau 1 W Other
## 9 Dino Ciccarelli 0 W Other
## 10 Mario Lemieux 2 L L