# excel file
data <- read_excel("../00_data/NHLDATA (1).xlsx")
data
## # A tibble: 8,299 × 9
## player_id first_name last_name birth_date birth_city birth_country
## <dbl> <chr> <chr> <dttm> <chr> <chr>
## 1 8445319 Earl Campbell 1900-07-23 00:00:00 Buckingham CAN
## 2 8445255 Billy Burch 1900-11-20 00:00:00 Yonkers USA
## 3 8445644 Charlie Cotch 1900-02-21 00:00:00 Sarnia CAN
## 4 8447741 Duke McCurry 1900-06-13 00:00:00 Toronto CAN
## 5 8449392 Tex White 1900-06-26 00:00:00 Hillsborough CAN
## 6 8450153 Roy Worters 1900-10-19 00:00:00 Toronto CAN
## 7 8450043 Joe Miller 1900-10-06 00:00:00 Morrisburg CAN
## 8 8445818 Stan Crossett 1900-04-18 00:00:00 Tillsonburg CAN
## 9 8446613 Fred Gordon 1900-05-06 00:00:00 Fleming CAN
## 10 8446662 Harold Halderson 1900-01-06 00:00:00 Winnipeg CAN
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## # birth_month <dbl>
set.seed(1234) # for reproducible outcome
data_small <- data %>%
# Select three columns
select(birth_year, birth_month, birth_country) %>%
# Randomly select five rows
sample_n(5)
data_small
## # A tibble: 5 × 3
## birth_year birth_month birth_country
## <dbl> <dbl> <chr>
## 1 1995 7 CAN
## 2 1999 5 USA
## 3 1993 1 CAN
## 4 2000 3 FIN
## 5 1994 9 CAN
data_long <- data %>%
pivot_longer(cols = c('birth_month', 'birth_year'),
names_to = "month",
values_to = "year")
set.seed(1234) # for reproducible outcome
data_small <- data %>%
# Select three columns
select(birth_year, birth_month, birth_country) %>%
# Randomly select five rows
sample_n(5)
data_small
## # A tibble: 5 × 3
## birth_year birth_month birth_country
## <dbl> <dbl> <chr>
## 1 1995 7 CAN
## 2 1999 5 USA
## 3 1993 1 CAN
## 4 2000 3 FIN
## 5 1994 9 CAN
data_long %>%
pivot_wider(names_from = month, values_from = year)
## # A tibble: 8,299 × 9
## player_id first_name last_name birth_date birth_city birth_country
## <dbl> <chr> <chr> <dttm> <chr> <chr>
## 1 8445319 Earl Campbell 1900-07-23 00:00:00 Buckingham CAN
## 2 8445255 Billy Burch 1900-11-20 00:00:00 Yonkers USA
## 3 8445644 Charlie Cotch 1900-02-21 00:00:00 Sarnia CAN
## 4 8447741 Duke McCurry 1900-06-13 00:00:00 Toronto CAN
## 5 8449392 Tex White 1900-06-26 00:00:00 Hillsborough CAN
## 6 8450153 Roy Worters 1900-10-19 00:00:00 Toronto CAN
## 7 8450043 Joe Miller 1900-10-06 00:00:00 Morrisburg CAN
## 8 8445818 Stan Crossett 1900-04-18 00:00:00 Tillsonburg CAN
## 9 8446613 Fred Gordon 1900-05-06 00:00:00 Fleming CAN
## 10 8446662 Harold Halderson 1900-01-06 00:00:00 Winnipeg CAN
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_month <dbl>,
## # birth_year <dbl>
set.seed(1234) # for reproducible outcome
data_small <- data %>%
# Select three columns
select(birth_year, birth_month, birth_country) %>%
# Randomly select five rows
sample_n(5)
data_small
## # A tibble: 5 × 3
## birth_year birth_month birth_country
## <dbl> <dbl> <chr>
## 1 1995 7 CAN
## 2 1999 5 USA
## 3 1993 1 CAN
## 4 2000 3 FIN
## 5 1994 9 CAN
data_sep <- data %>%
separate(col = birth_country, into = c("birth_year", "birth_month"))
set.seed(1234) # for reproducible outcome
data_small <- data %>%
# Select three columns
select(birth_year, birth_month, birth_country) %>%
# Randomly select five rows
sample_n(5)
data_small
## # A tibble: 5 × 3
## birth_year birth_month birth_country
## <dbl> <dbl> <chr>
## 1 1995 7 CAN
## 2 1999 5 USA
## 3 1993 1 CAN
## 4 2000 3 FIN
## 5 1994 9 CAN
data %>%
separate(col = birth_date, into = c("month", "year"))
## # A tibble: 8,299 × 10
## player_id first_name last_name month year birth_city birth_country
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 8445319 Earl Campbell 1900 07 Buckingham CAN
## 2 8445255 Billy Burch 1900 11 Yonkers USA
## 3 8445644 Charlie Cotch 1900 02 Sarnia CAN
## 4 8447741 Duke McCurry 1900 06 Toronto CAN
## 5 8449392 Tex White 1900 06 Hillsborough CAN
## 6 8450153 Roy Worters 1900 10 Toronto CAN
## 7 8450043 Joe Miller 1900 10 Morrisburg CAN
## 8 8445818 Stan Crossett 1900 04 Tillsonburg CAN
## 9 8446613 Fred Gordon 1900 05 Fleming CAN
## 10 8446662 Harold Halderson 1900 01 Winnipeg CAN
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## # birth_month <dbl>
set.seed(1234) # for reproducible outcome
data_small <- data %>%
# Select three columns
select(birth_year, birth_month, birth_country) %>%
# Randomly select five rows
sample_n(5)
data_small
## # A tibble: 5 × 3
## birth_year birth_month birth_country
## <dbl> <dbl> <chr>
## 1 1995 7 CAN
## 2 1999 5 USA
## 3 1993 1 CAN
## 4 2000 3 FIN
## 5 1994 9 CAN
I have some missing data, some NHL players don’t have their date of birth registered, therefore, gives warnings.