Import your data

# excel file
data <- read_excel("../00_data/NHLDATA (1).xlsx")
data
## # A tibble: 8,299 × 9
##    player_id first_name last_name birth_date          birth_city   birth_country
##        <dbl> <chr>      <chr>     <dttm>              <chr>        <chr>        
##  1   8445319 Earl       Campbell  1900-07-23 00:00:00 Buckingham   CAN          
##  2   8445255 Billy      Burch     1900-11-20 00:00:00 Yonkers      USA          
##  3   8445644 Charlie    Cotch     1900-02-21 00:00:00 Sarnia       CAN          
##  4   8447741 Duke       McCurry   1900-06-13 00:00:00 Toronto      CAN          
##  5   8449392 Tex        White     1900-06-26 00:00:00 Hillsborough CAN          
##  6   8450153 Roy        Worters   1900-10-19 00:00:00 Toronto      CAN          
##  7   8450043 Joe        Miller    1900-10-06 00:00:00 Morrisburg   CAN          
##  8   8445818 Stan       Crossett  1900-04-18 00:00:00 Tillsonburg  CAN          
##  9   8446613 Fred       Gordon    1900-05-06 00:00:00 Fleming      CAN          
## 10   8446662 Harold     Halderson 1900-01-06 00:00:00 Winnipeg     CAN          
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## #   birth_month <dbl>
set.seed(1234) # for reproducible outcome
data_small <- data %>%
    
    # Select three columns
    select(birth_year, birth_month, birth_country) %>%
    
    # Randomly select five rows
    sample_n(5)

data_small
## # A tibble: 5 × 3
##   birth_year birth_month birth_country
##        <dbl>       <dbl> <chr>        
## 1       1995           7 CAN          
## 2       1999           5 USA          
## 3       1993           1 CAN          
## 4       2000           3 FIN          
## 5       1994           9 CAN

Pivoting

long to wide form

data_long <- data %>% 
    
    pivot_longer(cols = c('birth_month', 'birth_year'),
                 names_to = "month",
                 values_to = "year")
set.seed(1234) # for reproducible outcome
data_small <- data %>%
    
    # Select three columns
    select(birth_year, birth_month, birth_country) %>%
    
    # Randomly select five rows
    sample_n(5)

data_small
## # A tibble: 5 × 3
##   birth_year birth_month birth_country
##        <dbl>       <dbl> <chr>        
## 1       1995           7 CAN          
## 2       1999           5 USA          
## 3       1993           1 CAN          
## 4       2000           3 FIN          
## 5       1994           9 CAN

wide to long form

data_long %>%
    
    pivot_wider(names_from = month, values_from = year)
## # A tibble: 8,299 × 9
##    player_id first_name last_name birth_date          birth_city   birth_country
##        <dbl> <chr>      <chr>     <dttm>              <chr>        <chr>        
##  1   8445319 Earl       Campbell  1900-07-23 00:00:00 Buckingham   CAN          
##  2   8445255 Billy      Burch     1900-11-20 00:00:00 Yonkers      USA          
##  3   8445644 Charlie    Cotch     1900-02-21 00:00:00 Sarnia       CAN          
##  4   8447741 Duke       McCurry   1900-06-13 00:00:00 Toronto      CAN          
##  5   8449392 Tex        White     1900-06-26 00:00:00 Hillsborough CAN          
##  6   8450153 Roy        Worters   1900-10-19 00:00:00 Toronto      CAN          
##  7   8450043 Joe        Miller    1900-10-06 00:00:00 Morrisburg   CAN          
##  8   8445818 Stan       Crossett  1900-04-18 00:00:00 Tillsonburg  CAN          
##  9   8446613 Fred       Gordon    1900-05-06 00:00:00 Fleming      CAN          
## 10   8446662 Harold     Halderson 1900-01-06 00:00:00 Winnipeg     CAN          
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_month <dbl>,
## #   birth_year <dbl>
set.seed(1234) # for reproducible outcome
data_small <- data %>%
    
    # Select three columns
    select(birth_year, birth_month, birth_country) %>%
    
    # Randomly select five rows
    sample_n(5)

data_small
## # A tibble: 5 × 3
##   birth_year birth_month birth_country
##        <dbl>       <dbl> <chr>        
## 1       1995           7 CAN          
## 2       1999           5 USA          
## 3       1993           1 CAN          
## 4       2000           3 FIN          
## 5       1994           9 CAN

Separating and Uniting

Separate a column

data_sep <- data %>%
    
    separate(col = birth_country, into = c("birth_year", "birth_month"))
set.seed(1234) # for reproducible outcome
data_small <- data %>%
    
    # Select three columns
    select(birth_year, birth_month, birth_country) %>%
    
    # Randomly select five rows
    sample_n(5)

data_small
## # A tibble: 5 × 3
##   birth_year birth_month birth_country
##        <dbl>       <dbl> <chr>        
## 1       1995           7 CAN          
## 2       1999           5 USA          
## 3       1993           1 CAN          
## 4       2000           3 FIN          
## 5       1994           9 CAN

Unite two columns

data %>%
    
    separate(col = birth_date, into = c("month", "year"))
## # A tibble: 8,299 × 10
##    player_id first_name last_name month year  birth_city   birth_country
##        <dbl> <chr>      <chr>     <chr> <chr> <chr>        <chr>        
##  1   8445319 Earl       Campbell  1900  07    Buckingham   CAN          
##  2   8445255 Billy      Burch     1900  11    Yonkers      USA          
##  3   8445644 Charlie    Cotch     1900  02    Sarnia       CAN          
##  4   8447741 Duke       McCurry   1900  06    Toronto      CAN          
##  5   8449392 Tex        White     1900  06    Hillsborough CAN          
##  6   8450153 Roy        Worters   1900  10    Toronto      CAN          
##  7   8450043 Joe        Miller    1900  10    Morrisburg   CAN          
##  8   8445818 Stan       Crossett  1900  04    Tillsonburg  CAN          
##  9   8446613 Fred       Gordon    1900  05    Fleming      CAN          
## 10   8446662 Harold     Halderson 1900  01    Winnipeg     CAN          
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## #   birth_month <dbl>
set.seed(1234) # for reproducible outcome
data_small <- data %>%
    
    # Select three columns
    select(birth_year, birth_month, birth_country) %>%
    
    # Randomly select five rows
    sample_n(5)

data_small
## # A tibble: 5 × 3
##   birth_year birth_month birth_country
##        <dbl>       <dbl> <chr>        
## 1       1995           7 CAN          
## 2       1999           5 USA          
## 3       1993           1 CAN          
## 4       2000           3 FIN          
## 5       1994           9 CAN

Missing Values

I have some missing data, some NHL players don’t have their date of birth registered, therefore, gives warnings.