Import data

# excel file
data <- read_excel("../00_data/NHLDATA (1).xlsx")
data
## # A tibble: 8,299 × 9
##    player_id first_name last_name birth_date          birth_city   birth_country
##        <dbl> <chr>      <chr>     <dttm>              <chr>        <chr>        
##  1   8445319 Earl       Campbell  1900-07-23 00:00:00 Buckingham   CAN          
##  2   8445255 Billy      Burch     1900-11-20 00:00:00 Yonkers      USA          
##  3   8445644 Charlie    Cotch     1900-02-21 00:00:00 Sarnia       CAN          
##  4   8447741 Duke       McCurry   1900-06-13 00:00:00 Toronto      CAN          
##  5   8449392 Tex        White     1900-06-26 00:00:00 Hillsborough CAN          
##  6   8450153 Roy        Worters   1900-10-19 00:00:00 Toronto      CAN          
##  7   8450043 Joe        Miller    1900-10-06 00:00:00 Morrisburg   CAN          
##  8   8445818 Stan       Crossett  1900-04-18 00:00:00 Tillsonburg  CAN          
##  9   8446613 Fred       Gordon    1900-05-06 00:00:00 Fleming      CAN          
## 10   8446662 Harold     Halderson 1900-01-06 00:00:00 Winnipeg     CAN          
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## #   birth_month <dbl>

Apply the following dplyr verbs to your data

Filter rows

filter(data, birth_country == "CAN")
## # A tibble: 5,305 × 9
##    player_id first_name last_name birth_date          birth_city   birth_country
##        <dbl> <chr>      <chr>     <dttm>              <chr>        <chr>        
##  1   8445319 Earl       Campbell  1900-07-23 00:00:00 Buckingham   CAN          
##  2   8445644 Charlie    Cotch     1900-02-21 00:00:00 Sarnia       CAN          
##  3   8447741 Duke       McCurry   1900-06-13 00:00:00 Toronto      CAN          
##  4   8449392 Tex        White     1900-06-26 00:00:00 Hillsborough CAN          
##  5   8450153 Roy        Worters   1900-10-19 00:00:00 Toronto      CAN          
##  6   8450043 Joe        Miller    1900-10-06 00:00:00 Morrisburg   CAN          
##  7   8445818 Stan       Crossett  1900-04-18 00:00:00 Tillsonburg  CAN          
##  8   8446613 Fred       Gordon    1900-05-06 00:00:00 Fleming      CAN          
##  9   8446662 Harold     Halderson 1900-01-06 00:00:00 Winnipeg     CAN          
## 10   8446857 Henry      Hicks     1900-12-10 00:00:00 Sillery      CAN          
## # ℹ 5,295 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## #   birth_month <dbl>

Arrange rows

arrange(data, (birth_date))
## # A tibble: 8,299 × 9
##    player_id first_name last_name   birth_date          birth_city birth_country
##        <dbl> <chr>      <chr>       <dttm>              <chr>      <chr>        
##  1   8446662 Harold     Halderson   1900-01-06 00:00:00 Winnipeg   CAN          
##  2   8450099 Herb       Rheaume     1900-01-12 00:00:00 Mason      CAN          
##  3   8449199 Billy      Stuart      1900-02-01 00:00:00 Sackville  CAN          
##  4   8445644 Charlie    Cotch       1900-02-21 00:00:00 Sarnia     CAN          
##  5   8445818 Stan       Crossett    1900-04-18 00:00:00 Tillsonbu… CAN          
##  6   8446613 Fred       Gordon      1900-05-06 00:00:00 Fleming    CAN          
##  7   8447573 Reg        Mackey      1900-05-07 00:00:00 Ottawa     CAN          
##  8   8444855 Clarence   Abel        1900-05-28 00:00:00 Sault Ste… USA          
##  9   8447741 Duke       McCurry     1900-06-13 00:00:00 Toronto    CAN          
## 10   8448237 Leo        Quenneville 1900-06-15 00:00:00 St-Anicet  CAN          
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## #   birth_month <dbl>

Select columns

select(data, first_name, last_name, birth_date, birth_country)
## # A tibble: 8,299 × 4
##    first_name last_name birth_date          birth_country
##    <chr>      <chr>     <dttm>              <chr>        
##  1 Earl       Campbell  1900-07-23 00:00:00 CAN          
##  2 Billy      Burch     1900-11-20 00:00:00 USA          
##  3 Charlie    Cotch     1900-02-21 00:00:00 CAN          
##  4 Duke       McCurry   1900-06-13 00:00:00 CAN          
##  5 Tex        White     1900-06-26 00:00:00 CAN          
##  6 Roy        Worters   1900-10-19 00:00:00 CAN          
##  7 Joe        Miller    1900-10-06 00:00:00 CAN          
##  8 Stan       Crossett  1900-04-18 00:00:00 CAN          
##  9 Fred       Gordon    1900-05-06 00:00:00 CAN          
## 10 Harold     Halderson 1900-01-06 00:00:00 CAN          
## # ℹ 8,289 more rows

Add columns

mutate(data,
       gain = birth_date) %>%
    
    select(first_name:last_name, gain)
## # A tibble: 8,299 × 3
##    first_name last_name gain               
##    <chr>      <chr>     <dttm>             
##  1 Earl       Campbell  1900-07-23 00:00:00
##  2 Billy      Burch     1900-11-20 00:00:00
##  3 Charlie    Cotch     1900-02-21 00:00:00
##  4 Duke       McCurry   1900-06-13 00:00:00
##  5 Tex        White     1900-06-26 00:00:00
##  6 Roy        Worters   1900-10-19 00:00:00
##  7 Joe        Miller    1900-10-06 00:00:00
##  8 Stan       Crossett  1900-04-18 00:00:00
##  9 Fred       Gordon    1900-05-06 00:00:00
## 10 Harold     Halderson 1900-01-06 00:00:00
## # ℹ 8,289 more rows

Summarize by groups

data %>%
    
    #group by birth month and birth year
  group_by(birth_month, birth_year) %>%   
    
    #Amount of entries per grade
  summarise(count = n(), .groups = "drop") %>%  
    
    #arrange in year order
  arrange(birth_year)
## # A tibble: 1,207 × 3
##    birth_month birth_year count
##          <dbl>      <dbl> <int>
##  1           1       1900     2
##  2           2       1900     2
##  3           4       1900     1
##  4           5       1900     3
##  5           6       1900     5
##  6           7       1900     2
##  7           8       1900     1
##  8           9       1900     1
##  9          10       1900     3
## 10          11       1900     2
## # ℹ 1,197 more rows