Import data
# excel file
data <- read_excel("../00_data/NHLDATA (1).xlsx")
data
## # A tibble: 8,299 × 9
## player_id first_name last_name birth_date birth_city birth_country
## <dbl> <chr> <chr> <dttm> <chr> <chr>
## 1 8445319 Earl Campbell 1900-07-23 00:00:00 Buckingham CAN
## 2 8445255 Billy Burch 1900-11-20 00:00:00 Yonkers USA
## 3 8445644 Charlie Cotch 1900-02-21 00:00:00 Sarnia CAN
## 4 8447741 Duke McCurry 1900-06-13 00:00:00 Toronto CAN
## 5 8449392 Tex White 1900-06-26 00:00:00 Hillsborough CAN
## 6 8450153 Roy Worters 1900-10-19 00:00:00 Toronto CAN
## 7 8450043 Joe Miller 1900-10-06 00:00:00 Morrisburg CAN
## 8 8445818 Stan Crossett 1900-04-18 00:00:00 Tillsonburg CAN
## 9 8446613 Fred Gordon 1900-05-06 00:00:00 Fleming CAN
## 10 8446662 Harold Halderson 1900-01-06 00:00:00 Winnipeg CAN
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## # birth_month <dbl>
Apply the following dplyr verbs to your data
Filter rows
filter(data, birth_country == "CAN")
## # A tibble: 5,305 × 9
## player_id first_name last_name birth_date birth_city birth_country
## <dbl> <chr> <chr> <dttm> <chr> <chr>
## 1 8445319 Earl Campbell 1900-07-23 00:00:00 Buckingham CAN
## 2 8445644 Charlie Cotch 1900-02-21 00:00:00 Sarnia CAN
## 3 8447741 Duke McCurry 1900-06-13 00:00:00 Toronto CAN
## 4 8449392 Tex White 1900-06-26 00:00:00 Hillsborough CAN
## 5 8450153 Roy Worters 1900-10-19 00:00:00 Toronto CAN
## 6 8450043 Joe Miller 1900-10-06 00:00:00 Morrisburg CAN
## 7 8445818 Stan Crossett 1900-04-18 00:00:00 Tillsonburg CAN
## 8 8446613 Fred Gordon 1900-05-06 00:00:00 Fleming CAN
## 9 8446662 Harold Halderson 1900-01-06 00:00:00 Winnipeg CAN
## 10 8446857 Henry Hicks 1900-12-10 00:00:00 Sillery CAN
## # ℹ 5,295 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## # birth_month <dbl>
Arrange rows
arrange(data, (birth_date))
## # A tibble: 8,299 × 9
## player_id first_name last_name birth_date birth_city birth_country
## <dbl> <chr> <chr> <dttm> <chr> <chr>
## 1 8446662 Harold Halderson 1900-01-06 00:00:00 Winnipeg CAN
## 2 8450099 Herb Rheaume 1900-01-12 00:00:00 Mason CAN
## 3 8449199 Billy Stuart 1900-02-01 00:00:00 Sackville CAN
## 4 8445644 Charlie Cotch 1900-02-21 00:00:00 Sarnia CAN
## 5 8445818 Stan Crossett 1900-04-18 00:00:00 Tillsonbu… CAN
## 6 8446613 Fred Gordon 1900-05-06 00:00:00 Fleming CAN
## 7 8447573 Reg Mackey 1900-05-07 00:00:00 Ottawa CAN
## 8 8444855 Clarence Abel 1900-05-28 00:00:00 Sault Ste… USA
## 9 8447741 Duke McCurry 1900-06-13 00:00:00 Toronto CAN
## 10 8448237 Leo Quenneville 1900-06-15 00:00:00 St-Anicet CAN
## # ℹ 8,289 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## # birth_month <dbl>
Select columns
select(data, first_name, last_name, birth_date, birth_country)
## # A tibble: 8,299 × 4
## first_name last_name birth_date birth_country
## <chr> <chr> <dttm> <chr>
## 1 Earl Campbell 1900-07-23 00:00:00 CAN
## 2 Billy Burch 1900-11-20 00:00:00 USA
## 3 Charlie Cotch 1900-02-21 00:00:00 CAN
## 4 Duke McCurry 1900-06-13 00:00:00 CAN
## 5 Tex White 1900-06-26 00:00:00 CAN
## 6 Roy Worters 1900-10-19 00:00:00 CAN
## 7 Joe Miller 1900-10-06 00:00:00 CAN
## 8 Stan Crossett 1900-04-18 00:00:00 CAN
## 9 Fred Gordon 1900-05-06 00:00:00 CAN
## 10 Harold Halderson 1900-01-06 00:00:00 CAN
## # ℹ 8,289 more rows
Add columns
mutate(data,
gain = birth_date) %>%
select(first_name:last_name, gain)
## # A tibble: 8,299 × 3
## first_name last_name gain
## <chr> <chr> <dttm>
## 1 Earl Campbell 1900-07-23 00:00:00
## 2 Billy Burch 1900-11-20 00:00:00
## 3 Charlie Cotch 1900-02-21 00:00:00
## 4 Duke McCurry 1900-06-13 00:00:00
## 5 Tex White 1900-06-26 00:00:00
## 6 Roy Worters 1900-10-19 00:00:00
## 7 Joe Miller 1900-10-06 00:00:00
## 8 Stan Crossett 1900-04-18 00:00:00
## 9 Fred Gordon 1900-05-06 00:00:00
## 10 Harold Halderson 1900-01-06 00:00:00
## # ℹ 8,289 more rows
Summarize by groups
data %>%
#group by birth month and birth year
group_by(birth_month, birth_year) %>%
#Amount of entries per grade
summarise(count = n(), .groups = "drop") %>%
#arrange in year order
arrange(birth_year)
## # A tibble: 1,207 × 3
## birth_month birth_year count
## <dbl> <dbl> <int>
## 1 1 1900 2
## 2 2 1900 2
## 3 4 1900 1
## 4 5 1900 3
## 5 6 1900 5
## 6 7 1900 2
## 7 8 1900 1
## 8 9 1900 1
## 9 10 1900 3
## 10 11 1900 2
## # ℹ 1,197 more rows