Import Data

# excel file
data <- read_excel("../00_data/Data3.xlsx")
data
## # A tibble: 8,474 × 9
##    player_id first_name last_name   birth_date          birth_city birth_country
##        <dbl> <chr>      <chr>       <dttm>              <chr>      <chr>        
##  1   8467867 Bryan      Adams       1977-03-20 00:00:00 Fort St. … CAN          
##  2   8445176 Donald     Audette     1969-09-23 00:00:00 Laval      CAN          
##  3   8460014 Eric       Bertrand    1975-04-16 00:00:00 St-Ephrem  CAN          
##  4   8460510 Jason      Botterill   1976-05-19 00:00:00 Edmonton   CAN          
##  5   8459596 Andrew     Brunette    1973-08-24 00:00:00 Sudbury    CAN          
##  6   8445733 Kelly      Buchberger  1966-12-02 00:00:00 Langenburg CAN          
##  7   8460573 Hnat       Domenichel… 1976-02-17 00:00:00 Edmonton   CAN          
##  8   8459450 Shean      Donovan     1975-01-22 00:00:00 Timmins    CAN          
##  9   8446675 Nelson     Emerson     1967-08-17 00:00:00 Hamilton   CAN          
## 10   8446823 Ray        Ferraro     1964-08-23 00:00:00 Trail      CAN          
## # ℹ 8,464 more rows
## # ℹ 3 more variables: birth_state_province <chr>, birth_year <dbl>,
## #   birth_month <dbl>

Introduction

The data set provides a look at the birth dates of all NHL players

Questions

Does the month a player was born correlate with their success in the NHL

Variation

ggplot(data = data) +
    geom_bar(mapping = aes(x = birth_month))

Visualizing distributions

data %>%
    ggplot(mapping = aes(x = birth_year)) +
    geom_histogram(binwidth = 0.5)

Typical values

data %>%
    
    # filter out birth years < 2000
    filter(birth_year > 2000) %>%
    
    # Plot
    ggplot(aes(x = birth_year)) +
    geom_histogram(binwidth = .5)

Unusual values

ggplot(data) +
    geom_histogram(mapping = aes(x = birth_year), binwidth = .25)

Missing Values

data %>%
    
    # filter(birth_year < 2000 | birth_year > 1990) %>%
    mutate(y_rev = ifelse(birth_year < 2000 | birth_year > 1990, NA, y)) 
## # A tibble: 8,474 × 10
##    player_id first_name last_name   birth_date          birth_city birth_country
##        <dbl> <chr>      <chr>       <dttm>              <chr>      <chr>        
##  1   8467867 Bryan      Adams       1977-03-20 00:00:00 Fort St. … CAN          
##  2   8445176 Donald     Audette     1969-09-23 00:00:00 Laval      CAN          
##  3   8460014 Eric       Bertrand    1975-04-16 00:00:00 St-Ephrem  CAN          
##  4   8460510 Jason      Botterill   1976-05-19 00:00:00 Edmonton   CAN          
##  5   8459596 Andrew     Brunette    1973-08-24 00:00:00 Sudbury    CAN          
##  6   8445733 Kelly      Buchberger  1966-12-02 00:00:00 Langenburg CAN          
##  7   8460573 Hnat       Domenichel… 1976-02-17 00:00:00 Edmonton   CAN          
##  8   8459450 Shean      Donovan     1975-01-22 00:00:00 Timmins    CAN          
##  9   8446675 Nelson     Emerson     1967-08-17 00:00:00 Hamilton   CAN          
## 10   8446823 Ray        Ferraro     1964-08-23 00:00:00 Trail      CAN          
## # ℹ 8,464 more rows
## # ℹ 4 more variables: birth_state_province <chr>, birth_year <dbl>,
## #   birth_month <dbl>, y_rev <lgl>

Covariation

data %>%
    
    ggplot(aes(x = birth_year, y = birth_month)) +
    geom_boxplot()

A categorical and continuous variable

data %>%
    count(birth_year, birth_month) %>%
    ggplot(aes(x = birth_year, y = birth_month, fill = n)) +
    geom_tile()

Two categorical variables

ggplot(data = data) +
  geom_count(mapping = aes(x = birth_year, y = birth_month))

Two continous variables

library(hexbin)
data %>%
    ggplot(aes(x = birth_year, y = birth_month)) +
    geom_hex()