Import data

# excel file
data <- read_excel("../00_data/NHLDATA.xlsx")

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = data) +
  geom_bar(mapping = aes(x = birth_month))

ggplot(data = data) +
  geom_histogram(mapping = aes(x = birth_year), binwidth = 0.5)

ggplot(data = data, mapping = aes(x = birth_month, colour = birth_country)) +
  geom_freqpoly()

Typical values

data %>%
    ggplot(aes(x = birth_year)) +
    geom_histogram(binwith = 0.25)

Unusual values

data %>%
    ggplot(aes(x = birth_month)) +
    geom_histogram()

data %>%
    ggplot(aes(x = birth_year)) +
    geom_histogram() +
    coord_cartesian(ylim = c(500,800))

Missing Values

data %>%
  mutate(
    birth_country = birth_country,
    birth_month = birth_month
  ) %>%
  ggplot(aes(x = birth_month, y = birth_country)) +
  geom_point() 

Covariation

A categorical and continuous variable

ggplot(data = data, mapping = aes(x = birth_year)) + 
  geom_freqpoly(mapping = aes(colour = birth_country))

Two categorical variables

data %>%
    
    count(birth_month, birth_country) %>%
    ggplot(mapping= aes(x = birth_month, y = birth_country, fill = n)) + 
    geom_tile()

Two continous Variables

data %>%
    ggplot() +
    geom_bin2d(mapping = aes(x = birth_year, y = birth_country))

data %>%
    filter(birth_year > 0) %>%
    ggplot(aes(x = birth_year, y = birth_country )) +
    geom_boxplot(aes(group = birth_year ))

Patterns and models

library(modelr)
model <- lm(log(birth_month) ~ log(player_id), data = data)

data <- data %>%
    modelr::add_residuals(model) %>%
    mutate(resid = exp(birth_month))

data %>%
    ggplot(aes(birth_month, birth_country)) + 
    geom_boxplot()