Import data
# excel file
data <- read_excel("../00_data/NHLDATA.xlsx")
Introduction
Questions
Variation
Visualizing distributions
ggplot(data = data) +
geom_bar(mapping = aes(x = birth_month))

ggplot(data = data) +
geom_histogram(mapping = aes(x = birth_year), binwidth = 0.5)

ggplot(data = data, mapping = aes(x = birth_month, colour = birth_country)) +
geom_freqpoly()

Typical values
data %>%
ggplot(aes(x = birth_year)) +
geom_histogram(binwith = 0.25)

Unusual values
data %>%
ggplot(aes(x = birth_month)) +
geom_histogram()

data %>%
ggplot(aes(x = birth_year)) +
geom_histogram() +
coord_cartesian(ylim = c(500,800))

Missing Values
data %>%
mutate(
birth_country = birth_country,
birth_month = birth_month
) %>%
ggplot(aes(x = birth_month, y = birth_country)) +
geom_point()

Covariation
A categorical and continuous variable
ggplot(data = data, mapping = aes(x = birth_year)) +
geom_freqpoly(mapping = aes(colour = birth_country))

Two categorical variables
data %>%
count(birth_month, birth_country) %>%
ggplot(mapping= aes(x = birth_month, y = birth_country, fill = n)) +
geom_tile()

Two continous Variables
data %>%
ggplot() +
geom_bin2d(mapping = aes(x = birth_year, y = birth_country))

data %>%
filter(birth_year > 0) %>%
ggplot(aes(x = birth_year, y = birth_country )) +
geom_boxplot(aes(group = birth_year ))

Patterns and models
library(modelr)
model <- lm(log(birth_month) ~ log(player_id), data = data)
data <- data %>%
modelr::add_residuals(model) %>%
mutate(resid = exp(birth_month))
data %>%
ggplot(aes(birth_month, birth_country)) +
geom_boxplot()
