Introduction
Questions
Variation
Visualizing distributions
ggplot(data = data) +
geom_bar(mapping = aes(x = year))

data %>%
count(year)
## # A tibble: 21 × 2
## year n
## <dbl> <int>
## 1 1930 18
## 2 1934 17
## 3 1938 18
## 4 1950 22
## 5 1954 26
## 6 1958 35
## 7 1962 32
## 8 1966 32
## 9 1970 32
## 10 1974 38
## # ℹ 11 more rows
ggplot(data = data) +
geom_histogram(mapping = aes(x = home_score))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, mapping = aes(x = year, colour = outcome)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
data %>%
filter(year > 2000) %>%
ggplot(aes(x = year)) +
geom_histogram(binwidth = 1)

Unusual values
data %>%
ggplot(aes(home_score)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
data %>%
mutate(home_score = ifelse(home_score > 3, NA, home_score)) %>%
ggplot(aes(x = home_team, y = home_score)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_point()
## Warning: Removed 86 rows containing missing values (`geom_point()`).

Covariation
A categorical and continuous variable
data %>%
ggplot(aes(x = outcome, y = away_score)) +
geom_boxplot()

Two categorical variables
data %>%
count(outcome, home_team) %>%
ggplot(aes(x = outcome, y = home_team, fill = n)) +
geom_tile()

Two continous variables
data %>%
ggplot(aes(x = away_score, y = home_score)) +
geom_hex()

Patterns and models
library(modelr)
mod <- lm(log(away_score + 1) ~ log(home_score + 1), data = data)
data4 <- data %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
data4 %>%
ggplot(aes(x = home_score, resid)) +
geom_point()
