Import data
data <- read_excel("../00_data/Data.xlsx")
## New names:
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
data
## # A tibble: 10,846 × 14
## team `Team City` Population team_name year total home away week
## <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 1
## 2 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 2
## 3 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 3
## 4 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 4
## 5 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 5
## 6 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 6
## 7 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 7
## 8 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 8
## 9 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 9
## 10 Arizona Phoenix 1608139 Cardinals 2000 893926 387475 506451 10
## # ℹ 10,836 more rows
## # ℹ 5 more variables: weekly_attendance <chr>, ...11 <lgl>, ...12 <chr>,
## # ...13 <lgl>, ...14 <dbl>
Introduction
Questions
Variation
Visualizing distributions
data %>%
ggplot(aes(x = team_name, y = Population)) +
geom_bar(stat = "identity") +
coord_flip()

data %>%
ggplot(aes(x = team_name, y = Population)) +
geom_bar(stat = "identity", binwidth = 0.30)
## Warning in geom_bar(stat = "identity", binwidth = 0.3): Ignoring unknown
## parameters: `binwidth`

ggplot(data = data) +
geom_histogram(mapping = aes(x = total), binwidth = 25000)

ggplot(data = data, mapping = aes(x = total, colour = team)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values
ggplot(data = data, mapping = aes(x = total)) +
geom_histogram(binwidth = 10000)

Unusual values
data %>%
ggplot(aes(x = total)) +
geom_histogram() +
coord_cartesian(ylim = c(0, 1500))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values
data %>%
# filter(x < 5 | x > 200) %>%
mutate(x = ifelse(home < 5 | home > 200, NA, home)) %>%
# Plot
ggplot(aes(y = `Team City`, x = home)) +
geom_point()

Covariation
A categorical and continuous variable
data %>%
ggplot(aes(x = team_name, y = away)) +
geom_boxplot()

Two categorical variables
data %>%
count(`Team City`, team_name) %>%
ggplot(aes(x = `Team City`, y = team_name, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
data %>%
ggplot(aes(x = away, y = home)) +
geom_hex()

Patterns and models
library(modelr)
data_clean <- data %>%
filter(Population > 0,
total > 0)
mod <- lm(log(Population) ~ log(total), data = data_clean)
data_resid <- data_clean %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
data_resid %>%
ggplot(aes(Population, resid)) +
geom_point()

data_resid %>%
ggplot(aes(team, resid)) +
geom_boxplot() +
coord_flip()
