Import Data
# excel file
attendance <- read_excel("../00_data/nfl_attendance.xlsx")
attendance
## # A tibble: 10,846 × 8
## team team_name year total home away week weekly_attendance
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Arizona Cardinals 2000 893926 387475 506451 1 77434
## 2 Arizona Cardinals 2000 893926 387475 506451 2 66009
## 3 Arizona Cardinals 2000 893926 387475 506451 3 NA
## 4 Arizona Cardinals 2000 893926 387475 506451 4 71801
## 5 Arizona Cardinals 2000 893926 387475 506451 5 66985
## 6 Arizona Cardinals 2000 893926 387475 506451 6 44296
## 7 Arizona Cardinals 2000 893926 387475 506451 7 38293
## 8 Arizona Cardinals 2000 893926 387475 506451 8 62981
## 9 Arizona Cardinals 2000 893926 387475 506451 9 35286
## 10 Arizona Cardinals 2000 893926 387475 506451 10 52244
## # ℹ 10,836 more rows
Visualizing distributions
ggplot(data = attendance) +
geom_bar(mapping = aes(x = team_name)) +
coord_flip()

attendance %>% count(team_name)
## # A tibble: 32 × 2
## team_name n
## <chr> <int>
## 1 49ers 340
## 2 Bears 340
## 3 Bengals 340
## 4 Bills 340
## 5 Broncos 340
## 6 Browns 340
## 7 Buccaneers 340
## 8 Cardinals 340
## 9 Chargers 340
## 10 Chiefs 340
## # ℹ 22 more rows
#Distribution of away attendance
ggplot(data = attendance) +
geom_histogram(mapping = aes(x = away))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = attendance, mapping = aes(x = home, colour = team_name)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
attendance %>%
#Filter out away attendance under 475,000
filter(away > 475000) %>%
#Plot
ggplot(aes(x = away)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values
attendance %>%
ggplot(aes(away)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

attendance %>%
ggplot(aes(away)) +
geom_histogram() +
coord_cartesian(ylim = c(0,200))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
attendance %>%
# filter (away < 475000 | away > 600000) %>%
mutate(away = ifelse(away < 475000 | away > 600000, NA, away)) %>%
#Plot
ggplot(aes(x = team_name, y = away)) +
geom_point()
## Warning: Removed 153 rows containing missing values (`geom_point()`).

Covariation
A categorical and continuous variable
attendance %>%
ggplot(aes(x = team_name, y = away)) +
geom_boxplot()

Two categorical variables
#My two categorical variables do not work very well for graphing but I figured I could use the code anyways
attendance %>%
count(team_name, team) %>%
ggplot(aes(x = team_name, y = team, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
attendance %>%
ggplot(aes(x = away, y = home)) +
geom_hex()

Patterns and models
library(modelr)
mod <- lm(log(total) ~ log(home), data = attendance)
attendance2 <- attendance %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
attendance2 %>%
ggplot(aes(total, resid)) +
geom_point()

attendance2 %>%
ggplot(aes(team_name, resid)) +
geom_boxplot()
