Import Data

# excel file

attendance <- read_excel("../00_data/nfl_attendance.xlsx")
attendance
## # A tibble: 10,846 × 8
##    team    team_name  year  total   home   away  week weekly_attendance
##    <chr>   <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl> <chr>            
##  1 Arizona Cardinals  2000 893926 387475 506451     1 77434            
##  2 Arizona Cardinals  2000 893926 387475 506451     2 66009            
##  3 Arizona Cardinals  2000 893926 387475 506451     3 NA               
##  4 Arizona Cardinals  2000 893926 387475 506451     4 71801            
##  5 Arizona Cardinals  2000 893926 387475 506451     5 66985            
##  6 Arizona Cardinals  2000 893926 387475 506451     6 44296            
##  7 Arizona Cardinals  2000 893926 387475 506451     7 38293            
##  8 Arizona Cardinals  2000 893926 387475 506451     8 62981            
##  9 Arizona Cardinals  2000 893926 387475 506451     9 35286            
## 10 Arizona Cardinals  2000 893926 387475 506451    10 52244            
## # ℹ 10,836 more rows

Visualizing distributions

ggplot(data = attendance) +
  geom_bar(mapping = aes(x = team_name)) +
    coord_flip()

attendance %>% count(team_name)
## # A tibble: 32 × 2
##    team_name      n
##    <chr>      <int>
##  1 49ers        340
##  2 Bears        340
##  3 Bengals      340
##  4 Bills        340
##  5 Broncos      340
##  6 Browns       340
##  7 Buccaneers   340
##  8 Cardinals    340
##  9 Chargers     340
## 10 Chiefs       340
## # ℹ 22 more rows
#Distribution of away attendance

ggplot(data = attendance) +
  geom_histogram(mapping = aes(x = away))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = attendance, mapping = aes(x = home, colour = team_name)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

attendance %>%
    
#Filter out away attendance under 475,000
filter(away > 475000) %>% 
    
    #Plot
    ggplot(aes(x = away)) + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values

attendance %>%
    ggplot(aes(away)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

attendance %>%
    ggplot(aes(away)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0,200))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

attendance %>%
    
    # filter (away < 475000 | away > 600000) %>%
    
     mutate(away = ifelse(away < 475000 | away > 600000, NA, away)) %>%
    
    #Plot
    ggplot(aes(x = team_name, y = away)) +
    geom_point()
## Warning: Removed 153 rows containing missing values (`geom_point()`).

Covariation

A categorical and continuous variable

attendance %>%
    ggplot(aes(x = team_name, y = away)) +
    geom_boxplot()

Two categorical variables

#My two categorical variables do not work very well for graphing but I figured I could use the code anyways
attendance %>%
    count(team_name, team) %>%
    ggplot(aes(x = team_name, y = team, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
attendance %>%
    ggplot(aes(x = away, y = home)) +
    geom_hex()

Patterns and models

library(modelr)
mod <- lm(log(total) ~ log(home), data = attendance)

attendance2 <- attendance %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

attendance2 %>%
    ggplot(aes(total, resid)) +
    geom_point()

attendance2 %>%
    ggplot(aes(team_name, resid)) +
    geom_boxplot()