Import data

data <- read_excel("../00_data/Data.xlsx")
## New names:
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
data
## # A tibble: 10,846 × 14
##    team    `Team City` Population team_name  year  total   home   away  week
##    <chr>   <chr>            <dbl> <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl>
##  1 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     1
##  2 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     2
##  3 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     3
##  4 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     4
##  5 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     5
##  6 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     6
##  7 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     7
##  8 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     8
##  9 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     9
## 10 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451    10
## # ℹ 10,836 more rows
## # ℹ 5 more variables: weekly_attendance <chr>, ...11 <lgl>, ...12 <chr>,
## #   ...13 <lgl>, ...14 <dbl>

Introduction

Questions

Variation

Visualizing distributions

data %>%
ggplot(aes(x = team_name, y = Population)) +
    geom_bar(stat = "identity") +
    coord_flip()

data %>%
ggplot(aes(x = team_name, y = Population)) +
    geom_bar(stat = "identity", binwidth = 0.30)
## Warning in geom_bar(stat = "identity", binwidth = 0.3): Ignoring unknown
## parameters: `binwidth`

ggplot(data = data) +
    geom_histogram(mapping = aes(x = total), binwidth = 25000)

ggplot(data = data, mapping = aes(x = total, colour = team)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values

ggplot(data = data, mapping = aes(x = total)) +
  geom_histogram(binwidth = 10000)

Unusual values

data %>%
ggplot(aes(x = total)) + 
  geom_histogram() +
    coord_cartesian(ylim = c(0, 1500))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Missing Values

data %>%
    
    # filter(x < 5 | x > 200) %>%
    
    mutate(x = ifelse(home < 5 | home > 200, NA, home)) %>%
    
    # Plot
    
    ggplot(aes(y = `Team City`, x = home)) +
    geom_point()

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = team_name, y = away)) +
    geom_boxplot()

Two categorical variables

data %>%
    
    count(`Team City`, team_name) %>%
    ggplot(aes(x = `Team City`, y = team_name, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)

data %>%
    ggplot(aes(x = away, y = home)) +
    geom_hex()

Patterns and models

library(modelr)
data_clean <- data %>%
    filter(Population > 0,
           total > 0)
mod <- lm(log(Population) ~ log(total), data = data_clean)

data_resid <- data_clean %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

data_resid %>%
    ggplot(aes(Population, resid)) +
    geom_point()

data_resid %>%
    ggplot(aes(team, resid)) +
    geom_boxplot() +
    coord_flip()