Import your data

data <- read_excel("../00_data/Data.xlsx")
## New names:
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
## • `` -> `...14`
data
## # A tibble: 10,846 × 14
##    team    `Team City` Population team_name  year  total   home   away  week
##    <chr>   <chr>            <dbl> <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl>
##  1 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     1
##  2 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     2
##  3 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     3
##  4 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     4
##  5 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     5
##  6 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     6
##  7 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     7
##  8 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     8
##  9 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     9
## 10 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451    10
## # ℹ 10,836 more rows
## # ℹ 5 more variables: weekly_attendance <chr>, ...11 <lgl>, ...12 <chr>,
## #   ...13 <lgl>, ...14 <dbl>

Chapter 15

Create a factor

data %>% count(team_name)
## # A tibble: 32 × 2
##    team_name      n
##    <chr>      <int>
##  1 49ers        340
##  2 Bears        340
##  3 Bengals      340
##  4 Bills        340
##  5 Broncos      340
##  6 Browns       340
##  7 Buccaneers   340
##  8 Cardinals    340
##  9 Chargers     340
## 10 Chiefs       340
## # ℹ 22 more rows
x1 <- c("Cardinals","Packers", "Patriots", "Dolphins", "Jets", "Giants", "Texans", "Vikings", "Cowboys", "Eagles", "Bears", "Lions", "Saints", "Panthers")

team_name_levels <- c("Cardinals","Packers", "Patriots", "Dolphins", "Jets", "Giants", "Texans", "Vikings", "Cowboys", "Eagles", "Bears", "Lions", "Saints", "Panthers")

team_name_rev <- data %>%
    mutate(team_name = team_name %>% factor(levels = team_name_levels))

team_name_rev
## # A tibble: 10,846 × 14
##    team    `Team City` Population team_name  year  total   home   away  week
##    <chr>   <chr>            <dbl> <fct>     <dbl>  <dbl>  <dbl>  <dbl> <dbl>
##  1 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     1
##  2 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     2
##  3 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     3
##  4 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     4
##  5 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     5
##  6 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     6
##  7 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     7
##  8 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     8
##  9 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451     9
## 10 Arizona Phoenix        1608139 Cardinals  2000 893926 387475 506451    10
## # ℹ 10,836 more rows
## # ℹ 5 more variables: weekly_attendance <chr>, ...11 <lgl>, ...12 <chr>,
## #   ...13 <lgl>, ...14 <dbl>
y1 <- factor(x1, levels = team_name_levels)
y1
##  [1] Cardinals Packers   Patriots  Dolphins  Jets      Giants    Texans   
##  [8] Vikings   Cowboys   Eagles    Bears     Lions     Saints    Panthers 
## 14 Levels: Cardinals Packers Patriots Dolphins Jets Giants Texans ... Panthers

Modify factor order

Make two bar charts here - one before ordering another after

# Calculate average home attendance per team_name

home_attendance_per_team_name <- data %>%
    group_by(team_name) %>%
    summarise(
        avg_home_attendance = mean(home, na.rm = TRUE)
    )
home_attendance_per_team_name
## # A tibble: 32 × 2
##    team_name  avg_home_attendance
##    <chr>                    <dbl>
##  1 49ers                  550459.
##  2 Bears                  497954.
##  3 Bengals                476364.
##  4 Bills                  539234.
##  5 Broncos                607991 
##  6 Browns                 555725.
##  7 Buccaneers             489756.
##  8 Cardinals              452536.
##  9 Chargers               466237.
## 10 Chiefs                 598235.
## # ℹ 22 more rows
ggplot(home_attendance_per_team_name, aes(avg_home_attendance, team_name)) +
    geom_point()

ggplot(home_attendance_per_team_name, aes(avg_home_attendance, fct_reorder(team_name, avg_home_attendance))) +
    geom_point()

Modify factor levels

Show examples of three functions:

  • fct_recode
data %>%
    mutate(`Team City` = fct_recode(`Team City`,
        "phoenix" = "Phoenix",
        "atlanta" = "Atlanta",
        "baltimore" = "Baltimore",
        "buffalo" = "Buffalo",
        "charlotte" = "Charlotte",
        "chicago" = "Chicago",
        "cincinnati" = "Cincinnati",
        "cleveland" = "Cleveland",
        "dallas" = "Dallas",
        "denver" ="Denver",
        "detroit" = "Detroit",
        "green bay" = "Green Bay",
        "houston" = "Houston",
        "indianapolis" = "Indianapolis",
        "jacksonville" = "Jacksonville",
        "kansas city" = "Kansas City",
        "miami" = "Miami",
        "minneapolis" = "Minneapolis",
        "boston" = "Boston",
        "new orleans" = "New Orleans",
        "new york" = "New York",
        "oakland" = "Oakland",
        "philadelphia" = "Philadelphia",
        "pittsburgh" = "Pittsburgh",
        "san diego" = "San Diego",
        "san francisco" = "San Francisco",
        "seattle" = "Seattle",
        "saint louis" = "St. Louis",
        "tampa" = "Tampa",
        "nashville" = "Nashville",
        "dc" = "Washington DC"
        )) %>%
count(`Team City`)
## # A tibble: 32 × 2
##    `Team City`     n
##    <fct>       <int>
##  1 atlanta       340
##  2 baltimore     340
##  3 boston        340
##  4 buffalo       340
##  5 charlotte     340
##  6 chicago       340
##  7 cincinnati    340
##  8 cleveland     340
##  9 dallas        340
## 10 denver        340
## # ℹ 22 more rows
  • fct_collapse
data %>%
    mutate(team = fct_collapse(team,
                               Arizona = "Arizona",
                               Baltimore = "Baltimore",
                               Other = c("Miami", "Atlanta", "Denver", "Houston"))) %>%
    count(team)
## # A tibble: 29 × 2
##    team           n
##    <fct>      <int>
##  1 Arizona      340
##  2 Other       1326
##  3 Baltimore    340
##  4 Buffalo      340
##  5 Carolina     340
##  6 Chicago      340
##  7 Cincinnati   340
##  8 Cleveland    340
##  9 Dallas       340
## 10 Detroit      340
## # ℹ 19 more rows
  • fct_lump
data %>%
    mutate(team = fct_lump(team, prop = 0.03)) %>%
    count(team)
## # A tibble: 29 × 2
##    team           n
##    <fct>      <int>
##  1 Arizona      340
##  2 Atlanta      340
##  3 Baltimore    340
##  4 Buffalo      340
##  5 Carolina     340
##  6 Chicago      340
##  7 Cincinnati   340
##  8 Cleveland    340
##  9 Dallas       340
## 10 Denver       340
## # ℹ 19 more rows
data %>%
    mutate(team = fct_lump(team, prop = 0.05)) %>%
    count(team)
## # A tibble: 2 × 2
##   team         n
##   <fct>    <int>
## 1 New York   680
## 2 Other    10166

Chapter 16

No need to do anything here.