Module 11: Apply it to your data 10

Import your data

team_results <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-03-26/team-results.csv')

## Rows: 236 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): TEAM, F4PERCENT, CHAMPPERCENT
## dbl (17): TEAMID, PAKE, PAKERANK, PASE, PASERANK, GAMES, W, L, WINPERCENT, R...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

public_picks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-03-26/public-picks.csv')

## Rows: 64 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): TEAM, R64, R32, S16, E8, F4, FINALS
## dbl (2): YEAR, TEAMNO
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chapter 15

Create a factor

# TEAM as a factor in team_results
team_results <- team_results |>
  mutate(
    TEAM_f = factor(TEAM)
  )

# YEAR as a factor in public_picks
public_picks <- public_picks |>
  mutate(
    YEAR_f = factor(YEAR)
  )

team_results |> count(TEAM_f) |> head()

## # A tibble: 6 × 2
##   TEAM_f                n
##   <fct>             <int>
## 1 Abilene Christian     1
## 2 Akron                 1
## 3 Alabama               1
## 4 Albany                1
## 5 American              1
## 6 Arizona               1

public_picks |> count(YEAR_f)

## # A tibble: 1 × 2
##   YEAR_f     n
##   <fct>  <int>
## 1 2024      64

Modify factor order

Make two bar charts here - one before ordering another after

# Convert R64 ("87.2%") → numeric (87.2)
public_picks <- public_picks |>
  mutate(
    R64_num = readr::parse_number(R64)
  )

# Chart 1 — alphabetical TEAM order
ggplot(public_picks, aes(x = TEAM, y = R64_num)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "R64 Probability by Team (Default Order)",
    x = "Team",
    y = "R64 Probability (%)"
  )

# Chart 2 — reorder TEAM by R64 probability
public_picks <- public_picks |>
  mutate(
    TEAM_ordered = forcats::fct_reorder(TEAM, R64_num)
  )

ggplot(public_picks, aes(x = TEAM_ordered, y = R64_num)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "R64 Probability by Team (Ordered by Probability)",
    x = "Team (Ordered)",
    y = "R64 Probability (%)"
  )

Modify factor levels

Show examples of three functions:

## 1. fct_recode
public_picks <- public_picks |>
  mutate(
    TEAM_recode = forcats::fct_recode(
      factor(TEAM),
      "St. Mary's"  = "Saint Mary's",
      "St. Peter's" = "Saint Peter's"
    )
  )

public_picks |>
  filter(str_detect(TEAM, "Saint")) |>
  select(TEAM, TEAM_recode) |>
  distinct()

## # A tibble: 2 × 2
##   TEAM          TEAM_recode
##   <chr>         <fct>      
## 1 Saint Mary's  St. Mary's 
## 2 Saint Peter's St. Peter's

## 2. fct_collapse
public_picks <- public_picks |>
  mutate(
    TEAM_group = forcats::fct_collapse(
      factor(TEAM),
      `Blue bloods` = c("Connecticut", "Duke", "Kansas", "Kentucky", "North Carolina"),
      `Mid-majors` = c("Gonzaga", "Saint Mary's", "Saint Peter's", "San Diego St."),
      Other = "Other teams"
    )
  )

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `TEAM_group = forcats::fct_collapse(...)`.
## Caused by warning:
## ! Unknown levels in `f`: Other teams

public_picks |> count(TEAM_group)

## # A tibble: 57 × 2
##    TEAM_group                n
##    <fct>                 <int>
##  1 Akron                     1
##  2 Alabama                   1
##  3 Arizona                   1
##  4 Auburn                    1
##  5 Baylor                    1
##  6 BYU                       1
##  7 Clemson                   1
##  8 Colgate                   1
##  9 College of Charleston     1
## 10 Colorado                  1
## # ℹ 47 more rows

## 3. fct_lump
public_picks <- public_picks |>
  mutate(
    TEAM_lumped = forcats::fct_lump(factor(TEAM), n = 10)
  )

public_picks |> count(TEAM_lumped, sort = TRUE)

## # A tibble: 64 × 2
##    TEAM_lumped               n
##    <fct>                 <int>
##  1 Akron                     1
##  2 Alabama                   1
##  3 Arizona                   1
##  4 Auburn                    1
##  5 Baylor                    1
##  6 BYU                       1
##  7 Clemson                   1
##  8 Colgate                   1
##  9 College of Charleston     1
## 10 Colorado                  1
## # ℹ 54 more rows

fct_recode
fct_collapse
fct_lump

Module 11: Apply it to your data 10

Matthew Plutzner

Import your data

Chapter 15

Create a factor

Modify factor order

Modify factor levels

Chapter 16