team_results <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-03-26/team-results.csv')
## Rows: 236 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TEAM, F4PERCENT, CHAMPPERCENT
## dbl (17): TEAMID, PAKE, PAKERANK, PASE, PASERANK, GAMES, W, L, WINPERCENT, R...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
public_picks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-03-26/public-picks.csv')
## Rows: 64 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): TEAM, R64, R32, S16, E8, F4, FINALS
## dbl (2): YEAR, TEAMNO
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# TEAM as a factor in team_results
team_results <- team_results |>
mutate(
TEAM_f = factor(TEAM)
)
# YEAR as a factor in public_picks
public_picks <- public_picks |>
mutate(
YEAR_f = factor(YEAR)
)
team_results |> count(TEAM_f) |> head()
## # A tibble: 6 × 2
## TEAM_f n
## <fct> <int>
## 1 Abilene Christian 1
## 2 Akron 1
## 3 Alabama 1
## 4 Albany 1
## 5 American 1
## 6 Arizona 1
public_picks |> count(YEAR_f)
## # A tibble: 1 × 2
## YEAR_f n
## <fct> <int>
## 1 2024 64
Make two bar charts here - one before ordering another after
# Convert R64 ("87.2%") → numeric (87.2)
public_picks <- public_picks |>
mutate(
R64_num = readr::parse_number(R64)
)
# Chart 1 — alphabetical TEAM order
ggplot(public_picks, aes(x = TEAM, y = R64_num)) +
geom_col() +
coord_flip() +
labs(
title = "R64 Probability by Team (Default Order)",
x = "Team",
y = "R64 Probability (%)"
)
# Chart 2 — reorder TEAM by R64 probability
public_picks <- public_picks |>
mutate(
TEAM_ordered = forcats::fct_reorder(TEAM, R64_num)
)
ggplot(public_picks, aes(x = TEAM_ordered, y = R64_num)) +
geom_col() +
coord_flip() +
labs(
title = "R64 Probability by Team (Ordered by Probability)",
x = "Team (Ordered)",
y = "R64 Probability (%)"
)
Show examples of three functions:
## 1. fct_recode
public_picks <- public_picks |>
mutate(
TEAM_recode = forcats::fct_recode(
factor(TEAM),
"St. Mary's" = "Saint Mary's",
"St. Peter's" = "Saint Peter's"
)
)
public_picks |>
filter(str_detect(TEAM, "Saint")) |>
select(TEAM, TEAM_recode) |>
distinct()
## # A tibble: 2 × 2
## TEAM TEAM_recode
## <chr> <fct>
## 1 Saint Mary's St. Mary's
## 2 Saint Peter's St. Peter's
## 2. fct_collapse
public_picks <- public_picks |>
mutate(
TEAM_group = forcats::fct_collapse(
factor(TEAM),
`Blue bloods` = c("Connecticut", "Duke", "Kansas", "Kentucky", "North Carolina"),
`Mid-majors` = c("Gonzaga", "Saint Mary's", "Saint Peter's", "San Diego St."),
Other = "Other teams"
)
)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `TEAM_group = forcats::fct_collapse(...)`.
## Caused by warning:
## ! Unknown levels in `f`: Other teams
public_picks |> count(TEAM_group)
## # A tibble: 57 × 2
## TEAM_group n
## <fct> <int>
## 1 Akron 1
## 2 Alabama 1
## 3 Arizona 1
## 4 Auburn 1
## 5 Baylor 1
## 6 BYU 1
## 7 Clemson 1
## 8 Colgate 1
## 9 College of Charleston 1
## 10 Colorado 1
## # ℹ 47 more rows
## 3. fct_lump
public_picks <- public_picks |>
mutate(
TEAM_lumped = forcats::fct_lump(factor(TEAM), n = 10)
)
public_picks |> count(TEAM_lumped, sort = TRUE)
## # A tibble: 64 × 2
## TEAM_lumped n
## <fct> <int>
## 1 Akron 1
## 2 Alabama 1
## 3 Arizona 1
## 4 Auburn 1
## 5 Baylor 1
## 6 BYU 1
## 7 Clemson 1
## 8 Colgate 1
## 9 College of Charleston 1
## 10 Colorado 1
## # ℹ 54 more rows
No need to do anything here.