Import your data

mydata <- read_excel("../00_data/mydata.xlsx") %>%
    janitor::clean_names()
mydata 
## # A tibble: 54 × 9
##     year winner        score runner_up     third_place  fourth_place    location
##    <dbl> <chr>         <dbl> <chr>         <chr>        <chr>           <chr>   
##  1  2024 UConn          75.6 Purdue        *Alabama     *NCState        Phoenix 
##  2  2023 UConn          76.6 SanDiegoSt.   *Miami(FL)   *FloridaAtlant… Houston 
##  3  2022 Kansas         72.7 NorthCarolina *Villanova   *Duke           NewOrle…
##  4  2021 Baylor         86.7 Gonzaga       *Houston     *UCLA           Indiana…
##  5  2019 Virginia       85.8 TexasTech     *Auburn      *MichiganSt.    Minneap…
##  6  2018 Villanova      79.6 Michigan      *Kansas      *LoyolaChicago  SanAnto…
##  7  2017 NorthCarolina  71.6 Gonzaga       *Oregon      *SouthCarolina  Phoenix 
##  8  2016 Villanova      77.7 NorthCarolina *Oklahoma    *Syracuse       Houston 
##  9  2015 Duke           68.6 Wisconsin     *MichiganSt. *Kentucky       Indiana…
## 10  2014 UConn          60.5 Kentucky      *Florida     *Wisconsin      Arlingt…
## # ℹ 44 more rows
## # ℹ 2 more variables: most_outstanding_player <chr>, winning_coach <chr>
data_clean <- mydata %>% 
    select(year, winner, score, runner_up, location ) %>% 
    slice(1:26)

Chapter 15

Create a factor

x1 <- c("UConn","Kansas","Baylor","Virginia")

winner_levels <- c("UConn","Kansas","Baylor","Virginia","Villanova","NorthCarolina","Duke","†Louisville","Kentucky","Uconn ","Florida","Syracuse","Maryland","MichiganSt.")

data_rclean <- data_clean %>%
    mutate(winner = winner %>% factor(levels = winner_levels))

Modify factor order

Make two bar charts here - one before ordering another after

data_summary <- data_clean %>%
  group_by(winner) %>%
  summarise(
    score = mean(score, na.rm = TRUE),
  )

data_summary
## # A tibble: 14 × 2
##    winner        score
##    <chr>         <dbl>
##  1 Baylor         86.7
##  2 Duke           71.0
##  3 Florida        79.2
##  4 Kansas         74.2
##  5 Kentucky       73.1
##  6 Maryland       64.5
##  7 MichiganSt.    89.8
##  8 NorthCarolina  79.0
##  9 Syracuse       81.8
## 10 UConn          74.6
## 11 Uconn          53.4
## 12 Villanova      78.7
## 13 Virginia       85.8
## 14 †Louisville    82.8
ggplot(data_summary, aes(score, winner)) + geom_point()

ggplot(data_summary, aes(score, fct_reorder(winner, score))) + geom_point()

Modify factor levels

Show examples of three functions:

  • fct_recode
data_clean %>%
  mutate(winner = fct_recode(winner,
    "Baylor.B"    = "Baylor",
    "Duke.D"      = "Duke",
    "Florida.F" = "Florida",
    "Kansas.KA" = "Kansas",
    "Kentucky.KE"        = "Kentucky",
    "Maryland.M"      = "Maryland",
    "MichiganSt..MS"                 = "MichiganSt.",
    "NorthCarolina.NC"                 = "NorthCarolina",
    "Syracuse.S"                 = "Syracuse",
    "UConn.U"        = "UConn",
    "Uconn.u"      = "Uconn",
    "Villanova.VN"                 = "Villanova",
    "Virginia.VG"                 = "Virginia",
    "†Louisville.L"                 = "†Louisville"
    )) %>%
  count(winner)
## # A tibble: 14 × 2
##    winner               n
##    <fct>            <int>
##  1 †Louisville.L        1
##  2 Baylor.B             1
##  3 Duke.D               3
##  4 Florida.F            2
##  5 Kansas.KA            2
##  6 Kentucky.KE          2
##  7 Maryland.M           1
##  8 MichiganSt..MS       1
##  9 NorthCarolina.NC     3
## 10 Syracuse.S           1
## 11 Uconn.u              1
## 12 UConn.U              5
## 13 Villanova.VN         2
## 14 Virginia.VG          1
  • fct_collapse
data_clean %>% 
    mutate(winner = fct_collapse(winner,
                                 UConn.U = "UConn",
                                 Other  = c("†Louisville","Virginia","Villanova","Uconn","Syracuse","NorthCarolina","MichiganSt.","Maryland","Kentucky","Kansas","Florida","Duke","Baylor"))) %>%
    count(winner)
## # A tibble: 2 × 2
##   winner      n
##   <fct>   <int>
## 1 Other      21
## 2 UConn.U     5
  • fct_lump
data_clean %>%
    mutate(winner = fct_lump(winner, n = 10)) %>%
    count(winner, sort = TRUE)
## # A tibble: 14 × 2
##    winner            n
##    <fct>         <int>
##  1 UConn             5
##  2 Duke              3
##  3 NorthCarolina     3
##  4 Florida           2
##  5 Kansas            2
##  6 Kentucky          2
##  7 Villanova         2
##  8 †Louisville       1
##  9 Baylor            1
## 10 Maryland          1
## 11 MichiganSt.       1
## 12 Syracuse          1
## 13 Uconn             1
## 14 Virginia          1