Import your data

mydata <- read_excel("../00_data/mydata.xlsx") %>%
    janitor::clean_names()
mydata 
## # A tibble: 56 × 9
##     year winner        score runner_up     third_place fourth_place     location
##    <dbl> <chr>         <dbl> <chr>         <chr>       <chr>            <chr>   
##  1  2025 <NA>            0   <NA>          <NA>        <NA>             SanAnto…
##  2  2024 UConn          75.6 Purdue        *Alabama    *NCState         Phoenix 
##  3  2023 UConn          76.6 SanDiegoSt.   *Miami(FL)  *FloridaAtlantic Houston 
##  4  2022 Kansas         72.7 NorthCarolina *Villanova  *Duke            NewOrle…
##  5  2021 Baylor         86.7 Gonzaga       *Houston    *UCLA            Indiana…
##  6  2020 <NA>            0   <NA>          <NA>        <NA>             <NA>    
##  7  2019 Virginia       85.8 TexasTech     *Auburn     *MichiganSt.     Minneap…
##  8  2018 Villanova      79.6 Michigan      *Kansas     *LoyolaChicago   SanAnto…
##  9  2017 NorthCarolina  71.6 Gonzaga       *Oregon     *SouthCarolina   Phoenix 
## 10  2016 Villanova      77.7 NorthCarolina *Oklahoma   *Syracuse        Houston 
## # ℹ 46 more rows
## # ℹ 2 more variables: most_outstanding_player <chr>, winning_coach <chr>
data_clean <- mydata %>% 
    select(winner, year, runner_up, score) %>% 
    slice(2:26)

Chapter 14

Tools

Detect matches

mydata$winner
##  [1] NA              "UConn"         "UConn"         "Kansas"       
##  [5] "Baylor"        NA              "Virginia"      "Villanova"    
##  [9] "NorthCarolina" "Villanova"     "Duke"          "UConn"        
## [13] "†Louisville"   "Kentucky"      "Uconn"         "Duke"         
## [17] "NorthCarolina" "Kansas"        "Florida"       "Florida"      
## [21] "NorthCarolina" "UConn"         "Syracuse"      "Maryland"     
## [25] "Duke"          "MichiganSt."   "UConn"         "Kentucky"     
## [29] "Arizona"       "Kentucky"      "UCLA"          "Arkansas"     
## [33] "NorthCarolina" "Duke"          "Duke"          "UNLV"         
## [37] "Michigan"      "Kansas"        "Indiana"       "Louisville"   
## [41] "Villanova"     "Georgetown"    "NCState"       "NorthCarolina"
## [45] "Indiana"       "Louisville"    "MichiganSt."   "Kentucky"     
## [49] "Marquette"     "Indiana"       "UCLA"          "NCState"      
## [53] "UCLA"          "UCLA"          "UCLA"          "UCLA"
str_detect(data_clean$winner, "UConn")
##  [1]  TRUE  TRUE FALSE FALSE    NA FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [25] FALSE
sum(str_detect(data_clean$winner, "UConn"))
## [1] NA
mydata %>%
    summarise(num_UConn = sum(str_detect(winner, "UConn"), na.rm = TRUE))
## # A tibble: 1 × 1
##   num_UConn
##       <int>
## 1         5

Extract matches

mydata %>%
    mutate(col_UConn = str_extract(winner, "UConn")) %>%
    select(winner, col_UConn) %>%
    filter(is.na(col_UConn))
## # A tibble: 51 × 2
##    winner        col_UConn
##    <chr>         <chr>    
##  1 <NA>          <NA>     
##  2 Kansas        <NA>     
##  3 Baylor        <NA>     
##  4 <NA>          <NA>     
##  5 Virginia      <NA>     
##  6 Villanova     <NA>     
##  7 NorthCarolina <NA>     
##  8 Villanova     <NA>     
##  9 Duke          <NA>     
## 10 †Louisville   <NA>     
## # ℹ 41 more rows

Replacing matches

mydata %>%
    mutate(col_Duke = str_replace(winner, "UConn", "Duke")) %>%
    select(winner, col_Duke)
## # A tibble: 56 × 2
##    winner        col_Duke     
##    <chr>         <chr>        
##  1 <NA>          <NA>         
##  2 UConn         Duke         
##  3 UConn         Duke         
##  4 Kansas        Kansas       
##  5 Baylor        Baylor       
##  6 <NA>          <NA>         
##  7 Virginia      Virginia     
##  8 Villanova     Villanova    
##  9 NorthCarolina NorthCarolina
## 10 Villanova     Villanova    
## # ℹ 46 more rows