Import your data

nhl_rosters <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-01-09/nhl_rosters.csv')
## Rows: 54883 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): team_code, position_type, headshot, first_name, last_name, positi...
## dbl   (7): season, player_id, sweater_number, height_in_inches, weight_in_po...
## date  (1): birth_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data smaller

# using set seed this time so we get same players...
set.seed(1234)
nhl_rs <- nhl_rosters %>% select(team_code, first_name, last_name, birth_city, birth_country) %>% sample_n(100)

nhl_rs
## # A tibble: 100 × 5
##    team_code first_name last_name birth_city  birth_country
##    <chr>     <chr>      <chr>     <chr>       <chr>        
##  1 DAL       Cody       Eakin     Winnipeg    CAN          
##  2 DAL       Ales       Hemsky    Pardubice   CZE          
##  3 CHI       Marty      Burke     Toronto     CAN          
##  4 PIT       Harry      York      Ponoka      CAN          
##  5 VAN       Richard    Brodeur   Longueuil   CAN          
##  6 COL       Patrick    Bordeleau Montréal    CAN          
##  7 OTT       Brady      Tkachuk   Scottsdale  USA          
##  8 PHI       Ross       Lonsberry Humboldt    CAN          
##  9 PIT       Peter      Skudra    Riga        LVA          
## 10 TOR       Ted        Kennedy   Humberstone CAN          
## # ℹ 90 more rows

Chapter 14

Tools

Detect matches

Find Dan’s

nhl_rs$first_name
##   [1] "Cody"      "Ales"      "Marty"     "Harry"     "Richard"   "Patrick"  
##   [7] "Brady"     "Ross"      "Peter"     "Ted"       "John"      "Jon"      
##  [13] "Pat"       "Dick"      "Kevin"     "Jari"      "Andrei"    "Pascal"   
##  [19] "Lee"       "Radek"     "Drew"      "Dan"       "Pat"       "Bob"      
##  [25] "Andrew"    "Andy"      "Nick"      "Doug"      "Murray"    "Marcel"   
##  [31] "George"    "Randy"     "Maxim"     "Kirk"      "Tony"      "Michael"  
##  [37] "John"      "Paul"      "Jack"      "David"     "Larry"     "Keith"    
##  [43] "Jaroslav"  "Jason"     "Tony"      "Karel"     "Dale"      "Stephane" 
##  [49] "Gary"      "Blair"     "Valeri"    "Brian"     "Lars"      "Matt"     
##  [55] "Daniel"    "Craig"     "Bryan"     "Frantisek" "Trevor"    "Dan"      
##  [61] "Riley"     "Valeri"    "Jason"     "Ron"       "Darcy"     "Jiri"     
##  [67] "Greg"      "Brandon"   "Greg"      "Shayne"    "Brayden"   "Chuck"    
##  [73] "Craig"     "Craig"     "Brian"     "Jacques"   "John"      "Kent-Erik"
##  [79] "Todd"      "Dan"       "Martin"    "Jack"      "Ted"       "Roberto"  
##  [85] "Kari"      "David"     "Kerby"     "Justin"    "Mike"      "Pat"      
##  [91] "Terry"     "Tyson"     "Oleg"      "Nicholas"  "Byron"     "Jason"    
##  [97] "Jamie"     "Brenden"   "Wayne"     "Sidney"
# str_detect(nhl_rs$first_name, "Dan")
sum(str_detect(nhl_rs$first_name, "Dan"))
## [1] 4
nhl_rs %>%
    summarise(num_Dan = sum(str_detect(first_name, "Dan")))
## # A tibble: 1 × 1
##   num_Dan
##     <int>
## 1       4

Count Canadians

sum(str_detect(nhl_rs$birth_country, "CAN"))
## [1] 70

Count Canadian Dan’s

canadian_Dans = str_detect(nhl_rs$first_name, "Dan") & str_detect(nhl_rs$birth_country, "CAN")
canadian_Dans
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [49] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE
##  [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [97] FALSE FALSE FALSE FALSE
nhl_rs %>%
    summarise(
        canadian_Dans = sum(
            str_detect(first_name, "Dan") & str_detect(birth_country, "CAN")
        )
    )
## # A tibble: 1 × 1
##   canadian_Dans
##           <int>
## 1             4

Extract matches

Extract Dan’s

nhl_rs %>%
    mutate(col_Dan = str_extract(first_name, "Dan")) %>%
    select(first_name, last_name, col_Dan) %>%
    filter(!is.na(col_Dan))
## # A tibble: 4 × 3
##   first_name last_name col_Dan
##   <chr>      <chr>     <chr>  
## 1 Dan        Bonar     Dan    
## 2 Daniel     Marois    Dan    
## 3 Dan        Maloney   Dan    
## 4 Dan        Boyle     Dan

Replacing matches

Replacing first_name “Cody” with “HanSolo”

nhl_rs %>%
    mutate(col_HanSolo = str_replace(first_name, "Cody", "HanSolo")) %>%
    select(col_HanSolo, last_name) %>%
    filter(!is.na(col_HanSolo))
## # A tibble: 100 × 2
##    col_HanSolo last_name
##    <chr>       <chr>    
##  1 HanSolo     Eakin    
##  2 Ales        Hemsky   
##  3 Marty       Burke    
##  4 Harry       York     
##  5 Richard     Brodeur  
##  6 Patrick     Bordeleau
##  7 Brady       Tkachuk  
##  8 Ross        Lonsberry
##  9 Peter       Skudra   
## 10 Ted         Kennedy  
## # ℹ 90 more rows

Replacing Canadians birth country “CAN” with “Han Solo’s birth place”

nhl_hansolo <- nhl_rs %>%
    mutate(birth_country = str_replace(birth_country, "CAN", "Corellia"))

nhl_hansolo
## # A tibble: 100 × 5
##    team_code first_name last_name birth_city  birth_country
##    <chr>     <chr>      <chr>     <chr>       <chr>        
##  1 DAL       Cody       Eakin     Winnipeg    Corellia     
##  2 DAL       Ales       Hemsky    Pardubice   CZE          
##  3 CHI       Marty      Burke     Toronto     Corellia     
##  4 PIT       Harry      York      Ponoka      Corellia     
##  5 VAN       Richard    Brodeur   Longueuil   Corellia     
##  6 COL       Patrick    Bordeleau Montréal    Corellia     
##  7 OTT       Brady      Tkachuk   Scottsdale  USA          
##  8 PHI       Ross       Lonsberry Humboldt    Corellia     
##  9 PIT       Peter      Skudra    Riga        LVA          
## 10 TOR       Ted        Kennedy   Humberstone Corellia     
## # ℹ 90 more rows

Finding all players from Corellia and changing their first name to Han and last name to Solo

nhl_hansolo %>%
    mutate(
        first_name = if_else(birth_country == "Corellia", "Han", first_name),
        last_name = if_else(birth_country == "Corellia", "Solo", last_name)
    )
## # A tibble: 100 × 5
##    team_code first_name last_name birth_city  birth_country
##    <chr>     <chr>      <chr>     <chr>       <chr>        
##  1 DAL       Han        Solo      Winnipeg    Corellia     
##  2 DAL       Ales       Hemsky    Pardubice   CZE          
##  3 CHI       Han        Solo      Toronto     Corellia     
##  4 PIT       Han        Solo      Ponoka      Corellia     
##  5 VAN       Han        Solo      Longueuil   Corellia     
##  6 COL       Han        Solo      Montréal    Corellia     
##  7 OTT       Brady      Tkachuk   Scottsdale  USA          
##  8 PHI       Han        Solo      Humboldt    Corellia     
##  9 PIT       Peter      Skudra    Riga        LVA          
## 10 TOR       Han        Solo      Humberstone Corellia     
## # ℹ 90 more rows