Import data

# excel file
data <- read_excel("../00_data/NHLDATA.xlsx")
data_small <- data %>% select(where(is.character)) %>% head(n = 50)
data_small

## # A tibble: 50 × 5
##    first_name last_name    birth_city     birth_country birth_state_province
##    <chr>      <chr>        <chr>          <chr>         <chr>               
##  1 Bryan      Adams        Fort St. James CAN           British Columbia    
##  2 Donald     Audette      Laval          CAN           Quebec              
##  3 Eric       Bertrand     St-Ephrem      CAN           Quebec              
##  4 Jason      Botterill    Edmonton       CAN           Alberta             
##  5 Andrew     Brunette     Sudbury        CAN           Ontario             
##  6 Kelly      Buchberger   Langenburg     CAN           Saskatchewan        
##  7 Hnat       Domenichelli Edmonton       CAN           Alberta             
##  8 Shean      Donovan      Timmins        CAN           Ontario             
##  9 Nelson     Emerson      Hamilton       CAN           Ontario             
## 10 Ray        Ferraro      Trail          CAN           British Columbia    
## # ℹ 40 more rows

Detect matches

data_small %>%
filter(str_detect(first_name, "A"))

## # A tibble: 2 × 5
##   first_name last_name birth_city birth_country birth_state_province
##   <chr>      <chr>     <chr>      <chr>         <chr>               
## 1 Andrew     Brunette  Sudbury    CAN           Ontario             
## 2 Andreas    Karlsson  Ludvika    SWE           NA

# Count how many first names end with "n"
data_small %>% 
  summarise(sum(str_detect(first_name, "n$")))

## # A tibble: 1 × 1
##   `sum(str_detect(first_name, "n$"))`
##                                 <int>
## 1                                  12

# Show TRUE/FALSE for each name
str_detect(data_small$first_name, "n$")

##  [1]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
## [13] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [25] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE  TRUE

# Total count of names ending with "n"
sum(str_detect(data_small$first_name, "n$"))

## [1] 12

# Proportion of names ending with "n"
mean(str_detect(data_small$first_name, "n$"))

## [1] 0.24

Extract matches

sum(str_detect(data_small$birth_country, "CAN"))

## [1] 30

name_patterns <- c("son", "berg", "man", "ski")
name_match <- str_c(name_patterns, collapse = "|")
name_match

## [1] "son|berg|man|ski"

has_pattern <- str_subset(data_small$last_name, name_match)


str_extract(has_pattern, name_match)

## [1] "berg" "son"  "son"  "son"

Replacing matches

# Replace the first letter of first_name with "-"
data_small %>% 
  mutate(first_name_rev = first_name %>% str_replace("^[A-Z]", "-"))

## # A tibble: 50 × 6
##    first_name last_name    birth_city     birth_country birth_state_province
##    <chr>      <chr>        <chr>          <chr>         <chr>               
##  1 Bryan      Adams        Fort St. James CAN           British Columbia    
##  2 Donald     Audette      Laval          CAN           Quebec              
##  3 Eric       Bertrand     St-Ephrem      CAN           Quebec              
##  4 Jason      Botterill    Edmonton       CAN           Alberta             
##  5 Andrew     Brunette     Sudbury        CAN           Ontario             
##  6 Kelly      Buchberger   Langenburg     CAN           Saskatchewan        
##  7 Hnat       Domenichelli Edmonton       CAN           Alberta             
##  8 Shean      Donovan      Timmins        CAN           Ontario             
##  9 Nelson     Emerson      Hamilton       CAN           Ontario             
## 10 Ray        Ferraro      Trail          CAN           British Columbia    
## # ℹ 40 more rows
## # ℹ 1 more variable: first_name_rev <chr>

# Replace all uppercase letters in first_name with "-"
data_small %>% 
  mutate(first_name_rev = first_name %>% str_replace_all("[A-Z]", "-"))

## # A tibble: 50 × 6
##    first_name last_name    birth_city     birth_country birth_state_province
##    <chr>      <chr>        <chr>          <chr>         <chr>               
##  1 Bryan      Adams        Fort St. James CAN           British Columbia    
##  2 Donald     Audette      Laval          CAN           Quebec              
##  3 Eric       Bertrand     St-Ephrem      CAN           Quebec              
##  4 Jason      Botterill    Edmonton       CAN           Alberta             
##  5 Andrew     Brunette     Sudbury        CAN           Ontario             
##  6 Kelly      Buchberger   Langenburg     CAN           Saskatchewan        
##  7 Hnat       Domenichelli Edmonton       CAN           Alberta             
##  8 Shean      Donovan      Timmins        CAN           Ontario             
##  9 Nelson     Emerson      Hamilton       CAN           Ontario             
## 10 Ray        Ferraro      Trail          CAN           British Columbia    
## # ℹ 40 more rows
## # ℹ 1 more variable: first_name_rev <chr>

Module 10: Apply it to your data 9

Thomas Kaufield

Import data

Detect matches

Extract matches

Replacing matches