Import data

# excel file
data <- read_excel("../00_data/NHLDATA.xlsx")
data_small <- data %>% select(where(is.character)) %>% head(n = 50)
data_small %>% select(first_name, last_name, birth_country)

## # A tibble: 50 × 3
##    first_name last_name    birth_country
##    <chr>      <chr>        <chr>        
##  1 Bryan      Adams        CAN          
##  2 Donald     Audette      CAN          
##  3 Eric       Bertrand     CAN          
##  4 Jason      Botterill    CAN          
##  5 Andrew     Brunette     CAN          
##  6 Kelly      Buchberger   CAN          
##  7 Hnat       Domenichelli CAN          
##  8 Shean      Donovan      CAN          
##  9 Nelson     Emerson      CAN          
## 10 Ray        Ferraro      CAN          
## # ℹ 40 more rows

data_small

## # A tibble: 50 × 5
##    first_name last_name    birth_city     birth_country birth_state_province
##    <chr>      <chr>        <chr>          <chr>         <chr>               
##  1 Bryan      Adams        Fort St. James CAN           British Columbia    
##  2 Donald     Audette      Laval          CAN           Quebec              
##  3 Eric       Bertrand     St-Ephrem      CAN           Quebec              
##  4 Jason      Botterill    Edmonton       CAN           Alberta             
##  5 Andrew     Brunette     Sudbury        CAN           Ontario             
##  6 Kelly      Buchberger   Langenburg     CAN           Saskatchewan        
##  7 Hnat       Domenichelli Edmonton       CAN           Alberta             
##  8 Shean      Donovan      Timmins        CAN           Ontario             
##  9 Nelson     Emerson      Hamilton       CAN           Ontario             
## 10 Ray        Ferraro      Trail          CAN           British Columbia    
## # ℹ 40 more rows

Detect matches

data_small %>%
filter(str_detect(first_name, "A")) %>% filter(str_detect(first_name, "A"))

## # A tibble: 2 × 5
##   first_name last_name birth_city birth_country birth_state_province
##   <chr>      <chr>     <chr>      <chr>         <chr>               
## 1 Andrew     Brunette  Sudbury    CAN           Ontario             
## 2 Andreas    Karlsson  Ludvika    SWE           NA

# Count how many first names end with "n"
data_small %>% 
  summarise(sum(str_detect(first_name, "n$")))

## # A tibble: 1 × 1
##   `sum(str_detect(first_name, "n$"))`
##                                 <int>
## 1                                  12

# Show TRUE/FALSE for each name
str_detect(data_small$first_name, "n$")

##  [1]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE
## [13] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [25] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE  TRUE

# Total count of names ending with "n"
sum(str_detect(data_small$first_name, "n$"))

## [1] 12

# Proportion of names ending with "n"
mean(str_detect(data_small$first_name, "n$"))

## [1] 0.24

Extract matches

sum(str_detect(data_small$birth_country, "CAN"))

## [1] 30

name_patterns <- c("son", "berg", "man", "ski")
name_match <- str_c(name_patterns, collapse = "|")
name_match

## [1] "son|berg|man|ski"

has_pattern <- str_subset(data_small$last_name, name_match)


str_extract(has_pattern, name_match)

## [1] "berg" "son"  "son"  "son"

Replacing matches

data_small %>%
  mutate(first_name_rev = str_replace(first_name, "^[A-Z]", "-")) %>%
  select(first_name, first_name_rev)

## # A tibble: 50 × 2
##    first_name first_name_rev
##    <chr>      <chr>         
##  1 Bryan      -ryan         
##  2 Donald     -onald        
##  3 Eric       -ric          
##  4 Jason      -ason         
##  5 Andrew     -ndrew        
##  6 Kelly      -elly         
##  7 Hnat       -nat          
##  8 Shean      -hean         
##  9 Nelson     -elson        
## 10 Ray        -ay           
## # ℹ 40 more rows

data_small %>%
  mutate(first_name_rev = str_replace_all(first_name, "[A-Z]", "-")) %>%
  select(first_name, first_name_rev)

## # A tibble: 50 × 2
##    first_name first_name_rev
##    <chr>      <chr>         
##  1 Bryan      -ryan         
##  2 Donald     -onald        
##  3 Eric       -ric          
##  4 Jason      -ason         
##  5 Andrew     -ndrew        
##  6 Kelly      -elly         
##  7 Hnat       -nat          
##  8 Shean      -hean         
##  9 Nelson     -elson        
## 10 Ray        -ay           
## # ℹ 40 more rows

Module 10: Apply it to your data 9

Thomas Kaufield

Import data

Detect matches

Extract matches

Replacing matches