Introduction

String basics

char_data <- "I'm 'very' hungry."

stringr::str_length("I am hungry.")
## [1] 12
stringr::str_c("I", " am")
## [1] "I am"
stringr::str_c(c("I", " am"), collapse = "")
## [1] "I am"
stringr::str_c("I", " am", sep = " ;")
## [1] "I ; am"
str_sort(c("John", "Mary", "Aaron"))
## [1] "Aaron" "John"  "Mary"

Matching patterns with regular expressions

flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)

Basic matches

flights_small %>% filter(str_detect(dest, "AH"))
## # A tibble: 2 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH
flights_small %>% filter(str_detect(origin, "M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(dest, "M."))
## # A tibble: 2 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 AA      N619AA  JFK    MIA  
## 2 B6      N593JB  JFK    MCO
flights_small %>% filter(str_detect(dest, ".M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(dest, "M\\."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

Anchors

flights_small %>% filter(str_detect(origin, "^E"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL

Character classes and alternatives

flights_small %>% filter(str_detect(carrier, "\\d"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 B6      N804JB  JFK    BQN  
## 2 B6      N516JB  EWR    FLL  
## 3 B6      N593JB  JFK    MCO
flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(carrier, "[ABD]"))
## # A tibble: 9 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 B6      N804JB  JFK    BQN  
## 5 DL      N668DN  LGA    ATL  
## 6 UA      N39463  EWR    ORD  
## 7 B6      N516JB  EWR    FLL  
## 8 B6      N593JB  JFK    MCO  
## 9 AA      N3ALAA  LGA    ORD
flights_small %>% filter(str_detect(carrier, "[^ABD]"))
## # A tibble: 8 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 B6      N804JB  JFK    BQN  
## 4 DL      N668DN  LGA    ATL  
## 5 UA      N39463  EWR    ORD  
## 6 B6      N516JB  EWR    FLL  
## 7 EV      N829AS  LGA    IAD  
## 8 B6      N593JB  JFK    MCO

Repetition

# ? 0 or 1
flights_small %>% filter(str_detect(carrier, "A?"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD
# + 1 or more
flights_small %>% filter(str_detect(carrier, "A+"))
## # A tibble: 5 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 UA      N39463  EWR    ORD  
## 5 AA      N3ALAA  LGA    ORD
# * 0 or more
flights_small %>% filter(str_detect(carrier, "A*"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD

Grouping and backreferences

# (..)\\1

flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d{2})\\1"))
## # A tibble: 1,990 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 EV      N15555  EWR    MKE  
##  2 EV      N11119  LGA    CLE  
##  3 UA      N14242  EWR    TPA  
##  4 EV      N14143  EWR    PIT  
##  5 EV      N15555  EWR    SAV  
##  6 UA      N12125  EWR    LAX  
##  7 EV      N15555  EWR    PWM  
##  8 EV      N15555  EWR    BUF  
##  9 EV      N15555  EWR    RIC  
## 10 EV      N13133  EWR    DTW  
## # ℹ 1,980 more rows

Tools

Detect matches

flights_small %>% 
    summarize(sum(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
##   `sum(str_detect(tailnum, "8$"))`
##                              <int>
## 1                                1
str_detect(flights_small$tailnum, "8$")
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(flights_small$tailnum, "8$"))
## [1] 1
mean(str_detect(flights_small$tailnum, "8$"))
## [1] 0.1

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
# Extract strings with a color
has_colour <- str_subset(sentences, colour_match)
str_extract(has_colour, colour_match)
##  [1] "blue"   "blue"   "red"    "red"    "red"    "blue"   "yellow" "red"   
##  [9] "red"    "green"  "red"    "red"    "blue"   "red"    "red"    "red"   
## [17] "red"    "blue"   "red"    "blue"   "red"    "green"  "red"    "red"   
## [25] "red"    "red"    "red"    "red"    "green"  "red"    "green"  "red"   
## [33] "purple" "green"  "red"    "red"    "red"    "red"    "red"    "blue"  
## [41] "red"    "blue"   "red"    "red"    "red"    "red"    "green"  "green" 
## [49] "green"  "red"    "red"    "yellow" "red"    "orange" "red"    "red"   
## [57] "red"

Grouped matches

# Extract strings with a noun
noun <- "(a|the) ([^ ]+)"
had_nouns <- str_subset(sentences, noun) %>% head(10)
had_nouns %>% str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

Replacing matches

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-")) 
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619AA     
##  4 B6      N804JB  JFK    BQN   -804JB     
##  5 DL      N668DN  LGA    ATL   -668DN     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516JB     
##  8 EV      N829AS  LGA    IAD   -829AS     
##  9 B6      N593JB  JFK    MCO   -593JB     
## 10 AA      N3ALAA  LGA    ORD   -3ALAA
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-")) 
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619--     
##  4 B6      N804JB  JFK    BQN   -804--     
##  5 DL      N668DN  LGA    ATL   -668--     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516--     
##  8 EV      N829AS  LGA    IAD   -829--     
##  9 B6      N593JB  JFK    MCO   -593--     
## 10 AA      N3ALAA  LGA    ORD   -3----

Splitting

sentences[1] %>% str_split(" ", n = 3, simplify = TRUE)
##      [,1]  [,2]    [,3]                              
## [1,] "The" "birch" "canoe slid on the smooth planks."

Other types of patterns

flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL