Introduction

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

String Basics

c("one", "two", "three")
## [1] "one"   "two"   "three"
#> [1] "one"   "two"   "three"
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'

double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"

x <- c("\"", "\\")
x
## [1] "\"" "\\"
#> [1] "\"" "\\"
writeLines(x)
## "
## \
#> "
#> \

String Length

str_length(c("a", "R for data science", NA))
## [1]  1 18 NA
#> [1]  1 18 NA

Combining Strings

str_c("x", "y")
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
#> [1] "x, y"
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
#> [1] "x, y, z"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)
## [1] "Good morning Hadley."
#> [1] "Good morning Hadley."

Subsetting Strings

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
#> [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
## [1] "a"
#> [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple"  "banana" "pear"
#> [1] "apple"  "banana" "pear"

Locales

# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
## [1] "I" "I"
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "İ" "I"
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")  # English
## [1] "apple"    "banana"   "eggplant"
#> [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # Hawaiian
## [1] "apple"    "eggplant" "banana"
#> [1] "apple"    "eggplant" "banana"

Matching Patterns with Regular Expressions

x <- c("apple", "banana", "pear")
str_view(x, "an")
## [2] │ b<an><an>a
#> [2] │ b<an><an>a
str_view(x, ".a.")
## [2] │ <ban>ana
## [3] │ p<ear>
#> [2] │ <ban>ana
#> [3] │ p<ear>
# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)
## \.
#> \.

# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")
## [2] │ <a.c>
#> [2] │ <a.c>
x <- "a\\b"
writeLines(x)
## a\b
#> a\b

str_view(x, "\\\\")
## [1] │ a<\>b
#> [1] │ a<\>b

Anchors

x <- c("apple", "banana", "pear")
str_view(x, "^a")
## [1] │ <a>pple
#> [1] │ <a>pple
str_view(x, "a$")
## [2] │ banan<a>
#> [2] │ banan<a>
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
## [1] │ <apple> pie
## [2] │ <apple>
## [3] │ <apple> cake
#> [1] │ <apple> pie
#> [2] │ <apple>
#> [3] │ <apple> cake
str_view(x, "^apple$")
## [2] │ <apple>
#> [2] │ <apple>

Character classes and alternatives

# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
## [2] │ <a.c>
#> [2] │ <a.c>
str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")
## [3] │ <a*c>
#> [3] │ <a*c>
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]")
## [4] │ <a >c
#> [4] │ <a >c
str_view(c("grey", "gray"), "gr(e|a)y")
## [1] │ <grey>
## [2] │ <gray>
#> [1] │ <grey>
#> [2] │ <gray>

Repetition

x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
str_view(x, "CC+")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C[LX]+')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
str_view(x, "C{2}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, "C{2,}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, "C{2,3}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C{2,3}?')
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, 'C[LX]+?')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII

Grouping and Backreferencing

str_view(fruit, "(..)\\1", match = TRUE)
##  [4] │ b<anan>a
## [20] │ <coco>nut
## [22] │ <cucu>mber
## [41] │ <juju>be
## [56] │ <papa>ya
## [73] │ s<alal> berry
#>  [4] │ b<anan>a
#> [20] │ <coco>nut
#> [22] │ <cucu>mber
#> [41] │ <juju>be
#> [56] │ <papa>ya
#> [73] │ s<alal> berry

Tools

Detect matches

x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1]  TRUE FALSE  TRUE
#> [1]  TRUE FALSE  TRUE
sum(str_detect(words, "^t"))
## [1] 65
#> [1] 65

mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
#> [1] 0.2765306
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_count("abababa", "aba")
## [1] 2
#> [1] 2
str_view_all("abababa", "aba")
## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ <aba>b<aba>
#> Warning: `str_view_all()` was deprecated in stringr 1.5.0.
#> ℹ Please use `str_view()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> [1] │ <aba>b<aba>

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
#> [1] "red|orange|yellow|green|blue|purple"
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
## [1] "blue" "blue" "red"  "red"  "red"  "blue"
#> [1] "blue" "blue" "red"  "red"  "red"  "blue"
more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)
## [1] │ It is hard to erase <blue> or <red> ink.
## [2] │ The <green> light in the brown box flicke<red>.
## [3] │ The sky in the west is tinged with <orange> <red>.
#> [1] │ It is hard to erase <blue> or <red> ink.
#> [2] │ The <green> light in the brown box flicke<red>.
#> [3] │ The sky in the west is tinged with <orange> <red>.

str_extract(more, colour_match)
## [1] "blue"   "green"  "orange"
#> [1] "blue"   "green"  "orange"
str_extract_all(more, colour_match)
## [[1]]
## [1] "blue" "red" 
## 
## [[2]]
## [1] "green" "red"  
## 
## [[3]]
## [1] "orange" "red"
#> [[1]]
#> [1] "blue" "red" 
#> 
#> [[2]]
#> [1] "green" "red"  
#> 
#> [[3]]
#> [1] "orange" "red"
str_extract_all(more, colour_match, simplify = TRUE)
##      [,1]     [,2] 
## [1,] "blue"   "red"
## [2,] "green"  "red"
## [3,] "orange" "red"
#>      [,1]     [,2] 
#> [1,] "blue"   "red"
#> [2,] "green"  "red"
#> [3,] "orange" "red"

x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"
#>      [,1] [,2] [,3]
#> [1,] "a"  ""   ""  
#> [2,] "a"  "b"  ""  
#> [3,] "a"  "b"  "c"

Grouped matches

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"
#>  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
#>  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"
has_noun %>% 
  str_match(noun)
##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"
#>       [,1]         [,2]  [,3]     
#>  [1,] "the smooth" "the" "smooth" 
#>  [2,] "the sheet"  "the" "sheet"  
#>  [3,] "the depth"  "the" "depth"  
#>  [4,] "a chicken"  "a"   "chicken"
#>  [5,] "the parked" "the" "parked" 
#>  [6,] "the sun"    "the" "sun"    
#>  [7,] "the huge"   "the" "huge"   
#>  [8,] "the ball"   "the" "ball"   
#>  [9,] "the woman"  "the" "woman"  
#> [10,] "a helps"    "a"   "helps"
tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )
## # A tibble: 720 × 3
##    sentence                                    article noun   
##    <chr>                                       <chr>   <chr>  
##  1 The birch canoe slid on the smooth planks.  the     smooth 
##  2 Glue the sheet to the dark blue background. the     sheet  
##  3 It's easy to tell the depth of a well.      the     depth  
##  4 These days a chicken leg is a rare dish.    a       chicken
##  5 Rice is often served in round bowls.        <NA>    <NA>   
##  6 The juice of lemons makes fine punch.       <NA>    <NA>   
##  7 The box was thrown beside the parked truck. the     parked 
##  8 The hogs were fed chopped corn and garbage. <NA>    <NA>   
##  9 Four hours of steady work faced us.         <NA>    <NA>   
## 10 A large size in stockings is hard to sell.  <NA>    <NA>   
## # ℹ 710 more rows
#> # A tibble: 720 × 3
#>   sentence                                    article noun   
#>   <chr>                                       <chr>   <chr>  
#> 1 The birch canoe slid on the smooth planks.  the     smooth 
#> 2 Glue the sheet to the dark blue background. the     sheet  
#> 3 It's easy to tell the depth of a well.      the     depth  
#> 4 These days a chicken leg is a rare dish.    a       chicken
#> 5 Rice is often served in round bowls.        <NA>    <NA>   
#> 6 The juice of lemons makes fine punch.       <NA>    <NA>   
#> # ℹ 714 more rows

Replacing matches

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple"  "p-ar"   "b-nana"
#> [1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-"  "p--r"   "b-n-n-"
#> [1] "-ppl-"  "p--r"   "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house"    "two cars"     "three people"
#> [1] "one house"    "two cars"     "three people"
sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)
## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."
#> [1] "The canoe birch slid on the smooth planks." 
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."     
#> [4] "These a days chicken leg is a rare dish."   
#> [5] "Rice often is served in round bowls."

Splitting

"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]
## [1] "a" "b" "c" "d"
#> [1] "a" "b" "c" "d"
sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)
##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
##      [,9]   
## [1,] ""     
## [2,] ""     
## [3,] "well."
## [4,] "dish."
## [5,] ""
#>      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
#> [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
#> [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
#> [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
#> [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
#> [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
#>      [,9]   
#> [1,] ""     
#> [2,] ""     
#> [3,] "well."
#> [4,] "dish."
#> [5,] ""

Other Types of Pattern

# The regular call:
str_view(fruit, "nana")
## [4] │ ba<nana>
# Is shorthand for
str_view(fruit, regex("nana"))
## [4] │ ba<nana>
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
#> [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
#> [1] "Line" "Line" "Line"
phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [) -]?   # optional closing parens, space, or dash
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)

str_match("514-791-8141", phone)
##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"
#>      [,1]          [,2]  [,3]  [,4] 
#> [1,] "514-791-814" "514" "791" "814"
# That means you also need to be aware of the difference
# when doing case insensitive matches:
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "İ" "i" "ı"
#> [1] "I" "İ" "i" "ı"

str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
#> [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "İ" "i"
#> [1] "İ" "i"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>.
#> [1] │ <This> <is> <a> <sentence>.
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This"     "is"       "a"        "sentence"
#> [[1]]
#> [1] "This"     "is"       "a"        "sentence"

Other

apropos("replace")
## [1] "%+replace%"       "replace"          "replace_na"       "setReplaceMethod"
## [5] "str_replace"      "str_replace_all"  "str_replace_na"   "theme_replace"
#> [1] "%+replace%"       "replace"          "replace_na"       "setReplaceMethod"
#> [5] "str_replace"      "str_replace_all"  "str_replace_na"   "theme_replace"
head(dir(pattern = "\\.Rmd$"))
## [1] "CodeAlong9.Rmd"
#> [1] "communicate-plots.Rmd" "communicate.Rmd"       "datetimes.Rmd"        
#> [4] "EDA.Rmd"               "explore.Rmd"           "factors.Rmd"