Introduction

String Basics

chac_data <- "Im 'very' Hungry"

x <- c("\"", "\\")
x
## [1] "\"" "\\"
writeLines(x)
## "
## \
c("one", "two", "three")
## [1] "one"   "two"   "three"

String Length

str_length(c("I am Hungry."))
## [1] 12

Combining Strings

str_c("I", "am")
## [1] "Iam"
str_c("I", "am", sep = ";")
## [1] "I;am"
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)
## [1] "Good morning Hadley."
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"

Subsetting Strings

x <- c("apple", "banana", "pear")
str_sub(x, 1, 3)
## [1] "app" "ban" "pea"
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
## [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple"  "banana" "pear"

Locales

str_to_upper(c("i", "ı"))
## [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "İ" "I"
x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")
## [1] "apple"    "banana"   "eggplant"
str_sort(x, locale = "haw")
## [1] "apple"    "eggplant" "banana"

Matching Patterns With Regular Expressions

flights %>% glimpse()
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)

Basic Matches

flights_small %>% filter(str_detect(origin, "M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, ".M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "M."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "M\\."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

Anchors

flights_small %>% filter(str_detect(origin, "^E"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL
flights_small %>% filter(str_detect(origin, "A$"))
## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N24211  LGA    IAH  
## 2 DL      N668DN  LGA    ATL  
## 3 EV      N829AS  LGA    IAD  
## 4 AA      N3ALAA  LGA    ORD

Character Classes and Alternatives

flights_small %>% filter(str_detect(carrier, "\\d"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 B6      N804JB  JFK    BQN  
## 2 B6      N516JB  EWR    FLL  
## 3 B6      N593JB  JFK    MCO
flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(carrier, "[ABD]"))
## # A tibble: 9 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 B6      N804JB  JFK    BQN  
## 5 DL      N668DN  LGA    ATL  
## 6 UA      N39463  EWR    ORD  
## 7 B6      N516JB  EWR    FLL  
## 8 B6      N593JB  JFK    MCO  
## 9 AA      N3ALAA  LGA    ORD
flights_small %>% filter(str_detect(carrier, "[^ABD]"))
## # A tibble: 8 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 B6      N804JB  JFK    BQN  
## 4 DL      N668DN  LGA    ATL  
## 5 UA      N39463  EWR    ORD  
## 6 B6      N516JB  EWR    FLL  
## 7 EV      N829AS  LGA    IAD  
## 8 B6      N593JB  JFK    MCO

Repetition

# ? 0 or 1
flights_small %>% filter(str_detect(carrier, "A?"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD
# + 1 or more
flights_small %>% filter(str_detect(carrier, "A+"))
## # A tibble: 5 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 UA      N39463  EWR    ORD  
## 5 AA      N3ALAA  LGA    ORD
# * 0 or more
flights_small %>% filter(str_detect(carrier, "A*"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD

Grouping and backreferences

# (..)\\1

flights_small %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))
## # A tibble: 1,990 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 EV      N15555  EWR    MKE  
##  2 EV      N11119  LGA    CLE  
##  3 UA      N14242  EWR    TPA  
##  4 EV      N14143  EWR    PIT  
##  5 EV      N15555  EWR    SAV  
##  6 UA      N12125  EWR    LAX  
##  7 EV      N15555  EWR    PWM  
##  8 EV      N15555  EWR    BUF  
##  9 EV      N15555  EWR    RIC  
## 10 EV      N13133  EWR    DTW  
## # ℹ 1,980 more rows

Tools

Detect matches

flights_small %>% 
    summarise(sum(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
##   `sum(str_detect(tailnum, "8$"))`
##                              <int>
## 1                                1
str_detect(flights_small$tailnum, "8$")
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(flights_small$tailnum, "8$"))
## [1] 1
mean(str_detect(flights_small$tailnum, "8$"))
## [1] 0.1

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
# extract strings with color
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
## [1] "blue" "blue" "red"  "red"  "red"  "blue"

Grouped matches

# Extract strings with a noun
noun <- "(a|the) ([^ ]+)"
has_noun <- str_subset(sentences, noun) %>% head(10)
has_noun %>% str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"
has_noun %>% 
    str_match(noun)
##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

Replacing matches

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-"  "p--r"   "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house"    "two cars"     "three people"
sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)
## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619AA     
##  4 B6      N804JB  JFK    BQN   -804JB     
##  5 DL      N668DN  LGA    ATL   -668DN     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516JB     
##  8 EV      N829AS  LGA    IAD   -829AS     
##  9 B6      N593JB  JFK    MCO   -593JB     
## 10 AA      N3ALAA  LGA    ORD   -3ALAA
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619--     
##  4 B6      N804JB  JFK    BQN   -804--     
##  5 DL      N668DN  LGA    ATL   -668--     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516--     
##  8 EV      N829AS  LGA    IAD   -829--     
##  9 B6      N593JB  JFK    MCO   -593--     
## 10 AA      N3ALAA  LGA    ORD   -3----

Splitting

sentences %>%
  head(5) %>% 
  str_split(" ")
## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."
"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]
## [1] "a" "b" "c" "d"
sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)
##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
##      [,9]   
## [1,] ""     
## [2,] ""     
## [3,] "well."
## [4,] "dish."
## [5,] ""
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"
x <- "This is a sentence.  This is another sentence."
str_view_all(x, boundary("word"))
## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ <This> <is> <a> <sentence>.  <This> <is> <another> <sentence>.
str_split(x, " ")[[1]]
## [1] "This"      "is"        "a"         "sentence." ""          "This"     
## [7] "is"        "another"   "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This"     "is"       "a"        "sentence" "This"     "is"       "another" 
## [8] "sentence"

Other types of patterns

flights_small %>% filter(str_detect(tailnum, regex("^n", ignore_case = TRUE)))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD
flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [) -]?   # optional closing parens, space, or dash
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)
str_match("514-791-8141", phone)
##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "İ" "i" "ı"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "İ" "i"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>.
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This"     "is"       "a"        "sentence"