String Basics
c("one", "two", "three")
## [1] "one" "two" "three"
#> [1] "one" "two" "three"
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
x <- c("\"", "\\")
x
## [1] "\"" "\\"
#> [1] "\"" "\\"
writeLines(x)
## "
## \
#> "
#> \
String Length
str_length(c("a", "R for data science", NA))
## [1] 1 18 NA
#> [1] 1 18 NA
Combining Strings
str_c("x", "y")
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
#> [1] "x, y"
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
#> [1] "x, y, z"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
## [1] "Good morning Hadley."
#> [1] "Good morning Hadley."
Subsetting Strings
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
#> [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
## [1] "a"
#> [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple" "banana" "pear"
#> [1] "apple" "banana" "pear"
Locales
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
## [1] "I" "I"
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "İ" "I"
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
## [1] "apple" "banana" "eggplant"
#> [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw") # Hawaiian
## [1] "apple" "eggplant" "banana"
#> [1] "apple" "eggplant" "banana"
Matching Patterns with Regular Expressions
x <- c("apple", "banana", "pear")
str_view(x, "an")
## [2] │ b<an><an>a
#> [2] │ b<an><an>a
str_view(x, ".a.")
## [2] │ <ban>ana
## [3] │ p<ear>
#> [2] │ <ban>ana
#> [3] │ p<ear>
# To create the regular expression, we need \\
dot <- "\\."
# But the expression itself only contains one:
writeLines(dot)
## \.
#> \.
# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")
## [2] │ <a.c>
#> [2] │ <a.c>
x <- "a\\b"
writeLines(x)
## a\b
#> a\b
str_view(x, "\\\\")
## [1] │ a<\>b
#> [1] │ a<\>b
Anchors
x <- c("apple", "banana", "pear")
str_view(x, "^a")
## [1] │ <a>pple
#> [1] │ <a>pple
str_view(x, "a$")
## [2] │ banan<a>
#> [2] │ banan<a>
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
## [1] │ <apple> pie
## [2] │ <apple>
## [3] │ <apple> cake
#> [1] │ <apple> pie
#> [2] │ <apple>
#> [3] │ <apple> cake
str_view(x, "^apple$")
## [2] │ <apple>
#> [2] │ <apple>
Character classes and alternatives
# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
## [2] │ <a.c>
#> [2] │ <a.c>
str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")
## [3] │ <a*c>
#> [3] │ <a*c>
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]")
## [4] │ <a >c
#> [4] │ <a >c
str_view(c("grey", "gray"), "gr(e|a)y")
## [1] │ <grey>
## [2] │ <gray>
#> [1] │ <grey>
#> [2] │ <gray>
Repetition
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
str_view(x, "CC+")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C[LX]+')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
str_view(x, "C{2}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, "C{2,}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, "C{2,3}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C{2,3}?')
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, 'C[LX]+?')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII
Grouping and Backreferencing
str_view(fruit, "(..)\\1", match = TRUE)
## [4] │ b<anan>a
## [20] │ <coco>nut
## [22] │ <cucu>mber
## [41] │ <juju>be
## [56] │ <papa>ya
## [73] │ s<alal> berry
#> [4] │ b<anan>a
#> [20] │ <coco>nut
#> [22] │ <cucu>mber
#> [41] │ <juju>be
#> [56] │ <papa>ya
#> [73] │ s<alal> berry
Tools
Detect matches
x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUE
#> [1] TRUE FALSE TRUE
sum(str_detect(words, "^t"))
## [1] 65
#> [1] 65
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
#> [1] 0.2765306
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_count("abababa", "aba")
## [1] 2
#> [1] 2
str_view_all("abababa", "aba")
## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ <aba>b<aba>
#> Warning: `str_view_all()` was deprecated in stringr 1.5.0.
#> ℹ Please use `str_view()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> [1] │ <aba>b<aba>
Grouped matches
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
#> [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
#> [6] "the sun" "the huge" "the ball" "the woman" "a helps"
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
#> [,1] [,2] [,3]
#> [1,] "the smooth" "the" "smooth"
#> [2,] "the sheet" "the" "sheet"
#> [3,] "the depth" "the" "depth"
#> [4,] "a chicken" "a" "chicken"
#> [5,] "the parked" "the" "parked"
#> [6,] "the sun" "the" "sun"
#> [7,] "the huge" "the" "huge"
#> [8,] "the ball" "the" "ball"
#> [9,] "the woman" "the" "woman"
#> [10,] "a helps" "a" "helps"
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
## # A tibble: 720 × 3
## sentence article noun
## <chr> <chr> <chr>
## 1 The birch canoe slid on the smooth planks. the smooth
## 2 Glue the sheet to the dark blue background. the sheet
## 3 It's easy to tell the depth of a well. the depth
## 4 These days a chicken leg is a rare dish. a chicken
## 5 Rice is often served in round bowls. <NA> <NA>
## 6 The juice of lemons makes fine punch. <NA> <NA>
## 7 The box was thrown beside the parked truck. the parked
## 8 The hogs were fed chopped corn and garbage. <NA> <NA>
## 9 Four hours of steady work faced us. <NA> <NA>
## 10 A large size in stockings is hard to sell. <NA> <NA>
## # ℹ 710 more rows
#> # A tibble: 720 × 3
#> sentence article noun
#> <chr> <chr> <chr>
#> 1 The birch canoe slid on the smooth planks. the smooth
#> 2 Glue the sheet to the dark blue background. the sheet
#> 3 It's easy to tell the depth of a well. the depth
#> 4 These days a chicken leg is a rare dish. a chicken
#> 5 Rice is often served in round bowls. <NA> <NA>
#> 6 The juice of lemons makes fine punch. <NA> <NA>
#> # ℹ 714 more rows
Replacing matches
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
#> [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
#> [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
#> [1] "one house" "two cars" "three people"
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
#> [1] "The canoe birch slid on the smooth planks."
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."
#> [4] "These a days chicken leg is a rare dish."
#> [5] "Rice often is served in round bowls."
Splitting
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
## [1] "a" "b" "c" "d"
#> [1] "a" "b" "c" "d"
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
## [,9]
## [1,] ""
## [2,] ""
## [3,] "well."
## [4,] "dish."
## [5,] ""
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
#> [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
#> [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
#> [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
#> [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
#> [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
#> [,9]
#> [1,] ""
#> [2,] ""
#> [3,] "well."
#> [4,] "dish."
#> [5,] ""
Other Types of Pattern
# The regular call:
str_view(fruit, "nana")
## [4] │ ba<nana>
# Is shorthand for
str_view(fruit, regex("nana"))
## [4] │ ba<nana>
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
#> [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
#> [1] "Line" "Line" "Line"
phone <- regex("
\\(? # optional opening parens
(\\d{3}) # area code
[) -]? # optional closing parens, space, or dash
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)
str_match("514-791-8141", phone)
## [,1] [,2] [,3] [,4]
## [1,] "514-791-814" "514" "791" "814"
#> [,1] [,2] [,3] [,4]
#> [1,] "514-791-814" "514" "791" "814"
# That means you also need to be aware of the difference
# when doing case insensitive matches:
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "İ" "i" "ı"
#> [1] "I" "İ" "i" "ı"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
#> [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "İ" "i"
#> [1] "İ" "i"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>.
#> [1] │ <This> <is> <a> <sentence>.
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This" "is" "a" "sentence"
#> [[1]]
#> [1] "This" "is" "a" "sentence"