library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Strings

String basics

string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
x <- c("\"", "\\")
x
## [1] "\"" "\\"
#> [1] "\"" "\\"
writeLines(x)
## "
## \
#> "
#> \
x <- "\u00b5"
x
## [1] "µ"
#> [1] "µ"
c("one", "two", "three")
## [1] "one"   "two"   "three"
#> [1] "one"   "two"   "three"
str_length(c("a", "R for data science", NA))
## [1]  1 18 NA
#> [1]  1 18 NA
str_c("x", "y")
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
#> [1] "x, y"
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
#> [1] "|-abc-|" "|-NA-|"
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)
## [1] "Good morning Hadley."
#> [1] "Good morning Hadley."
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
#> [1] "x, y, z"
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
#> [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
## [1] "a"
#> [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple"  "banana" "pear"
#> [1] "apple"  "banana" "pear"
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
## [1] "I" "I"
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "İ" "I"
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")  # English
## [1] "apple"    "banana"   "eggplant"
#> [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # Hawaiian
## [1] "apple"    "eggplant" "banana"
#> [1] "apple"    "eggplant" "banana"

Exerises

  1. In code that doesn’t use stringr, you’ll often see paste() and paste0(). What’s the difference between the two functions? What stringr function are they equivalent to? How do the functions differ in their handling of NA? The function paste() separates strings by spaces by default, while paste0() does not separate strings with spaces by default.Since str_c() does not separate strings with spaces by default it is closer in behavior to paste0().However, str_c() and the paste function handle NA differently. The function str_c() propagates NA, if any argument is a missing value, it returns a missing value. This is in line with how the numeric R functions, e.g. sum(), mean(), handle missing values. However, the paste functions, convert NA to the string “NA” and then treat it as any other character vector.

  2. In your own words, describe the difference between the sep and collapse arguments to str_c(). The sep argument is the string inserted between arguments to str_c(), while collapse is the string used to separate any elements of the character vector into a character vector of length one.

  3. Use str_length() and str_sub() to extract the middle character from a string. What will you do if the string has an even number of characters? The following function extracts the middle character. If the string has an even number of characters the choice is arbitrary. We choose to select n/2 because that case works even if the string is only of length one.

  4. What does str_wrap() do? When might you want to use it? The function str_wrap() wraps text so that it fits within a certain width. This is useful for wrapping long strings of text to be typeset.

5.What does str_trim() do? What’s the opposite of str_trim()? The function str_trim() trims the whitespace from a string.

str_trim(" abc ")
## [1] "abc"
#> [1] "abc"
str_trim(" abc ", side = "left")
## [1] "abc "
#> [1] "abc "
str_trim(" abc ", side = "right")
## [1] " abc"
#> [1] " abc"

The opposite of str_trim() is str_pad() which adds characters to each side.

str_pad("abc", 5, side = "both")
## [1] " abc "
#> [1] " abc "
str_pad("abc", 4, side = "right")
## [1] "abc "
#> [1] "abc "
str_pad("abc", 4, side = "left")
## [1] " abc"
#> [1] " abc"
  1. Write a function that turns (e.g.) a vector c(“a”, “b”, “c”) into the string a, b, and c. Think carefully about what it should do if given a vector of length 0, 1, or 2. This function needs to handle four cases.

n == 0: an empty string, e.g. ““. n == 1: the original vector, e.g. ”a”. n == 2: return the two elements separated by “and”, e.g. “a and b”. n > 2: return the first n - 1 elements separated by commas, and the last element separated by a comma and “and”, e.g. “a, b, and c”.

str_commasep <- function(x, delim = ",") {
  n <- length(x)
  if (n == 0) {
    ""
  } else if (n == 1) {
    x
  } else if (n == 2) {
    # no comma before and when n == 2
    str_c(x[[1]], "and", x[[2]], sep = " ")
  } else {
    # commas after all n - 1 elements
    not_last <- str_c(x[seq_len(n - 1)], delim)
    # prepend "and" to the last element
    last <- str_c("and", x[[n]], sep = " ")
    # combine parts with spaces
    str_c(c(not_last, last), collapse = " ")
  }
}
str_commasep("")
## [1] ""
#> [1] ""
str_commasep("a")
## [1] "a"
#> [1] "a"
str_commasep(c("a", "b"))
## [1] "a and b"
#> [1] "a and b"
str_commasep(c("a", "b", "c"))
## [1] "a, b, and c"
#> [1] "a, b, and c"
str_commasep(c("a", "b", "c", "d"))
## [1] "a, b, c, and d"
#> [1] "a, b, c, and d"

Matching patterns with regular expressions

x <- c("apple", "banana", "pear")
str_view(x, "an")
## [2] │ b<an><an>a
#> [2] │ b<an><an>a
str_view(x, ".a.")
## [2] │ <ban>ana
## [3] │ p<ear>
#> [2] │ <ban>ana
#> [3] │ p<ear>
# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)
## \.
#> \.

# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")
## [2] │ <a.c>
#> [2] │ <a.c>
x <- "a\\b"
writeLines(x)
## a\b
#> a\b

str_view(x, "\\\\")
## [1] │ a<\>b
#> [1] │ a<\>b

Exercises

  1. Explain why each of these strings don’t match a : “",”\“,”\". “": This will escape the next character in the R string.”\“: This will resolve to  in the regular expression, which will escape the next character in the regular expression.”\": The first two backslashes will resolve to a literal backslash in the regular expression, the third will escape the next character. So in the regular expression, this will escape some escaped character.

  2. How would you match the sequence “’?

str_view("\"'\\", "\"'\\\\", match = TRUE)
## [1] │ <"'\>
  1. What patterns will the regular expression ...... match? How would you represent it as a string? It will match any patterns that are a dot followed by any character, repeated three times.
str_view(c(".a.b.c", ".a.b", "....."), c("\\..\\..\\.."), match = TRUE)
## [1] │ <.a.b.c>

Anchors

x <- c("apple", "banana", "pear")
str_view(x, "^a")
## [1] │ <a>pple
#> [1] │ <a>pple
str_view(x, "a$")
## [2] │ banan<a>
#> [2] │ banan<a>
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
## [1] │ <apple> pie
## [2] │ <apple>
## [3] │ <apple> cake
#> [1] │ <apple> pie
#> [2] │ <apple>
#> [3] │ <apple> cake
str_view(x, "^apple$")
## [2] │ <apple>
#> [2] │ <apple>

Exercises

  1. How would you match the literal string “\(^\)”? To check that the pattern works, I’ll include both the string “\(^\)”, and an example where that pattern occurs in the middle of the string which should not be matched.
str_view(c("$^$", "ab$^$sfas"), "^\\$\\^\\$$", match = TRUE)
## [1] │ <$^$>
  1. Given the corpus of common words in stringr::words, create regular expressions that find all words that:

Start with “y”. End with “x” Are exactly three letters long. (Don’t cheat by using str_length()!) Have seven letters or more. The Words that start with Y are -

str_view(stringr::words, "^y", match = TRUE)
## [975] │ <y>ear
## [976] │ <y>es
## [977] │ <y>esterday
## [978] │ <y>et
## [979] │ <y>ou
## [980] │ <y>oung

The Words that end with X are -

str_view(stringr::words, "x$", match = TRUE)
## [108] │ bo<x>
## [747] │ se<x>
## [772] │ si<x>
## [841] │ ta<x>

Are exactly 3 letters long

str_view(stringr::words, "^...$", match = TRUE)
##   [9] │ <act>
##  [12] │ <add>
##  [22] │ <age>
##  [24] │ <ago>
##  [26] │ <air>
##  [27] │ <all>
##  [38] │ <and>
##  [41] │ <any>
##  [51] │ <arm>
##  [54] │ <art>
##  [56] │ <ask>
##  [68] │ <bad>
##  [69] │ <bag>
##  [73] │ <bar>
##  [82] │ <bed>
##  [89] │ <bet>
##  [91] │ <big>
##  [94] │ <bit>
## [108] │ <box>
## [109] │ <boy>
## ... and 90 more

The Words that have more than 7 letters

str_view(stringr::words, "^...$", match = TRUE)
##   [9] │ <act>
##  [12] │ <add>
##  [22] │ <age>
##  [24] │ <ago>
##  [26] │ <air>
##  [27] │ <all>
##  [38] │ <and>
##  [41] │ <any>
##  [51] │ <arm>
##  [54] │ <art>
##  [56] │ <ask>
##  [68] │ <bad>
##  [69] │ <bag>
##  [73] │ <bar>
##  [82] │ <bed>
##  [89] │ <bet>
##  [91] │ <big>
##  [94] │ <bit>
## [108] │ <box>
## [109] │ <boy>
## ... and 90 more

Character classes and alternatives

# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
## [2] │ <a.c>
#> [2] │ <a.c>
str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")
## [3] │ <a*c>
#> [3] │ <a*c>
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]")
## [4] │ <a >c
#> [4] │ <a >c
str_view(c("grey", "gray"), "gr(e|a)y")
## [1] │ <grey>
## [2] │ <gray>
#> [1] │ <grey>
#> [2] │ <gray>

Exercises

  1. Create regular expressions to find all words that:

Start with a vowel. That only contain consonants. (Hint: thinking about matching “not”-vowels.) End with ed, but not with eed. End with ing or ise.

Words starting with vowels

str_subset(stringr::words, "^[aeiou]")
##   [1] "a"           "able"        "about"       "absolute"    "accept"     
##   [6] "account"     "achieve"     "across"      "act"         "active"     
##  [11] "actual"      "add"         "address"     "admit"       "advertise"  
##  [16] "affect"      "afford"      "after"       "afternoon"   "again"      
##  [21] "against"     "age"         "agent"       "ago"         "agree"      
##  [26] "air"         "all"         "allow"       "almost"      "along"      
##  [31] "already"     "alright"     "also"        "although"    "always"     
##  [36] "america"     "amount"      "and"         "another"     "answer"     
##  [41] "any"         "apart"       "apparent"    "appear"      "apply"      
##  [46] "appoint"     "approach"    "appropriate" "area"        "argue"      
##  [51] "arm"         "around"      "arrange"     "art"         "as"         
##  [56] "ask"         "associate"   "assume"      "at"          "attend"     
##  [61] "authority"   "available"   "aware"       "away"        "awful"      
##  [66] "each"        "early"       "east"        "easy"        "eat"        
##  [71] "economy"     "educate"     "effect"      "egg"         "eight"      
##  [76] "either"      "elect"       "electric"    "eleven"      "else"       
##  [81] "employ"      "encourage"   "end"         "engine"      "english"    
##  [86] "enjoy"       "enough"      "enter"       "environment" "equal"      
##  [91] "especial"    "europe"      "even"        "evening"     "ever"       
##  [96] "every"       "evidence"    "exact"       "example"     "except"     
## [101] "excuse"      "exercise"    "exist"       "expect"      "expense"    
## [106] "experience"  "explain"     "express"     "extra"       "eye"        
## [111] "idea"        "identify"    "if"          "imagine"     "important"  
## [116] "improve"     "in"          "include"     "income"      "increase"   
## [121] "indeed"      "individual"  "industry"    "inform"      "inside"     
## [126] "instead"     "insure"      "interest"    "into"        "introduce"  
## [131] "invest"      "involve"     "issue"       "it"          "item"       
## [136] "obvious"     "occasion"    "odd"         "of"          "off"        
## [141] "offer"       "office"      "often"       "okay"        "old"        
## [146] "on"          "once"        "one"         "only"        "open"       
## [151] "operate"     "opportunity" "oppose"      "or"          "order"      
## [156] "organize"    "original"    "other"       "otherwise"   "ought"      
## [161] "out"         "over"        "own"         "under"       "understand" 
## [166] "union"       "unit"        "unite"       "university"  "unless"     
## [171] "until"       "up"          "upon"        "use"         "usual"
#>   [1] "a"           "able"        "about"       "absolute"    "accept"     
#>   [6] "account"     "achieve"     "across"      "act"         "active"     
#>  [11] "actual"      "add"         "address"     "admit"       "advertise"  
#>  [16] "affect"      "afford"      "after"       "afternoon"   "again"      
#>  [21] "against"     "age"         "agent"       "ago"         "agree"      
#>  [26] "air"         "all"         "allow"       "almost"      "along"      
#>  [31] "already"     "alright"     "also"        "although"    "always"     
#>  [36] "america"     "amount"      "and"         "another"     "answer"     
#>  [41] "any"         "apart"       "apparent"    "appear"      "apply"      
#>  [46] "appoint"     "approach"    "appropriate" "area"        "argue"      
#>  [51] "arm"         "around"      "arrange"     "art"         "as"         
#>  [56] "ask"         "associate"   "assume"      "at"          "attend"     
#>  [61] "authority"   "available"   "aware"       "away"        "awful"      
#>  [66] "each"        "early"       "east"        "easy"        "eat"        
#>  [71] "economy"     "educate"     "effect"      "egg"         "eight"      
#>  [76] "either"      "elect"       "electric"    "eleven"      "else"       
#>  [81] "employ"      "encourage"   "end"         "engine"      "english"    
#>  [86] "enjoy"       "enough"      "enter"       "environment" "equal"      
#>  [91] "especial"    "europe"      "even"        "evening"     "ever"       
#>  [96] "every"       "evidence"    "exact"       "example"     "except"     
#> [101] "excuse"      "exercise"    "exist"       "expect"      "expense"    
#> [106] "experience"  "explain"     "express"     "extra"       "eye"        
#> [111] "idea"        "identify"    "if"          "imagine"     "important"  
#> [116] "improve"     "in"          "include"     "income"      "increase"   
#> [121] "indeed"      "individual"  "industry"    "inform"      "inside"     
#> [126] "instead"     "insure"      "interest"    "into"        "introduce"  
#> [131] "invest"      "involve"     "issue"       "it"          "item"       
#> [136] "obvious"     "occasion"    "odd"         "of"          "off"        
#> [141] "offer"       "office"      "often"       "okay"        "old"        
#> [146] "on"          "once"        "one"         "only"        "open"       
#> [151] "operate"     "opportunity" "oppose"      "or"          "order"      
#> [156] "organize"    "original"    "other"       "otherwise"   "ought"      
#> [161] "out"         "over"        "own"         "under"       "understand" 
#> [166] "union"       "unit"        "unite"       "university"  "unless"     
#> [171] "until"       "up"          "upon"        "use"         "usual"

Words that contain only consonants: Use the negate argument of str_subset.

str_subset(stringr::words, "[aeiou]", negate=TRUE)
## [1] "by"  "dry" "fly" "mrs" "try" "why"
#> [1] "by"  "dry" "fly" "mrs" "try" "why"

Alternatively, using str_view() the consonant-only words are:

str_view(stringr::words, "[aeiou]", match=FALSE)
## [123] │ by
## [249] │ dry
## [328] │ fly
## [538] │ mrs
## [895] │ try
## [952] │ why

Words that end with “-ed” but not ending in “-eed”

str_subset(stringr::words, "[^e]ed$")
## [1] "bed"     "hundred" "red"
#> [1] "bed"     "hundred" "red"
str_subset(c("ed", stringr::words), "(^|[^e])ed$")
## [1] "ed"      "bed"     "hundred" "red"
#> [1] "ed"      "bed"     "hundred" "red"

Words ending in ing or ise:

str_subset(stringr::words, "i(ng|se)$")
##  [1] "advertise" "bring"     "during"    "evening"   "exercise"  "king"     
##  [7] "meaning"   "morning"   "otherwise" "practise"  "raise"     "realise"  
## [13] "ring"      "rise"      "sing"      "surprise"  "thing"
#>  [1] "advertise" "bring"     "during"    "evening"   "exercise"  "king"     
#>  [7] "meaning"   "morning"   "otherwise" "practise"  "raise"     "realise"  
#> [13] "ring"      "rise"      "sing"      "surprise"  "thing"

2.Empirically verify the rule “i before e except after c”.

length(str_subset(stringr::words, "(cei|[^c]ie)"))
## [1] 14
#> [1] 14
length(str_subset(stringr::words, "(cie|[^c]ei)"))
## [1] 3
#> [1] 3
  1. Is “q” always followed by a “u”? In the stringr::words dataset, yes.
str_view(stringr::words, "q[^u]", match = TRUE)
  1. Write a regular expression that matches a word if it’s probably written in British English, not American English. In the general case, this is hard, and could require a dictionary. But, there are a few heuristics to consider that would account for some common cases: British English tends to use the following: “ou” instead of “o” use of “ae” and “oe” instead of “a” and “o” ends in ise instead of ize ends in yse The regex ou|ise\(|ae|oe|yse\) would match these.

  2. Create a regular expression that will match telephone numbers as commonly written in your country.

x <- c("123-456-7890", "(123)456-7890", "(123) 456-7890", "1235-2351")
str_view(x, "\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d")
## [1] │ <123-456-7890>
str_view(x, "[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]")
## [1] │ <123-456-7890>
str_view(x, "\\(\\d\\d\\d\\)\\s*\\d\\d\\d-\\d\\d\\d\\d")
## [2] │ <(123)456-7890>
## [3] │ <(123) 456-7890>
str_view(x, "\\([0-9][0-9][0-9]\\)[ ]*[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]")
## [2] │ <(123)456-7890>
## [3] │ <(123) 456-7890>
str_view(x, "\\d{3}-\\d{3}-\\d{4}")
## [1] │ <123-456-7890>
str_view(x, "\\(\\d{3}\\)\\s*\\d{3}-\\d{4}")
## [2] │ <(123)456-7890>
## [3] │ <(123) 456-7890>

Repetition

x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
str_view(x, "CC+")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C[LX]+')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
str_view(x, "C{2}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, "C{2,}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, "C{2,3}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C{2,3}?')
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, 'C[LX]+?')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII

Exercises

  1. Describe the equivalents of ?, +, * in {m,n} form.
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
  1. Describe in words what these regular expressions match: (read carefully to see if I’m using a regular expression or a string that defines a regular expression.)

^.*$ “\{.+\}” -- “\\{4}”

  1. ^.$ will match any string. For example: ^.$: c(“dog”, “$1.23”, “lorem ipsum”).

  2. “\{.+\}” will match any string with curly braces surrounding at least one character. For example: “\{.+\}”: c(“{a}”, “{abc}”).

  3. -- will match four digits followed by a hyphen, followed by two digits followed by a hyphen, followed by another two digits. This is a regular expression that can match dates formatted like “YYYY-MM-DD” (“%Y-%m-%d”). For example: --: 2018-01-11

  4. “\\{4}” is \{4}, which will match four backslashes. For example: “\\{4}”: “\\\\”.

  5. Create regular expressions to find all words that:

Start with three consonants. Have three or more vowels in a row. Have two or more vowel-consonant pairs in a row.

This regex finds all words starting with three consonants.

str_view(words, "^[^aeiou]{3}", match = TRUE)
## [150] │ <Chr>ist
## [151] │ <Chr>istmas
## [249] │ <dry>
## [328] │ <fly>
## [538] │ <mrs>
## [724] │ <sch>eme
## [725] │ <sch>ool
## [811] │ <str>aight
## [812] │ <str>ategy
## [813] │ <str>eet
## [814] │ <str>ike
## [815] │ <str>ong
## [816] │ <str>ucture
## [836] │ <sys>tem
## [868] │ <thr>ee
## [869] │ <thr>ough
## [870] │ <thr>ow
## [895] │ <try>
## [901] │ <typ>e
## [952] │ <why>

This regex finds three or more vowels in a row:

str_view(words, "[aeiou]{3,}", match = TRUE)
##  [79] │ b<eau>ty
## [565] │ obv<iou>s
## [644] │ prev<iou>s
## [670] │ q<uie>t
## [741] │ ser<iou>s
## [915] │ var<iou>s

This regex finds two or more vowel-consonant pairs in a row.

str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)
##   [4] │ abs<olut>e
##  [23] │ <agen>t
##  [30] │ <alon>g
##  [36] │ <americ>a
##  [39] │ <anot>her
##  [42] │ <apar>t
##  [43] │ app<aren>t
##  [61] │ auth<orit>y
##  [62] │ ava<ilab>le
##  [63] │ <awar>e
##  [64] │ <away>
##  [70] │ b<alan>ce
##  [75] │ b<asis>
##  [81] │ b<ecom>e
##  [83] │ b<efor>e
##  [84] │ b<egin>
##  [85] │ b<ehin>d
##  [87] │ b<enefit>
## [119] │ b<usines>s
## [143] │ ch<arac>ter
## ... and 149 more

Grouping and backreferences

str_view(fruit, "(..)\\1", match = TRUE)
##  [4] │ b<anan>a
## [20] │ <coco>nut
## [22] │ <cucu>mber
## [41] │ <juju>be
## [56] │ <papa>ya
## [73] │ s<alal> berry
#>  [4] │ b<anan>a
#> [20] │ <coco>nut
#> [22] │ <cucu>mber
#> [41] │ <juju>be
#> [56] │ <papa>ya
#> [73] │ s<alal> berry

1.Describe, in words, what these expressions will match:

(.)\1\1 “(.)(.)\2\1” (..)\1 “(.).\1.\1” “(.)(.)(.).*\3\2\1”

(.)\1\1: The same character appearing three times in a row. E.g. “aaa” “(.)(.)\2\1”: A pair of characters followed by the same pair of characters in reversed order. E.g. “abba”. (..)\1: Any two characters repeated. E.g. “a1a1”. “(.).\1.\1”: A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “b8b.b”. “(.)(.)(.).*\3\2\1” Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abcsgasgddsadgsdgcba” or “abccba” or “abc1cba”. 2. Construct regular expressions to match words that:

Start and end with the same character.

Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)

Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)

This regular expression matches words that start and end with the same character.

str_subset(words, "^(.)((.*\\1$)|\\1?$)")
##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "depend"     "educate"    "else"       "encourage"  "engine"    
## [11] "europe"     "evidence"   "example"    "excuse"     "exercise"  
## [16] "expense"    "experience" "eye"        "health"     "high"      
## [21] "knock"      "level"      "local"      "nation"     "non"       
## [26] "rather"     "refer"      "remember"   "serious"    "stairs"    
## [31] "test"       "tonight"    "transport"  "treat"      "trust"     
## [36] "window"     "yesterday"
#>  [1] "a"          "america"    "area"       "dad"        "dead"      
#>  [6] "depend"     "educate"    "else"       "encourage"  "engine"    
#> [11] "europe"     "evidence"   "example"    "excuse"     "exercise"  
#> [16] "expense"    "experience" "eye"        "health"     "high"      
#> [21] "knock"      "level"      "local"      "nation"     "non"       
#> [26] "rather"     "refer"      "remember"   "serious"    "stairs"    
#> [31] "test"       "tonight"    "transport"  "treat"      "trust"     
#> [36] "window"     "yesterday"

This regular expression will match any pair of repeated letters, where letters is defined to be the ASCII letters A-Z. First, check that it works with the example in the problem.

str_subset("church", "([A-Za-z][A-Za-z]).*\\1")
## [1] "church"
#> [1] "church"
str_subset(words, "([A-Za-z][A-Za-z]).*\\1")
##  [1] "appropriate" "church"      "condition"   "decide"      "environment"
##  [6] "london"      "paragraph"   "particular"  "photograph"  "prepare"    
## [11] "pressure"    "remember"    "represent"   "require"     "sense"      
## [16] "therefore"   "understand"  "whether"
#>  [1] "appropriate" "church"      "condition"   "decide"      "environment"
#>  [6] "london"      "paragraph"   "particular"  "photograph"  "prepare"    
#> [11] "pressure"    "remember"    "represent"   "require"     "sense"      
#> [16] "therefore"   "understand"  "whether"
str_subset("eleven", "([a-z]).*\\1.*\\1")
## [1] "eleven"
#> [1] "eleven"
str_subset(words, "([a-z]).*\\1.*\\1")
##  [1] "appropriate" "available"   "believe"     "between"     "business"   
##  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
## [11] "evidence"    "exercise"    "expense"     "experience"  "individual" 
## [16] "paragraph"   "receive"     "remember"    "represent"   "telephone"  
## [21] "therefore"   "tomorrow"
#>  [1] "appropriate" "available"   "believe"     "between"     "business"   
#>  [6] "degree"      "difference"  "discuss"     "eleven"      "environment"
#> [11] "evidence"    "exercise"    "expense"     "experience"  "individual" 
#> [16] "paragraph"   "receive"     "remember"    "represent"   "telephone"  
#> [21] "therefore"   "tomorrow"

Tools

x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1]  TRUE FALSE  TRUE
#> [1]  TRUE FALSE  TRUE
# How many common words start with t?
sum(str_detect(words, "^t"))
## [1] 65
#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
#> [1] 0.2765306
# Find all words containing at least one vowel, and negate
no_vowels_1 <- !str_detect(words, "[aeiou]")
# Find all words consisting only of consonants (non-vowels)
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
## [1] TRUE
#> [1] TRUE
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
df <- tibble(
  word = words, 
  i = seq_along(word)
)
df %>% 
  filter(str_detect(word, "x$"))
## # A tibble: 4 × 2
##   word      i
##   <chr> <int>
## 1 box     108
## 2 sex     747
## 3 six     772
## 4 tax     841
#> # A tibble: 4 × 2
#>   word      i
#>   <chr> <int>
#> 1 box     108
#> 2 sex     747
#> 3 six     772
#> 4 tax     841
x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
#> [1] 1 3 1

# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))
## [1] 1.991837
#> [1] 1.991837
df %>% 
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )
## # A tibble: 980 × 4
##    word         i vowels consonants
##    <chr>    <int>  <int>      <int>
##  1 a            1      1          0
##  2 able         2      2          2
##  3 about        3      3          2
##  4 absolute     4      4          4
##  5 accept       5      2          4
##  6 account      6      3          4
##  7 achieve      7      4          3
##  8 across       8      2          4
##  9 act          9      1          2
## 10 active      10      3          3
## # ℹ 970 more rows
#> # A tibble: 980 × 4
#>   word         i vowels consonants
#>   <chr>    <int>  <int>      <int>
#> 1 a            1      1          0
#> 2 able         2      2          2
#> 3 about        3      3          2
#> 4 absolute     4      4          4
#> 5 accept       5      2          4
#> 6 account      6      3          4
#> # ℹ 974 more rows
str_count("abababa", "aba")
## [1] 2
#> [1] 2
str_view_all("abababa", "aba")
## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ <aba>b<aba>
#> Warning: `str_view()` was deprecated in stringr 1.5.0.
#> ℹ Please use `str_view_all()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> [1] │ <aba>b<aba>

Exercises

  1. For each of the following challenges, try solving it by using both a single regular expression, and a combination of multiple str_detect() calls.

Find all words that start or end with x.

# one regex
words[str_detect(words, "^x|x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
# split regex into parts
start_with_x <- str_detect(words, "^x")
end_with_x <- str_detect(words, "x$")
words[start_with_x | end_with_x]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"

Find all words that start with a vowel and end with a consonant.

str_subset(words, "^[aeiou].*[^aeiou]$") %>% head()
## [1] "about"   "accept"  "account" "across"  "act"     "actual"
#> [1] "about"   "accept"  "account" "across"  "act"     "actual"
start_with_vowel <- str_detect(words, "^[aeiou]")
end_with_consonant <- str_detect(words, "[^aeiou]$")
words[start_with_vowel & end_with_consonant] %>% head()
## [1] "about"   "accept"  "account" "across"  "act"     "actual"
#> [1] "about"   "accept"  "account" "across"  "act"     "actual"

Are there any words that contain at least one of each different vowel? There is not a simple regular expression to match words that that contain at least one of each vowel.

  1. What word has the highest number of vowels? What word has the highest proportion of vowels?
vowels <- str_count(words, "[aeiou]")
words[which(vowels == max(vowels))]
## [1] "appropriate" "associate"   "available"   "colleague"   "encourage"  
## [6] "experience"  "individual"  "television"
#> [1] "appropriate" "associate"   "available"   "colleague"   "encourage"  
#> [6] "experience"  "individual"  "television"
prop_vowels <- str_count(words, "[aeiou]") / str_length(words)
words[which(prop_vowels == max(prop_vowels))]
## [1] "a"
#> [1] "a"

Extract matches

length(sentences)
## [1] 720
#> [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."
#> [1] "The birch canoe slid on the smooth planks." 
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."     
#> [4] "These days a chicken leg is a rare dish."   
#> [5] "Rice is often served in round bowls."       
#> [6] "The juice of lemons makes fine punch."
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
#> [1] "red|orange|yellow|green|blue|purple"
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
## [1] "blue" "blue" "red"  "red"  "red"  "blue"
#> [1] "blue" "blue" "red"  "red"  "red"  "blue"
more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)
## [1] │ It is hard to erase <blue> or <red> ink.
## [2] │ The <green> light in the brown box flicke<red>.
## [3] │ The sky in the west is tinged with <orange> <red>.
#> [1] │ It is hard to erase <blue> or <red> ink.
#> [2] │ The <green> light in the brown box flicke<red>.
#> [3] │ The sky in the west is tinged with <orange> <red>.

str_extract(more, colour_match)
## [1] "blue"   "green"  "orange"
#> [1] "blue"   "green"  "orange"
str_extract_all(more, colour_match)
## [[1]]
## [1] "blue" "red" 
## 
## [[2]]
## [1] "green" "red"  
## 
## [[3]]
## [1] "orange" "red"
#> [[1]]
#> [1] "blue" "red" 
#> 
#> [[2]]
#> [1] "green" "red"  
#> 
#> [[3]]
#> [1] "orange" "red"
str_extract_all(more, colour_match, simplify = TRUE)
##      [,1]     [,2] 
## [1,] "blue"   "red"
## [2,] "green"  "red"
## [3,] "orange" "red"
#>      [,1]     [,2] 
#> [1,] "blue"   "red"
#> [2,] "green"  "red"
#> [3,] "orange" "red"

x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"
#>      [,1] [,2] [,3]
#> [1,] "a"  ""   ""  
#> [2,] "a"  "b"  ""  
#> [3,] "a"  "b"  "c"

Exercises

  1. In the previous example, you might have noticed that the regular expression matched “flickered”, which is not a colour. Modify the regex to fix the problem.
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match2 <- str_c("\\b(", str_c(colours, collapse = "|"), ")\\b")
colour_match2
## [1] "\\b(red|orange|yellow|green|blue|purple)\\b"
#> [1] "\\b(red|orange|yellow|green|blue|purple)\\b"
more2 <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more2, colour_match2, match = TRUE)
## [1] │ It is hard to erase <blue> or <red> ink.
## [2] │ The <green> light in the brown box flickered.
## [3] │ The sky in the west is tinged with <orange> <red>.
  1. From the Harvard sentences data, extract:

The first word from each sentence.

str_extract(sentences, "[A-ZAa-z]+") %>% head()
## [1] "The"   "Glue"  "It"    "These" "Rice"  "The"
#> [1] "The"   "Glue"  "It"    "These" "Rice"  "The"
str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head()
## [1] "The"   "Glue"  "It's"  "These" "Rice"  "The"
#> [1] "The"   "Glue"  "It's"  "These" "Rice"  "The"

All words ending in ing.

pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
  head()
## [1] "spring"  "evening" "morning" "winding" "living"  "king"
#> [1] "spring"  "evening" "morning" "winding" "living"  "king"

All plurals.

unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
  head()
## [1] "planks" "days"   "bowls"  "lemons" "makes"  "hogs"
#> [1] "planks" "days"   "bowls"  "lemons" "makes"  "hogs"

Grouped Matches

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"
#>  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
#>  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"
has_noun %>% 
  str_match(noun)
##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"
#>       [,1]         [,2]  [,3]     
#>  [1,] "the smooth" "the" "smooth" 
#>  [2,] "the sheet"  "the" "sheet"  
#>  [3,] "the depth"  "the" "depth"  
#>  [4,] "a chicken"  "a"   "chicken"
#>  [5,] "the parked" "the" "parked" 
#>  [6,] "the sun"    "the" "sun"    
#>  [7,] "the huge"   "the" "huge"   
#>  [8,] "the ball"   "the" "ball"   
#>  [9,] "the woman"  "the" "woman"  
#> [10,] "a helps"    "a"   "helps"
tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )
## # A tibble: 720 × 3
##    sentence                                    article noun   
##    <chr>                                       <chr>   <chr>  
##  1 The birch canoe slid on the smooth planks.  the     smooth 
##  2 Glue the sheet to the dark blue background. the     sheet  
##  3 It's easy to tell the depth of a well.      the     depth  
##  4 These days a chicken leg is a rare dish.    a       chicken
##  5 Rice is often served in round bowls.        <NA>    <NA>   
##  6 The juice of lemons makes fine punch.       <NA>    <NA>   
##  7 The box was thrown beside the parked truck. the     parked 
##  8 The hogs were fed chopped corn and garbage. <NA>    <NA>   
##  9 Four hours of steady work faced us.         <NA>    <NA>   
## 10 A large size in stockings is hard to sell.  <NA>    <NA>   
## # ℹ 710 more rows
#> # A tibble: 720 × 3
#>   sentence                                    article noun   
#>   <chr>                                       <chr>   <chr>  
#> 1 The birch canoe slid on the smooth planks.  the     smooth 
#> 2 Glue the sheet to the dark blue background. the     sheet  
#> 3 It's easy to tell the depth of a well.      the     depth  
#> 4 These days a chicken leg is a rare dish.    a       chicken
#> 5 Rice is often served in round bowls.        <NA>    <NA>   
#> 6 The juice of lemons makes fine punch.       <NA>    <NA>   
#> # ℹ 714 more rows

Exercises

  1. Find all words that come after a “number” like “one”, “two”, “three” etc. Pull out both the number and the word.
numword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numword)] %>%
  str_extract(numword)
##  [1] "seven books"   "two met"       "two factors"   "three lists"  
##  [5] "seven is"      "two when"      "ten inches"    "one war"      
##  [9] "one button"    "six minutes"   "ten years"     "two shares"   
## [13] "two distinct"  "five cents"    "two pins"      "five robins"  
## [17] "four kinds"    "three story"   "three inches"  "six comes"    
## [21] "three batches" "two leaves"
#>  [1] "seven books"   "two met"       "two factors"   "three lists"  
#>  [5] "seven is"      "two when"      "ten inches"    "one war"      
#>  [9] "one button"    "six minutes"   "ten years"     "two shares"   
#> [13] "two distinct"  "five cents"    "two pins"      "five robins"  
#> [17] "four kinds"    "three story"   "three inches"  "six comes"    
#> [21] "three batches" "two leaves"
  1. Find all contractions. Separate out the pieces before and after the apostrophe.
contraction <- "([A-Za-z]+)'([A-Za-z]+)"
sentences[str_detect(sentences, contraction)] %>%
  str_extract(contraction) %>%
  str_split("'")
## [[1]]
## [1] "It" "s" 
## 
## [[2]]
## [1] "man" "s"  
## 
## [[3]]
## [1] "don" "t"  
## 
## [[4]]
## [1] "store" "s"    
## 
## [[5]]
## [1] "workman" "s"      
## 
## [[6]]
## [1] "Let" "s"  
## 
## [[7]]
## [1] "sun" "s"  
## 
## [[8]]
## [1] "child" "s"    
## 
## [[9]]
## [1] "king" "s"   
## 
## [[10]]
## [1] "It" "s" 
## 
## [[11]]
## [1] "don" "t"  
## 
## [[12]]
## [1] "queen" "s"    
## 
## [[13]]
## [1] "don" "t"  
## 
## [[14]]
## [1] "don" "t"  
## 
## [[15]]
## [1] "don" "t"  
## 
## [[16]]
## [1] "don" "t"  
## 
## [[17]]
## [1] "pirate" "s"     
## 
## [[18]]
## [1] "neighbor" "s"
#> [[1]]
#> [1] "It" "s" 
#> 
#> [[2]]
#> [1] "man" "s"  
#> 
#> [[3]]
#> [1] "don" "t"  
#> 
#> [[4]]
#> [1] "store" "s"    
#> 
#> [[5]]
#> [1] "workmen" "s"      
#> 
#> [[6]]
#> [1] "Let" "s"  
#> 
#> [[7]]
#> [1] "sun" "s"  
#> 
#> [[8]]
#> [1] "child" "s"    
#> 
#> [[9]]
#> [1] "king" "s"   
#> 
#> [[10]]
#> [1] "It" "s" 
#> 
#> [[11]]
#> [1] "don" "t"  
#> 
#> [[12]]
#> [1] "queen" "s"    
#> 
#> [[13]]
#> [1] "don" "t"  
#> 
#> [[14]]
#> [1] "pirate" "s"     
#> 
#> [[15]]
#> [1] "neighbor" "s"

Replacing Matches

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple"  "p-ar"   "b-nana"
#> [1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-"  "p--r"   "b-n-n-"
#> [1] "-ppl-"  "p--r"   "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house"    "two cars"     "three people"
#> [1] "one house"    "two cars"     "three people"
sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)
## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."
#> [1] "The canoe birch slid on the smooth planks." 
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."     
#> [4] "These a days chicken leg is a rare dish."   
#> [5] "Rice often is served in round bowls."

Exercises

  1. Replace all forward slashes in a string with backslashes.
str_replace_all("past/present/future", "/", "\\\\")
## [1] "past\\present\\future"
#> [1] "past\\present\\future"
  1. Implement a simple version of str_to_lower() using replace_all().
replacements <- c("A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
                  "F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j", 
                  "K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o", 
                  "P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t", 
                  "U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y", 
                  "Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)
## [1] "a"        "able"     "about"    "absolute" "accept"   "account"
#> [1] "a"        "able"     "about"    "absolute" "accept"   "account"
  1. Switch the first and last letters in words. Which of those strings are still words?
swapped <- str_replace_all(words, "^([A-Za-z])(.*)([A-Za-z])$", "\\3\\2\\1")
intersect(swapped, words)
##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "lead"       "read"       "depend"     "god"        "educate"   
## [11] "else"       "encourage"  "engine"     "europe"     "evidence"  
## [16] "example"    "excuse"     "exercise"   "expense"    "experience"
## [21] "eye"        "dog"        "health"     "high"       "knock"     
## [26] "deal"       "level"      "local"      "nation"     "on"        
## [31] "non"        "no"         "rather"     "dear"       "refer"     
## [36] "remember"   "serious"    "stairs"     "test"       "tonight"   
## [41] "transport"  "treat"      "trust"      "window"     "yesterday"
#>  [1] "a"          "america"    "area"       "dad"        "dead"      
#>  [6] "lead"       "read"       "depend"     "god"        "educate"   
#> [11] "else"       "encourage"  "engine"     "europe"     "evidence"  
#> [16] "example"    "excuse"     "exercise"   "expense"    "experience"
#> [21] "eye"        "dog"        "health"     "high"       "knock"     
#> [26] "deal"       "level"      "local"      "nation"     "on"        
#> [31] "non"        "no"         "rather"     "dear"       "refer"     
#> [36] "remember"   "serious"    "stairs"     "test"       "tonight"   
#> [41] "transport"  "treat"      "trust"      "window"     "yesterday"
swapped2 <- str_replace_all(words, "^([[:alpha:]])(.*)([[:alpha:]])$", "\\3\\2\\1")
intersect(swapped2, words)
##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "lead"       "read"       "depend"     "god"        "educate"   
## [11] "else"       "encourage"  "engine"     "europe"     "evidence"  
## [16] "example"    "excuse"     "exercise"   "expense"    "experience"
## [21] "eye"        "dog"        "health"     "high"       "knock"     
## [26] "deal"       "level"      "local"      "nation"     "on"        
## [31] "non"        "no"         "rather"     "dear"       "refer"     
## [36] "remember"   "serious"    "stairs"     "test"       "tonight"   
## [41] "transport"  "treat"      "trust"      "window"     "yesterday"
#>  [1] "a"          "america"    "area"       "dad"        "dead"      
#>  [6] "lead"       "read"       "depend"     "god"        "educate"   
#> [11] "else"       "encourage"  "engine"     "europe"     "evidence"  
#> [16] "example"    "excuse"     "exercise"   "expense"    "experience"
#> [21] "eye"        "dog"        "health"     "high"       "knock"     
#> [26] "deal"       "level"      "local"      "nation"     "on"        
#> [31] "non"        "no"         "rather"     "dear"       "refer"     
#> [36] "remember"   "serious"    "stairs"     "test"       "tonight"   
#> [41] "transport"  "treat"      "trust"      "window"     "yesterday"

Splitting

sentences %>%
  head(5) %>% 
  str_split(" ")
## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."
#> [[1]]
#> [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
#> [8] "planks."
#> 
#> [[2]]
#> [1] "Glue"        "the"         "sheet"       "to"          "the"        
#> [6] "dark"        "blue"        "background."
#> 
#> [[3]]
#> [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
#> 
#> [[4]]
#> [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
#> [8] "rare"    "dish."  
#> 
#> [[5]]
#> [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."
"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]
## [1] "a" "b" "c" "d"
#> [1] "a" "b" "c" "d"
sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)
##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
##      [,9]   
## [1,] ""     
## [2,] ""     
## [3,] "well."
## [4,] "dish."
## [5,] ""
#>      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
#> [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
#> [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
#> [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
#> [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
#> [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
#>      [,9]   
#> [1,] ""     
#> [2,] ""     
#> [3,] "well."
#> [4,] "dish."
#> [5,] ""
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"
#>      [,1]      [,2]    
#> [1,] "Name"    "Hadley"
#> [2,] "Country" "NZ"    
#> [3,] "Age"     "35"
x <- "This is a sentence.  This is another sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>.  <This> <is> <another> <sentence>.
#> [1] │ <This> <is> <a> <sentence>.  <This> <is> <another> <sentence>.

str_split(x, " ")[[1]]
## [1] "This"      "is"        "a"         "sentence." ""          "This"     
## [7] "is"        "another"   "sentence."
#> [1] "This"      "is"        "a"         "sentence." ""          "This"     
#> [7] "is"        "another"   "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This"     "is"       "a"        "sentence" "This"     "is"       "another" 
## [8] "sentence"
#> [1] "This"     "is"       "a"        "sentence" "This"     "is"       "another" 
#> [8] "sentence"

Exercises

  1. Split up a string like “apples, pears, and bananas” into individual components.
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]
## [1] "apples"  "pears"   "bananas"
#> [1] "apples"  "pears"   "bananas"
  1. Why is it better to split up by boundary(“word”) than ” “? Splitting by boundary(”word”) is a more sophisticated method to split a string into words. It recognizes non-space punctuation that splits words, and also removes punctuation while retaining internal non-letter characters that are parts of the word, e.g., “can’t” See the ICU website for a description of the set of rules that are used to determine word boundaries.
  2. What does splitting with an empty string (““) do? Experiment, and then read the documentation.
str_split("ab. cd|agt", "")[[1]]
##  [1] "a" "b" "." " " "c" "d" "|" "a" "g" "t"
#>  [1] "a" "b" "." " " "c" "d" "|" "a" "g" "t"

Other types of pattern

# The regular call:
str_view(fruit, "nana")
## [4] │ ba<nana>
# Is shorthand for
str_view(fruit, regex("nana"))
## [4] │ ba<nana>
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
#> [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
#> [1] "Line" "Line" "Line"
phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [) -]?   # optional closing parens, space, or dash
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)

str_match("514-791-8141", phone)
##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"
#>      [,1]          [,2]  [,3]  [,4] 
#> [1,] "514-791-814" "514" "791" "814"
microbenchmark::microbenchmark(
  fixed = str_detect(sentences, fixed("the")),
  regex = str_detect(sentences, "the"),
  times = 20
)
## Unit: microseconds
##   expr     min       lq     mean   median      uq     max neval
##  fixed 119.126 121.5420 141.2255 126.7715 133.980 388.209    20
##  regex 410.001 415.4175 429.8818 420.0215 425.063 605.542    20
#> Unit: microseconds
#>   expr   min     lq    mean median     uq   max neval
#>  fixed  61.7  66.20  92.915   82.1  88.30 354.2    20
#>  regex 272.5 279.85 293.130  283.3 288.65 476.1    20
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
## [1] "á" "á"
#> [1] "á" "á"
a1 == a2
## [1] FALSE
#> [1] FALSE
str_detect(a1, fixed(a2))
## [1] FALSE
#> [1] FALSE
str_detect(a1, coll(a2))
## [1] TRUE
#> [1] TRUE
# That means you also need to be aware of the difference
# when doing case insensitive matches:
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "İ" "i" "ı"
#> [1] "I" "İ" "i" "ı"

str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
#> [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "İ" "i"
#> [1] "İ" "i"
stringi::stri_locale_info()
## $Language
## [1] "en"
## 
## $Country
## [1] "US"
## 
## $Variant
## [1] ""
## 
## $Name
## [1] "en_US"
#> $Language
#> [1] "c"
#> 
#> $Country
#> [1] ""
#> 
#> $Variant
#> [1] ""
#> 
#> $Name
#> [1] "c"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>.
#> [1] │ <This> <is> <a> <sentence>.
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This"     "is"       "a"        "sentence"
#> [[1]]
#> [1] "This"     "is"       "a"        "sentence"

Exercises

  1. How would you find all strings containing  with regex() vs. with fixed()?
str_subset(c("a\\b", "ab"), "\\\\")
## [1] "a\\b"
#> [1] "a\\b"
str_subset(c("a\\b", "ab"), fixed("\\"))
## [1] "a\\b"
#> [1] "a\\b"
  1. What are the five most common words in sentences?
tibble(word = unlist(str_extract_all(sentences, boundary("word")))) %>%
  mutate(word = str_to_lower(word)) %>%
  count(word, sort = TRUE) %>%
  head(5)
## # A tibble: 5 × 2
##   word      n
##   <chr> <int>
## 1 the     744
## 2 a       213
## 3 of      132
## 4 to      123
## 5 and     118
#> # A tibble: 5 x 2
#>   word      n
#>   <chr> <int>
#> 1 the     751
#> 2 a       202
#> 3 of      132
#> 4 to      123
#> 5 and     118