library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
x <- c("\"", "\\")
x
## [1] "\"" "\\"
#> [1] "\"" "\\"
writeLines(x)
## "
## \
#> "
#> \
x <- "\u00b5"
x
## [1] "µ"
#> [1] "µ"
c("one", "two", "three")
## [1] "one" "two" "three"
#> [1] "one" "two" "three"
str_length(c("a", "R for data science", NA))
## [1] 1 18 NA
#> [1] 1 18 NA
str_c("x", "y")
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
#> [1] "x, y"
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
#> [1] "|-abc-|" "|-NA-|"
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
## [1] "Good morning Hadley."
#> [1] "Good morning Hadley."
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
#> [1] "x, y, z"
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
#> [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
## [1] "a"
#> [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple" "banana" "pear"
#> [1] "apple" "banana" "pear"
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
## [1] "I" "I"
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "İ" "I"
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
## [1] "apple" "banana" "eggplant"
#> [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw") # Hawaiian
## [1] "apple" "eggplant" "banana"
#> [1] "apple" "eggplant" "banana"
In code that doesn’t use stringr, you’ll often see paste() and paste0(). What’s the difference between the two functions? What stringr function are they equivalent to? How do the functions differ in their handling of NA? The function paste() separates strings by spaces by default, while paste0() does not separate strings with spaces by default.Since str_c() does not separate strings with spaces by default it is closer in behavior to paste0().However, str_c() and the paste function handle NA differently. The function str_c() propagates NA, if any argument is a missing value, it returns a missing value. This is in line with how the numeric R functions, e.g. sum(), mean(), handle missing values. However, the paste functions, convert NA to the string “NA” and then treat it as any other character vector.
In your own words, describe the difference between the sep and collapse arguments to str_c(). The sep argument is the string inserted between arguments to str_c(), while collapse is the string used to separate any elements of the character vector into a character vector of length one.
Use str_length() and str_sub() to extract the middle character from a string. What will you do if the string has an even number of characters? The following function extracts the middle character. If the string has an even number of characters the choice is arbitrary. We choose to select n/2 because that case works even if the string is only of length one.
What does str_wrap() do? When might you want to use it? The function str_wrap() wraps text so that it fits within a certain width. This is useful for wrapping long strings of text to be typeset.
5.What does str_trim() do? What’s the opposite of str_trim()? The function str_trim() trims the whitespace from a string.
str_trim(" abc ")
## [1] "abc"
#> [1] "abc"
str_trim(" abc ", side = "left")
## [1] "abc "
#> [1] "abc "
str_trim(" abc ", side = "right")
## [1] " abc"
#> [1] " abc"
The opposite of str_trim() is str_pad() which adds characters to each side.
str_pad("abc", 5, side = "both")
## [1] " abc "
#> [1] " abc "
str_pad("abc", 4, side = "right")
## [1] "abc "
#> [1] "abc "
str_pad("abc", 4, side = "left")
## [1] " abc"
#> [1] " abc"
n == 0: an empty string, e.g. ““. n == 1: the original vector, e.g. ”a”. n == 2: return the two elements separated by “and”, e.g. “a and b”. n > 2: return the first n - 1 elements separated by commas, and the last element separated by a comma and “and”, e.g. “a, b, and c”.
str_commasep <- function(x, delim = ",") {
n <- length(x)
if (n == 0) {
""
} else if (n == 1) {
x
} else if (n == 2) {
# no comma before and when n == 2
str_c(x[[1]], "and", x[[2]], sep = " ")
} else {
# commas after all n - 1 elements
not_last <- str_c(x[seq_len(n - 1)], delim)
# prepend "and" to the last element
last <- str_c("and", x[[n]], sep = " ")
# combine parts with spaces
str_c(c(not_last, last), collapse = " ")
}
}
str_commasep("")
## [1] ""
#> [1] ""
str_commasep("a")
## [1] "a"
#> [1] "a"
str_commasep(c("a", "b"))
## [1] "a and b"
#> [1] "a and b"
str_commasep(c("a", "b", "c"))
## [1] "a, b, and c"
#> [1] "a, b, and c"
str_commasep(c("a", "b", "c", "d"))
## [1] "a, b, c, and d"
#> [1] "a, b, c, and d"
x <- c("apple", "banana", "pear")
str_view(x, "an")
## [2] │ b<an><an>a
#> [2] │ b<an><an>a
str_view(x, ".a.")
## [2] │ <ban>ana
## [3] │ p<ear>
#> [2] │ <ban>ana
#> [3] │ p<ear>
# To create the regular expression, we need \\
dot <- "\\."
# But the expression itself only contains one:
writeLines(dot)
## \.
#> \.
# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")
## [2] │ <a.c>
#> [2] │ <a.c>
x <- "a\\b"
writeLines(x)
## a\b
#> a\b
str_view(x, "\\\\")
## [1] │ a<\>b
#> [1] │ a<\>b
Explain why each of these strings don’t match a : “",”\“,”\". “": This will escape the next character in the R string.”\“: This will resolve to in the regular expression, which will escape the next character in the regular expression.”\": The first two backslashes will resolve to a literal backslash in the regular expression, the third will escape the next character. So in the regular expression, this will escape some escaped character.
How would you match the sequence “’?
str_view("\"'\\", "\"'\\\\", match = TRUE)
## [1] │ <"'\>
str_view(c(".a.b.c", ".a.b", "....."), c("\\..\\..\\.."), match = TRUE)
## [1] │ <.a.b.c>
x <- c("apple", "banana", "pear")
str_view(x, "^a")
## [1] │ <a>pple
#> [1] │ <a>pple
str_view(x, "a$")
## [2] │ banan<a>
#> [2] │ banan<a>
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
## [1] │ <apple> pie
## [2] │ <apple>
## [3] │ <apple> cake
#> [1] │ <apple> pie
#> [2] │ <apple>
#> [3] │ <apple> cake
str_view(x, "^apple$")
## [2] │ <apple>
#> [2] │ <apple>
str_view(c("$^$", "ab$^$sfas"), "^\\$\\^\\$$", match = TRUE)
## [1] │ <$^$>
Start with “y”. End with “x” Are exactly three letters long. (Don’t cheat by using str_length()!) Have seven letters or more. The Words that start with Y are -
str_view(stringr::words, "^y", match = TRUE)
## [975] │ <y>ear
## [976] │ <y>es
## [977] │ <y>esterday
## [978] │ <y>et
## [979] │ <y>ou
## [980] │ <y>oung
The Words that end with X are -
str_view(stringr::words, "x$", match = TRUE)
## [108] │ bo<x>
## [747] │ se<x>
## [772] │ si<x>
## [841] │ ta<x>
Are exactly 3 letters long
str_view(stringr::words, "^...$", match = TRUE)
## [9] │ <act>
## [12] │ <add>
## [22] │ <age>
## [24] │ <ago>
## [26] │ <air>
## [27] │ <all>
## [38] │ <and>
## [41] │ <any>
## [51] │ <arm>
## [54] │ <art>
## [56] │ <ask>
## [68] │ <bad>
## [69] │ <bag>
## [73] │ <bar>
## [82] │ <bed>
## [89] │ <bet>
## [91] │ <big>
## [94] │ <bit>
## [108] │ <box>
## [109] │ <boy>
## ... and 90 more
The Words that have more than 7 letters
str_view(stringr::words, "^...$", match = TRUE)
## [9] │ <act>
## [12] │ <add>
## [22] │ <age>
## [24] │ <ago>
## [26] │ <air>
## [27] │ <all>
## [38] │ <and>
## [41] │ <any>
## [51] │ <arm>
## [54] │ <art>
## [56] │ <ask>
## [68] │ <bad>
## [69] │ <bag>
## [73] │ <bar>
## [82] │ <bed>
## [89] │ <bet>
## [91] │ <big>
## [94] │ <bit>
## [108] │ <box>
## [109] │ <boy>
## ... and 90 more
# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
## [2] │ <a.c>
#> [2] │ <a.c>
str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")
## [3] │ <a*c>
#> [3] │ <a*c>
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]")
## [4] │ <a >c
#> [4] │ <a >c
str_view(c("grey", "gray"), "gr(e|a)y")
## [1] │ <grey>
## [2] │ <gray>
#> [1] │ <grey>
#> [2] │ <gray>
Start with a vowel. That only contain consonants. (Hint: thinking about matching “not”-vowels.) End with ed, but not with eed. End with ing or ise.
Words starting with vowels
str_subset(stringr::words, "^[aeiou]")
## [1] "a" "able" "about" "absolute" "accept"
## [6] "account" "achieve" "across" "act" "active"
## [11] "actual" "add" "address" "admit" "advertise"
## [16] "affect" "afford" "after" "afternoon" "again"
## [21] "against" "age" "agent" "ago" "agree"
## [26] "air" "all" "allow" "almost" "along"
## [31] "already" "alright" "also" "although" "always"
## [36] "america" "amount" "and" "another" "answer"
## [41] "any" "apart" "apparent" "appear" "apply"
## [46] "appoint" "approach" "appropriate" "area" "argue"
## [51] "arm" "around" "arrange" "art" "as"
## [56] "ask" "associate" "assume" "at" "attend"
## [61] "authority" "available" "aware" "away" "awful"
## [66] "each" "early" "east" "easy" "eat"
## [71] "economy" "educate" "effect" "egg" "eight"
## [76] "either" "elect" "electric" "eleven" "else"
## [81] "employ" "encourage" "end" "engine" "english"
## [86] "enjoy" "enough" "enter" "environment" "equal"
## [91] "especial" "europe" "even" "evening" "ever"
## [96] "every" "evidence" "exact" "example" "except"
## [101] "excuse" "exercise" "exist" "expect" "expense"
## [106] "experience" "explain" "express" "extra" "eye"
## [111] "idea" "identify" "if" "imagine" "important"
## [116] "improve" "in" "include" "income" "increase"
## [121] "indeed" "individual" "industry" "inform" "inside"
## [126] "instead" "insure" "interest" "into" "introduce"
## [131] "invest" "involve" "issue" "it" "item"
## [136] "obvious" "occasion" "odd" "of" "off"
## [141] "offer" "office" "often" "okay" "old"
## [146] "on" "once" "one" "only" "open"
## [151] "operate" "opportunity" "oppose" "or" "order"
## [156] "organize" "original" "other" "otherwise" "ought"
## [161] "out" "over" "own" "under" "understand"
## [166] "union" "unit" "unite" "university" "unless"
## [171] "until" "up" "upon" "use" "usual"
#> [1] "a" "able" "about" "absolute" "accept"
#> [6] "account" "achieve" "across" "act" "active"
#> [11] "actual" "add" "address" "admit" "advertise"
#> [16] "affect" "afford" "after" "afternoon" "again"
#> [21] "against" "age" "agent" "ago" "agree"
#> [26] "air" "all" "allow" "almost" "along"
#> [31] "already" "alright" "also" "although" "always"
#> [36] "america" "amount" "and" "another" "answer"
#> [41] "any" "apart" "apparent" "appear" "apply"
#> [46] "appoint" "approach" "appropriate" "area" "argue"
#> [51] "arm" "around" "arrange" "art" "as"
#> [56] "ask" "associate" "assume" "at" "attend"
#> [61] "authority" "available" "aware" "away" "awful"
#> [66] "each" "early" "east" "easy" "eat"
#> [71] "economy" "educate" "effect" "egg" "eight"
#> [76] "either" "elect" "electric" "eleven" "else"
#> [81] "employ" "encourage" "end" "engine" "english"
#> [86] "enjoy" "enough" "enter" "environment" "equal"
#> [91] "especial" "europe" "even" "evening" "ever"
#> [96] "every" "evidence" "exact" "example" "except"
#> [101] "excuse" "exercise" "exist" "expect" "expense"
#> [106] "experience" "explain" "express" "extra" "eye"
#> [111] "idea" "identify" "if" "imagine" "important"
#> [116] "improve" "in" "include" "income" "increase"
#> [121] "indeed" "individual" "industry" "inform" "inside"
#> [126] "instead" "insure" "interest" "into" "introduce"
#> [131] "invest" "involve" "issue" "it" "item"
#> [136] "obvious" "occasion" "odd" "of" "off"
#> [141] "offer" "office" "often" "okay" "old"
#> [146] "on" "once" "one" "only" "open"
#> [151] "operate" "opportunity" "oppose" "or" "order"
#> [156] "organize" "original" "other" "otherwise" "ought"
#> [161] "out" "over" "own" "under" "understand"
#> [166] "union" "unit" "unite" "university" "unless"
#> [171] "until" "up" "upon" "use" "usual"
Words that contain only consonants: Use the negate argument of str_subset.
str_subset(stringr::words, "[aeiou]", negate=TRUE)
## [1] "by" "dry" "fly" "mrs" "try" "why"
#> [1] "by" "dry" "fly" "mrs" "try" "why"
Alternatively, using str_view() the consonant-only words are:
str_view(stringr::words, "[aeiou]", match=FALSE)
## [123] │ by
## [249] │ dry
## [328] │ fly
## [538] │ mrs
## [895] │ try
## [952] │ why
Words that end with “-ed” but not ending in “-eed”
str_subset(stringr::words, "[^e]ed$")
## [1] "bed" "hundred" "red"
#> [1] "bed" "hundred" "red"
str_subset(c("ed", stringr::words), "(^|[^e])ed$")
## [1] "ed" "bed" "hundred" "red"
#> [1] "ed" "bed" "hundred" "red"
Words ending in ing or ise:
str_subset(stringr::words, "i(ng|se)$")
## [1] "advertise" "bring" "during" "evening" "exercise" "king"
## [7] "meaning" "morning" "otherwise" "practise" "raise" "realise"
## [13] "ring" "rise" "sing" "surprise" "thing"
#> [1] "advertise" "bring" "during" "evening" "exercise" "king"
#> [7] "meaning" "morning" "otherwise" "practise" "raise" "realise"
#> [13] "ring" "rise" "sing" "surprise" "thing"
2.Empirically verify the rule “i before e except after c”.
length(str_subset(stringr::words, "(cei|[^c]ie)"))
## [1] 14
#> [1] 14
length(str_subset(stringr::words, "(cie|[^c]ei)"))
## [1] 3
#> [1] 3
str_view(stringr::words, "q[^u]", match = TRUE)
Write a regular expression that matches a word if it’s probably written in British English, not American English. In the general case, this is hard, and could require a dictionary. But, there are a few heuristics to consider that would account for some common cases: British English tends to use the following: “ou” instead of “o” use of “ae” and “oe” instead of “a” and “o” ends in ise instead of ize ends in yse The regex ou|ise\(|ae|oe|yse\) would match these.
Create a regular expression that will match telephone numbers as commonly written in your country.
x <- c("123-456-7890", "(123)456-7890", "(123) 456-7890", "1235-2351")
str_view(x, "\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d")
## [1] │ <123-456-7890>
str_view(x, "[0-9][0-9][0-9]-[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]")
## [1] │ <123-456-7890>
str_view(x, "\\(\\d\\d\\d\\)\\s*\\d\\d\\d-\\d\\d\\d\\d")
## [2] │ <(123)456-7890>
## [3] │ <(123) 456-7890>
str_view(x, "\\([0-9][0-9][0-9]\\)[ ]*[0-9][0-9][0-9]-[0-9][0-9][0-9][0-9]")
## [2] │ <(123)456-7890>
## [3] │ <(123) 456-7890>
str_view(x, "\\d{3}-\\d{3}-\\d{4}")
## [1] │ <123-456-7890>
str_view(x, "\\(\\d{3}\\)\\s*\\d{3}-\\d{4}")
## [2] │ <(123)456-7890>
## [3] │ <(123) 456-7890>
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
str_view(x, "CC+")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C[LX]+')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII
str_view(x, "C{2}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, "C{2,}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, "C{2,3}")
## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C{2,3}?')
## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, 'C[LX]+?')
## [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII
#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
^.*$ “\{.+\}” -- “\\{4}”
^.$ will match any string. For example: ^.$: c(“dog”, “$1.23”, “lorem ipsum”).
“\{.+\}” will match any string with curly braces surrounding at least one character. For example: “\{.+\}”: c(“{a}”, “{abc}”).
-- will match four digits followed by a hyphen, followed by two digits followed by a hyphen, followed by another two digits. This is a regular expression that can match dates formatted like “YYYY-MM-DD” (“%Y-%m-%d”). For example: --: 2018-01-11
“\\{4}” is \{4}, which will match four backslashes. For example: “\\{4}”: “\\\\”.
Create regular expressions to find all words that:
Start with three consonants. Have three or more vowels in a row. Have two or more vowel-consonant pairs in a row.
This regex finds all words starting with three consonants.
str_view(words, "^[^aeiou]{3}", match = TRUE)
## [150] │ <Chr>ist
## [151] │ <Chr>istmas
## [249] │ <dry>
## [328] │ <fly>
## [538] │ <mrs>
## [724] │ <sch>eme
## [725] │ <sch>ool
## [811] │ <str>aight
## [812] │ <str>ategy
## [813] │ <str>eet
## [814] │ <str>ike
## [815] │ <str>ong
## [816] │ <str>ucture
## [836] │ <sys>tem
## [868] │ <thr>ee
## [869] │ <thr>ough
## [870] │ <thr>ow
## [895] │ <try>
## [901] │ <typ>e
## [952] │ <why>
This regex finds three or more vowels in a row:
str_view(words, "[aeiou]{3,}", match = TRUE)
## [79] │ b<eau>ty
## [565] │ obv<iou>s
## [644] │ prev<iou>s
## [670] │ q<uie>t
## [741] │ ser<iou>s
## [915] │ var<iou>s
This regex finds two or more vowel-consonant pairs in a row.
str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)
## [4] │ abs<olut>e
## [23] │ <agen>t
## [30] │ <alon>g
## [36] │ <americ>a
## [39] │ <anot>her
## [42] │ <apar>t
## [43] │ app<aren>t
## [61] │ auth<orit>y
## [62] │ ava<ilab>le
## [63] │ <awar>e
## [64] │ <away>
## [70] │ b<alan>ce
## [75] │ b<asis>
## [81] │ b<ecom>e
## [83] │ b<efor>e
## [84] │ b<egin>
## [85] │ b<ehin>d
## [87] │ b<enefit>
## [119] │ b<usines>s
## [143] │ ch<arac>ter
## ... and 149 more
str_view(fruit, "(..)\\1", match = TRUE)
## [4] │ b<anan>a
## [20] │ <coco>nut
## [22] │ <cucu>mber
## [41] │ <juju>be
## [56] │ <papa>ya
## [73] │ s<alal> berry
#> [4] │ b<anan>a
#> [20] │ <coco>nut
#> [22] │ <cucu>mber
#> [41] │ <juju>be
#> [56] │ <papa>ya
#> [73] │ s<alal> berry
1.Describe, in words, what these expressions will match:
(.)\1\1 “(.)(.)\2\1” (..)\1 “(.).\1.\1” “(.)(.)(.).*\3\2\1”
(.)\1\1: The same character appearing three times in a row. E.g. “aaa” “(.)(.)\2\1”: A pair of characters followed by the same pair of characters in reversed order. E.g. “abba”. (..)\1: Any two characters repeated. E.g. “a1a1”. “(.).\1.\1”: A character followed by any character, the original character, any other character, the original character again. E.g. “abaca”, “b8b.b”. “(.)(.)(.).*\3\2\1” Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. “abcsgasgddsadgsdgcba” or “abccba” or “abc1cba”. 2. Construct regular expressions to match words that:
Start and end with the same character.
Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.)
Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
This regular expression matches words that start and end with the same character.
str_subset(words, "^(.)((.*\\1$)|\\1?$)")
## [1] "a" "america" "area" "dad" "dead"
## [6] "depend" "educate" "else" "encourage" "engine"
## [11] "europe" "evidence" "example" "excuse" "exercise"
## [16] "expense" "experience" "eye" "health" "high"
## [21] "knock" "level" "local" "nation" "non"
## [26] "rather" "refer" "remember" "serious" "stairs"
## [31] "test" "tonight" "transport" "treat" "trust"
## [36] "window" "yesterday"
#> [1] "a" "america" "area" "dad" "dead"
#> [6] "depend" "educate" "else" "encourage" "engine"
#> [11] "europe" "evidence" "example" "excuse" "exercise"
#> [16] "expense" "experience" "eye" "health" "high"
#> [21] "knock" "level" "local" "nation" "non"
#> [26] "rather" "refer" "remember" "serious" "stairs"
#> [31] "test" "tonight" "transport" "treat" "trust"
#> [36] "window" "yesterday"
This regular expression will match any pair of repeated letters, where letters is defined to be the ASCII letters A-Z. First, check that it works with the example in the problem.
str_subset("church", "([A-Za-z][A-Za-z]).*\\1")
## [1] "church"
#> [1] "church"
str_subset(words, "([A-Za-z][A-Za-z]).*\\1")
## [1] "appropriate" "church" "condition" "decide" "environment"
## [6] "london" "paragraph" "particular" "photograph" "prepare"
## [11] "pressure" "remember" "represent" "require" "sense"
## [16] "therefore" "understand" "whether"
#> [1] "appropriate" "church" "condition" "decide" "environment"
#> [6] "london" "paragraph" "particular" "photograph" "prepare"
#> [11] "pressure" "remember" "represent" "require" "sense"
#> [16] "therefore" "understand" "whether"
str_subset("eleven", "([a-z]).*\\1.*\\1")
## [1] "eleven"
#> [1] "eleven"
str_subset(words, "([a-z]).*\\1.*\\1")
## [1] "appropriate" "available" "believe" "between" "business"
## [6] "degree" "difference" "discuss" "eleven" "environment"
## [11] "evidence" "exercise" "expense" "experience" "individual"
## [16] "paragraph" "receive" "remember" "represent" "telephone"
## [21] "therefore" "tomorrow"
#> [1] "appropriate" "available" "believe" "between" "business"
#> [6] "degree" "difference" "discuss" "eleven" "environment"
#> [11] "evidence" "exercise" "expense" "experience" "individual"
#> [16] "paragraph" "receive" "remember" "represent" "telephone"
#> [21] "therefore" "tomorrow"
x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUE
#> [1] TRUE FALSE TRUE
# How many common words start with t?
sum(str_detect(words, "^t"))
## [1] 65
#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
#> [1] 0.2765306
# Find all words containing at least one vowel, and negate
no_vowels_1 <- !str_detect(words, "[aeiou]")
# Find all words consisting only of consonants (non-vowels)
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
## [1] TRUE
#> [1] TRUE
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
df <- tibble(
word = words,
i = seq_along(word)
)
df %>%
filter(str_detect(word, "x$"))
## # A tibble: 4 × 2
## word i
## <chr> <int>
## 1 box 108
## 2 sex 747
## 3 six 772
## 4 tax 841
#> # A tibble: 4 × 2
#> word i
#> <chr> <int>
#> 1 box 108
#> 2 sex 747
#> 3 six 772
#> 4 tax 841
x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
#> [1] 1 3 1
# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))
## [1] 1.991837
#> [1] 1.991837
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
## # A tibble: 980 × 4
## word i vowels consonants
## <chr> <int> <int> <int>
## 1 a 1 1 0
## 2 able 2 2 2
## 3 about 3 3 2
## 4 absolute 4 4 4
## 5 accept 5 2 4
## 6 account 6 3 4
## 7 achieve 7 4 3
## 8 across 8 2 4
## 9 act 9 1 2
## 10 active 10 3 3
## # ℹ 970 more rows
#> # A tibble: 980 × 4
#> word i vowels consonants
#> <chr> <int> <int> <int>
#> 1 a 1 1 0
#> 2 able 2 2 2
#> 3 about 3 3 2
#> 4 absolute 4 4 4
#> 5 accept 5 2 4
#> 6 account 6 3 4
#> # ℹ 974 more rows
str_count("abababa", "aba")
## [1] 2
#> [1] 2
str_view_all("abababa", "aba")
## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ <aba>b<aba>
#> Warning: `str_view()` was deprecated in stringr 1.5.0.
#> ℹ Please use `str_view_all()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> [1] │ <aba>b<aba>
Find all words that start or end with x.
# one regex
words[str_detect(words, "^x|x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
# split regex into parts
start_with_x <- str_detect(words, "^x")
end_with_x <- str_detect(words, "x$")
words[start_with_x | end_with_x]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
Find all words that start with a vowel and end with a consonant.
str_subset(words, "^[aeiou].*[^aeiou]$") %>% head()
## [1] "about" "accept" "account" "across" "act" "actual"
#> [1] "about" "accept" "account" "across" "act" "actual"
start_with_vowel <- str_detect(words, "^[aeiou]")
end_with_consonant <- str_detect(words, "[^aeiou]$")
words[start_with_vowel & end_with_consonant] %>% head()
## [1] "about" "accept" "account" "across" "act" "actual"
#> [1] "about" "accept" "account" "across" "act" "actual"
Are there any words that contain at least one of each different vowel? There is not a simple regular expression to match words that that contain at least one of each vowel.
vowels <- str_count(words, "[aeiou]")
words[which(vowels == max(vowels))]
## [1] "appropriate" "associate" "available" "colleague" "encourage"
## [6] "experience" "individual" "television"
#> [1] "appropriate" "associate" "available" "colleague" "encourage"
#> [6] "experience" "individual" "television"
prop_vowels <- str_count(words, "[aeiou]") / str_length(words)
words[which(prop_vowels == max(prop_vowels))]
## [1] "a"
#> [1] "a"
length(sentences)
## [1] 720
#> [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
#> [6] "The juice of lemons makes fine punch."
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
#> [1] "red|orange|yellow|green|blue|purple"
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
## [1] "blue" "blue" "red" "red" "red" "blue"
#> [1] "blue" "blue" "red" "red" "red" "blue"
more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)
## [1] │ It is hard to erase <blue> or <red> ink.
## [2] │ The <green> light in the brown box flicke<red>.
## [3] │ The sky in the west is tinged with <orange> <red>.
#> [1] │ It is hard to erase <blue> or <red> ink.
#> [2] │ The <green> light in the brown box flicke<red>.
#> [3] │ The sky in the west is tinged with <orange> <red>.
str_extract(more, colour_match)
## [1] "blue" "green" "orange"
#> [1] "blue" "green" "orange"
str_extract_all(more, colour_match)
## [[1]]
## [1] "blue" "red"
##
## [[2]]
## [1] "green" "red"
##
## [[3]]
## [1] "orange" "red"
#> [[1]]
#> [1] "blue" "red"
#>
#> [[2]]
#> [1] "green" "red"
#>
#> [[3]]
#> [1] "orange" "red"
str_extract_all(more, colour_match, simplify = TRUE)
## [,1] [,2]
## [1,] "blue" "red"
## [2,] "green" "red"
## [3,] "orange" "red"
#> [,1] [,2]
#> [1,] "blue" "red"
#> [2,] "green" "red"
#> [3,] "orange" "red"
x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "a" "" ""
## [2,] "a" "b" ""
## [3,] "a" "b" "c"
#> [,1] [,2] [,3]
#> [1,] "a" "" ""
#> [2,] "a" "b" ""
#> [3,] "a" "b" "c"
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match2 <- str_c("\\b(", str_c(colours, collapse = "|"), ")\\b")
colour_match2
## [1] "\\b(red|orange|yellow|green|blue|purple)\\b"
#> [1] "\\b(red|orange|yellow|green|blue|purple)\\b"
more2 <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more2, colour_match2, match = TRUE)
## [1] │ It is hard to erase <blue> or <red> ink.
## [2] │ The <green> light in the brown box flickered.
## [3] │ The sky in the west is tinged with <orange> <red>.
The first word from each sentence.
str_extract(sentences, "[A-ZAa-z]+") %>% head()
## [1] "The" "Glue" "It" "These" "Rice" "The"
#> [1] "The" "Glue" "It" "These" "Rice" "The"
str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head()
## [1] "The" "Glue" "It's" "These" "Rice" "The"
#> [1] "The" "Glue" "It's" "These" "Rice" "The"
All words ending in ing.
pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
head()
## [1] "spring" "evening" "morning" "winding" "living" "king"
#> [1] "spring" "evening" "morning" "winding" "living" "king"
All plurals.
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
head()
## [1] "planks" "days" "bowls" "lemons" "makes" "hogs"
#> [1] "planks" "days" "bowls" "lemons" "makes" "hogs"
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
#> [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
#> [6] "the sun" "the huge" "the ball" "the woman" "a helps"
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
#> [,1] [,2] [,3]
#> [1,] "the smooth" "the" "smooth"
#> [2,] "the sheet" "the" "sheet"
#> [3,] "the depth" "the" "depth"
#> [4,] "a chicken" "a" "chicken"
#> [5,] "the parked" "the" "parked"
#> [6,] "the sun" "the" "sun"
#> [7,] "the huge" "the" "huge"
#> [8,] "the ball" "the" "ball"
#> [9,] "the woman" "the" "woman"
#> [10,] "a helps" "a" "helps"
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
## # A tibble: 720 × 3
## sentence article noun
## <chr> <chr> <chr>
## 1 The birch canoe slid on the smooth planks. the smooth
## 2 Glue the sheet to the dark blue background. the sheet
## 3 It's easy to tell the depth of a well. the depth
## 4 These days a chicken leg is a rare dish. a chicken
## 5 Rice is often served in round bowls. <NA> <NA>
## 6 The juice of lemons makes fine punch. <NA> <NA>
## 7 The box was thrown beside the parked truck. the parked
## 8 The hogs were fed chopped corn and garbage. <NA> <NA>
## 9 Four hours of steady work faced us. <NA> <NA>
## 10 A large size in stockings is hard to sell. <NA> <NA>
## # ℹ 710 more rows
#> # A tibble: 720 × 3
#> sentence article noun
#> <chr> <chr> <chr>
#> 1 The birch canoe slid on the smooth planks. the smooth
#> 2 Glue the sheet to the dark blue background. the sheet
#> 3 It's easy to tell the depth of a well. the depth
#> 4 These days a chicken leg is a rare dish. a chicken
#> 5 Rice is often served in round bowls. <NA> <NA>
#> 6 The juice of lemons makes fine punch. <NA> <NA>
#> # ℹ 714 more rows
numword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numword)] %>%
str_extract(numword)
## [1] "seven books" "two met" "two factors" "three lists"
## [5] "seven is" "two when" "ten inches" "one war"
## [9] "one button" "six minutes" "ten years" "two shares"
## [13] "two distinct" "five cents" "two pins" "five robins"
## [17] "four kinds" "three story" "three inches" "six comes"
## [21] "three batches" "two leaves"
#> [1] "seven books" "two met" "two factors" "three lists"
#> [5] "seven is" "two when" "ten inches" "one war"
#> [9] "one button" "six minutes" "ten years" "two shares"
#> [13] "two distinct" "five cents" "two pins" "five robins"
#> [17] "four kinds" "three story" "three inches" "six comes"
#> [21] "three batches" "two leaves"
contraction <- "([A-Za-z]+)'([A-Za-z]+)"
sentences[str_detect(sentences, contraction)] %>%
str_extract(contraction) %>%
str_split("'")
## [[1]]
## [1] "It" "s"
##
## [[2]]
## [1] "man" "s"
##
## [[3]]
## [1] "don" "t"
##
## [[4]]
## [1] "store" "s"
##
## [[5]]
## [1] "workman" "s"
##
## [[6]]
## [1] "Let" "s"
##
## [[7]]
## [1] "sun" "s"
##
## [[8]]
## [1] "child" "s"
##
## [[9]]
## [1] "king" "s"
##
## [[10]]
## [1] "It" "s"
##
## [[11]]
## [1] "don" "t"
##
## [[12]]
## [1] "queen" "s"
##
## [[13]]
## [1] "don" "t"
##
## [[14]]
## [1] "don" "t"
##
## [[15]]
## [1] "don" "t"
##
## [[16]]
## [1] "don" "t"
##
## [[17]]
## [1] "pirate" "s"
##
## [[18]]
## [1] "neighbor" "s"
#> [[1]]
#> [1] "It" "s"
#>
#> [[2]]
#> [1] "man" "s"
#>
#> [[3]]
#> [1] "don" "t"
#>
#> [[4]]
#> [1] "store" "s"
#>
#> [[5]]
#> [1] "workmen" "s"
#>
#> [[6]]
#> [1] "Let" "s"
#>
#> [[7]]
#> [1] "sun" "s"
#>
#> [[8]]
#> [1] "child" "s"
#>
#> [[9]]
#> [1] "king" "s"
#>
#> [[10]]
#> [1] "It" "s"
#>
#> [[11]]
#> [1] "don" "t"
#>
#> [[12]]
#> [1] "queen" "s"
#>
#> [[13]]
#> [1] "don" "t"
#>
#> [[14]]
#> [1] "pirate" "s"
#>
#> [[15]]
#> [1] "neighbor" "s"
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
#> [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
#> [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
#> [1] "one house" "two cars" "three people"
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
#> [1] "The canoe birch slid on the smooth planks."
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."
#> [4] "These a days chicken leg is a rare dish."
#> [5] "Rice often is served in round bowls."
str_replace_all("past/present/future", "/", "\\\\")
## [1] "past\\present\\future"
#> [1] "past\\present\\future"
replacements <- c("A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
"F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j",
"K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o",
"P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t",
"U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y",
"Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)
## [1] "a" "able" "about" "absolute" "accept" "account"
#> [1] "a" "able" "about" "absolute" "accept" "account"
swapped <- str_replace_all(words, "^([A-Za-z])(.*)([A-Za-z])$", "\\3\\2\\1")
intersect(swapped, words)
## [1] "a" "america" "area" "dad" "dead"
## [6] "lead" "read" "depend" "god" "educate"
## [11] "else" "encourage" "engine" "europe" "evidence"
## [16] "example" "excuse" "exercise" "expense" "experience"
## [21] "eye" "dog" "health" "high" "knock"
## [26] "deal" "level" "local" "nation" "on"
## [31] "non" "no" "rather" "dear" "refer"
## [36] "remember" "serious" "stairs" "test" "tonight"
## [41] "transport" "treat" "trust" "window" "yesterday"
#> [1] "a" "america" "area" "dad" "dead"
#> [6] "lead" "read" "depend" "god" "educate"
#> [11] "else" "encourage" "engine" "europe" "evidence"
#> [16] "example" "excuse" "exercise" "expense" "experience"
#> [21] "eye" "dog" "health" "high" "knock"
#> [26] "deal" "level" "local" "nation" "on"
#> [31] "non" "no" "rather" "dear" "refer"
#> [36] "remember" "serious" "stairs" "test" "tonight"
#> [41] "transport" "treat" "trust" "window" "yesterday"
swapped2 <- str_replace_all(words, "^([[:alpha:]])(.*)([[:alpha:]])$", "\\3\\2\\1")
intersect(swapped2, words)
## [1] "a" "america" "area" "dad" "dead"
## [6] "lead" "read" "depend" "god" "educate"
## [11] "else" "encourage" "engine" "europe" "evidence"
## [16] "example" "excuse" "exercise" "expense" "experience"
## [21] "eye" "dog" "health" "high" "knock"
## [26] "deal" "level" "local" "nation" "on"
## [31] "non" "no" "rather" "dear" "refer"
## [36] "remember" "serious" "stairs" "test" "tonight"
## [41] "transport" "treat" "trust" "window" "yesterday"
#> [1] "a" "america" "area" "dad" "dead"
#> [6] "lead" "read" "depend" "god" "educate"
#> [11] "else" "encourage" "engine" "europe" "evidence"
#> [16] "example" "excuse" "exercise" "expense" "experience"
#> [21] "eye" "dog" "health" "high" "knock"
#> [26] "deal" "level" "local" "nation" "on"
#> [31] "non" "no" "rather" "dear" "refer"
#> [36] "remember" "serious" "stairs" "test" "tonight"
#> [41] "transport" "treat" "trust" "window" "yesterday"
sentences %>%
head(5) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
#> [[1]]
#> [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
#> [8] "planks."
#>
#> [[2]]
#> [1] "Glue" "the" "sheet" "to" "the"
#> [6] "dark" "blue" "background."
#>
#> [[3]]
#> [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
#>
#> [[4]]
#> [1] "These" "days" "a" "chicken" "leg" "is" "a"
#> [8] "rare" "dish."
#>
#> [[5]]
#> [1] "Rice" "is" "often" "served" "in" "round" "bowls."
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
## [1] "a" "b" "c" "d"
#> [1] "a" "b" "c" "d"
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
## [,9]
## [1,] ""
## [2,] ""
## [3,] "well."
## [4,] "dish."
## [5,] ""
#> [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
#> [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
#> [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
#> [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
#> [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
#> [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
#> [,9]
#> [1,] ""
#> [2,] ""
#> [3,] "well."
#> [4,] "dish."
#> [5,] ""
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
## [,1] [,2]
## [1,] "Name" "Hadley"
## [2,] "Country" "NZ"
## [3,] "Age" "35"
#> [,1] [,2]
#> [1,] "Name" "Hadley"
#> [2,] "Country" "NZ"
#> [3,] "Age" "35"
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>. <This> <is> <another> <sentence>.
#> [1] │ <This> <is> <a> <sentence>. <This> <is> <another> <sentence>.
str_split(x, " ")[[1]]
## [1] "This" "is" "a" "sentence." "" "This"
## [7] "is" "another" "sentence."
#> [1] "This" "is" "a" "sentence." "" "This"
#> [7] "is" "another" "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This" "is" "a" "sentence" "This" "is" "another"
## [8] "sentence"
#> [1] "This" "is" "a" "sentence" "This" "is" "another"
#> [8] "sentence"
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]
## [1] "apples" "pears" "bananas"
#> [1] "apples" "pears" "bananas"
str_split("ab. cd|agt", "")[[1]]
## [1] "a" "b" "." " " "c" "d" "|" "a" "g" "t"
#> [1] "a" "b" "." " " "c" "d" "|" "a" "g" "t"
# The regular call:
str_view(fruit, "nana")
## [4] │ ba<nana>
# Is shorthand for
str_view(fruit, regex("nana"))
## [4] │ ba<nana>
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
#> [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
#> [1] "Line" "Line" "Line"
phone <- regex("
\\(? # optional opening parens
(\\d{3}) # area code
[) -]? # optional closing parens, space, or dash
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)
str_match("514-791-8141", phone)
## [,1] [,2] [,3] [,4]
## [1,] "514-791-814" "514" "791" "814"
#> [,1] [,2] [,3] [,4]
#> [1,] "514-791-814" "514" "791" "814"
microbenchmark::microbenchmark(
fixed = str_detect(sentences, fixed("the")),
regex = str_detect(sentences, "the"),
times = 20
)
## Unit: microseconds
## expr min lq mean median uq max neval
## fixed 119.126 121.5420 141.2255 126.7715 133.980 388.209 20
## regex 410.001 415.4175 429.8818 420.0215 425.063 605.542 20
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> fixed 61.7 66.20 92.915 82.1 88.30 354.2 20
#> regex 272.5 279.85 293.130 283.3 288.65 476.1 20
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
## [1] "á" "á"
#> [1] "á" "á"
a1 == a2
## [1] FALSE
#> [1] FALSE
str_detect(a1, fixed(a2))
## [1] FALSE
#> [1] FALSE
str_detect(a1, coll(a2))
## [1] TRUE
#> [1] TRUE
# That means you also need to be aware of the difference
# when doing case insensitive matches:
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "İ" "i" "ı"
#> [1] "I" "İ" "i" "ı"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
#> [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "İ" "i"
#> [1] "İ" "i"
stringi::stri_locale_info()
## $Language
## [1] "en"
##
## $Country
## [1] "US"
##
## $Variant
## [1] ""
##
## $Name
## [1] "en_US"
#> $Language
#> [1] "c"
#>
#> $Country
#> [1] ""
#>
#> $Variant
#> [1] ""
#>
#> $Name
#> [1] "c"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>.
#> [1] │ <This> <is> <a> <sentence>.
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This" "is" "a" "sentence"
#> [[1]]
#> [1] "This" "is" "a" "sentence"
str_subset(c("a\\b", "ab"), "\\\\")
## [1] "a\\b"
#> [1] "a\\b"
str_subset(c("a\\b", "ab"), fixed("\\"))
## [1] "a\\b"
#> [1] "a\\b"
tibble(word = unlist(str_extract_all(sentences, boundary("word")))) %>%
mutate(word = str_to_lower(word)) %>%
count(word, sort = TRUE) %>%
head(5)
## # A tibble: 5 × 2
## word n
## <chr> <int>
## 1 the 744
## 2 a 213
## 3 of 132
## 4 to 123
## 5 and 118
#> # A tibble: 5 x 2
#> word n
#> <chr> <int>
#> 1 the 751
#> 2 a 202
#> 3 of 132
#> 4 to 123
#> 5 and 118