library(tidyverse)
## -- Attaching packages -------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#Sections: Introduction, Prerequisites, String Basics, String Length, Combining Strings, Subsetting Strings, Locales; Exercises
#14.2 String basics
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
x <- c("\"", "\\")
x
## [1] "\"" "\\"
#> [1] "\"" "\\"
writeLines(x)
## "
## \
#> "
#> \
x <- "\u00b5"
x
## [1] "µ"
#> [1] "µ"
c("one", "two", "three")
## [1] "one" "two" "three"
#> [1] "one" "two" "three"
#14.2.1 String length
str_length(c("a", "R for data science", NA))
## [1] 1 18 NA
#> [1] 1 18 NA
str_c("x", "y")
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
#> [1] "x, y"
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
#> [1] "|-abc-|" "|-NA-|"
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
## [1] "Good morning Hadley."
#> [1] "Good morning Hadley."
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
#> [1] "x, y, z"
#14.2.3 Subsetting strings
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
#> [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
## [1] "a"
#> [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple" "banana" "pear"
#> [1] "apple" "banana" "pear"
#14.2.4 Locales
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
## [1] "I" "I"
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "I" "I"
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
## [1] "apple" "banana" "eggplant"
#> [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw") # Hawaiian
## [1] "apple" "eggplant" "banana"
#> [1] "apple" "eggplant" "banana"
#14.2.5 Exercises--3
x <- c("a", "abc", "abcd", "abcde", "abcdef")
L <- str_length(x)
m <- ceiling(L / 2)
str_sub(x, m, m)
## [1] "a" "b" "b" "c" "c"
#> [1] "a" "b" "b" "c" "c"
#2-Sections: Matching patterns with regular expressions, Basic Matches; Exercises: None
#14.3 Matching patterns with regular expressions
#install.packages("htmlwidgets")
x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")
# To create the regular expression, we need \\
dot <- "\\."
# But the expression itself only contains one:
writeLines(dot)
## \.
#> \.
# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")
x <- "a\\b"
writeLines(x)
## a\b
#> a\b
str_view(x, "\\\\")
#Sections: Anchors; Exercises
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple$")
#Sections: Repetition; Exercises
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, 'C[LX]+')
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
str_view(x, 'C{2,3}?')
str_view(x, 'C[LX]+?')
#14.3.4.1 Exercises
str_view(words, '^[^aeiou]{3}?', match = TRUE)
str_view(words, '[aeiou]{3,}?', match = TRUE)
str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)
#Sections: Grouping and Back references; Exercises: 1
str_view(fruit, "(..)\\1", match = TRUE)
#14.3.5.1 Exercises
#(.)\1\1: The same character appearing three times in a row. E.g. "aaa"
#"(.)(.)\\2\\1": A pair of characters followed by the same pair of characters in reversed order. E.g. "abba".
#(..)\1: Any two characters repeated. E.g. "a1a1".
#"(.).\\1.\\1": A character followed by any character, the original character, any other character, the original character again. E.g. "abaca", "b8b.b".
#"(.)(.)(.).*\\3\\2\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. "abcsgasgddsadgsdgcba" or "abccba" or "abc1cba".
#Sections: Tools, Detect Matches; Exercises
x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUE
#> [1] TRUE FALSE TRUE
# How many common words start with t?
sum(str_detect(words, "^t"))
## [1] 65
#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
#> [1] 0.277
# Find all words containing at least one vowel, and negate
no_vowels_1 <- !str_detect(words, "[aeiou]")
# Find all words consisting only of consonants (non-vowels)
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
## [1] TRUE
#> [1] TRUE
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
df <- tibble(
word = words,
i = seq_along(word)
)
df %>%
filter(str_detect(word, "x$"))
#> # A tibble: 4 x 2
#> word i
#> <chr> <int>
#> 1 box 108
#> 2 sex 747
#> 3 six 772
#> 4 tax 841
x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
#> [1] 1 3 1
# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))
## [1] 1.991837
#> [1] 1.99
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
#> # A tibble: 980 x 4
#> word i vowels consonants
#> <chr> <int> <int> <int>
#> 1 a 1 1 0
#> 2 able 2 2 2
#> 3 about 3 3 2
#> 4 absolute 4 4 4
#> 5 accept 5 2 4
#> 6 account 6 3 4
#> # … with 974 more rows
str_count("abababa", "aba")
## [1] 2
#> [1] 2
str_view_all("abababa", "aba")
#14.4.1.1 Exercises
str_view(words, "^x|x$", match = TRUE)
str_view(words, "^x|x$", match = TRUE)
words[str_detect(words, "^x|x$")]
## [1] "box" "sex" "six" "tax"
start_with_x <- str_detect(words, "^x")
end_with_x <- str_detect(words, "x$")
words[start_with_x | end_with_x]
## [1] "box" "sex" "six" "tax"
#Find all words that start with a vowel and end with a consonant.
words[str_detect(words,"^[aieou].*[^aeiou]$")]
## [1] "about" "accept" "account" "across" "act"
## [6] "actual" "add" "address" "admit" "affect"
## [11] "afford" "after" "afternoon" "again" "against"
## [16] "agent" "air" "all" "allow" "almost"
## [21] "along" "already" "alright" "although" "always"
## [26] "amount" "and" "another" "answer" "any"
## [31] "apart" "apparent" "appear" "apply" "appoint"
## [36] "approach" "arm" "around" "art" "as"
## [41] "ask" "at" "attend" "authority" "away"
## [46] "awful" "each" "early" "east" "easy"
## [51] "eat" "economy" "effect" "egg" "eight"
## [56] "either" "elect" "electric" "eleven" "employ"
## [61] "end" "english" "enjoy" "enough" "enter"
## [66] "environment" "equal" "especial" "even" "evening"
## [71] "ever" "every" "exact" "except" "exist"
## [76] "expect" "explain" "express" "identify" "if"
## [81] "important" "in" "indeed" "individual" "industry"
## [86] "inform" "instead" "interest" "invest" "it"
## [91] "item" "obvious" "occasion" "odd" "of"
## [96] "off" "offer" "often" "okay" "old"
## [101] "on" "only" "open" "opportunity" "or"
## [106] "order" "original" "other" "ought" "out"
## [111] "over" "own" "under" "understand" "union"
## [116] "unit" "university" "unless" "until" "up"
## [121] "upon" "usual"
words[str_detect(words, "a") &
str_detect(words, "e") &
str_detect(words, "i") &
str_detect(words, "o") &
str_detect(words, "u")]
## character(0)
prop_vowels <- str_count(words, "[aeiou]") / str_length(words)
words[which(prop_vowels == max(prop_vowels))]
## [1] "a"
#Sections: Extract matches; Exercises: 2
length(sentences)
## [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
## [1] "blue" "blue" "red" "red" "red" "blue"
more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)
str_extract(more, colour_match)
## [1] "blue" "green" "orange"
str_extract_all(more, colour_match)
## [[1]]
## [1] "blue" "red"
##
## [[2]]
## [1] "green" "red"
##
## [[3]]
## [1] "orange" "red"
str_extract_all(more, colour_match, simplify = TRUE)
## [,1] [,2]
## [1,] "blue" "red"
## [2,] "green" "red"
## [3,] "orange" "red"
x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "a" "" ""
## [2,] "a" "b" ""
## [3,] "a" "b" "c"
#14.4.2.1 Exercises
str_extract(sentences, "[A-ZAa-z]+") %>% head()
## [1] "The" "Glue" "It" "These" "Rice" "The"
str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head()
## [1] "The" "Glue" "It's" "These" "Rice" "The"
pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
head()
## [1] "spring" "evening" "morning" "winding" "living" "king"
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
head()
## [1] "planks" "days" "bowls" "lemons" "makes" "hogs"
#Sections: Grouped Matches; Exercises: 1, 2
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
#14.4.3.1 Exercises
numword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numword)] %>%
str_extract(numword)
## [1] "seven books" "two met" "two factors" "three lists"
## [5] "seven is" "two when" "ten inches" "one war"
## [9] "one button" "six minutes" "ten years" "two shares"
## [13] "two distinct" "five cents" "two pins" "five robins"
## [17] "four kinds" "three story" "three inches" "six comes"
## [21] "three batches" "two leaves"
#Sections: Replacing Matches; Exercises: 1, 2
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
#14.4.4.1 Exercises
str_replace_all("past/present/future", "/", "\\\\")
## [1] "past\\present\\future"
replacements <- c("A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
"F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j",
"K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o",
"P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t",
"U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y",
"Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)
## [1] "a" "able" "about" "absolute" "accept" "account"
#Sections: Splitting; Exercises: 1,2, 3
sentences %>%
head(5) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
## [1] "a" "b" "c" "d"
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
## [,9]
## [1,] ""
## [2,] ""
## [3,] "well."
## [4,] "dish."
## [5,] ""
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
## [,1] [,2]
## [1,] "Name" "Hadley"
## [2,] "Country" "NZ"
## [3,] "Age" "35"
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
str_split(x, " ")[[1]]
## [1] "This" "is" "a" "sentence." "" "This"
## [7] "is" "another" "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This" "is" "a" "sentence" "This" "is" "another"
## [8] "sentence"
#14.4.5.1 Exercises
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]
## [1] "apples" "pears" "bananas"
sentence <- "The quick (“brown”) fox can’t jump 32.3 feet, right?"
str_split(sentence, " ")
## [[1]]
## [1] "The" "quick" "(“brown”)" "fox" "can’t" "jump"
## [7] "32.3" "feet," "right?"
str_split(sentence, boundary("word"))
## [[1]]
## [1] "The" "quick" "brown" "fox" "can’t" "jump" "32.3" "feet" "right"
#Sections: Find Matches; Exercises: 1
# The regular call:
str_view(fruit, "nana")
# Is shorthand for
str_view(fruit, regex("nana"))
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
str_view(bananas, regex("banana", ignore_case = TRUE))
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
phone <- regex("
\\(? # optional opening parens
(\\d{3}) # area code
[) -]? # optional closing parens, space, or dash
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)
str_match("514-791-8141", phone)
## [,1] [,2] [,3] [,4]
## [1,] "514-791-814" "514" "791" "814"
#install.packages("microbenchmark")
microbenchmark::microbenchmark(
fixed = str_detect(sentences, fixed("the")),
regex = str_detect(sentences, "the"),
times = 20
)
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
## [1] "á" "a´"
a1 == a2
## [1] FALSE
str_detect(a1, fixed(a2))
## [1] FALSE
str_detect(a1, coll(a2))
## [1] TRUE
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "I" "i" "i"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "I" "i" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "i" "i"
stringi::stri_locale_info()
## $Language
## [1] "en"
##
## $Country
## [1] "US"
##
## $Variant
## [1] ""
##
## $Name
## [1] "en_US"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This" "is" "a" "sentence"
#14.5.1 Exercises
str_subset(c("a\\b", "ab"), "\\\\")
## [1] "a\\b"
str_subset(c("a\\b", "ab"), fixed("\\"))
## [1] "a\\b"