library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.0.6     ✓ dplyr   1.0.4
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr)

14.2.5 Exercises

  1. In code that doesn’t use stringr, you’ll often see paste() and paste0(). What’s the difference between the two functions? What stringr function are they equivalent to? How do the functions differ in their handling of NA?

The paste() separator in nothing is (“”) and paste the default separator is space ("“). The equivalent stringr function are str_c(), stri_paste(), stri_join(), and stri_flatten().In paste, missing values, NA, are treated as strings, “NA”.

  1. In your own words, describe the difference between the sep and collapse arguments to str_c().

The sep argument joins elements from multiple character vectors, while the collapse argument joins elements in a character vector.

  1. Use str_length() and str_sub() to extract the middle character from a string. What will you do if the string has an even number of characters?
x <- "Apples"
mid <- ifelse(
  str_length(x) %% 2 == 0,
  ceiling(str_length(x) / 2),
  floor(str_length(x) / 2)
)
str_sub(x,mid,mid)
## [1] "p"
  1. What does str_wrap() do? When might you want to use it?

This could be useful for reformatting text.

  1. What does str_trim() do? What’s the opposite of str_trim()?

str_trim() trims whitespace form the beginning or the end of a string. The opposite, str_pad(), adds whitespace.

  1. Write a function that turns (e.g.) a vector c(“a”, “b”, “c”) into the string a, b, and c. Think carefully about what it should do if given a vector of length 0, 1, or 2.

14.3.1.1 Exercises

  1. Explain why each of these strings don’t match a : “",”\“,”\".

“" is a special character.”" doesn’t match" because ecen though it escaped the "in the regexp, we should wrap it in a string first, and a string requires more escaping outside the special behavior.

  1. How would you match the sequence "’?
x <- "\"\`\\"
str_view(x, "\\\"\\`\\\\")
  1. What patterns will the regular expression ...... match? How would you represent it as a string?
x <- "\\..\\..\\.."
writeLines(x)
## \..\..\..

14.3.2 Exercises

  1. How would you match the literal string “\(^\)”?
x <- '"$^$"'

str_view(x, '"$^$"')
  1. Given the corpus of common words in stringr::words, create regular expressions that find all words that:

Start with “y”. End with “x” Are exactly three letters long. (Don’t cheat by using str_length()!) Have seven letters or more.

Since this list is long, you might want to use the match argument to str_view() to show only the matching or non-matching words.

words <- stringr::words
str_view(words,"^y",match=T)
str_view(words,"x$", match=T)
str_view(words,"^...$", match=T)
str_view(words,"^.......$", match=T)

14.3.3.1 Exercises

  1. Create regular expressions to find all words that:

Start with a vowel. That only contain consonants. (Hint: thinking about matching “not”-vowels.) End with ed, but not with eed. End with ing or ise.

str_view(words, "^[aeiou]", match=T)
str_view(words, "^[^aeiou]$", match=T)
# There are none.
str_view(words, "[^e]ed$", match=T)
str_view(words,"(ing|ise)$", match=T)
  1. Empirically verify the rule “i before e except after c”.
str_view(words, "(cei|[^c]ie)", match=T)
  1. Is “q” always followed by a “u”?
str_view(words, "q[^u]", match=T)
  1. Write a regular expression that matches a word if it’s probably written in British English, not American English.
str_view(words, "colour", match=T)
  1. Create a regular expression that will match telephone numbers as commonly written in your country.

14.3.4.1 Exercises

  1. Describe the equivalents of ?, +, * in {m,n} form.

? <-{0,1} <-{1,} <-{0.}

  1. Describe in words what these regular expressions match: (read carefully to see if I’m using a regular expression or a string that defines a regular expression.)

^.*$ # This matches any string

“\{.+\}” # This string of a regex that matches one or more periods

-- # This regex matches 4 digits, a dash, 2 digts, a dash then 2 digits.

“\\{4}” # This regex is in a string and matches 4’s.

  1. Create regular expressions to find all words that: Start with three consonants. Have three or more vowels in a row. *Have two or more vowel-consonant pairs in a row.
str_view(words, "^[^aeiou]{3}", match=T)
str_view(words, "[aeiou]{3}", match=T)
str_view(words, "([aeiou][^aeiou]){2,}", match=T)

14.3.5 Exercises

Describe, in words, what these expressions will match:

(.)\1\1 # Words with two of the same letter once.

“(.)(.)\2\1” # Words with any two letters repeated twice consecutively once in the word.

(..)\1 # Words with two letter repeated once.

“(.).\1.\1” # Words with any letter, a specific letter repeated once than another letter repeated once

"(.)(.)(.).*\3\2\1" # Words with one letter, then two of the same letter, 3 of the same letter than 0 or more of a letter.

  1. Construct regular expressions to match words that: Start and end with the same character. Contain a repeated pair of letters (e.g. “church” contains “ch” repeated twice.) *Contain one letter repeated in at least three places (e.g. “eleven” contains three “e”s.)
str_view(words,"^(.).*\\1$", match=T)
str_view(words, "(..).*\\1", match=T)
str_view(words,"(.).*\\1.*\\1", match=T)

14.4.1 Exercises

  1. For each of the following challenges, try solving it by using both a single regular expression, and a combination of multiple str_detect() calls.

Find all words that start or end with x. Find all words that start with a vowel and end with a consonant. *Are there any words that contain at least one of each different vowel?

words[str_detect(words, "^x|x$")]
## [1] "box" "sex" "six" "tax"
start_with_x <- str_detect(words, "^x")
end_with_y <- str_detect(words, "x$")
words[start_with_x | end_with_y]
## [1] "box" "sex" "six" "tax"
str_subset(words, "^[aeiou].*[aeiou]$") %>% head()

start_with_vowel <- str_detect(words, "^[aeiou]")
end_with_consonant <- str_detect(words, "[^aeiou$")
words[start_with_vowel & end_with_consonant] %>% head()
words[str_detect(words, "a") & str_detect(words, "e") & str_detect(words, "i") & str_detect(words, "o") & str_detect(words, "u")]
## character(0)
  1. What word has the highest number of vowels? What word has the highest proportion of vowels? (Hint: what is the denominator?)
hnv <- which.max(str_count(words, "[aeiou]"))
words[hnv]
## [1] "appropriate"

14.4.2.1 Exercises

  1. In the previous example, you might have noticed that the regular expression matched “flickered”, which is not a colour. Modify the regex to fix the problem.

  2. From the Harvard sentences data, extract:

The first word from each sentence. All words ending in ing. *All plurals.

str_extract(sentences, "[a-zA-X]+") %>% head()
## [1] "The"   "Glue"  "It"    "These" "Rice"  "The"
pattern <- "\\b[AZa-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
  head()
## [1] "spring"  "evening" "morning" "winding" "living"  "king"
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
  head()
## [1] "planks" "days"   "bowls"  "lemons" "makes"  "hogs"

14.4.3.1 Exercises

  1. Find all words that come after a “number” like “one”, “two”, “three” etc. Pull out both the number and the word.
numword <- "(one|two|three|four|five|six|seven|eight|nine|ten) +(\\S+)"
sentences[str_detect(sentences, numword)] %>%
  str_extract(numword)
##  [1] "ten served"    "one over"      "seven books"   "two met"      
##  [5] "two factors"   "one and"       "three lists"   "seven is"     
##  [9] "two when"      "one floor."    "ten inches."   "one with"     
## [13] "one war"       "one button"    "six minutes."  "ten years"    
## [17] "one in"        "ten chased"    "one like"      "two shares"   
## [21] "two distinct"  "one costs"     "ten two"       "five robins." 
## [25] "four kinds"    "one rang"      "ten him."      "three story"  
## [29] "ten by"        "one wall."     "three inches"  "ten your"     
## [33] "six comes"     "one before"    "three batches" "two leaves."
  1. Find all contractions. Separate out the pieces before and after the apostrophe.
contraction <- "([A-Za-z]+)'([A-Za-z]+)"

sentences %>% 
  `[`(str_detect(sentences, contraction))%>%
  str_extract(contraction)
##  [1] "It's"       "man's"      "don't"      "store's"    "workmen's" 
##  [6] "Let's"      "sun's"      "child's"    "king's"     "It's"      
## [11] "don't"      "queen's"    "don't"      "pirate's"   "neighbor's"

14.4.4.1 Exercises

  1. Replace all forward slashes in a string with backslashes.
x <- c("apple, pears, and bananas")
str_split(x, ",+(and +)?")[[1]]
## [1] "apple"        " pears"       " and bananas"
  1. Implement a simple version of str_to_lower() using replace_all().
simple <- str_subset(words,"[A-Z]")
simple
## [1] "Christ"    "Christmas"
  1. Switch the first and last letters in words. Which of those strings are still words?
switch <- str_replace_all(words,"^([A-Za-z])(.*)([A-Za-z])$","\\3\\2\\1")
head(switch)
## [1] "a"        "ebla"     "tboua"    "ebsoluta" "tccepa"   "tccouna"