library(tidyverse)
#### Section 14.2.5 - Introduction, Prerequisites, String Basics, String Length, Combining Strings, Subsetting Strings, Locales
### Exercise 1: In code that doesn’t use stringr, you’ll often see paste() and paste0(). What’s the difference between the two functions? What stringr function are they equivalent to? How do the functions differ in their handling of NA?
paste("foo", "bar") #the function paste() separates strings by spaces by default
paste0("foo", "bar") #paste0() does not separate strings with spaces by default
str_c("foo", "bar") #str_c() does not separate strings with spaces by default it is closer in behavior to paste0()
str_c("foo", NA) #str_c() propagates NA, if any argument is a missing value, it returns a missing value.
paste("foo", NA) #the paste functions, convert NA to the string "NA" and then treat it as any other character vector
paste0("foo", NA)
### Exercise 3: Use str_length() and str_sub() to extract the middle character from a string. What will you do if the string has an even number of characters?
# See function extracting middle character below; if the string has an even number of characters, it will select the first one of the two middle characters; ⌈n/2⌉ works even if the string is only a length of one
x <- c("a", "abc", "abcd", "abcde", "abcdef")
L <- str_length(x)
m <- ceiling(L / 2)
str_sub(x, m, m)
### Exercise 4: What does str_wrap() do? When might you want to use it?
# str_wrap() wraps text so that it fits within a certain width; may use it to wrap long strings of text to be typeset or printed.
#### Section 14.3.1.1 - Matching patterns with regular expressions, Basic Matches
### Exercise 1: Explain why each of these strings don’t match a \: "\", "\\", "\\\".
# Back-slash is the escape character for both strings and regular expressions. In order to match a back-slash, the regular expression requires four back-slashes.
### Exercise 2: How would you match the sequence "'\? # must use the four back-slashes within ""
str_view("\"'\\", "\"'\\\\", match = TRUE)
### Exercise 3: What patterns will the regular expression \..\..\.. match? How would you represent it as a string?
# matches any patterns that are a dot followed by any character, repeated three times - see below
str_view(c(".a.b.c", ".a.b", "....."), c("\\..\\..\\.."), match = TRUE)
#### Section 14.3.3.1 - Character classes and alternatives
### Exercise 1: Create regular expressions to find all words that:
# Start with a vowel.
str_subset(stringr::words, "^[aeiou]")
# That only contain consonants. (Hint: thinking about matching “not”-vowels.)
str_subset(stringr::words, "[aeiou]", negate=TRUE)
# End with ed, but not with eed.
str_subset(stringr::words, "[^e]ed$")
# End with ing or ise.
str_subset(stringr::words, "i(ng|se)$")
### Exercise 4: Write a regular expression that matches a word if it’s probably written in British English, not American English.
str_subset(words, ".+[^aeiou]our$|yse$|ae|ise$") #not exact, but demonstrative
#### Section 14.3.4.1 - Repetition
### Exercise 2: Describe in words what these regular expressions match: (read carefully to see if I’m using a regular expression or a string that defines a regular expression.)
#^.*$: matches one or more characters, any string
#"\\{.+\\}": matches quote, open curly brace, one or more characters, including spaces, close curly brace, quote - if the quotes are ignored, then it matches any character inside curly braces
#\d{4}-\d{2}-\d{2}: matches 4 digits, hyphen, 2 digits, hyphen, 2 digits, will match dates in format yyyy-mm-dd
#"\\\\{4}": matches four back-slashes
### Exercise 3: Create regular expressions to find all words that:
#start with three consonants
str_view(words, "^[^aeiou]{3}", match = TRUE)
#have three or more vowels in a row
str_view(words, "[aeiou]{3,}", match = TRUE)
#have two or more vowel-consonant pairs in a row
str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)
#### Section 14.3.5.1 - Grouping and Back references
### Exercise 1: Describe, in words, what these expressions will match:
# (.)\1\1: The same character appearing three times in a row, example "aaa"
# "(.)(.)\\2\\1": A pair of characters followed by the same pair of characters in reversed order, example "abba"
# (..)\1: Any two characters repeated, example "a1a1"
# "(.).\\1.\\1": A character followed by any character, the original character, any other character, the original character again, example "abaca", "b8b.b"
#"(.)(.)(.).*\\3\\2\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order, example "abcsgasgddsadgsdgcba" or "abccba" or "abc1cba"
#### Section 14.4.1.1 - Tools, Detect Matches
### Exercise 1: For each of the following challenges, try solving it by using both a single regular expression, and a combination of multiple str_detect() calls.
#Find all words that start or end with x
words[str_detect(words, "^x|x$")] #single
strtx <- str_detect(words, "^x") #combination
endx <- str_detect(words, "x$")
words[strtx | endx]
#### Section 14.4.2.1 - Extract matches
### Exercise 2: From the Harvard sentences data, extract:
# The first word from each sentence.
str_extract(sentences, "[A-ZAa-z]+") %>% head()
str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head() #regular expression requires the string to begin with a letter, but this catches the apostrophe in It's
# All words ending in ing.
pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>% head()
# All plurals.
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>% head() #found by using s top denote a plural
#### Section 14.4.3.1 - Grouped Matches
### Exercise 1: Find all words that come after a “number” like “one”, “two”, “three” etc. Pull out both the number and the word.
numword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numword)] %>%
str_extract(numword)
### Exercise 2: Find all contractions. Separate out the pieces before and after the apostrophe.
contraction <- "([A-Za-z]+)'([A-Za-z]+)"
sentences[str_detect(sentences, contraction)] %>%
str_extract(contraction) %>%
str_split("'")
#### Section 14.4.4.1 - Replacing Matches
### Exercise 1: Replace all forward slashes in a string with backslashes.
str_replace_all("past/present/future", "/", "\\\\")
### Exercise 2: Implement a simple version of str_to_lower() using replace_all().
replacements <- c(
"A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
"F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j",
"K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o",
"P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t",
"U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y",
"Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)
#### Section 14.4.5.1 - Splitting; Exercises
### Exercise 1: Split up a string like "apples, pears, and bananas" into individual components.
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]
### Exercise 2: Why is it better to split up by boundary("word") than " "?
#Splitting by boundary("word") splits a string into words, recognizes non-space punctuation and removes punctuation while retaining non-letter characters whereas " " will include any extra spaces output as empty elements and will show punctuation.
### Exercise 3: What does splitting with an empty string ("") do? Experiment, and then read the documentation.
str_split("Learning R is tougher than I...", "") #It splits the string into separate characters.
#### Section 14.5.1 - Splitting; Exercises
### Exercise 1: How would you find all strings containing \ with regex() vs. with fixed()?
str_subset(c("a\\b", "ab"), "\\\\")
#> [1] "a\\b"
str_subset(c("a\\b", "ab"), fixed("\\"))
#> [1] "a\\b"