library(tidyverse)
## -- Attaching packages -------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#Sections: Introduction, Prerequisites, String Basics, String Length, Combining Strings, Subsetting Strings, Locales; Exercises
#14.2 String basics
string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'
double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"
x <- c("\"", "\\")
x
## [1] "\"" "\\"
#> [1] "\"" "\\"
writeLines(x)
## "
## \
#> "
#> \
x <- "\u00b5"
x
## [1] "µ"
#> [1] "µ"
c("one", "two", "three")
## [1] "one"   "two"   "three"
#> [1] "one"   "two"   "three"
#14.2.1 String length
str_length(c("a", "R for data science", NA))
## [1]  1 18 NA
#> [1]  1 18 NA
str_c("x", "y")
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
#> [1] "x, y"
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
#> [1] "|-abc-|" "|-NA-|"
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
#> [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)
## [1] "Good morning Hadley."
#> [1] "Good morning Hadley."
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
#> [1] "x, y, z"
#14.2.3 Subsetting strings

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
#> [1] "ple" "ana" "ear"
str_sub("a", 1, 5)
## [1] "a"
#> [1] "a"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple"  "banana" "pear"
#> [1] "apple"  "banana" "pear"
#14.2.4 Locales
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
## [1] "I" "I"
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "I" "I"
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")  # English
## [1] "apple"    "banana"   "eggplant"
#> [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # Hawaiian
## [1] "apple"    "eggplant" "banana"
#> [1] "apple"    "eggplant" "banana"
#14.2.5 Exercises--3
x <- c("a", "abc", "abcd", "abcde", "abcdef")
L <- str_length(x)
m <- ceiling(L / 2)
str_sub(x, m, m)
## [1] "a" "b" "b" "c" "c"
#> [1] "a" "b" "b" "c" "c"
#2-Sections: Matching patterns with regular expressions, Basic Matches; Exercises: None
#14.3 Matching patterns with regular expressions
#install.packages("htmlwidgets")
 x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")
# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)
## \.
#> \.

# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")
x <- "a\\b"
writeLines(x)
## a\b
#> a\b

str_view(x, "\\\\")
#Sections: Anchors; Exercises 

x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple$")
#Sections: Repetition; Exercises
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, 'C[LX]+')
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
str_view(x, 'C{2,3}?')
str_view(x, 'C[LX]+?')
#14.3.4.1 Exercises
str_view(words, '^[^aeiou]{3}?', match = TRUE)
str_view(words, '[aeiou]{3,}?', match = TRUE)
str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)
#Sections: Grouping and Back references; Exercises: 1
str_view(fruit, "(..)\\1", match = TRUE)
#14.3.5.1 Exercises
#(.)\1\1: The same character appearing three times in a row. E.g. "aaa"
#"(.)(.)\\2\\1": A pair of characters followed by the same pair of characters in reversed order. E.g. "abba".
#(..)\1: Any two characters repeated. E.g. "a1a1".
#"(.).\\1.\\1": A character followed by any character, the original character, any other character, the original character again. E.g. "abaca", "b8b.b".
#"(.)(.)(.).*\\3\\2\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order. E.g. "abcsgasgddsadgsdgcba" or "abccba" or "abc1cba".
#Sections: Tools, Detect Matches; Exercises
x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1]  TRUE FALSE  TRUE
#> [1]  TRUE FALSE  TRUE
# How many common words start with t?
sum(str_detect(words, "^t"))
## [1] 65
#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
#> [1] 0.277
# Find all words containing at least one vowel, and negate
no_vowels_1 <- !str_detect(words, "[aeiou]")
# Find all words consisting only of consonants (non-vowels)
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)
## [1] TRUE
#> [1] TRUE
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
#> [1] "box" "sex" "six" "tax"
df <- tibble(
  word = words, 
  i = seq_along(word)
)
df %>% 
  filter(str_detect(word, "x$"))
#> # A tibble: 4 x 2
#>   word      i
#>   <chr> <int>
#> 1 box     108
#> 2 sex     747
#> 3 six     772
#> 4 tax     841
x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1
#> [1] 1 3 1

# On average, how many vowels per word?
mean(str_count(words, "[aeiou]"))
## [1] 1.991837
#> [1] 1.99
df %>% 
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )
#> # A tibble: 980 x 4
#>   word         i vowels consonants
#>   <chr>    <int>  <int>      <int>
#> 1 a            1      1          0
#> 2 able         2      2          2
#> 3 about        3      3          2
#> 4 absolute     4      4          4
#> 5 accept       5      2          4
#> 6 account      6      3          4
#> # … with 974 more rows
str_count("abababa", "aba")
## [1] 2
#> [1] 2
str_view_all("abababa", "aba")
#14.4.1.1 Exercises
str_view(words, "^x|x$", match = TRUE)
str_view(words, "^x|x$", match = TRUE)
words[str_detect(words, "^x|x$")]
## [1] "box" "sex" "six" "tax"
start_with_x <- str_detect(words, "^x")
end_with_x <- str_detect(words, "x$")
words[start_with_x | end_with_x]
## [1] "box" "sex" "six" "tax"
#Find all words that start with a vowel and end with a consonant.


words[str_detect(words,"^[aieou].*[^aeiou]$")]
##   [1] "about"       "accept"      "account"     "across"      "act"        
##   [6] "actual"      "add"         "address"     "admit"       "affect"     
##  [11] "afford"      "after"       "afternoon"   "again"       "against"    
##  [16] "agent"       "air"         "all"         "allow"       "almost"     
##  [21] "along"       "already"     "alright"     "although"    "always"     
##  [26] "amount"      "and"         "another"     "answer"      "any"        
##  [31] "apart"       "apparent"    "appear"      "apply"       "appoint"    
##  [36] "approach"    "arm"         "around"      "art"         "as"         
##  [41] "ask"         "at"          "attend"      "authority"   "away"       
##  [46] "awful"       "each"        "early"       "east"        "easy"       
##  [51] "eat"         "economy"     "effect"      "egg"         "eight"      
##  [56] "either"      "elect"       "electric"    "eleven"      "employ"     
##  [61] "end"         "english"     "enjoy"       "enough"      "enter"      
##  [66] "environment" "equal"       "especial"    "even"        "evening"    
##  [71] "ever"        "every"       "exact"       "except"      "exist"      
##  [76] "expect"      "explain"     "express"     "identify"    "if"         
##  [81] "important"   "in"          "indeed"      "individual"  "industry"   
##  [86] "inform"      "instead"     "interest"    "invest"      "it"         
##  [91] "item"        "obvious"     "occasion"    "odd"         "of"         
##  [96] "off"         "offer"       "often"       "okay"        "old"        
## [101] "on"          "only"        "open"        "opportunity" "or"         
## [106] "order"       "original"    "other"       "ought"       "out"        
## [111] "over"        "own"         "under"       "understand"  "union"      
## [116] "unit"        "university"  "unless"      "until"       "up"         
## [121] "upon"        "usual"
words[str_detect(words, "a") &
        str_detect(words, "e") &
        str_detect(words, "i") &
        str_detect(words, "o") &
        str_detect(words, "u")]
## character(0)
prop_vowels <- str_count(words, "[aeiou]") / str_length(words)
words[which(prop_vowels == max(prop_vowels))]
## [1] "a"
#Sections: Extract matches; Exercises: 2
length(sentences)
## [1] 720
 head(sentences)
## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
## [1] "blue" "blue" "red"  "red"  "red"  "blue"
more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)
str_extract(more, colour_match)
## [1] "blue"   "green"  "orange"
str_extract_all(more, colour_match)
## [[1]]
## [1] "blue" "red" 
## 
## [[2]]
## [1] "green" "red"  
## 
## [[3]]
## [1] "orange" "red"
str_extract_all(more, colour_match, simplify = TRUE)
##      [,1]     [,2] 
## [1,] "blue"   "red"
## [2,] "green"  "red"
## [3,] "orange" "red"
x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"
#14.4.2.1 Exercises

str_extract(sentences, "[A-ZAa-z]+") %>% head()
## [1] "The"   "Glue"  "It"    "These" "Rice"  "The"
str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head()
## [1] "The"   "Glue"  "It's"  "These" "Rice"  "The"
pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
  head()
## [1] "spring"  "evening" "morning" "winding" "living"  "king"
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
  head()
## [1] "planks" "days"   "bowls"  "lemons" "makes"  "hogs"
#Sections: Grouped Matches; Exercises: 1, 2
noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"
has_noun %>% 
  str_match(noun)
##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"
tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )
#14.4.3.1 Exercises
numword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numword)] %>%
  str_extract(numword)
##  [1] "seven books"   "two met"       "two factors"   "three lists"  
##  [5] "seven is"      "two when"      "ten inches"    "one war"      
##  [9] "one button"    "six minutes"   "ten years"     "two shares"   
## [13] "two distinct"  "five cents"    "two pins"      "five robins"  
## [17] "four kinds"    "three story"   "three inches"  "six comes"    
## [21] "three batches" "two leaves"
#Sections: Replacing Matches; Exercises: 1, 2
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-"  "p--r"   "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house"    "two cars"     "three people"
sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)
## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."
#14.4.4.1 Exercises
str_replace_all("past/present/future", "/", "\\\\")
## [1] "past\\present\\future"
replacements <- c("A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
                  "F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j", 
                  "K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o", 
                  "P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t", 
                  "U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y", 
                  "Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)
## [1] "a"        "able"     "about"    "absolute" "accept"   "account"
#Sections: Splitting; Exercises: 1,2, 3
sentences %>%
  head(5) %>% 
  str_split(" ")
## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."
"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]
## [1] "a" "b" "c" "d"
sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)
##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
##      [,9]   
## [1,] ""     
## [2,] ""     
## [3,] "well."
## [4,] "dish."
## [5,] ""
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"
x <- "This is a sentence.  This is another sentence."
str_view_all(x, boundary("word"))
str_split(x, " ")[[1]]
## [1] "This"      "is"        "a"         "sentence." ""          "This"     
## [7] "is"        "another"   "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This"     "is"       "a"        "sentence" "This"     "is"       "another" 
## [8] "sentence"
#14.4.5.1 Exercises
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]
## [1] "apples"  "pears"   "bananas"
sentence <- "The quick (“brown”) fox can’t jump 32.3 feet, right?"
str_split(sentence, " ")
## [[1]]
## [1] "The"       "quick"     "(“brown”)" "fox"       "can’t"     "jump"     
## [7] "32.3"      "feet,"     "right?"
str_split(sentence, boundary("word"))
## [[1]]
## [1] "The"   "quick" "brown" "fox"   "can’t" "jump"  "32.3"  "feet"  "right"
#Sections: Find Matches; Exercises: 1
# The regular call:
str_view(fruit, "nana")
# Is shorthand for
str_view(fruit, regex("nana"))
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
str_view(bananas, regex("banana", ignore_case = TRUE))
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
 str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [) -]?   # optional closing parens, space, or dash
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)

str_match("514-791-8141", phone)
##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"
#install.packages("microbenchmark")
microbenchmark::microbenchmark(
  fixed = str_detect(sentences, fixed("the")),
  regex = str_detect(sentences, "the"),
  times = 20
)
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
## [1] "á" "a´"
 a1 == a2
## [1] FALSE
str_detect(a1, fixed(a2))
## [1] FALSE
 str_detect(a1, coll(a2))
## [1] TRUE
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "I" "i" "i"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "I" "i" "i"
 str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "i" "i"
stringi::stri_locale_info()
## $Language
## [1] "en"
## 
## $Country
## [1] "US"
## 
## $Variant
## [1] ""
## 
## $Name
## [1] "en_US"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This"     "is"       "a"        "sentence"
#14.5.1 Exercises
str_subset(c("a\\b", "ab"), "\\\\")
## [1] "a\\b"
str_subset(c("a\\b", "ab"), fixed("\\"))
## [1] "a\\b"