library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.0.6 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
The paste() separator in nothing is (“”) and paste the default separator is space ("“). The equivalent stringr function are str_c(), stri_paste(), stri_join(), and stri_flatten().In paste, missing values, NA, are treated as strings, “NA”.
The sep argument joins elements from multiple character vectors, while the collapse argument joins elements in a character vector.
x <- "Apples"
mid <- ifelse(
str_length(x) %% 2 == 0,
ceiling(str_length(x) / 2),
floor(str_length(x) / 2)
)
str_sub(x,mid,mid)
## [1] "p"
This could be useful for reformatting text.
str_trim() trims whitespace form the beginning or the end of a string. The opposite, str_pad(), adds whitespace.
“" is a special character.”" doesn’t match" because ecen though it escaped the "in the regexp, we should wrap it in a string first, and a string requires more escaping outside the special behavior.
x <- "\"\`\\"
str_view(x, "\\\"\\`\\\\")
x <- "\\..\\..\\.."
writeLines(x)
## \..\..\..
x <- '"$^$"'
str_view(x, '"$^$"')
Start with “y”. End with “x” Are exactly three letters long. (Don’t cheat by using str_length()!) Have seven letters or more.
Since this list is long, you might want to use the match argument to str_view() to show only the matching or non-matching words.
words <- stringr::words
str_view(words,"^y",match=T)
str_view(words,"x$", match=T)
str_view(words,"^...$", match=T)
str_view(words,"^.......$", match=T)
Start with a vowel. That only contain consonants. (Hint: thinking about matching “not”-vowels.) End with ed, but not with eed. End with ing or ise.
str_view(words, "^[aeiou]", match=T)
str_view(words, "^[^aeiou]$", match=T)
# There are none.
str_view(words, "[^e]ed$", match=T)
str_view(words,"(ing|ise)$", match=T)
str_view(words, "(cei|[^c]ie)", match=T)
str_view(words, "q[^u]", match=T)
str_view(words, "colour", match=T)
? <-{0,1} <-{1,} <-{0.}
^.*$ # This matches any string
“\{.+\}” # This string of a regex that matches one or more periods
-- # This regex matches 4 digits, a dash, 2 digts, a dash then 2 digits.
“\\{4}” # This regex is in a string and matches 4’s.
str_view(words, "^[^aeiou]{3}", match=T)
str_view(words, "[aeiou]{3}", match=T)
str_view(words, "([aeiou][^aeiou]){2,}", match=T)
Describe, in words, what these expressions will match:
(.)\1\1 # Words with two of the same letter once.
“(.)(.)\2\1” # Words with any two letters repeated twice consecutively once in the word.
(..)\1 # Words with two letter repeated once.
“(.).\1.\1” # Words with any letter, a specific letter repeated once than another letter repeated once
"(.)(.)(.).*\3\2\1" # Words with one letter, then two of the same letter, 3 of the same letter than 0 or more of a letter.
str_view(words,"^(.).*\\1$", match=T)
str_view(words, "(..).*\\1", match=T)
str_view(words,"(.).*\\1.*\\1", match=T)
Find all words that start or end with x. Find all words that start with a vowel and end with a consonant. *Are there any words that contain at least one of each different vowel?
words[str_detect(words, "^x|x$")]
## [1] "box" "sex" "six" "tax"
start_with_x <- str_detect(words, "^x")
end_with_y <- str_detect(words, "x$")
words[start_with_x | end_with_y]
## [1] "box" "sex" "six" "tax"
str_subset(words, "^[aeiou].*[aeiou]$") %>% head()
start_with_vowel <- str_detect(words, "^[aeiou]")
end_with_consonant <- str_detect(words, "[^aeiou$")
words[start_with_vowel & end_with_consonant] %>% head()
words[str_detect(words, "a") & str_detect(words, "e") & str_detect(words, "i") & str_detect(words, "o") & str_detect(words, "u")]
## character(0)
hnv <- which.max(str_count(words, "[aeiou]"))
words[hnv]
## [1] "appropriate"
In the previous example, you might have noticed that the regular expression matched “flickered”, which is not a colour. Modify the regex to fix the problem.
From the Harvard sentences data, extract:
The first word from each sentence. All words ending in ing. *All plurals.
str_extract(sentences, "[a-zA-X]+") %>% head()
## [1] "The" "Glue" "It" "These" "Rice" "The"
pattern <- "\\b[AZa-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
head()
## [1] "spring" "evening" "morning" "winding" "living" "king"
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
head()
## [1] "planks" "days" "bowls" "lemons" "makes" "hogs"
numword <- "(one|two|three|four|five|six|seven|eight|nine|ten) +(\\S+)"
sentences[str_detect(sentences, numword)] %>%
str_extract(numword)
## [1] "ten served" "one over" "seven books" "two met"
## [5] "two factors" "one and" "three lists" "seven is"
## [9] "two when" "one floor." "ten inches." "one with"
## [13] "one war" "one button" "six minutes." "ten years"
## [17] "one in" "ten chased" "one like" "two shares"
## [21] "two distinct" "one costs" "ten two" "five robins."
## [25] "four kinds" "one rang" "ten him." "three story"
## [29] "ten by" "one wall." "three inches" "ten your"
## [33] "six comes" "one before" "three batches" "two leaves."
contraction <- "([A-Za-z]+)'([A-Za-z]+)"
sentences %>%
`[`(str_detect(sentences, contraction))%>%
str_extract(contraction)
## [1] "It's" "man's" "don't" "store's" "workmen's"
## [6] "Let's" "sun's" "child's" "king's" "It's"
## [11] "don't" "queen's" "don't" "pirate's" "neighbor's"
x <- c("apple, pears, and bananas")
str_split(x, ",+(and +)?")[[1]]
## [1] "apple" " pears" " and bananas"
simple <- str_subset(words,"[A-Z]")
simple
## [1] "Christ" "Christmas"
switch <- str_replace_all(words,"^([A-Za-z])(.*)([A-Za-z])$","\\3\\2\\1")
head(switch)
## [1] "a" "ebla" "tboua" "ebsoluta" "tccepa" "tccouna"