library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
You can also embed plots, for example:
## [1] "one" "two" "three"
str_length(c("a", "R for data science", NA))
## [1] 1 18 NA
#> [1] 1 18 NA
str_c("x", "y")
## [1] "xy"
#> [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
#> [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
#> [1] "x, y"
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
#> [1] "|-abc-|" "|-NA-|"
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good ", time_of_day, " ", name,
if (birthday) " and HAPPY BIRTHDAY",
"."
)
## [1] "Good morning Hadley."
#> [1] "Good morning Hadley."
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
#> [1] "x, y, z"
# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))
## [1] "I" "I"
#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "İ" "I"
#> [1] "İ" "I"
x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # English
## [1] "apple" "banana" "eggplant"
#> [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw") # Hawaiian
## [1] "apple" "eggplant" "banana"
#> [1] "apple" "eggplant" "banana"
paste("dog","cat","bird")
## [1] "dog cat bird"
paste0("dog","cat","bird")
## [1] "dogcatbird"
##The function paste will separate the strings with spaces as seen below. In contrast, the paste0 function will not separate the string and reproduce it as a whole.
str_c("dog","cat","bird")
## [1] "dogcatbird"
## The sep argument does not separate any strings, unlike the collapse argument which is used to separate elements.
x <- c("a", "abc", "abcd", "abcde", "abcdef")
M <- str_length(x)
m <- ceiling(M / 2)
str_sub(x, m, m)
## [1] "a" "b" "b" "c" "c"
## This function extracts the middle character of each element. I used google to figure out the most appropriate ceiling and it was (str_length/2), this guarantees that the middle character will always be found.
##Str_wrap() wraps the text so that it fits within a certain width. This is useful when trying to wrap long strings of text.
str_trim(" horse ")
## [1] "horse"
str_trim(" horse ", side = "right")
## [1] " horse"
str_trim(" horse ", side = "left")
## [1] "horse "
##str_trim will remove the whitespace from the string.
str_pad("horse", 10, side = "both")
## [1] " horse "
str_pad("horse", 9, side = "right")
## [1] "horse "
str_pad("horse", 9, side = "left")
## [1] " horse"
##str_pad will add characters (whitespace) to the string.
str_commasep <- function(x, delim = ",") {
n <- length(x)
if (n == 0) {
""
} else if (n == 1) {
x
} else if (n == 2) {
# no comma before and when n == 2
str_c(x[[1]], "and", x[[2]], sep = " ")
} else {
# commas after all n - 1 elements
not_last <- str_c(x[seq_len(n - 1)], delim)
# prepend "and" to the last element
last <- str_c("and", x[[n]], sep = " ")
# combine parts with spaces
str_c(c(not_last, last), collapse = " ")
}
}
str_commasep("")
## [1] ""
str_commasep("a")
## [1] "a"
str_commasep(c("a", "b"))
## [1] "a and b"
str_commasep(c("a", "b", "c"))
## [1] "a, b, and c"
str_commasep(c("a", "b", "c", "d"))
## [1] "a, b, c, and d"
## Had assistance from google and a friend to complete this one, it was really tricky/complex.
x <- c("apple", "banana", "pear")
str_view(x, "an")
str_view(x, ".a.")
# To create the regular expression, we need \\
dot <- "\\."
# But the expression itself only contains one:
writeLines(dot)
## \.
#> \.
# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")
#Explain why each of these strings don’t match a \: "\", "\\", "\\\".
## "\": This will escape the next character in the string.
## "\\": This will resolve to \ in the regular expression, which will escape the next character in the regular expression.
## "\\\": The first two backslashes will resolve to a literal backslash in the regular expression, the third will escape the next character. So in the regular expression, this will escape some escaped character.
str_view("\"'\\", "\"'\\\\", match = TRUE)
str_view(c(".a.b.c", ".a.b", ".a.b.c.d."), c("\\..\\..\\.."), match = TRUE)
## This will match any patterns that are a dot followed by any character, three times.
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple$")
str_view(c("$^$", "ab$^$sfas"), "^\\$\\^\\$$", match = TRUE)
## Had assisstance from a friend in completing this.
str_view(stringr::words, "^y", match = TRUE)
str_view(stringr::words, "x$",match=TRUE)
str_view(stringr::words,"^...$",match=TRUE)
str_view(stringr::words,".......",match=TRUE)
# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")
str_view(c("grey", "gray"), "gr(e|a)y")
str_subset(stringr::words,"^[aeiou]")
## [1] "a" "able" "about" "absolute" "accept"
## [6] "account" "achieve" "across" "act" "active"
## [11] "actual" "add" "address" "admit" "advertise"
## [16] "affect" "afford" "after" "afternoon" "again"
## [21] "against" "age" "agent" "ago" "agree"
## [26] "air" "all" "allow" "almost" "along"
## [31] "already" "alright" "also" "although" "always"
## [36] "america" "amount" "and" "another" "answer"
## [41] "any" "apart" "apparent" "appear" "apply"
## [46] "appoint" "approach" "appropriate" "area" "argue"
## [51] "arm" "around" "arrange" "art" "as"
## [56] "ask" "associate" "assume" "at" "attend"
## [61] "authority" "available" "aware" "away" "awful"
## [66] "each" "early" "east" "easy" "eat"
## [71] "economy" "educate" "effect" "egg" "eight"
## [76] "either" "elect" "electric" "eleven" "else"
## [81] "employ" "encourage" "end" "engine" "english"
## [86] "enjoy" "enough" "enter" "environment" "equal"
## [91] "especial" "europe" "even" "evening" "ever"
## [96] "every" "evidence" "exact" "example" "except"
## [101] "excuse" "exercise" "exist" "expect" "expense"
## [106] "experience" "explain" "express" "extra" "eye"
## [111] "idea" "identify" "if" "imagine" "important"
## [116] "improve" "in" "include" "income" "increase"
## [121] "indeed" "individual" "industry" "inform" "inside"
## [126] "instead" "insure" "interest" "into" "introduce"
## [131] "invest" "involve" "issue" "it" "item"
## [136] "obvious" "occasion" "odd" "of" "off"
## [141] "offer" "office" "often" "okay" "old"
## [146] "on" "once" "one" "only" "open"
## [151] "operate" "opportunity" "oppose" "or" "order"
## [156] "organize" "original" "other" "otherwise" "ought"
## [161] "out" "over" "own" "under" "understand"
## [166] "union" "unit" "unite" "university" "unless"
## [171] "until" "up" "upon" "use" "usual"
str_subset(stringr::words, "[aeiou]", negate=TRUE)
## [1] "by" "dry" "fly" "mrs" "try" "why"
str_subset(stringr::words, "[^e]ed$")
## [1] "bed" "hundred" "red"
str_subset(stringr::words, "i(ng|se)$")
## [1] "advertise" "bring" "during" "evening" "exercise" "king"
## [7] "meaning" "morning" "otherwise" "practise" "raise" "realise"
## [13] "ring" "rise" "sing" "surprise" "thing"
length(str_subset(stringr::words, "(cei|[^c]ie)"))
## [1] 14
str_view(stringr::words, "q[^u]", match = TRUE)
##ou|ise$|ae|oe|yse$
x <- c("240-567-5594")
str_view(x, "\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d")
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
## ? pattern in {0,1} format will match at most 1 character/elements
## + pattern in {1,} format will match 1 or more characters/elements
## * pattern in {0,} format will match 0 or more characters/elements
# ^.*$ will match any string, such as ^.*$: c("house", "dog")
# "\\{.+\\}" will match any string with curly braces surrounding at least one character. Such as "\\{.+\\}": c("{d}", "{dcf}").
# \d{4}-\d{2}-\d{2} will match four digits followed by a hyphen, followed by two digits followed by a hyphen, followed by another two digits. This is expression can be used to match dates in this format “YYYY-MM-DD”.
# "\\\\{4}" is \\{4}, which will match four backslashes.
str_view(words, "^[^aeiou]{3}", match = TRUE)
str_view(words, "[aeiou]{3,}", match = TRUE)
str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)
## A friend helped me solve the last two word searches.
str_view(fruit, "(..)\\1", match = TRUE)
#(.)\1\1: The same character appearing three times in a row.
# "(.)(.)\\2\\1": A pair of characters followed by the same pair of characters in reversed order.
#(..)\1: Any two characters repeated.
#"(.).\\1.\\1": A character followed by any character, the original character, any other character, the original character again.
#"(.)(.)(.).*\\3\\2\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order.
#Google assisted with finding the meaning of the last two expressions
str_subset(words, "^(.)((.*\\1$)|\\1?$)")
## [1] "a" "america" "area" "dad" "dead"
## [6] "depend" "educate" "else" "encourage" "engine"
## [11] "europe" "evidence" "example" "excuse" "exercise"
## [16] "expense" "experience" "eye" "health" "high"
## [21] "knock" "level" "local" "nation" "non"
## [26] "rather" "refer" "remember" "serious" "stairs"
## [31] "test" "tonight" "transport" "treat" "trust"
## [36] "window" "yesterday"
str_subset(words, "([A-Za-z][A-Za-z]).*\\1")
## [1] "appropriate" "church" "condition" "decide" "environment"
## [6] "london" "paragraph" "particular" "photograph" "prepare"
## [11] "pressure" "remember" "represent" "require" "sense"
## [16] "therefore" "understand" "whether"
str_subset("eleven", "([a-z]).*\\1.*\\1")
## [1] "eleven"
x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUE
# How many common words start with t?
sum(str_detect(words, "^t"))
## [1] 65
#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
#> [1] 0.2765306
# one regex
words[str_detect(words, "^x|x$")]
## [1] "box" "sex" "six" "tax"
# combination
start_with_x <- str_detect(words, "^x")
end_with_x <- str_detect(words, "x$")
words[start_with_x | end_with_x]
## [1] "box" "sex" "six" "tax"
str_subset(words, "^[aeiou].*[^aeiou]$") %>% head()
## [1] "about" "accept" "account" "across" "act" "actual"
vowels <- str_count(words, "[aeiou]")
words[which(vowels == max(vowels))]
## [1] "appropriate" "associate" "available" "colleague" "encourage"
## [6] "experience" "individual" "television"
length(sentences)
## [1] 720
#> [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
#> [1] "The birch canoe slid on the smooth planks."
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."
#> [4] "These days a chicken leg is a rare dish."
#> [5] "Rice is often served in round bowls."
#> [6] "The juice of lemons makes fine punch."
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match2 <- str_c("\\b(", str_c(colours, collapse = "|"), ")\\b")
colour_match2
## [1] "\\b(red|orange|yellow|green|blue|purple)\\b"
str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head()
## [1] "The" "Glue" "It's" "These" "Rice" "The"
pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
head()
## [1] "spring" "evening" "morning" "winding" "living" "king"
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
head()
## [1] "planks" "days" "bowls" "lemons" "makes" "hogs"
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
## # A tibble: 720 x 3
## sentence article noun
## <chr> <chr> <chr>
## 1 The birch canoe slid on the smooth planks. the smooth
## 2 Glue the sheet to the dark blue background. the sheet
## 3 It's easy to tell the depth of a well. the depth
## 4 These days a chicken leg is a rare dish. a chicken
## 5 Rice is often served in round bowls. <NA> <NA>
## 6 The juice of lemons makes fine punch. <NA> <NA>
## 7 The box was thrown beside the parked truck. the parked
## 8 The hogs were fed chopped corn and garbage. <NA> <NA>
## 9 Four hours of steady work faced us. <NA> <NA>
## 10 Large size in stockings is hard to sell. <NA> <NA>
## # … with 710 more rows
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
## # A tibble: 720 x 3
## sentence article noun
## <chr> <chr> <chr>
## 1 The birch canoe slid on the smooth planks. the smooth
## 2 Glue the sheet to the dark blue background. the sheet
## 3 It's easy to tell the depth of a well. the depth
## 4 These days a chicken leg is a rare dish. a chicken
## 5 Rice is often served in round bowls. <NA> <NA>
## 6 The juice of lemons makes fine punch. <NA> <NA>
## 7 The box was thrown beside the parked truck. the parked
## 8 The hogs were fed chopped corn and garbage. <NA> <NA>
## 9 Four hours of steady work faced us. <NA> <NA>
## 10 Large size in stockings is hard to sell. <NA> <NA>
## # … with 710 more rows
numberword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numberword)] %>%
str_extract(numberword)
## [1] "seven books" "two met" "two factors" "three lists"
## [5] "seven is" "two when" "ten inches" "one war"
## [9] "one button" "six minutes" "ten years" "two shares"
## [13] "two distinct" "five cents" "two pins" "five robins"
## [17] "four kinds" "three story" "three inches" "six comes"
## [21] "three batches" "two leaves"
contraction <- "([A-Za-z]+)'([A-Za-z]+)"
sentences[str_detect(sentences, contraction)] %>%
str_extract(contraction) %>%
str_split("'")
## [[1]]
## [1] "It" "s"
##
## [[2]]
## [1] "man" "s"
##
## [[3]]
## [1] "don" "t"
##
## [[4]]
## [1] "store" "s"
##
## [[5]]
## [1] "workmen" "s"
##
## [[6]]
## [1] "Let" "s"
##
## [[7]]
## [1] "sun" "s"
##
## [[8]]
## [1] "child" "s"
##
## [[9]]
## [1] "king" "s"
##
## [[10]]
## [1] "It" "s"
##
## [[11]]
## [1] "don" "t"
##
## [[12]]
## [1] "queen" "s"
##
## [[13]]
## [1] "don" "t"
##
## [[14]]
## [1] "pirate" "s"
##
## [[15]]
## [1] "neighbor" "s"
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
str_replace_all("orange/red", "/", "\\\\")
## [1] "orange\\red"
replacements <- c("A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
"F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j",
"K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o",
"P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t",
"U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y",
"Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)
## [1] "a" "able" "about" "absolute" "accept" "account"
swapped <- str_replace_all(words, "^([A-Za-z])(.*)([A-Za-z])$", "\\3\\2\\1")
sentences %>%
head(10) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
##
## [[6]]
## [1] "The" "juice" "of" "lemons" "makes" "fine" "punch."
##
## [[7]]
## [1] "The" "box" "was" "thrown" "beside" "the" "parked" "truck."
##
## [[8]]
## [1] "The" "hogs" "were" "fed" "chopped" "corn" "and"
## [8] "garbage."
##
## [[9]]
## [1] "Four" "hours" "of" "steady" "work" "faced" "us."
##
## [[10]]
## [1] "Large" "size" "in" "stockings" "is" "hard"
## [7] "to" "sell."
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]
## [1] "apples" "pears" "bananas"
# The spliting up by boundary ("word") will remove punction and maintain non-letter characters that are part of word, such as an apostrophe.
# Spliting a string will divide (split) the string into individual characters.