Strings

Including Plots

You can also embed plots, for example:

## [1] "one"   "two"   "three"

str_length(c("a", "R for data science", NA))

## [1]  1 18 NA

#> [1]  1 18 NA

str_c("x", "y")

## [1] "xy"

#> [1] "xy"
str_c("x", "y", "z")

## [1] "xyz"

#> [1] "xyz"

str_c("x", "y", sep = ", ")

## [1] "x, y"

#> [1] "x, y"

x <- c("abc", NA)
str_c("|-", x, "-|")

## [1] "|-abc-|" NA

#> [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")

## [1] "|-abc-|" "|-NA-|"

#> [1] "|-abc-|" "|-NA-|"

name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "Good morning Hadley."

#> [1] "Good morning Hadley."

str_c(c("x", "y", "z"), collapse = ", ")

## [1] "x, y, z"

#> [1] "x, y, z"

# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))

## [1] "I" "I"

#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")

## [1] "İ" "I"

#> [1] "İ" "I"

x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")  # English

## [1] "apple"    "banana"   "eggplant"

#> [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # Hawaiian

## [1] "apple"    "eggplant" "banana"

#> [1] "apple"    "eggplant" "banana"

paste("dog","cat","bird")

## [1] "dog cat bird"

paste0("dog","cat","bird")

## [1] "dogcatbird"

##The function paste will separate the strings with spaces as seen below. In contrast, the paste0 function will not separate the string and reproduce it as a whole. 

str_c("dog","cat","bird")

## [1] "dogcatbird"

## The sep argument does not separate any strings, unlike the collapse argument which is used to separate elements.

x <- c("a", "abc", "abcd", "abcde", "abcdef")
M <- str_length(x)
m <- ceiling(M / 2)
str_sub(x, m, m)

## [1] "a" "b" "b" "c" "c"

## This function extracts the middle character of each element. I used google to figure out the most appropriate ceiling and it was (str_length/2), this guarantees that the middle character will always be found.

##Str_wrap() wraps the text so that it fits within a certain width. This is useful when trying to wrap long strings of text.

str_trim(" horse ")

## [1] "horse"

str_trim(" horse ", side = "right")

## [1] " horse"

str_trim(" horse ", side = "left")

## [1] "horse "

##str_trim will remove the whitespace from the string.

str_pad("horse", 10, side = "both")

## [1] "  horse   "

str_pad("horse", 9, side = "right")

## [1] "horse    "

str_pad("horse", 9, side = "left")

## [1] "    horse"

##str_pad will add characters (whitespace) to the string.

str_commasep <- function(x, delim = ",") {
  n <- length(x)
  if (n == 0) {
    ""
  } else if (n == 1) {
    x
  } else if (n == 2) {
    # no comma before and when n == 2
    str_c(x[[1]], "and", x[[2]], sep = " ")
  } else {
    # commas after all n - 1 elements
    not_last <- str_c(x[seq_len(n - 1)], delim)
    # prepend "and" to the last element
    last <- str_c("and", x[[n]], sep = " ")
    # combine parts with spaces
    str_c(c(not_last, last), collapse = " ")
  }
}
str_commasep("")

## [1] ""

str_commasep("a")

## [1] "a"

str_commasep(c("a", "b"))

## [1] "a and b"

str_commasep(c("a", "b", "c"))

## [1] "a, b, and c"

str_commasep(c("a", "b", "c", "d"))

## [1] "a, b, c, and d"

## Had assistance from google and a friend to complete this one, it was really tricky/complex.

x <- c("apple", "banana", "pear")
str_view(x, "an")

str_view(x, ".a.")

# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)

## \.

#> \.

# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")

#Explain why each of these strings don’t match a \: "\", "\\", "\\\".

## "\": This will escape the next character in the string.
## "\\": This will resolve to \ in the regular expression, which will escape the next character in the regular expression.
## "\\\": The first two backslashes will resolve to a literal backslash in the regular expression, the third will escape the next character. So in the regular expression, this will escape some escaped character.

str_view("\"'\\", "\"'\\\\", match = TRUE)

str_view(c(".a.b.c", ".a.b", ".a.b.c.d."), c("\\..\\..\\.."), match = TRUE)

## This will match any patterns that are a dot followed by any character, three times.

x <- c("apple", "banana", "pear")
str_view(x, "^a")

str_view(x, "a$")

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")

str_view(x, "^apple$")

str_view(c("$^$", "ab$^$sfas"), "^\\$\\^\\$$", match = TRUE)

## Had assisstance from a friend in completing this.

str_view(stringr::words, "^y", match = TRUE)

str_view(stringr::words, "x$",match=TRUE)

str_view(stringr::words,"^...$",match=TRUE)

str_view(stringr::words,".......",match=TRUE)

# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")

str_view(c("grey", "gray"), "gr(e|a)y")

str_subset(stringr::words,"^[aeiou]")

##   [1] "a"           "able"        "about"       "absolute"    "accept"     
##   [6] "account"     "achieve"     "across"      "act"         "active"     
##  [11] "actual"      "add"         "address"     "admit"       "advertise"  
##  [16] "affect"      "afford"      "after"       "afternoon"   "again"      
##  [21] "against"     "age"         "agent"       "ago"         "agree"      
##  [26] "air"         "all"         "allow"       "almost"      "along"      
##  [31] "already"     "alright"     "also"        "although"    "always"     
##  [36] "america"     "amount"      "and"         "another"     "answer"     
##  [41] "any"         "apart"       "apparent"    "appear"      "apply"      
##  [46] "appoint"     "approach"    "appropriate" "area"        "argue"      
##  [51] "arm"         "around"      "arrange"     "art"         "as"         
##  [56] "ask"         "associate"   "assume"      "at"          "attend"     
##  [61] "authority"   "available"   "aware"       "away"        "awful"      
##  [66] "each"        "early"       "east"        "easy"        "eat"        
##  [71] "economy"     "educate"     "effect"      "egg"         "eight"      
##  [76] "either"      "elect"       "electric"    "eleven"      "else"       
##  [81] "employ"      "encourage"   "end"         "engine"      "english"    
##  [86] "enjoy"       "enough"      "enter"       "environment" "equal"      
##  [91] "especial"    "europe"      "even"        "evening"     "ever"       
##  [96] "every"       "evidence"    "exact"       "example"     "except"     
## [101] "excuse"      "exercise"    "exist"       "expect"      "expense"    
## [106] "experience"  "explain"     "express"     "extra"       "eye"        
## [111] "idea"        "identify"    "if"          "imagine"     "important"  
## [116] "improve"     "in"          "include"     "income"      "increase"   
## [121] "indeed"      "individual"  "industry"    "inform"      "inside"     
## [126] "instead"     "insure"      "interest"    "into"        "introduce"  
## [131] "invest"      "involve"     "issue"       "it"          "item"       
## [136] "obvious"     "occasion"    "odd"         "of"          "off"        
## [141] "offer"       "office"      "often"       "okay"        "old"        
## [146] "on"          "once"        "one"         "only"        "open"       
## [151] "operate"     "opportunity" "oppose"      "or"          "order"      
## [156] "organize"    "original"    "other"       "otherwise"   "ought"      
## [161] "out"         "over"        "own"         "under"       "understand" 
## [166] "union"       "unit"        "unite"       "university"  "unless"     
## [171] "until"       "up"          "upon"        "use"         "usual"

str_subset(stringr::words, "[aeiou]", negate=TRUE)

## [1] "by"  "dry" "fly" "mrs" "try" "why"

str_subset(stringr::words, "[^e]ed$")

## [1] "bed"     "hundred" "red"

str_subset(stringr::words, "i(ng|se)$")

##  [1] "advertise" "bring"     "during"    "evening"   "exercise"  "king"     
##  [7] "meaning"   "morning"   "otherwise" "practise"  "raise"     "realise"  
## [13] "ring"      "rise"      "sing"      "surprise"  "thing"

length(str_subset(stringr::words, "(cei|[^c]ie)"))

## [1] 14

str_view(stringr::words, "q[^u]", match = TRUE)

##ou|ise$|ae|oe|yse$

x <- c("240-567-5594")
str_view(x, "\\d\\d\\d-\\d\\d\\d-\\d\\d\\d\\d")

x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")

## ? pattern in {0,1} format will match at most 1 character/elements 
## + pattern in {1,} format will match 1 or more characters/elements
## * pattern in {0,} format will match 0 or more characters/elements

# ^.*$ will match any string, such as ^.*$: c("house", "dog")

# "\\{.+\\}" will match any string with curly braces surrounding at least one character. Such as "\\{.+\\}": c("{d}", "{dcf}").

# \d{4}-\d{2}-\d{2} will match four digits followed by a hyphen, followed by two digits followed by a hyphen, followed by another two digits. This is expression can be used to match dates in this format  “YYYY-MM-DD”. 

# "\\\\{4}" is \\{4}, which will match four backslashes.

str_view(words, "^[^aeiou]{3}", match = TRUE)

str_view(words, "[aeiou]{3,}", match = TRUE)

str_view(words, "([aeiou][^aeiou]){2,}", match = TRUE)

## A friend helped me solve the last two word searches.

str_view(fruit, "(..)\\1", match = TRUE)

#(.)\1\1: The same character appearing three times in a row. 

# "(.)(.)\\2\\1": A pair of characters followed by the same pair of characters in reversed order.

#(..)\1: Any two characters repeated.

#"(.).\\1.\\1": A character followed by any character, the original character, any other character, the original character again. 

#"(.)(.)(.).*\\3\\2\\1" Three characters followed by zero or more characters of any kind followed by the same three characters but in reverse order.

#Google assisted with finding the meaning of the last two expressions

str_subset(words, "^(.)((.*\\1$)|\\1?$)")

##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "depend"     "educate"    "else"       "encourage"  "engine"    
## [11] "europe"     "evidence"   "example"    "excuse"     "exercise"  
## [16] "expense"    "experience" "eye"        "health"     "high"      
## [21] "knock"      "level"      "local"      "nation"     "non"       
## [26] "rather"     "refer"      "remember"   "serious"    "stairs"    
## [31] "test"       "tonight"    "transport"  "treat"      "trust"     
## [36] "window"     "yesterday"

str_subset(words, "([A-Za-z][A-Za-z]).*\\1")

##  [1] "appropriate" "church"      "condition"   "decide"      "environment"
##  [6] "london"      "paragraph"   "particular"  "photograph"  "prepare"    
## [11] "pressure"    "remember"    "represent"   "require"     "sense"      
## [16] "therefore"   "understand"  "whether"

str_subset("eleven", "([a-z]).*\\1.*\\1")

## [1] "eleven"

x <- c("apple", "banana", "pear")
str_detect(x, "e")

## [1]  TRUE FALSE  TRUE

# How many common words start with t?
sum(str_detect(words, "^t"))

## [1] 65

#> [1] 65
# What proportion of common words end with a vowel?
mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

#> [1] 0.2765306

# one regex
words[str_detect(words, "^x|x$")]

## [1] "box" "sex" "six" "tax"

# combination
start_with_x <- str_detect(words, "^x")
end_with_x <- str_detect(words, "x$")
words[start_with_x | end_with_x]

## [1] "box" "sex" "six" "tax"

str_subset(words, "^[aeiou].*[^aeiou]$") %>% head()

## [1] "about"   "accept"  "account" "across"  "act"     "actual"

vowels <- str_count(words, "[aeiou]")
words[which(vowels == max(vowels))]

## [1] "appropriate" "associate"   "available"   "colleague"   "encourage"  
## [6] "experience"  "individual"  "television"

length(sentences)

## [1] 720

#> [1] 720
head(sentences)

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

#> [1] "The birch canoe slid on the smooth planks." 
#> [2] "Glue the sheet to the dark blue background."
#> [3] "It's easy to tell the depth of a well."     
#> [4] "These days a chicken leg is a rare dish."   
#> [5] "Rice is often served in round bowls."       
#> [6] "The juice of lemons makes fine punch."

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")

colour_match2 <- str_c("\\b(", str_c(colours, collapse = "|"), ")\\b")
colour_match2

## [1] "\\b(red|orange|yellow|green|blue|purple)\\b"

str_extract(sentences, "[A-Za-z][A-Za-z']*") %>% head()

## [1] "The"   "Glue"  "It's"  "These" "Rice"  "The"

pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
  head()

## [1] "spring"  "evening" "morning" "winding" "living"  "king"

unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
  head()

## [1] "planks" "days"   "bowls"  "lemons" "makes"  "hogs"

noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

has_noun %>% 
  str_match(noun)

##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )

## # A tibble: 720 x 3
##    sentence                                    article noun   
##    <chr>                                       <chr>   <chr>  
##  1 The birch canoe slid on the smooth planks.  the     smooth 
##  2 Glue the sheet to the dark blue background. the     sheet  
##  3 It's easy to tell the depth of a well.      the     depth  
##  4 These days a chicken leg is a rare dish.    a       chicken
##  5 Rice is often served in round bowls.        <NA>    <NA>   
##  6 The juice of lemons makes fine punch.       <NA>    <NA>   
##  7 The box was thrown beside the parked truck. the     parked 
##  8 The hogs were fed chopped corn and garbage. <NA>    <NA>   
##  9 Four hours of steady work faced us.         <NA>    <NA>   
## 10 Large size in stockings is hard to sell.    <NA>    <NA>   
## # … with 710 more rows

tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )

## # A tibble: 720 x 3
##    sentence                                    article noun   
##    <chr>                                       <chr>   <chr>  
##  1 The birch canoe slid on the smooth planks.  the     smooth 
##  2 Glue the sheet to the dark blue background. the     sheet  
##  3 It's easy to tell the depth of a well.      the     depth  
##  4 These days a chicken leg is a rare dish.    a       chicken
##  5 Rice is often served in round bowls.        <NA>    <NA>   
##  6 The juice of lemons makes fine punch.       <NA>    <NA>   
##  7 The box was thrown beside the parked truck. the     parked 
##  8 The hogs were fed chopped corn and garbage. <NA>    <NA>   
##  9 Four hours of steady work faced us.         <NA>    <NA>   
## 10 Large size in stockings is hard to sell.    <NA>    <NA>   
## # … with 710 more rows

numberword <- "\\b(one|two|three|four|five|six|seven|eight|nine|ten) +(\\w+)"
sentences[str_detect(sentences, numberword)] %>%
  str_extract(numberword)

##  [1] "seven books"   "two met"       "two factors"   "three lists"  
##  [5] "seven is"      "two when"      "ten inches"    "one war"      
##  [9] "one button"    "six minutes"   "ten years"     "two shares"   
## [13] "two distinct"  "five cents"    "two pins"      "five robins"  
## [17] "four kinds"    "three story"   "three inches"  "six comes"    
## [21] "three batches" "two leaves"

contraction <- "([A-Za-z]+)'([A-Za-z]+)"
sentences[str_detect(sentences, contraction)] %>%
  str_extract(contraction) %>%
  str_split("'")

## [[1]]
## [1] "It" "s" 
## 
## [[2]]
## [1] "man" "s"  
## 
## [[3]]
## [1] "don" "t"  
## 
## [[4]]
## [1] "store" "s"    
## 
## [[5]]
## [1] "workmen" "s"      
## 
## [[6]]
## [1] "Let" "s"  
## 
## [[7]]
## [1] "sun" "s"  
## 
## [[8]]
## [1] "child" "s"    
## 
## [[9]]
## [1] "king" "s"   
## 
## [[10]]
## [1] "It" "s" 
## 
## [[11]]
## [1] "don" "t"  
## 
## [[12]]
## [1] "queen" "s"    
## 
## [[13]]
## [1] "don" "t"  
## 
## [[14]]
## [1] "pirate" "s"     
## 
## [[15]]
## [1] "neighbor" "s"

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")

## [1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")

## [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))

## [1] "one house"    "two cars"     "three people"

sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)

## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."

str_replace_all("orange/red", "/", "\\\\")

## [1] "orange\\red"

replacements <- c("A" = "a", "B" = "b", "C" = "c", "D" = "d", "E" = "e",
                  "F" = "f", "G" = "g", "H" = "h", "I" = "i", "J" = "j", 
                  "K" = "k", "L" = "l", "M" = "m", "N" = "n", "O" = "o", 
                  "P" = "p", "Q" = "q", "R" = "r", "S" = "s", "T" = "t", 
                  "U" = "u", "V" = "v", "W" = "w", "X" = "x", "Y" = "y", 
                  "Z" = "z")
lower_words <- str_replace_all(words, pattern = replacements)
head(lower_words)

## [1] "a"        "able"     "about"    "absolute" "accept"   "account"

swapped <- str_replace_all(words, "^([A-Za-z])(.*)([A-Za-z])$", "\\3\\2\\1")

sentences %>%
  head(10) %>% 
  str_split(" ")

## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."
## 
## [[6]]
## [1] "The"    "juice"  "of"     "lemons" "makes"  "fine"   "punch."
## 
## [[7]]
## [1] "The"    "box"    "was"    "thrown" "beside" "the"    "parked" "truck."
## 
## [[8]]
## [1] "The"      "hogs"     "were"     "fed"      "chopped"  "corn"     "and"     
## [8] "garbage."
## 
## [[9]]
## [1] "Four"   "hours"  "of"     "steady" "work"   "faced"  "us."   
## 
## [[10]]
## [1] "Large"     "size"      "in"        "stockings" "is"        "hard"     
## [7] "to"        "sell."

x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]

## [1] "apples"  "pears"   "bananas"

# The spliting up by boundary ("word") will remove punction and maintain non-letter characters that are part of word, such as an apostrophe.  

# Spliting a string will divide (split) the string into individual characters.