Load stringr

Text are strings and the R package stringr and/ stringi are libraries to help deal with string data as part of data analysis.

Note:Text string color is red and is not customizable, I tried to have another color while having a dark mode.

library(stringi)
library(stringr)
library(tidyverse)

# stringr had built in data
fruit = stringr::fruit
head(fruit)
## [1] "apple"       "apricot"     "avocado"     "banana"      "bell pepper"
## [6] "bilberry"
head(sentences)
## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

Text strings

Messy text

txt = "YOU CAN HAVE ALL CAPS DATA edit edit edit"
text2 = "// YoU nEvEr kNöw //"
text3 = "2-009.89.0."
text4 = "<p> ...Text strings  </p> "
text5 = "|| * * * <br> \n didyouseeThis?! {stringr} z = 6 *9"
text6 = "http://www.link.com/strings"
text7 = "bill total $400,35 [07/08/01]"
folder = c("results-001-2020.csv", "results-002-2020.csv", "results-003-2020.csv", "results-004-2020.csv", "results-005-2020.csv")

Stringr Functions

Will use fruits data and text strings.

detect matches

Detection methods return Boolean array or index location

  • str_detect(string, pattern)
  • str_detect(fruit, pattern = "a")
  • str_detect(txt, pattern = "<h1>")
  • str_starts(fruit, pattern = "ap" )
str_starts(text2, pattern = "//")
## [1] TRUE

More detection functions:

  • str_which() finds the indexes of strings that contain match
  • str_locate() locates the positions of pattern match
  • str_locate_all(fruit, pattern = "an")
  • str_coount()count the number of matches in string
  • str_count(fruit, pattern = "an")
str_which(fruit, pattern = "ma")
## [1] 48 49 66 74 77

Subset strings

  • str_sub(string, start= , end= ) extract substrings from character vector
  • str_subset(string, pattern= ) returns only string in pattern match
  • str_extract()/ str_extract_all() returns 1st match in string as a vector / returns all matches
  • str_match() / str_match_all() return 1st pattern match as a matrix
str_sub(text2, start = 4, end = 17)
## [1] "YoU nEvEr kNöw"
str_subset(fruit, pattern = "bl")
## [1] "blackberry"   "blackcurrant" "blood orange" "blueberry"
str_extract_all(txt, "edit")
## [[1]]
## [1] "edit" "edit" "edit"

manage string lengths

  • str_length(string) returns length (width) of entire strings
  • str_pad(string, width= , side= c('left','right','both') ') add string apdding
  • str_trunc(string, width= , side= ) truncate a string
  • str_squish(string) trim whitespace from each end
str_length(txt)
## [1] 41
str_pad(fruit[1], width=15, side='left')  # pad on left side of 1st element in fruits
## [1] "          apple"
sentences[1]
## [1] "The birch canoe slid on the smooth planks."
str_trunc(sentences[1], width = 17, side='right')
## [1] "The birch cano..."
str_squish(sentences[1])
## [1] "The birch canoe slid on the smooth planks."

Mutate strings

  • str_replace(string, pattern= ,replacement= ) / str_replace_all()
  • str_to_lower() converts strings to lowercase
  • str_to_upper() converts strings to uppercase
  • str_to_title() convert strings to title case
str_replace(text2, pattern = "nEvEr", replacement = "always")
## [1] "// YoU always kNöw //"
str_to_lower(txt, locale = 'en')
## [1] "you can have all caps data edit edit edit"
str_to_upper(txt)
## [1] "YOU CAN HAVE ALL CAPS DATA EDIT EDIT EDIT"
str_to_title(txt)
## [1] "You Can Have All Caps Data Edit Edit Edit"

Join & Split strings

  • str_c(string1, string2, sep= ) join multiple strings into single string
  • str_flatten(string, collapse= ) combines into a single string
  • str_dup(string, times= n) repeat strings by n times
  • str_split_fixed(string, pattern= , n= ) /str_split() / str_split_()
  • str_glue() create string from strings and {expressions}
  • str_glue_data() use a dataframe/ list to create a string from strings and {expressions}
str_c(letters[1:5], LETTERS[1:5], sep = "^")
## [1] "a^A" "b^B" "c^C" "d^D" "e^E"
str_c(txt, text4, sep = " :: ")
## [1] "YOU CAN HAVE ALL CAPS DATA edit edit edit :: <p> ...Text strings  </p> "
str_flatten(fruit[1:5], collapse = " % ")
## [1] "apple % apricot % avocado % banana % bell pepper"
str_dup(text3, times = 3)
## [1] "2-009.89.0.2-009.89.0.2-009.89.0."
str_split_fixed(txt, pattern = "YOU", n= 3)
##      [,1] [,2]                                     [,3]
## [1,] ""   " CAN HAVE ALL CAPS DATA edit edit edit" ""
str_glue("Pi is {pi}")
## Pi is 3.14159265358979
str_glue(text2, text7)
## // YoU nEvEr kNöw //bill total $400,35 [07/08/01]
# starwars data is from dplyr
str_glue_data(head(starwars), "starwars has {head(name)}")
## starwars has Luke Skywalker
## starwars has C-3PO
## starwars has R2-D2
## starwars has Darth Vader
## starwars has Leia Organa
## starwars has Owen Lars

Order Strings

str_order(letters[1:10], decreasing = F, numeric = T)
##  [1]  1  2  3  4  5  6  7  8  9 10
str_sort(letters[10:1])
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"

Helpers

str_view_all(fruit, pattern = "berry") returns a page with highlight matches.

# override existing encoding of letter
str_conv(fruit[1:4], encoding = "ISO-8859-1")
## [1] "apple"   "apricot" "avocado" "banana"
str_wrap(txt, width = 5, indent = 2, exdent = 3)
## [1] "  YOU\n   CAN\n   HAVE\n   ALL\n   CAPS\n   DATA\n   edit\n   edit\n   edit"
str_wrap(letters, width = 5, indent = 7)
##  [1] "       a" "       b" "       c" "       d" "       e" "       f"
##  [7] "       g" "       h" "       i" "       j" "       k" "       l"
## [13] "       m" "       n" "       o" "       p" "       q" "       r"
## [19] "       s" "       t" "       u" "       v" "       w" "       x"
## [25] "       y" "       z"

Regular Expressions (RegEx)

This section is need to know :

Match character

To match a specific character, use "\\{char}" example "\\!"

  • {char} = . ! ? ( ) { } \n \t
  • \d (any digit)
  • \w (any word character)
  • need 4 backslashes for 1 back slash \\\\

Note that the ‘.’ expression returns everything except for new line.

RegEx Helpers

  • "[:digit:]" matches digits
  • "[:letters:]" matches letters
  • "[:lower:]" matches lowercase letters
  • "[:upper:]" matches uppercase letters
  • "[:alnum:]" matches letters and numbers
  • "[:punct:]" matches punctuation
  • "[:graph:]" matches letters, numbers, punctuation
  • "[:space;]" matches space characters
  • "[:blank:]" matches space and tab (not new line)
expressions = c("abc ABC 123\t!?\\(){}\n")

str_extract_all(text7, "[:digit:]")
## [[1]]
##  [1] "4" "0" "0" "3" "5" "0" "7" "0" "8" "0" "1"
str_extract_all(text2, "[:alnum:]")
## [[1]]
##  [1] "Y" "o" "U" "n" "E" "v" "E" "r" "k" "N" "ö" "w"
str_extract_all(text2, "[:lower:]")
## [[1]]
## [1] "o" "n" "v" "r" "k" "ö" "w"
str_extract_all(text6, "[:alpha:]")
## [[1]]
##  [1] "h" "t" "t" "p" "w" "w" "w" "l" "i" "n" "k" "c" "o" "m" "s" "t" "r" "i" "n"
## [20] "g" "s"

Alternates

Custom searches of strings

# match A or B
# str_match(fruit, "apple|pear")
str_count(text3, "0|9")
## [1] 5
# match one of
str_match_all(text3, "[09]")
## [[1]]
##      [,1]
## [1,] "0" 
## [2,] "0" 
## [3,] "9" 
## [4,] "9" 
## [5,] "0"
# match anything but
str_extract_all(text6, "[^http://]")
## [[1]]
##  [1] "w" "w" "w" "." "l" "i" "n" "k" "." "c" "o" "m" "s" "r" "i" "n" "g" "s"
# match a string between a range
str_extract_all(text4, "[Text - strings]")
## [[1]]
##  [1] " " "T" "e" "x" "t" " " "s" "t" "r" "i" "n" "g" "s" " " " " " "

Quantifiers

# match 0 or more  
str_match_all(txt, "edit?")
## [[1]]
##      [,1]  
## [1,] "edit"
## [2,] "edit"
## [3,] "edit"
str_match(folder, "2020?")
##      [,1]  
## [1,] "2020"
## [2,] "2020"
## [3,] "2020"
## [4,] "2020"
## [5,] "2020"
# match ZERO or more
str_match(text3, "8*")
##      [,1]
## [1,] ""
# match exactly n times
str_match_all(text6, "w[2]")
## [[1]]
##      [,1]
# match n or more times
str_match_all(txt, "edit[2,]")
## [[1]]
##      [,1]
# match between n and m times
str_match_all(text3, "0[1,3]")
## [[1]]
##      [,1]

Anchors !

This is the specialized part

# match at start of string
str_count(folder, "^results")
## [1] 1 1 1 1 1
# match at end of string
str_detect(folder, ".csv$")
## [1] TRUE TRUE TRUE TRUE TRUE

In the folder vector is a list of csv files, to extract each of them properly need to use special formatting. In the pattern \\d{n} d is for digit and there are 3 digits after results and 4 digits after that. This is useful when running a purrr function.

folder
## [1] "results-001-2020.csv" "results-002-2020.csv" "results-003-2020.csv"
## [4] "results-004-2020.csv" "results-005-2020.csv"
str_extract_all(folder, "^results-\\d{3}-\\d{4}.csv$")
## [[1]]
## [1] "results-001-2020.csv"
## 
## [[2]]
## [1] "results-002-2020.csv"
## 
## [[3]]
## [1] "results-003-2020.csv"
## 
## [[4]]
## [1] "results-004-2020.csv"
## 
## [[5]]
## [1] "results-005-2020.csv"

Look arounds

s ="bacad"

# match followed by a char
str_match(s, "a(?=c)")
##      [,1]
## [1,] "a"
# match not followed by a char
str_extract_all(s, "a(?!c)")
## [[1]]
## [1] "a"
# match preceded by
str_extract_all(s, "(?<!b)a")
## [[1]]
## [1] "a"