Using Stringr for strings

Load stringr
Text strings
Stringr Functions
Regular Expressions (RegEx)

Load stringr

Text are strings and the R package stringr and/ stringi are libraries to help deal with string data as part of data analysis.

Note:Text string color is red and is not customizable, I tried to have another color while having a dark mode.

library(stringi)
library(stringr)
library(tidyverse)

# stringr had built in data
fruit = stringr::fruit
head(fruit)

## [1] "apple"       "apricot"     "avocado"     "banana"      "bell pepper"
## [6] "bilberry"

head(sentences)

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

Text strings

Messy text

txt = "YOU CAN HAVE ALL CAPS DATA edit edit edit"
text2 = "// YoU nEvEr kNöw //"
text3 = "2-009.89.0."
text4 = "<p> ...Text strings  </p> "
text5 = "|| * * * <br> \n didyouseeThis?! {stringr} z = 6 *9"
text6 = "http://www.link.com/strings"
text7 = "bill total $400,35 [07/08/01]"
folder = c("results-001-2020.csv", "results-002-2020.csv", "results-003-2020.csv", "results-004-2020.csv", "results-005-2020.csv")

Stringr Functions

Will use fruits data and text strings.

detect matches

Detection methods return Boolean array or index location

str_detect(string, pattern)
str_detect(fruit, pattern = "a")
str_detect(txt, pattern = "<h1>")
str_starts(fruit, pattern = "ap" )

str_starts(text2, pattern = "//")

## [1] TRUE

More detection functions:

str_which() finds the indexes of strings that contain match
str_locate() locates the positions of pattern match
str_locate_all(fruit, pattern = "an")
str_coount()count the number of matches in string
str_count(fruit, pattern = "an")

str_which(fruit, pattern = "ma")

## [1] 48 49 66 74 77

Subset strings

str_sub(string, start= , end= ) extract substrings from character vector
str_subset(string, pattern= ) returns only string in pattern match
str_extract()/ str_extract_all() returns 1st match in string as a vector / returns all matches
str_match() / str_match_all() return 1st pattern match as a matrix

str_sub(text2, start = 4, end = 17)

## [1] "YoU nEvEr kNöw"

str_subset(fruit, pattern = "bl")

## [1] "blackberry"   "blackcurrant" "blood orange" "blueberry"

str_extract_all(txt, "edit")

## [[1]]
## [1] "edit" "edit" "edit"

manage string lengths

str_length(string) returns length (width) of entire strings
str_pad(string, width= , side= c('left','right','both') ') add string apdding
str_trunc(string, width= , side= ) truncate a string
str_squish(string) trim whitespace from each end

str_length(txt)

## [1] 41

str_pad(fruit[1], width=15, side='left')  # pad on left side of 1st element in fruits

## [1] "          apple"

sentences[1]

## [1] "The birch canoe slid on the smooth planks."

str_trunc(sentences[1], width = 17, side='right')

## [1] "The birch cano..."

str_squish(sentences[1])

## [1] "The birch canoe slid on the smooth planks."

Mutate strings

str_replace(string, pattern= ,replacement= ) / str_replace_all()
str_to_lower() converts strings to lowercase
str_to_upper() converts strings to uppercase
str_to_title() convert strings to title case

str_replace(text2, pattern = "nEvEr", replacement = "always")

## [1] "// YoU always kNöw //"

str_to_lower(txt, locale = 'en')

## [1] "you can have all caps data edit edit edit"

str_to_upper(txt)

## [1] "YOU CAN HAVE ALL CAPS DATA EDIT EDIT EDIT"

str_to_title(txt)

## [1] "You Can Have All Caps Data Edit Edit Edit"

Join & Split strings

str_c(string1, string2, sep= ) join multiple strings into single string
str_flatten(string, collapse= ) combines into a single string
str_dup(string, times= n) repeat strings by n times
str_split_fixed(string, pattern= , n= ) /str_split() / str_split_()
str_glue() create string from strings and {expressions}
str_glue_data() use a dataframe/ list to create a string from strings and {expressions}

str_c(letters[1:5], LETTERS[1:5], sep = "^")

## [1] "a^A" "b^B" "c^C" "d^D" "e^E"

str_c(txt, text4, sep = " :: ")

## [1] "YOU CAN HAVE ALL CAPS DATA edit edit edit :: <p> ...Text strings  </p> "

str_flatten(fruit[1:5], collapse = " % ")

## [1] "apple % apricot % avocado % banana % bell pepper"

str_dup(text3, times = 3)

## [1] "2-009.89.0.2-009.89.0.2-009.89.0."

str_split_fixed(txt, pattern = "YOU", n= 3)

##      [,1] [,2]                                     [,3]
## [1,] ""   " CAN HAVE ALL CAPS DATA edit edit edit" ""

str_glue("Pi is {pi}")

## Pi is 3.14159265358979

str_glue(text2, text7)

## // YoU nEvEr kNöw //bill total $400,35 [07/08/01]

# starwars data is from dplyr
str_glue_data(head(starwars), "starwars has {head(name)}")

## starwars has Luke Skywalker
## starwars has C-3PO
## starwars has R2-D2
## starwars has Darth Vader
## starwars has Leia Organa
## starwars has Owen Lars

Order Strings

str_order(letters[1:10], decreasing = F, numeric = T)

##  [1]  1  2  3  4  5  6  7  8  9 10

str_sort(letters[10:1])

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"

Helpers

str_view_all(fruit, pattern = "berry") returns a page with highlight matches.

# override existing encoding of letter
str_conv(fruit[1:4], encoding = "ISO-8859-1")

## [1] "apple"   "apricot" "avocado" "banana"

str_wrap(txt, width = 5, indent = 2, exdent = 3)

## [1] "  YOU\n   CAN\n   HAVE\n   ALL\n   CAPS\n   DATA\n   edit\n   edit\n   edit"

str_wrap(letters, width = 5, indent = 7)

##  [1] "       a" "       b" "       c" "       d" "       e" "       f"
##  [7] "       g" "       h" "       i" "       j" "       k" "       l"
## [13] "       m" "       n" "       o" "       p" "       q" "       r"
## [19] "       s" "       t" "       u" "       v" "       w" "       x"
## [25] "       y" "       z"

Regular Expressions (RegEx)

This section is need to know :

regular expressions go in ” ”
special characters, run ?"'" in RStudio terminal for full list
- \\ = \
- \" = ”
- \n = new line
use writeLines("\\.") to see what is viewed

Match character

To match a specific character, use "\\{char}" example "\\!"

{char} = . ! ? ( ) { } \n \t
\d (any digit)
\w (any word character)
need 4 backslashes for 1 back slash \\\\

Note that the ‘.’ expression returns everything except for new line.

RegEx Helpers

"[:digit:]" matches digits
"[:letters:]" matches letters
"[:lower:]" matches lowercase letters
"[:upper:]" matches uppercase letters
"[:alnum:]" matches letters and numbers
"[:punct:]" matches punctuation
"[:graph:]" matches letters, numbers, punctuation
"[:space;]" matches space characters
"[:blank:]" matches space and tab (not new line)

expressions = c("abc ABC 123\t!?\\(){}\n")

str_extract_all(text7, "[:digit:]")

## [[1]]
##  [1] "4" "0" "0" "3" "5" "0" "7" "0" "8" "0" "1"

str_extract_all(text2, "[:alnum:]")

## [[1]]
##  [1] "Y" "o" "U" "n" "E" "v" "E" "r" "k" "N" "ö" "w"

str_extract_all(text2, "[:lower:]")

## [[1]]
## [1] "o" "n" "v" "r" "k" "ö" "w"

str_extract_all(text6, "[:alpha:]")

## [[1]]
##  [1] "h" "t" "t" "p" "w" "w" "w" "l" "i" "n" "k" "c" "o" "m" "s" "t" "r" "i" "n"
## [20] "g" "s"

Alternates

Custom searches of strings

# match A or B
# str_match(fruit, "apple|pear")
str_count(text3, "0|9")

## [1] 5

# match one of
str_match_all(text3, "[09]")

## [[1]]
##      [,1]
## [1,] "0" 
## [2,] "0" 
## [3,] "9" 
## [4,] "9" 
## [5,] "0"

# match anything but
str_extract_all(text6, "[^http://]")

## [[1]]
##  [1] "w" "w" "w" "." "l" "i" "n" "k" "." "c" "o" "m" "s" "r" "i" "n" "g" "s"

# match a string between a range
str_extract_all(text4, "[Text - strings]")

## [[1]]
##  [1] " " "T" "e" "x" "t" " " "s" "t" "r" "i" "n" "g" "s" " " " " " "

Quantifiers

# match 0 or more  
str_match_all(txt, "edit?")

## [[1]]
##      [,1]  
## [1,] "edit"
## [2,] "edit"
## [3,] "edit"

str_match(folder, "2020?")

##      [,1]  
## [1,] "2020"
## [2,] "2020"
## [3,] "2020"
## [4,] "2020"
## [5,] "2020"

# match ZERO or more
str_match(text3, "8*")

##      [,1]
## [1,] ""

# match exactly n times
str_match_all(text6, "w[2]")

## [[1]]
##      [,1]

# match n or more times
str_match_all(txt, "edit[2,]")

## [[1]]
##      [,1]

# match between n and m times
str_match_all(text3, "0[1,3]")

## [[1]]
##      [,1]

Anchors !

This is the specialized part

# match at start of string
str_count(folder, "^results")

## [1] 1 1 1 1 1

# match at end of string
str_detect(folder, ".csv$")

## [1] TRUE TRUE TRUE TRUE TRUE

In the folder vector is a list of csv files, to extract each of them properly need to use special formatting. In the pattern \\d{n} d is for digit and there are 3 digits after results and 4 digits after that. This is useful when running a purrr function.

folder

## [1] "results-001-2020.csv" "results-002-2020.csv" "results-003-2020.csv"
## [4] "results-004-2020.csv" "results-005-2020.csv"

str_extract_all(folder, "^results-\\d{3}-\\d{4}.csv$")

## [[1]]
## [1] "results-001-2020.csv"
## 
## [[2]]
## [1] "results-002-2020.csv"
## 
## [[3]]
## [1] "results-003-2020.csv"
## 
## [[4]]
## [1] "results-004-2020.csv"
## 
## [[5]]
## [1] "results-005-2020.csv"

Look arounds

s ="bacad"

# match followed by a char
str_match(s, "a(?=c)")

##      [,1]
## [1,] "a"

# match not followed by a char
str_extract_all(s, "a(?!c)")

## [[1]]
## [1] "a"

# match preceded by
str_extract_all(s, "(?<!b)a")

## [[1]]
## [1] "a"