Text are strings and the R package stringr and/ stringi are libraries to help deal with string data as part of data analysis.
Note:Text string color is red and is not customizable, I tried to have another color while having a dark mode.
library(stringi)
library(stringr)
library(tidyverse)
# stringr had built in data
fruit = stringr::fruit
head(fruit)
## [1] "apple" "apricot" "avocado" "banana" "bell pepper"
## [6] "bilberry"
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
Messy text
txt = "YOU CAN HAVE ALL CAPS DATA edit edit edit"
text2 = "// YoU nEvEr kNöw //"
text3 = "2-009.89.0."
text4 = "<p> ...Text strings </p> "
text5 = "|| * * * <br> \n didyouseeThis?! {stringr} z = 6 *9"
text6 = "http://www.link.com/strings"
text7 = "bill total $400,35 [07/08/01]"
folder = c("results-001-2020.csv", "results-002-2020.csv", "results-003-2020.csv", "results-004-2020.csv", "results-005-2020.csv")
Will use fruits data and text strings.
Detection methods return Boolean array or index location
str_detect(string, pattern)str_detect(fruit, pattern = "a")str_detect(txt, pattern = "<h1>")str_starts(fruit, pattern = "ap" )str_starts(text2, pattern = "//")
## [1] TRUE
More detection functions:
str_which() finds the indexes of strings that contain
matchstr_locate() locates the positions of pattern
matchstr_locate_all(fruit, pattern = "an")str_coount()count the number of matches in stringstr_count(fruit, pattern = "an")str_which(fruit, pattern = "ma")
## [1] 48 49 66 74 77
str_sub(string, start= , end= ) extract substrings from
character vectorstr_subset(string, pattern= ) returns only string in
pattern matchstr_extract()/ str_extract_all() returns
1st match in string as a vector / returns all matchesstr_match() / str_match_all() return 1st
pattern match as a matrixstr_sub(text2, start = 4, end = 17)
## [1] "YoU nEvEr kNöw"
str_subset(fruit, pattern = "bl")
## [1] "blackberry" "blackcurrant" "blood orange" "blueberry"
str_extract_all(txt, "edit")
## [[1]]
## [1] "edit" "edit" "edit"
str_length(string) returns length (width) of entire
stringsstr_pad(string, width= , side= c('left','right','both') ')
add string apddingstr_trunc(string, width= , side= ) truncate a
stringstr_squish(string) trim whitespace from each endstr_length(txt)
## [1] 41
str_pad(fruit[1], width=15, side='left') # pad on left side of 1st element in fruits
## [1] " apple"
sentences[1]
## [1] "The birch canoe slid on the smooth planks."
str_trunc(sentences[1], width = 17, side='right')
## [1] "The birch cano..."
str_squish(sentences[1])
## [1] "The birch canoe slid on the smooth planks."
str_replace(string, pattern= ,replacement= ) /
str_replace_all()str_to_lower() converts strings to lowercasestr_to_upper() converts strings to uppercasestr_to_title() convert strings to title casestr_replace(text2, pattern = "nEvEr", replacement = "always")
## [1] "// YoU always kNöw //"
str_to_lower(txt, locale = 'en')
## [1] "you can have all caps data edit edit edit"
str_to_upper(txt)
## [1] "YOU CAN HAVE ALL CAPS DATA EDIT EDIT EDIT"
str_to_title(txt)
## [1] "You Can Have All Caps Data Edit Edit Edit"
str_c(string1, string2, sep= ) join multiple strings
into single stringstr_flatten(string, collapse= ) combines into a single
stringstr_dup(string, times= n) repeat strings by
n timesstr_split_fixed(string, pattern= , n= )
/str_split() / str_split_()str_glue() create string from strings and
{expressions}str_glue_data() use a dataframe/ list to create a
string from strings and {expressions}str_c(letters[1:5], LETTERS[1:5], sep = "^")
## [1] "a^A" "b^B" "c^C" "d^D" "e^E"
str_c(txt, text4, sep = " :: ")
## [1] "YOU CAN HAVE ALL CAPS DATA edit edit edit :: <p> ...Text strings </p> "
str_flatten(fruit[1:5], collapse = " % ")
## [1] "apple % apricot % avocado % banana % bell pepper"
str_dup(text3, times = 3)
## [1] "2-009.89.0.2-009.89.0.2-009.89.0."
str_split_fixed(txt, pattern = "YOU", n= 3)
## [,1] [,2] [,3]
## [1,] "" " CAN HAVE ALL CAPS DATA edit edit edit" ""
str_glue("Pi is {pi}")
## Pi is 3.14159265358979
str_glue(text2, text7)
## // YoU nEvEr kNöw //bill total $400,35 [07/08/01]
# starwars data is from dplyr
str_glue_data(head(starwars), "starwars has {head(name)}")
## starwars has Luke Skywalker
## starwars has C-3PO
## starwars has R2-D2
## starwars has Darth Vader
## starwars has Leia Organa
## starwars has Owen Lars
str_order(letters[1:10], decreasing = F, numeric = T)
## [1] 1 2 3 4 5 6 7 8 9 10
str_sort(letters[10:1])
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j"
str_view_all(fruit, pattern = "berry") returns a page
with highlight matches.
# override existing encoding of letter
str_conv(fruit[1:4], encoding = "ISO-8859-1")
## [1] "apple" "apricot" "avocado" "banana"
str_wrap(txt, width = 5, indent = 2, exdent = 3)
## [1] " YOU\n CAN\n HAVE\n ALL\n CAPS\n DATA\n edit\n edit\n edit"
str_wrap(letters, width = 5, indent = 7)
## [1] " a" " b" " c" " d" " e" " f"
## [7] " g" " h" " i" " j" " k" " l"
## [13] " m" " n" " o" " p" " q" " r"
## [19] " s" " t" " u" " v" " w" " x"
## [25] " y" " z"
This section is need to know :
?"'" in RStudio terminal for
full list
\\ = \\" = ”\n = new linewriteLines("\\.") to see what is viewedTo match a specific character, use "\\{char}" example
"\\!"
{ } \n
\t\d (any digit)\w (any word character)\\\\Note that the ‘.’ expression returns everything except for new line.
"[:digit:]" matches digits"[:letters:]" matches letters"[:lower:]" matches lowercase letters"[:upper:]" matches uppercase letters"[:alnum:]" matches letters and numbers"[:punct:]" matches punctuation"[:graph:]" matches letters, numbers, punctuation"[:space;]" matches space characters"[:blank:]" matches space and tab (not new line)expressions = c("abc ABC 123\t!?\\(){}\n")
str_extract_all(text7, "[:digit:]")
## [[1]]
## [1] "4" "0" "0" "3" "5" "0" "7" "0" "8" "0" "1"
str_extract_all(text2, "[:alnum:]")
## [[1]]
## [1] "Y" "o" "U" "n" "E" "v" "E" "r" "k" "N" "ö" "w"
str_extract_all(text2, "[:lower:]")
## [[1]]
## [1] "o" "n" "v" "r" "k" "ö" "w"
str_extract_all(text6, "[:alpha:]")
## [[1]]
## [1] "h" "t" "t" "p" "w" "w" "w" "l" "i" "n" "k" "c" "o" "m" "s" "t" "r" "i" "n"
## [20] "g" "s"
Custom searches of strings
# match A or B
# str_match(fruit, "apple|pear")
str_count(text3, "0|9")
## [1] 5
# match one of
str_match_all(text3, "[09]")
## [[1]]
## [,1]
## [1,] "0"
## [2,] "0"
## [3,] "9"
## [4,] "9"
## [5,] "0"
# match anything but
str_extract_all(text6, "[^http://]")
## [[1]]
## [1] "w" "w" "w" "." "l" "i" "n" "k" "." "c" "o" "m" "s" "r" "i" "n" "g" "s"
# match a string between a range
str_extract_all(text4, "[Text - strings]")
## [[1]]
## [1] " " "T" "e" "x" "t" " " "s" "t" "r" "i" "n" "g" "s" " " " " " "
# match 0 or more
str_match_all(txt, "edit?")
## [[1]]
## [,1]
## [1,] "edit"
## [2,] "edit"
## [3,] "edit"
str_match(folder, "2020?")
## [,1]
## [1,] "2020"
## [2,] "2020"
## [3,] "2020"
## [4,] "2020"
## [5,] "2020"
# match ZERO or more
str_match(text3, "8*")
## [,1]
## [1,] ""
# match exactly n times
str_match_all(text6, "w[2]")
## [[1]]
## [,1]
# match n or more times
str_match_all(txt, "edit[2,]")
## [[1]]
## [,1]
# match between n and m times
str_match_all(text3, "0[1,3]")
## [[1]]
## [,1]
This is the specialized part
# match at start of string
str_count(folder, "^results")
## [1] 1 1 1 1 1
# match at end of string
str_detect(folder, ".csv$")
## [1] TRUE TRUE TRUE TRUE TRUE
In the folder vector is a list of csv files, to extract each of them
properly need to use special formatting. In the pattern
\\d{n} d is for digit and there are 3 digits after results
and 4 digits after that. This is useful when running a
purrr function.
folder
## [1] "results-001-2020.csv" "results-002-2020.csv" "results-003-2020.csv"
## [4] "results-004-2020.csv" "results-005-2020.csv"
str_extract_all(folder, "^results-\\d{3}-\\d{4}.csv$")
## [[1]]
## [1] "results-001-2020.csv"
##
## [[2]]
## [1] "results-002-2020.csv"
##
## [[3]]
## [1] "results-003-2020.csv"
##
## [[4]]
## [1] "results-004-2020.csv"
##
## [[5]]
## [1] "results-005-2020.csv"
s ="bacad"
# match followed by a char
str_match(s, "a(?=c)")
## [,1]
## [1,] "a"
# match not followed by a char
str_extract_all(s, "a(?!c)")
## [[1]]
## [1] "a"
# match preceded by
str_extract_all(s, "(?<!b)a")
## [[1]]
## [1] "a"