CodeAlong9

Introduction

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

String Basics

c("one", "two", "three")

## [1] "one"   "two"   "three"

#> [1] "one"   "two"   "three"

string1 <- "This is a string"
string2 <- 'If I want to include a "quote" inside a string, I use single quotes'

double_quote <- "\"" # or '"'
single_quote <- '\'' # or "'"

x <- c("\"", "\\")
x

## [1] "\"" "\\"

#> [1] "\"" "\\"
writeLines(x)

## "
## \

#> "
#> \

String Length

str_length(c("a", "R for data science", NA))

## [1]  1 18 NA

#> [1]  1 18 NA

Combining Strings

str_c("x", "y")

## [1] "xy"

#> [1] "xy"
str_c("x", "y", "z")

## [1] "xyz"

#> [1] "xyz"

str_c("x", "y", sep = ", ")

## [1] "x, y"

#> [1] "x, y"

str_c(c("x", "y", "z"), collapse = ", ")

## [1] "x, y, z"

#> [1] "x, y, z"

name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "Good morning Hadley."

#> [1] "Good morning Hadley."

Subsetting Strings

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)

## [1] "App" "Ban" "Pea"

#> [1] "App" "Ban" "Pea"
# negative numbers count backwards from end
str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

#> [1] "ple" "ana" "ear"

str_sub("a", 1, 5)

## [1] "a"

#> [1] "a"

str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x

## [1] "apple"  "banana" "pear"

#> [1] "apple"  "banana" "pear"

Locales

# Turkish has two i's: with and without a dot, and it
# has a different rule for capitalising them:
str_to_upper(c("i", "ı"))

## [1] "I" "I"

#> [1] "I" "I"
str_to_upper(c("i", "ı"), locale = "tr")

## [1] "İ" "I"

#> [1] "İ" "I"

x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")  # English

## [1] "apple"    "banana"   "eggplant"

#> [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # Hawaiian

## [1] "apple"    "eggplant" "banana"

#> [1] "apple"    "eggplant" "banana"

Matching Patterns with Regular Expressions

x <- c("apple", "banana", "pear")
str_view(x, "an")

## [2] │ b<an><an>a

#> [2] │ b<an><an>a

str_view(x, ".a.")

## [2] │ <ban>ana
## [3] │ p<ear>

#> [2] │ <ban>ana
#> [3] │ p<ear>

# To create the regular expression, we need \\
dot <- "\\."

# But the expression itself only contains one:
writeLines(dot)

## \.

#> \.

# And this tells R to look for an explicit .
str_view(c("abc", "a.c", "bef"), "a\\.c")

## [2] │ <a.c>

#> [2] │ <a.c>

x <- "a\\b"
writeLines(x)

## a\b

#> a\b

str_view(x, "\\\\")

## [1] │ a<\>b

#> [1] │ a<\>b

Anchors

x <- c("apple", "banana", "pear")
str_view(x, "^a")

## [1] │ <a>pple

#> [1] │ <a>pple
str_view(x, "a$")

## [2] │ banan<a>

#> [2] │ banan<a>

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")

## [1] │ <apple> pie
## [2] │ <apple>
## [3] │ <apple> cake

#> [1] │ <apple> pie
#> [2] │ <apple>
#> [3] │ <apple> cake
str_view(x, "^apple$")

## [2] │ <apple>

#> [2] │ <apple>

Character classes and alternatives

# Look for a literal character that normally has special meaning in a regex
str_view(c("abc", "a.c", "a*c", "a c"), "a[.]c")

## [2] │ <a.c>

#> [2] │ <a.c>
str_view(c("abc", "a.c", "a*c", "a c"), ".[*]c")

## [3] │ <a*c>

#> [3] │ <a*c>
str_view(c("abc", "a.c", "a*c", "a c"), "a[ ]")

## [4] │ <a >c

#> [4] │ <a >c

str_view(c("grey", "gray"), "gr(e|a)y")

## [1] │ <grey>
## [2] │ <gray>

#> [1] │ <grey>
#> [2] │ <gray>

Repetition

x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")

## [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII

#> [1] │ 1888 is the longest year in Roman numerals: MD<CC><C>LXXXVIII
str_view(x, "CC+")

## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII

#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, 'C[LX]+')

## [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII

#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CLXXX>VIII

str_view(x, "C{2}")

## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII

#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, "C{2,}")

## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII

#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII
str_view(x, "C{2,3}")

## [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII

#> [1] │ 1888 is the longest year in Roman numerals: MD<CCC>LXXXVIII

str_view(x, 'C{2,3}?')

## [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII

#> [1] │ 1888 is the longest year in Roman numerals: MD<CC>CLXXXVIII
str_view(x, 'C[LX]+?')

## [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII

#> [1] │ 1888 is the longest year in Roman numerals: MDCC<CL>XXXVIII

Grouping and Backreferencing

str_view(fruit, "(..)\\1", match = TRUE)

##  [4] │ b<anan>a
## [20] │ <coco>nut
## [22] │ <cucu>mber
## [41] │ <juju>be
## [56] │ <papa>ya
## [73] │ s<alal> berry

#>  [4] │ b<anan>a
#> [20] │ <coco>nut
#> [22] │ <cucu>mber
#> [41] │ <juju>be
#> [56] │ <papa>ya
#> [73] │ s<alal> berry

Tools

Detect matches

x <- c("apple", "banana", "pear")
str_detect(x, "e")

## [1]  TRUE FALSE  TRUE

#> [1]  TRUE FALSE  TRUE

sum(str_detect(words, "^t"))

## [1] 65

#> [1] 65

mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

#> [1] 0.2765306

words[str_detect(words, "x$")]

## [1] "box" "sex" "six" "tax"

#> [1] "box" "sex" "six" "tax"
str_subset(words, "x$")

## [1] "box" "sex" "six" "tax"

#> [1] "box" "sex" "six" "tax"

str_count("abababa", "aba")

## [1] 2

#> [1] 2
str_view_all("abababa", "aba")

## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## [1] │ <aba>b<aba>

#> Warning: `str_view_all()` was deprecated in stringr 1.5.0.
#> ℹ Please use `str_view()` instead.
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
#> generated.
#> [1] │ <aba>b<aba>

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match

## [1] "red|orange|yellow|green|blue|purple"

#> [1] "red|orange|yellow|green|blue|purple"

has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)

## [1] "blue" "blue" "red"  "red"  "red"  "blue"

#> [1] "blue" "blue" "red"  "red"  "red"  "blue"

more <- sentences[str_count(sentences, colour_match) > 1]
str_view_all(more, colour_match)

## [1] │ It is hard to erase <blue> or <red> ink.
## [2] │ The <green> light in the brown box flicke<red>.
## [3] │ The sky in the west is tinged with <orange> <red>.

#> [1] │ It is hard to erase <blue> or <red> ink.
#> [2] │ The <green> light in the brown box flicke<red>.
#> [3] │ The sky in the west is tinged with <orange> <red>.

str_extract(more, colour_match)

## [1] "blue"   "green"  "orange"

#> [1] "blue"   "green"  "orange"

str_extract_all(more, colour_match)

## [[1]]
## [1] "blue" "red" 
## 
## [[2]]
## [1] "green" "red"  
## 
## [[3]]
## [1] "orange" "red"

#> [[1]]
#> [1] "blue" "red" 
#> 
#> [[2]]
#> [1] "green" "red"  
#> 
#> [[3]]
#> [1] "orange" "red"

str_extract_all(more, colour_match, simplify = TRUE)

##      [,1]     [,2] 
## [1,] "blue"   "red"
## [2,] "green"  "red"
## [3,] "orange" "red"

#>      [,1]     [,2] 
#> [1,] "blue"   "red"
#> [2,] "green"  "red"
#> [3,] "orange" "red"

x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)

##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"

#>      [,1] [,2] [,3]
#> [1,] "a"  ""   ""  
#> [2,] "a"  "b"  ""  
#> [3,] "a"  "b"  "c"

Grouped matches

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

#>  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
#>  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

has_noun %>% 
  str_match(noun)

##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

#>       [,1]         [,2]  [,3]     
#>  [1,] "the smooth" "the" "smooth" 
#>  [2,] "the sheet"  "the" "sheet"  
#>  [3,] "the depth"  "the" "depth"  
#>  [4,] "a chicken"  "a"   "chicken"
#>  [5,] "the parked" "the" "parked" 
#>  [6,] "the sun"    "the" "sun"    
#>  [7,] "the huge"   "the" "huge"   
#>  [8,] "the ball"   "the" "ball"   
#>  [9,] "the woman"  "the" "woman"  
#> [10,] "a helps"    "a"   "helps"

tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )

## # A tibble: 720 × 3
##    sentence                                    article noun   
##    <chr>                                       <chr>   <chr>  
##  1 The birch canoe slid on the smooth planks.  the     smooth 
##  2 Glue the sheet to the dark blue background. the     sheet  
##  3 It's easy to tell the depth of a well.      the     depth  
##  4 These days a chicken leg is a rare dish.    a       chicken
##  5 Rice is often served in round bowls.        <NA>    <NA>   
##  6 The juice of lemons makes fine punch.       <NA>    <NA>   
##  7 The box was thrown beside the parked truck. the     parked 
##  8 The hogs were fed chopped corn and garbage. <NA>    <NA>   
##  9 Four hours of steady work faced us.         <NA>    <NA>   
## 10 A large size in stockings is hard to sell.  <NA>    <NA>   
## # ℹ 710 more rows

#> # A tibble: 720 × 3
#>   sentence                                    article noun   
#>   <chr>                                       <chr>   <chr>  
#> 1 The birch canoe slid on the smooth planks.  the     smooth 
#> 2 Glue the sheet to the dark blue background. the     sheet  
#> 3 It's easy to tell the depth of a well.      the     depth  
#> 4 These days a chicken leg is a rare dish.    a       chicken
#> 5 Rice is often served in round bowls.        <NA>    <NA>   
#> 6 The juice of lemons makes fine punch.       <NA>    <NA>   
#> # ℹ 714 more rows

Replacing matches

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")

## [1] "-pple"  "p-ar"   "b-nana"

#> [1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")

## [1] "-ppl-"  "p--r"   "b-n-n-"

#> [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))

## [1] "one house"    "two cars"     "three people"

#> [1] "one house"    "two cars"     "three people"

sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)

## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."

#> [1] "The canoe birch slid on the smooth planks." 
#> [2] "Glue sheet the to the dark blue background."
#> [3] "It's to easy tell the depth of a well."     
#> [4] "These a days chicken leg is a rare dish."   
#> [5] "Rice often is served in round bowls."

Splitting

"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]

## [1] "a" "b" "c" "d"

#> [1] "a" "b" "c" "d"

sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)

##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
##      [,9]   
## [1,] ""     
## [2,] ""     
## [3,] "well."
## [4,] "dish."
## [5,] ""

#>      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
#> [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
#> [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
#> [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
#> [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
#> [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
#>      [,9]   
#> [1,] ""     
#> [2,] ""     
#> [3,] "well."
#> [4,] "dish."
#> [5,] ""

Other Types of Pattern

# The regular call:
str_view(fruit, "nana")

## [4] │ ba<nana>

# Is shorthand for
str_view(fruit, regex("nana"))

## [4] │ ba<nana>

bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")

## [1] │ <banana>

#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))

## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>

#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>

bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")

## [1] │ <banana>

#> [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))

## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>

#> [1] │ <banana>
#> [2] │ <Banana>
#> [3] │ <BANANA>

x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]

## [1] "Line"

#> [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]

## [1] "Line" "Line" "Line"

#> [1] "Line" "Line" "Line"

phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [) -]?   # optional closing parens, space, or dash
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)

str_match("514-791-8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

#>      [,1]          [,2]  [,3]  [,4] 
#> [1,] "514-791-814" "514" "791" "814"

# That means you also need to be aware of the difference
# when doing case insensitive matches:
i <- c("I", "İ", "i", "ı")
i

## [1] "I" "İ" "i" "ı"

#> [1] "I" "İ" "i" "ı"

str_subset(i, coll("i", ignore_case = TRUE))

## [1] "I" "i"

#> [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))

## [1] "İ" "i"

#> [1] "İ" "i"

x <- "This is a sentence."
str_view_all(x, boundary("word"))

## [1] │ <This> <is> <a> <sentence>.

#> [1] │ <This> <is> <a> <sentence>.
str_extract_all(x, boundary("word"))

## [[1]]
## [1] "This"     "is"       "a"        "sentence"

#> [[1]]
#> [1] "This"     "is"       "a"        "sentence"

Other

apropos("replace")

## [1] "%+replace%"       "replace"          "replace_na"       "setReplaceMethod"
## [5] "str_replace"      "str_replace_all"  "str_replace_na"   "theme_replace"

#> [1] "%+replace%"       "replace"          "replace_na"       "setReplaceMethod"
#> [5] "str_replace"      "str_replace_all"  "str_replace_na"   "theme_replace"

head(dir(pattern = "\\.Rmd$"))

## [1] "CodeAlong9.Rmd"

#> [1] "communicate-plots.Rmd" "communicate.Rmd"       "datetimes.Rmd"        
#> [4] "EDA.Rmd"               "explore.Rmd"           "factors.Rmd"

CodeAlong9

Josh Crosswhite

2025-04-02

Introduction

String Basics

String Length

Combining Strings

Subsetting Strings

Locales

Matching Patterns with Regular Expressions

Anchors

Character classes and alternatives

Repetition

Grouping and Backreferencing

Tools

Detect matches

Extract matches

Grouped matches

Replacing matches

Splitting

Other Types of Pattern

Other