Week 10: Code Along 9

Introduction

String Basics

chac_data <- "Im 'very' Hungry"

x <- c("\"", "\\")
x

## [1] "\"" "\\"

writeLines(x)

## "
## \

c("one", "two", "three")

## [1] "one"   "two"   "three"

String Length

str_length(c("I am Hungry."))

## [1] 12

Combining Strings

str_c("I", "am")

## [1] "Iam"

str_c("I", "am", sep = ";")

## [1] "I;am"

x <- c("abc", NA)
str_c("|-", x, "-|")

## [1] "|-abc-|" NA

str_c("|-", str_replace_na(x), "-|")

## [1] "|-abc-|" "|-NA-|"

str_c("prefix-", c("a", "b", "c"), "-suffix")

## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"

name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "Good morning Hadley."

str_c(c("x", "y", "z"), collapse = ", ")

## [1] "x, y, z"

Subsetting Strings

x <- c("apple", "banana", "pear")
str_sub(x, 1, 3)

## [1] "app" "ban" "pea"

str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

str_sub("a", 1, 5)

## [1] "a"

str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x

## [1] "apple"  "banana" "pear"

Locales

str_to_upper(c("i", "ı"))

## [1] "I" "I"

str_to_upper(c("i", "ı"), locale = "tr")

## [1] "İ" "I"

x <- c("apple", "eggplant", "banana")

str_sort(x, locale = "en")

## [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw")

## [1] "apple"    "eggplant" "banana"

Matching Patterns With Regular Expressions

flights %>% glimpse()

## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)

Basic Matches

flights_small %>% filter(str_detect(origin, "M"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights_small %>% filter(str_detect(origin, ".M"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights_small %>% filter(str_detect(origin, "M."))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights_small %>% filter(str_detect(origin, "M\\."))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

Anchors

flights_small %>% filter(str_detect(origin, "^E"))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL

flights_small %>% filter(str_detect(origin, "A$"))

## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N24211  LGA    IAH  
## 2 DL      N668DN  LGA    ATL  
## 3 EV      N829AS  LGA    IAD  
## 4 AA      N3ALAA  LGA    ORD

Character Classes and Alternatives

flights_small %>% filter(str_detect(carrier, "\\d"))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 B6      N804JB  JFK    BQN  
## 2 B6      N516JB  EWR    FLL  
## 3 B6      N593JB  JFK    MCO

flights_small %>% filter(str_detect(carrier, "\\s"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights_small %>% filter(str_detect(carrier, "[ABD]"))

## # A tibble: 9 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 B6      N804JB  JFK    BQN  
## 5 DL      N668DN  LGA    ATL  
## 6 UA      N39463  EWR    ORD  
## 7 B6      N516JB  EWR    FLL  
## 8 B6      N593JB  JFK    MCO  
## 9 AA      N3ALAA  LGA    ORD

flights_small %>% filter(str_detect(carrier, "[^ABD]"))

## # A tibble: 8 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 B6      N804JB  JFK    BQN  
## 4 DL      N668DN  LGA    ATL  
## 5 UA      N39463  EWR    ORD  
## 6 B6      N516JB  EWR    FLL  
## 7 EV      N829AS  LGA    IAD  
## 8 B6      N593JB  JFK    MCO

Repetition

# ? 0 or 1
flights_small %>% filter(str_detect(carrier, "A?"))

## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD

# + 1 or more
flights_small %>% filter(str_detect(carrier, "A+"))

## # A tibble: 5 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 UA      N39463  EWR    ORD  
## 5 AA      N3ALAA  LGA    ORD

# * 0 or more
flights_small %>% filter(str_detect(carrier, "A*"))

## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD

Grouping and backreferences

# (..)\\1

flights_small %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))

## # A tibble: 1,990 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 EV      N15555  EWR    MKE  
##  2 EV      N11119  LGA    CLE  
##  3 UA      N14242  EWR    TPA  
##  4 EV      N14143  EWR    PIT  
##  5 EV      N15555  EWR    SAV  
##  6 UA      N12125  EWR    LAX  
##  7 EV      N15555  EWR    PWM  
##  8 EV      N15555  EWR    BUF  
##  9 EV      N15555  EWR    RIC  
## 10 EV      N13133  EWR    DTW  
## # ℹ 1,980 more rows

Tools

Detect matches

flights_small %>% 
    summarise(sum(str_detect(tailnum, "8$")))

## # A tibble: 1 × 1
##   `sum(str_detect(tailnum, "8$"))`
##                              <int>
## 1                                1

str_detect(flights_small$tailnum, "8$")

##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

sum(str_detect(flights_small$tailnum, "8$"))

## [1] 1

mean(str_detect(flights_small$tailnum, "8$"))

## [1] 0.1

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match

## [1] "red|orange|yellow|green|blue|purple"

# extract strings with color
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)

## [1] "blue" "blue" "red"  "red"  "red"  "blue"

Grouped matches

# Extract strings with a noun
noun <- "(a|the) ([^ ]+)"
has_noun <- str_subset(sentences, noun) %>% head(10)
has_noun %>% str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

has_noun %>% 
    str_match(noun)

##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

Replacing matches

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")

## [1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")

## [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))

## [1] "one house"    "two cars"     "three people"

sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)

## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619AA     
##  4 B6      N804JB  JFK    BQN   -804JB     
##  5 DL      N668DN  LGA    ATL   -668DN     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516JB     
##  8 EV      N829AS  LGA    IAD   -829AS     
##  9 B6      N593JB  JFK    MCO   -593JB     
## 10 AA      N3ALAA  LGA    ORD   -3ALAA

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619--     
##  4 B6      N804JB  JFK    BQN   -804--     
##  5 DL      N668DN  LGA    ATL   -668--     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516--     
##  8 EV      N829AS  LGA    IAD   -829--     
##  9 B6      N593JB  JFK    MCO   -593--     
## 10 AA      N3ALAA  LGA    ORD   -3----

Splitting

sentences %>%
  head(5) %>% 
  str_split(" ")

## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."

"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]

## [1] "a" "b" "c" "d"

sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)

##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]     [,8]         
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth" "planks."    
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"   "background."
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"     "a"          
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"      "rare"       
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls." ""           
##      [,9]   
## [1,] ""     
## [2,] ""     
## [3,] "well."
## [4,] "dish."
## [5,] ""

fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)

##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"

x <- "This is a sentence.  This is another sentence."
str_view_all(x, boundary("word"))

## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## [1] │ <This> <is> <a> <sentence>.  <This> <is> <another> <sentence>.

str_split(x, " ")[[1]]

## [1] "This"      "is"        "a"         "sentence." ""          "This"     
## [7] "is"        "another"   "sentence."

str_split(x, boundary("word"))[[1]]

## [1] "This"     "is"       "a"        "sentence" "This"     "is"       "another" 
## [8] "sentence"

Other types of patterns

flights_small %>% filter(str_detect(tailnum, regex("^n", ignore_case = TRUE)))

## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD

flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL

bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")

## [1] │ <banana>

str_view(bananas, regex("banana", ignore_case = TRUE))

## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>

x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]

## [1] "Line"

str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]

## [1] "Line" "Line" "Line"

phone <- regex("
  \\(?     # optional opening parens
  (\\d{3}) # area code
  [) -]?   # optional closing parens, space, or dash
  (\\d{3}) # another three numbers
  [ -]?    # optional space or dash
  (\\d{3}) # three more numbers
  ", comments = TRUE)
str_match("514-791-8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

i <- c("I", "İ", "i", "ı")
i

## [1] "I" "İ" "i" "ı"

str_subset(i, coll("i", ignore_case = TRUE))

## [1] "I" "i"

str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))

## [1] "İ" "i"

x <- "This is a sentence."
str_view_all(x, boundary("word"))

## [1] │ <This> <is> <a> <sentence>.

str_extract_all(x, boundary("word"))

## [[1]]
## [1] "This"     "is"       "a"        "sentence"

Week 10: Code Along 9

R for Data Science: Chapter 14

Taylor Nelson

2026-06-015

Introduction

String Basics

String Length

Combining Strings

Subsetting Strings

Locales

Matching Patterns With Regular Expressions

Basic Matches

Anchors

Character Classes and Alternatives

Repetition

Grouping and backreferences

Tools

Detect matches

Extract matches

Grouped matches

Replacing matches

Splitting

Other types of patterns