Introduction

String basics

chac_data <- "i'm 'very' hungry."

stringr::str_length("i am hungry.")
## [1] 12
stringr::str_c(c("I", "am"), collapse = "")
## [1] "Iam"
stringr::str_c("I", "am", sep = ";")
## [1] "I;am"
str_sort(c("John", "Mary", "Aaron"))
## [1] "Aaron" "John"  "Mary"

Mattching patterns with regular expressions

flights %>% glimpse()
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)

basic matches

flights_small %>% filter(str_detect(origin, "M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, ".M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "M."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "M\\."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

anchors

flights_small %>% filter(str_detect(origin, "E$"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

Character classes and alternatives

flights_small %>% filter(str_detect(carrier, "\\d"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 B6      N804JB  JFK    BQN  
## 2 B6      N516JB  EWR    FLL  
## 3 B6      N593JB  JFK    MCO
flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(carrier, "[ABD]"))
## # A tibble: 9 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 B6      N804JB  JFK    BQN  
## 5 DL      N668DN  LGA    ATL  
## 6 UA      N39463  EWR    ORD  
## 7 B6      N516JB  EWR    FLL  
## 8 B6      N593JB  JFK    MCO  
## 9 AA      N3ALAA  LGA    ORD
flights_small %>% filter(str_detect(carrier, "[^ABD]"))
## # A tibble: 8 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 B6      N804JB  JFK    BQN  
## 4 DL      N668DN  LGA    ATL  
## 5 UA      N39463  EWR    ORD  
## 6 B6      N516JB  EWR    FLL  
## 7 EV      N829AS  LGA    IAD  
## 8 B6      N593JB  JFK    MCO

Repetition

# ? 0 or 1 
flights %>% filter(str_detect(carrier, "A"))
## # A tibble: 92,450 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      558            600        -2      753            745
##  6  2013     1     1      558            600        -2      924            917
##  7  2013     1     1      558            600        -2      923            937
##  8  2013     1     1      559            600        -1      941            910
##  9  2013     1     1      559            600        -1      854            902
## 10  2013     1     1      606            610        -4      858            910
## # ℹ 92,440 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# + 1 or more
flights_small %>% filter(str_detect(carrier, "A?"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD
# * 0 or more 
flights_small %>% filter(str_detect(carrier, "A*"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 UA      N14228  EWR    IAH  
##  2 UA      N24211  LGA    IAH  
##  3 AA      N619AA  JFK    MIA  
##  4 B6      N804JB  JFK    BQN  
##  5 DL      N668DN  LGA    ATL  
##  6 UA      N39463  EWR    ORD  
##  7 B6      N516JB  EWR    FLL  
##  8 EV      N829AS  LGA    IAD  
##  9 B6      N593JB  JFK    MCO  
## 10 AA      N3ALAA  LGA    ORD

Grouping and Backreferences

# (..)\\1

flights_small %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d{2})\\1"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

Tools

Detect matches

flights_small %>%
    summarise(sum(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
##   `sum(str_detect(tailnum, "8$"))`
##                              <int>
## 1                                1
str_detect(flights_small$tailnum, "8$")
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(flights_small$tailnum, "8$"))
## [1] 1
mean(str_detect(flights_small$tailnum, "8$"))
## [1] 0.1

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
#extract strings with a color
has_colour <- str_subset(sentences, colour_match)
str_subset(sentences, colour_match)
##  [1] "Glue the sheet to the dark blue background."       
##  [2] "Two blue fish swam in the tank."                   
##  [3] "The colt reared and threw the tall rider."         
##  [4] "The wide road shimmered in the hot sun."           
##  [5] "See the cat glaring at the scared mouse."          
##  [6] "A wisp of cloud hung in the blue air."             
##  [7] "Leaves turn brown and yellow in the fall."         
##  [8] "He ordered peach pie with ice cream."              
##  [9] "Pure bred poodles have curls."                     
## [10] "The spot on the blotter was made by green ink."    
## [11] "Mud was spattered on the front of his white shirt."
## [12] "The sofa cushion is red and of light weight."      
## [13] "The sky that morning was clear and bright blue."   
## [14] "Torn scraps littered the stone floor."             
## [15] "The doctor cured him with these pills."            
## [16] "The new girl was fired today at noon."             
## [17] "The third act was dull and tired the players."     
## [18] "A blue crane is a tall wading bird."               
## [19] "Live wires should be kept covered."                
## [20] "It is hard to erase blue or red ink."              
## [21] "The wreck occurred by the bank on Main Street."    
## [22] "The lamp shone with a steady green flame."         
## [23] "The box is held by a bright red snapper."          
## [24] "The prince ordered his head chopped off."          
## [25] "The houses are built of red clay bricks."          
## [26] "The red tape bound the smuggled food."             
## [27] "Nine men were hired to dig the ruins."             
## [28] "The flint sputtered and lit a pine torch."         
## [29] "Hedge apples may stain your hands green."          
## [30] "The old pan was covered with hard fudge."          
## [31] "The plant grew large and green in the window."     
## [32] "The store walls were lined with colored frocks."   
## [33] "The purple tie was ten years old."                 
## [34] "Bathe and relax in the cool green grass."          
## [35] "The clan gathered on each dull night."             
## [36] "The lake sparkled in the red hot sun."             
## [37] "Mark the spot with a sign painted red."            
## [38] "Smoke poured out of every crack."                  
## [39] "Serve the hot rum to the tired heroes."            
## [40] "The couch cover and hall drapes were blue."        
## [41] "He offered proof in the form of a large chart."    
## [42] "A man in a blue sweater sat at the desk."          
## [43] "A sip of tea revives his tired friend."            
## [44] "The door was barred, locked, and bolted as well."  
## [45] "A thick coat of black paint covered all."          
## [46] "The small red neon lamp went out."                 
## [47] "Paint the sockets in the wall dull green."         
## [48] "Wake and rise, and step into the green outdoors."  
## [49] "The green light in the brown box flickered."       
## [50] "He put his last cartridge into the gun and fired." 
## [51] "The ram scared the school children off."           
## [52] "Tear a thin sheet from the yellow pad."            
## [53] "Dimes showered down from all sides."               
## [54] "The sky in the west is tinged with orange red."    
## [55] "The red paper brightened the dim stage."           
## [56] "The hail pattered on the burnt brown grass."       
## [57] "The big red apple fell to the ground."

grouped matches

# Extract strings with a noun
noun <- "(a|the) ([^ ]+)"
had_nouns <- str_subset(sentences, noun) %>% head(10)
had_nouns %>% str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

replacing matches

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619AA     
##  4 B6      N804JB  JFK    BQN   -804JB     
##  5 DL      N668DN  LGA    ATL   -668DN     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516JB     
##  8 EV      N829AS  LGA    IAD   -829AS     
##  9 B6      N593JB  JFK    MCO   -593JB     
## 10 AA      N3ALAA  LGA    ORD   -3ALAA
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619--     
##  4 B6      N804JB  JFK    BQN   -804--     
##  5 DL      N668DN  LGA    ATL   -668--     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516--     
##  8 EV      N829AS  LGA    IAD   -829--     
##  9 B6      N593JB  JFK    MCO   -593--     
## 10 AA      N3ALAA  LGA    ORD   -3----

Splitting

sentences[1] %>% str_split(" ", n = 3, simplify = TRUE)
##      [,1]  [,2]    [,3]                              
## [1,] "The" "birch" "canoe slid on the smooth planks."

Other types of pattern

flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL