Introduction

String Basics

String Lengths and Combining Strings

 # This is how to put quotes inside quotation marks. 
chac_data <- "I'm 'very' hungry."

stringr::str_length("I am hungry.")
## [1] 12
stringr::str_c("I", " am", sep = " ;")
## [1] "I ; am"
stringr::str_c(c ("I", " am."), collapse = " truly")
## [1] "I truly am."

Subsetting Strings

x <- c("Apple", "Banana", "Pear")

 # Positive numbers count from front to back.
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
 # Negative numbers count backwards from end.
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"

Locale

str_sort(c("John", "Mary", "Aaron"))
## [1] "Aaron" "John"  "Mary"
str_to_lower(c("JOHN", "AARON"))
## [1] "john"  "aaron"
str_to_upper(c("sara loves data analytics!"))
## [1] "SARA LOVES DATA ANALYTICS!"

Matching Patterns

 # To see the data set - number of rows and columns.
flights %>% glimpse()
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
 # To reduce data set to a more manageable size.
flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)

Basic Matches

 # Finds matches to the chosen value.
flights_small %>% filter(str_detect(dest, "AH"))
## # A tibble: 2 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH
 # Period/dot is used to detect values prior to the chosen value.
flights_small %>% filter(str_detect(dest, ".A"))
## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 EV      N829AS  LGA    IAD
 # Period/dot is used to detect values after to the chosen value.
flights_small %>% filter(str_detect(dest, "A."))
## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 DL      N668DN  LGA    ATL  
## 4 EV      N829AS  LGA    IAD
 # To have R recognize a period/dot as a period/dot, put double backslashes in front of it. (\\.)

Anchors

flights_small %>% filter(str_detect(origin, "^E"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL
flights_small %>% filter(str_detect(origin, "A$"))
## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N24211  LGA    IAH  
## 2 DL      N668DN  LGA    ATL  
## 3 EV      N829AS  LGA    IAD  
## 4 AA      N3ALAA  LGA    ORD

Character Classes and Alternatives

 # \d matches any digit. Use another \ to show the first backslash is part of the function.
flights_small %>% filter(str_detect(carrier, "\\d"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 B6      N804JB  JFK    BQN  
## 2 B6      N516JB  EWR    FLL  
## 3 B6      N593JB  JFK    MCO
 # \s matches any white space.
flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_space <- flights_small[1,1] <- "UA "

flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 1 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH
 # [abc]: matches a, b, or c.
flights_small %>% filter(str_detect(carrier, "[ABD]"))
## # A tibble: 9 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "AA"    N619AA  JFK    MIA  
## 4 "B6"    N804JB  JFK    BQN  
## 5 "DL"    N668DN  LGA    ATL  
## 6 "UA"    N39463  EWR    ORD  
## 7 "B6"    N516JB  EWR    FLL  
## 8 "B6"    N593JB  JFK    MCO  
## 9 "AA"    N3ALAA  LGA    ORD
 # [^abc]: matches anything except a, b, or c.
flights_small %>% filter(str_detect(carrier, "[^ABD]"))
## # A tibble: 8 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "B6"    N804JB  JFK    BQN  
## 4 "DL"    N668DN  LGA    ATL  
## 5 "UA"    N39463  EWR    ORD  
## 6 "B6"    N516JB  EWR    FLL  
## 7 "EV"    N829AS  LGA    IAD  
## 8 "B6"    N593JB  JFK    MCO

Repetition

 # ? = 0 or 1
flights_small %>% filter(str_detect(carrier, "A?"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 "UA "   N14228  EWR    IAH  
##  2 "UA"    N24211  LGA    IAH  
##  3 "AA"    N619AA  JFK    MIA  
##  4 "B6"    N804JB  JFK    BQN  
##  5 "DL"    N668DN  LGA    ATL  
##  6 "UA"    N39463  EWR    ORD  
##  7 "B6"    N516JB  EWR    FLL  
##  8 "EV"    N829AS  LGA    IAD  
##  9 "B6"    N593JB  JFK    MCO  
## 10 "AA"    N3ALAA  LGA    ORD
 # + = 1 or more
flights_small %>% filter(str_detect(carrier, "A+"))
## # A tibble: 5 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "AA"    N619AA  JFK    MIA  
## 4 "UA"    N39463  EWR    ORD  
## 5 "AA"    N3ALAA  LGA    ORD
 # * = 0 or more
flights_small %>% filter(str_detect(carrier, "A*"))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 "UA "   N14228  EWR    IAH  
##  2 "UA"    N24211  LGA    IAH  
##  3 "AA"    N619AA  JFK    MIA  
##  4 "B6"    N804JB  JFK    BQN  
##  5 "DL"    N668DN  LGA    ATL  
##  6 "UA"    N39463  EWR    ORD  
##  7 "B6"    N516JB  EWR    FLL  
##  8 "EV"    N829AS  LGA    IAD  
##  9 "B6"    N593JB  JFK    MCO  
## 10 "AA"    N3ALAA  LGA    ORD

Grouping and Backreferences

 # (..)\\1 to identify repeated patterns.
flights_small %>% filter(str_detect(tailnum, "(\\d)\\1"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "DL"    N668DN  LGA    ATL
flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))
## # A tibble: 1,990 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 EV      N15555  EWR    MKE  
##  2 EV      N11119  LGA    CLE  
##  3 UA      N14242  EWR    TPA  
##  4 EV      N14143  EWR    PIT  
##  5 EV      N15555  EWR    SAV  
##  6 UA      N12125  EWR    LAX  
##  7 EV      N15555  EWR    PWM  
##  8 EV      N15555  EWR    BUF  
##  9 EV      N15555  EWR    RIC  
## 10 EV      N13133  EWR    DTW  
## # ℹ 1,980 more rows
 # To specify the number of matches precisely: {n}: exactly n  {n,}: n or more  {,m}: at most m  {n,m}: between n and m

Tools

Detect Matches

 # How many tail numbers end with number 8?
flights_small %>% 
    summarise(sum(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
##   `sum(str_detect(tailnum, "8$"))`
##                              <int>
## 1                                1
str_detect(flights_small$tailnum, "8$")
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(flights_small$tailnum, "8$"))
## [1] 1
str_detect(flights_small$tailnum, "8$")
##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
mean(str_detect(flights_small$tailnum, "8$"))
## [1] 0.1
flights_small %>% filter(str_detect(origin, "^E"))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N39463  EWR    ORD  
## 3 "B6"    N516JB  EWR    FLL

Extract Matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
 # Extract strings with colors. - Watch for "red"
has_colour <- str_subset(sentences, colour_match)

str_extract(has_colour, colour_match)
##  [1] "blue"   "blue"   "red"    "red"    "red"    "blue"   "yellow" "red"   
##  [9] "red"    "green"  "red"    "red"    "blue"   "red"    "red"    "red"   
## [17] "red"    "blue"   "red"    "blue"   "red"    "green"  "red"    "red"   
## [25] "red"    "red"    "red"    "red"    "green"  "red"    "green"  "red"   
## [33] "purple" "green"  "red"    "red"    "red"    "red"    "red"    "blue"  
## [41] "red"    "blue"   "red"    "red"    "red"    "red"    "green"  "green" 
## [49] "green"  "red"    "red"    "yellow" "red"    "orange" "red"    "red"   
## [57] "red"
flights_small %>% mutate(tailnum_numOnly = str_extract(tailnum, "\\d+"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_numOnly
##    <chr>   <chr>   <chr>  <chr> <chr>          
##  1 "UA "   N14228  EWR    IAH   14228          
##  2 "UA"    N24211  LGA    IAH   24211          
##  3 "AA"    N619AA  JFK    MIA   619            
##  4 "B6"    N804JB  JFK    BQN   804            
##  5 "DL"    N668DN  LGA    ATL   668            
##  6 "UA"    N39463  EWR    ORD   39463          
##  7 "B6"    N516JB  EWR    FLL   516            
##  8 "EV"    N829AS  LGA    IAD   829            
##  9 "B6"    N593JB  JFK    MCO   593            
## 10 "AA"    N3ALAA  LGA    ORD   3

Grouped Matches

 # Extract strings with a noun.
noun <- "(a|the) ([^ ]+)"
has_nouns <- str_subset(sentences, noun) %>% head(10)
has_nouns %>% str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

Replacing Matches

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 "UA "   N14228  EWR    IAH   -14228     
##  2 "UA"    N24211  LGA    IAH   -24211     
##  3 "AA"    N619AA  JFK    MIA   -619AA     
##  4 "B6"    N804JB  JFK    BQN   -804JB     
##  5 "DL"    N668DN  LGA    ATL   -668DN     
##  6 "UA"    N39463  EWR    ORD   -39463     
##  7 "B6"    N516JB  EWR    FLL   -516JB     
##  8 "EV"    N829AS  LGA    IAD   -829AS     
##  9 "B6"    N593JB  JFK    MCO   -593JB     
## 10 "AA"    N3ALAA  LGA    ORD   -3ALAA
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 "UA "   N14228  EWR    IAH   -14228     
##  2 "UA"    N24211  LGA    IAH   -24211     
##  3 "AA"    N619AA  JFK    MIA   -619--     
##  4 "B6"    N804JB  JFK    BQN   -804--     
##  5 "DL"    N668DN  LGA    ATL   -668--     
##  6 "UA"    N39463  EWR    ORD   -39463     
##  7 "B6"    N516JB  EWR    FLL   -516--     
##  8 "EV"    N829AS  LGA    IAD   -829--     
##  9 "B6"    N593JB  JFK    MCO   -593--     
## 10 "AA"    N3ALAA  LGA    ORD   -3----
flights_small %>% mutate(tailnum_chaOnly = str_replace_all(tailnum, "\\d+", ""))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_chaOnly
##    <chr>   <chr>   <chr>  <chr> <chr>          
##  1 "UA "   N14228  EWR    IAH   N              
##  2 "UA"    N24211  LGA    IAH   N              
##  3 "AA"    N619AA  JFK    MIA   NAA            
##  4 "B6"    N804JB  JFK    BQN   NJB            
##  5 "DL"    N668DN  LGA    ATL   NDN            
##  6 "UA"    N39463  EWR    ORD   N              
##  7 "B6"    N516JB  EWR    FLL   NJB            
##  8 "EV"    N829AS  LGA    IAD   NAS            
##  9 "B6"    N593JB  JFK    MCO   NJB            
## 10 "AA"    N3ALAA  LGA    ORD   NALAA

Splitting

sentences[1] %>% str_split(" ", n = 3, simplify = TRUE)
##      [,1]  [,2]    [,3]                              
## [1,] "The" "birch" "canoe slid on the smooth planks."
flights_small %>% mutate(new_col = str_split(tailnum, "\\d", n = 2, simplify(TRUE)))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  new_col[,1] [,2] 
##    <chr>   <chr>   <chr>  <chr> <chr>       <chr>
##  1 "UA "   N14228  EWR    IAH   N           4228 
##  2 "UA"    N24211  LGA    IAH   N           4211 
##  3 "AA"    N619AA  JFK    MIA   N           19AA 
##  4 "B6"    N804JB  JFK    BQN   N           04JB 
##  5 "DL"    N668DN  LGA    ATL   N           68DN 
##  6 "UA"    N39463  EWR    ORD   N           9463 
##  7 "B6"    N516JB  EWR    FLL   N           16JB 
##  8 "EV"    N829AS  LGA    IAD   N           29AS 
##  9 "B6"    N593JB  JFK    MCO   N           93JB 
## 10 "AA"    N3ALAA  LGA    ORD   N           ALAA

Find Matches

flights_small %>% mutate(tailnum_chaOnly = str_remove_all(tailnum, "\\d+"))
## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_chaOnly
##    <chr>   <chr>   <chr>  <chr> <chr>          
##  1 "UA "   N14228  EWR    IAH   N              
##  2 "UA"    N24211  LGA    IAH   N              
##  3 "AA"    N619AA  JFK    MIA   NAA            
##  4 "B6"    N804JB  JFK    BQN   NJB            
##  5 "DL"    N668DN  LGA    ATL   NDN            
##  6 "UA"    N39463  EWR    ORD   N              
##  7 "B6"    N516JB  EWR    FLL   NJB            
##  8 "EV"    N829AS  LGA    IAD   NAS            
##  9 "B6"    N593JB  JFK    MCO   NJB            
## 10 "AA"    N3ALAA  LGA    ORD   NALAA

Other Types of Patterns

 # Ignore 
flights_small %>% filter(str_detect(tailnum, regex("^n", ignore_case = TRUE)))
## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 "UA "   N14228  EWR    IAH  
##  2 "UA"    N24211  LGA    IAH  
##  3 "AA"    N619AA  JFK    MIA  
##  4 "B6"    N804JB  JFK    BQN  
##  5 "DL"    N668DN  LGA    ATL  
##  6 "UA"    N39463  EWR    ORD  
##  7 "B6"    N516JB  EWR    FLL  
##  8 "EV"    N829AS  LGA    IAD  
##  9 "B6"    N593JB  JFK    MCO  
## 10 "AA"    N3ALAA  LGA    ORD
flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))
## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N39463  EWR    ORD  
## 3 "B6"    N516JB  EWR    FLL