Week 6: Module 10 - Code Along 9

Introduction

String Basics

String Lengths and Combining Strings

 # This is how to put quotes inside quotation marks. 
chac_data <- "I'm 'very' hungry."

stringr::str_length("I am hungry.")

## [1] 12

stringr::str_c("I", " am", sep = " ;")

## [1] "I ; am"

stringr::str_c(c ("I", " am."), collapse = " truly")

## [1] "I truly am."

Subsetting Strings

x <- c("Apple", "Banana", "Pear")

 # Positive numbers count from front to back.
str_sub(x, 1, 3)

## [1] "App" "Ban" "Pea"

 # Negative numbers count backwards from end.
str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

Locale

str_sort(c("John", "Mary", "Aaron"))

## [1] "Aaron" "John"  "Mary"

str_to_lower(c("JOHN", "AARON"))

## [1] "john"  "aaron"

str_to_upper(c("sara loves data analytics!"))

## [1] "SARA LOVES DATA ANALYTICS!"

Matching Patterns

 # To see the data set - number of rows and columns.
flights %>% glimpse()

## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

 # To reduce data set to a more manageable size.
flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)

Basic Matches

 # Finds matches to the chosen value.
flights_small %>% filter(str_detect(dest, "AH"))

## # A tibble: 2 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH

 # Period/dot is used to detect values prior to the chosen value.
flights_small %>% filter(str_detect(dest, ".A"))

## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 EV      N829AS  LGA    IAD

 # Period/dot is used to detect values after to the chosen value.
flights_small %>% filter(str_detect(dest, "A."))

## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 DL      N668DN  LGA    ATL  
## 4 EV      N829AS  LGA    IAD

 # To have R recognize a period/dot as a period/dot, put double backslashes in front of it. (\\.)

Anchors

flights_small %>% filter(str_detect(origin, "^E"))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL

flights_small %>% filter(str_detect(origin, "A$"))

## # A tibble: 4 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N24211  LGA    IAH  
## 2 DL      N668DN  LGA    ATL  
## 3 EV      N829AS  LGA    IAD  
## 4 AA      N3ALAA  LGA    ORD

Character Classes and Alternatives

 # \d matches any digit. Use another \ to show the first backslash is part of the function.
flights_small %>% filter(str_detect(carrier, "\\d"))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 B6      N804JB  JFK    BQN  
## 2 B6      N516JB  EWR    FLL  
## 3 B6      N593JB  JFK    MCO

 # \s matches any white space.
flights_small %>% filter(str_detect(carrier, "\\s"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights_space <- flights_small[1,1] <- "UA "

flights_small %>% filter(str_detect(carrier, "\\s"))

## # A tibble: 1 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH

 # [abc]: matches a, b, or c.
flights_small %>% filter(str_detect(carrier, "[ABD]"))

## # A tibble: 9 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "AA"    N619AA  JFK    MIA  
## 4 "B6"    N804JB  JFK    BQN  
## 5 "DL"    N668DN  LGA    ATL  
## 6 "UA"    N39463  EWR    ORD  
## 7 "B6"    N516JB  EWR    FLL  
## 8 "B6"    N593JB  JFK    MCO  
## 9 "AA"    N3ALAA  LGA    ORD

 # [^abc]: matches anything except a, b, or c.
flights_small %>% filter(str_detect(carrier, "[^ABD]"))

## # A tibble: 8 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "B6"    N804JB  JFK    BQN  
## 4 "DL"    N668DN  LGA    ATL  
## 5 "UA"    N39463  EWR    ORD  
## 6 "B6"    N516JB  EWR    FLL  
## 7 "EV"    N829AS  LGA    IAD  
## 8 "B6"    N593JB  JFK    MCO

Repetition

 # ? = 0 or 1
flights_small %>% filter(str_detect(carrier, "A?"))

## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 "UA "   N14228  EWR    IAH  
##  2 "UA"    N24211  LGA    IAH  
##  3 "AA"    N619AA  JFK    MIA  
##  4 "B6"    N804JB  JFK    BQN  
##  5 "DL"    N668DN  LGA    ATL  
##  6 "UA"    N39463  EWR    ORD  
##  7 "B6"    N516JB  EWR    FLL  
##  8 "EV"    N829AS  LGA    IAD  
##  9 "B6"    N593JB  JFK    MCO  
## 10 "AA"    N3ALAA  LGA    ORD

 # + = 1 or more
flights_small %>% filter(str_detect(carrier, "A+"))

## # A tibble: 5 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "AA"    N619AA  JFK    MIA  
## 4 "UA"    N39463  EWR    ORD  
## 5 "AA"    N3ALAA  LGA    ORD

 # * = 0 or more
flights_small %>% filter(str_detect(carrier, "A*"))

## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 "UA "   N14228  EWR    IAH  
##  2 "UA"    N24211  LGA    IAH  
##  3 "AA"    N619AA  JFK    MIA  
##  4 "B6"    N804JB  JFK    BQN  
##  5 "DL"    N668DN  LGA    ATL  
##  6 "UA"    N39463  EWR    ORD  
##  7 "B6"    N516JB  EWR    FLL  
##  8 "EV"    N829AS  LGA    IAD  
##  9 "B6"    N593JB  JFK    MCO  
## 10 "AA"    N3ALAA  LGA    ORD

Grouping and Backreferences

 # (..)\\1 to identify repeated patterns.
flights_small %>% filter(str_detect(tailnum, "(\\d)\\1"))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N24211  LGA    IAH  
## 3 "DL"    N668DN  LGA    ATL

flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))

## # A tibble: 1,990 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 EV      N15555  EWR    MKE  
##  2 EV      N11119  LGA    CLE  
##  3 UA      N14242  EWR    TPA  
##  4 EV      N14143  EWR    PIT  
##  5 EV      N15555  EWR    SAV  
##  6 UA      N12125  EWR    LAX  
##  7 EV      N15555  EWR    PWM  
##  8 EV      N15555  EWR    BUF  
##  9 EV      N15555  EWR    RIC  
## 10 EV      N13133  EWR    DTW  
## # ℹ 1,980 more rows

 # To specify the number of matches precisely: {n}: exactly n  {n,}: n or more  {,m}: at most m  {n,m}: between n and m

Tools

Detect Matches

 # How many tail numbers end with number 8?
flights_small %>% 
    summarise(sum(str_detect(tailnum, "8$")))

## # A tibble: 1 × 1
##   `sum(str_detect(tailnum, "8$"))`
##                              <int>
## 1                                1

str_detect(flights_small$tailnum, "8$")

##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

sum(str_detect(flights_small$tailnum, "8$"))

## [1] 1

str_detect(flights_small$tailnum, "8$")

##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

mean(str_detect(flights_small$tailnum, "8$"))

## [1] 0.1

flights_small %>% filter(str_detect(origin, "^E"))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N39463  EWR    ORD  
## 3 "B6"    N516JB  EWR    FLL

Extract Matches

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match

## [1] "red|orange|yellow|green|blue|purple"

 # Extract strings with colors. - Watch for "red"
has_colour <- str_subset(sentences, colour_match)

str_extract(has_colour, colour_match)

##  [1] "blue"   "blue"   "red"    "red"    "red"    "blue"   "yellow" "red"   
##  [9] "red"    "green"  "red"    "red"    "blue"   "red"    "red"    "red"   
## [17] "red"    "blue"   "red"    "blue"   "red"    "green"  "red"    "red"   
## [25] "red"    "red"    "red"    "red"    "green"  "red"    "green"  "red"   
## [33] "purple" "green"  "red"    "red"    "red"    "red"    "red"    "blue"  
## [41] "red"    "blue"   "red"    "red"    "red"    "red"    "green"  "green" 
## [49] "green"  "red"    "red"    "yellow" "red"    "orange" "red"    "red"   
## [57] "red"

flights_small %>% mutate(tailnum_numOnly = str_extract(tailnum, "\\d+"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_numOnly
##    <chr>   <chr>   <chr>  <chr> <chr>          
##  1 "UA "   N14228  EWR    IAH   14228          
##  2 "UA"    N24211  LGA    IAH   24211          
##  3 "AA"    N619AA  JFK    MIA   619            
##  4 "B6"    N804JB  JFK    BQN   804            
##  5 "DL"    N668DN  LGA    ATL   668            
##  6 "UA"    N39463  EWR    ORD   39463          
##  7 "B6"    N516JB  EWR    FLL   516            
##  8 "EV"    N829AS  LGA    IAD   829            
##  9 "B6"    N593JB  JFK    MCO   593            
## 10 "AA"    N3ALAA  LGA    ORD   3

Grouped Matches

 # Extract strings with a noun.
noun <- "(a|the) ([^ ]+)"
has_nouns <- str_subset(sentences, noun) %>% head(10)
has_nouns %>% str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

Replacing Matches

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 "UA "   N14228  EWR    IAH   -14228     
##  2 "UA"    N24211  LGA    IAH   -24211     
##  3 "AA"    N619AA  JFK    MIA   -619AA     
##  4 "B6"    N804JB  JFK    BQN   -804JB     
##  5 "DL"    N668DN  LGA    ATL   -668DN     
##  6 "UA"    N39463  EWR    ORD   -39463     
##  7 "B6"    N516JB  EWR    FLL   -516JB     
##  8 "EV"    N829AS  LGA    IAD   -829AS     
##  9 "B6"    N593JB  JFK    MCO   -593JB     
## 10 "AA"    N3ALAA  LGA    ORD   -3ALAA

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 "UA "   N14228  EWR    IAH   -14228     
##  2 "UA"    N24211  LGA    IAH   -24211     
##  3 "AA"    N619AA  JFK    MIA   -619--     
##  4 "B6"    N804JB  JFK    BQN   -804--     
##  5 "DL"    N668DN  LGA    ATL   -668--     
##  6 "UA"    N39463  EWR    ORD   -39463     
##  7 "B6"    N516JB  EWR    FLL   -516--     
##  8 "EV"    N829AS  LGA    IAD   -829--     
##  9 "B6"    N593JB  JFK    MCO   -593--     
## 10 "AA"    N3ALAA  LGA    ORD   -3----

flights_small %>% mutate(tailnum_chaOnly = str_replace_all(tailnum, "\\d+", ""))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_chaOnly
##    <chr>   <chr>   <chr>  <chr> <chr>          
##  1 "UA "   N14228  EWR    IAH   N              
##  2 "UA"    N24211  LGA    IAH   N              
##  3 "AA"    N619AA  JFK    MIA   NAA            
##  4 "B6"    N804JB  JFK    BQN   NJB            
##  5 "DL"    N668DN  LGA    ATL   NDN            
##  6 "UA"    N39463  EWR    ORD   N              
##  7 "B6"    N516JB  EWR    FLL   NJB            
##  8 "EV"    N829AS  LGA    IAD   NAS            
##  9 "B6"    N593JB  JFK    MCO   NJB            
## 10 "AA"    N3ALAA  LGA    ORD   NALAA

Splitting

sentences[1] %>% str_split(" ", n = 3, simplify = TRUE)

##      [,1]  [,2]    [,3]                              
## [1,] "The" "birch" "canoe slid on the smooth planks."

flights_small %>% mutate(new_col = str_split(tailnum, "\\d", n = 2, simplify(TRUE)))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  new_col[,1] [,2] 
##    <chr>   <chr>   <chr>  <chr> <chr>       <chr>
##  1 "UA "   N14228  EWR    IAH   N           4228 
##  2 "UA"    N24211  LGA    IAH   N           4211 
##  3 "AA"    N619AA  JFK    MIA   N           19AA 
##  4 "B6"    N804JB  JFK    BQN   N           04JB 
##  5 "DL"    N668DN  LGA    ATL   N           68DN 
##  6 "UA"    N39463  EWR    ORD   N           9463 
##  7 "B6"    N516JB  EWR    FLL   N           16JB 
##  8 "EV"    N829AS  LGA    IAD   N           29AS 
##  9 "B6"    N593JB  JFK    MCO   N           93JB 
## 10 "AA"    N3ALAA  LGA    ORD   N           ALAA

Find Matches

flights_small %>% mutate(tailnum_chaOnly = str_remove_all(tailnum, "\\d+"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_chaOnly
##    <chr>   <chr>   <chr>  <chr> <chr>          
##  1 "UA "   N14228  EWR    IAH   N              
##  2 "UA"    N24211  LGA    IAH   N              
##  3 "AA"    N619AA  JFK    MIA   NAA            
##  4 "B6"    N804JB  JFK    BQN   NJB            
##  5 "DL"    N668DN  LGA    ATL   NDN            
##  6 "UA"    N39463  EWR    ORD   N              
##  7 "B6"    N516JB  EWR    FLL   NJB            
##  8 "EV"    N829AS  LGA    IAD   NAS            
##  9 "B6"    N593JB  JFK    MCO   NJB            
## 10 "AA"    N3ALAA  LGA    ORD   NALAA

Other Types of Patterns

 # Ignore 
flights_small %>% filter(str_detect(tailnum, regex("^n", ignore_case = TRUE)))

## # A tibble: 10 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 "UA "   N14228  EWR    IAH  
##  2 "UA"    N24211  LGA    IAH  
##  3 "AA"    N619AA  JFK    MIA  
##  4 "B6"    N804JB  JFK    BQN  
##  5 "DL"    N668DN  LGA    ATL  
##  6 "UA"    N39463  EWR    ORD  
##  7 "B6"    N516JB  EWR    FLL  
##  8 "EV"    N829AS  LGA    IAD  
##  9 "B6"    N593JB  JFK    MCO  
## 10 "AA"    N3ALAA  LGA    ORD

flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 "UA "   N14228  EWR    IAH  
## 2 "UA"    N39463  EWR    ORD  
## 3 "B6"    N516JB  EWR    FLL

Week 6: Module 10 - Code Along 9

R for Data Science: Chapter 14

Sara Donahue

2024-06-17

Introduction

String Basics

String Lengths and Combining Strings

Subsetting Strings

Locale

Matching Patterns

Basic Matches

Anchors

Character Classes and Alternatives

Repetition

Grouping and Backreferences

Tools

Detect Matches

Extract Matches

Grouped Matches

Replacing Matches

Splitting

Find Matches

Other Types of Patterns