Module 10: Code Along 9

knitr::opts_chunk$set(echo = TRUE)

# Load package]
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readxl) # for importing excel files
library(janitor) # cleaning data

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(nycflights13)

Introduction

String basics

chac_data <- "I'm 'very' hungry."

stringr::str_length("I am hungry.")

## [1] 12

stringr::str_c(c("I", " am"), collapse = " ")

## [1] "I  am"

stringr::str_c("I", " am", sep = ";")

## [1] "I; am"

str_sort(c("John", "Mary", "Aaron"))

## [1] "Aaron" "John"  "Mary"

Matching patterns with regular expressions

flights %>% glimpse()

## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)

Basic matches

flights_small %>% filter(str_detect(dest, "M"))

## # A tibble: 2 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 AA      N619AA  JFK    MIA  
## 2 B6      N593JB  JFK    MCO

flights_small %>% filter(str_detect(dest, ".M"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights_small %>% filter(str_detect(dest, "M."))

## # A tibble: 2 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 AA      N619AA  JFK    MIA  
## 2 B6      N593JB  JFK    MCO

flights_small %>% filter(str_detect(dest, "M\\."))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

Anchors

flights_small %>% filter(str_detect(origin, "E$"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

Character classes and alternatives

flights_small %>% filter(str_detect(carrier, "\\d"))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 B6      N804JB  JFK    BQN  
## 2 B6      N516JB  EWR    FLL  
## 3 B6      N593JB  JFK    MCO

flights_small %>% filter(str_detect(carrier, "\\s"))

## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>

flights_small %>% filter(str_detect(carrier, "[ABD]"))

## # A tibble: 9 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 AA      N619AA  JFK    MIA  
## 4 B6      N804JB  JFK    BQN  
## 5 DL      N668DN  LGA    ATL  
## 6 UA      N39463  EWR    ORD  
## 7 B6      N516JB  EWR    FLL  
## 8 B6      N593JB  JFK    MCO  
## 9 AA      N3ALAA  LGA    ORD

flights_small %>% filter(str_detect(carrier, "[^ABD]"))

## # A tibble: 8 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N24211  LGA    IAH  
## 3 B6      N804JB  JFK    BQN  
## 4 DL      N668DN  LGA    ATL  
## 5 UA      N39463  EWR    ORD  
## 6 B6      N516JB  EWR    FLL  
## 7 EV      N829AS  LGA    IAD  
## 8 B6      N593JB  JFK    MCO

Repitition

# ? 0 or 1
flights %>% filter(str_detect(carrier, "A?"))

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

# + 1 or more
flights %>% filter(str_detect(carrier, "A+"))

## # A tibble: 92,450 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      558            600        -2      753            745
##  6  2013     1     1      558            600        -2      924            917
##  7  2013     1     1      558            600        -2      923            937
##  8  2013     1     1      559            600        -1      941            910
##  9  2013     1     1      559            600        -1      854            902
## 10  2013     1     1      606            610        -4      858            910
## # ℹ 92,440 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

# * 0 or more
flights %>% filter(str_detect(carrier, "A*"))

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Grouping and backreferences

# (..)\\1

flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d{2})\\1"))

## # A tibble: 1,990 × 4
##    carrier tailnum origin dest 
##    <chr>   <chr>   <chr>  <chr>
##  1 EV      N15555  EWR    MKE  
##  2 EV      N11119  LGA    CLE  
##  3 UA      N14242  EWR    TPA  
##  4 EV      N14143  EWR    PIT  
##  5 EV      N15555  EWR    SAV  
##  6 UA      N12125  EWR    LAX  
##  7 EV      N15555  EWR    PWM  
##  8 EV      N15555  EWR    BUF  
##  9 EV      N15555  EWR    RIC  
## 10 EV      N13133  EWR    DTW  
## # ℹ 1,980 more rows

Tools

Detect matches

flights_small %>%
    summarise(sum(str_detect(tailnum, "8$")))

## # A tibble: 1 × 1
##   `sum(str_detect(tailnum, "8$"))`
##                              <int>
## 1                                1

str_detect(flights_small$tailnum, "8$")

##  [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

sum(str_detect(flights_small$tailnum, "8$"))

## [1] 1

mean(str_detect(flights_small$tailnum, "8$"))

## [1] 0.1

Extract matches

colours <- c("red", "orange", "yellow", "green", "blue", "pueple")
colour_match <- str_c(colours, collapse = "|")
colour_match

## [1] "red|orange|yellow|green|blue|pueple"

# Extract strings with a color
has_colour <- str_subset(sentences, colour_match)
str_extract(has_colour, colour_match)

##  [1] "blue"   "blue"   "red"    "red"    "red"    "blue"   "yellow" "red"   
##  [9] "red"    "green"  "red"    "red"    "blue"   "red"    "red"    "red"   
## [17] "red"    "blue"   "red"    "blue"   "red"    "green"  "red"    "red"   
## [25] "red"    "red"    "red"    "red"    "green"  "red"    "green"  "red"   
## [33] "green"  "red"    "red"    "red"    "red"    "red"    "blue"   "red"   
## [41] "blue"   "red"    "red"    "red"    "red"    "green"  "green"  "green" 
## [49] "red"    "red"    "yellow" "red"    "orange" "red"    "red"    "red"

Grouped matches

# Extract strings with a noun
noun <- "(a|the) ([^ ]+)"
had_nouns <- str_subset(sentences, noun) %>% head(10)
had_nouns %>% str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

Replacing matches

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619AA     
##  4 B6      N804JB  JFK    BQN   -804JB     
##  5 DL      N668DN  LGA    ATL   -668DN     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516JB     
##  8 EV      N829AS  LGA    IAD   -829AS     
##  9 B6      N593JB  JFK    MCO   -593JB     
## 10 AA      N3ALAA  LGA    ORD   -3ALAA

flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("^[A-Z]", "-"))

## # A tibble: 10 × 5
##    carrier tailnum origin dest  tailnum_rev
##    <chr>   <chr>   <chr>  <chr> <chr>      
##  1 UA      N14228  EWR    IAH   -14228     
##  2 UA      N24211  LGA    IAH   -24211     
##  3 AA      N619AA  JFK    MIA   -619AA     
##  4 B6      N804JB  JFK    BQN   -804JB     
##  5 DL      N668DN  LGA    ATL   -668DN     
##  6 UA      N39463  EWR    ORD   -39463     
##  7 B6      N516JB  EWR    FLL   -516JB     
##  8 EV      N829AS  LGA    IAD   -829AS     
##  9 B6      N593JB  JFK    MCO   -593JB     
## 10 AA      N3ALAA  LGA    ORD   -3ALAA

Splitting

sentences[1] %>% str_split(" ", n = 3, simplify = TRUE)

##      [,1]  [,2]    [,3]                              
## [1,] "The" "birch" "canoe slid on the smooth planks."

Other types of patterns

flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))

## # A tibble: 3 × 4
##   carrier tailnum origin dest 
##   <chr>   <chr>   <chr>  <chr>
## 1 UA      N14228  EWR    IAH  
## 2 UA      N39463  EWR    ORD  
## 3 B6      N516JB  EWR    FLL

Module 10: Code Along 9

R for Data Science: Chapter 14

Joe Stanley

2023-10-30

Introduction

String basics

Matching patterns with regular expressions

Basic matches

Anchors

Character classes and alternatives

Repitition

Grouping and backreferences

Tools

Detect matches

Extract matches

Grouped matches

Replacing matches

Splitting

Other types of patterns