Matching Patterns With Regular Expressions
flights %>% glimpse()
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)
Basic Matches
flights_small %>% filter(str_detect(origin, "M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, ".M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "M."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "M\\."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
Anchors
flights_small %>% filter(str_detect(origin, "^E"))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N39463 EWR ORD
## 3 B6 N516JB EWR FLL
flights_small %>% filter(str_detect(origin, "A$"))
## # A tibble: 4 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N24211 LGA IAH
## 2 DL N668DN LGA ATL
## 3 EV N829AS LGA IAD
## 4 AA N3ALAA LGA ORD
Character Classes and Alternatives
flights_small %>% filter(str_detect(carrier, "\\d"))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 B6 N804JB JFK BQN
## 2 B6 N516JB EWR FLL
## 3 B6 N593JB JFK MCO
flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(carrier, "[ABD]"))
## # A tibble: 9 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 B6 N593JB JFK MCO
## 9 AA N3ALAA LGA ORD
flights_small %>% filter(str_detect(carrier, "[^ABD]"))
## # A tibble: 8 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 B6 N804JB JFK BQN
## 4 DL N668DN LGA ATL
## 5 UA N39463 EWR ORD
## 6 B6 N516JB EWR FLL
## 7 EV N829AS LGA IAD
## 8 B6 N593JB JFK MCO
Repetition
# ? 0 or 1
flights_small %>% filter(str_detect(carrier, "A?"))
## # A tibble: 10 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD
# + 1 or more
flights_small %>% filter(str_detect(carrier, "A+"))
## # A tibble: 5 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 UA N39463 EWR ORD
## 5 AA N3ALAA LGA ORD
# * 0 or more
flights_small %>% filter(str_detect(carrier, "A*"))
## # A tibble: 10 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD
Grouping and backreferences
# (..)\\1
flights_small %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d\\d)\\1"))
## # A tibble: 1,990 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 EV N15555 EWR MKE
## 2 EV N11119 LGA CLE
## 3 UA N14242 EWR TPA
## 4 EV N14143 EWR PIT
## 5 EV N15555 EWR SAV
## 6 UA N12125 EWR LAX
## 7 EV N15555 EWR PWM
## 8 EV N15555 EWR BUF
## 9 EV N15555 EWR RIC
## 10 EV N13133 EWR DTW
## # ℹ 1,980 more rows
Tools
Detect matches
flights_small %>%
summarise(sum(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
## `sum(str_detect(tailnum, "8$"))`
## <int>
## 1 1
str_detect(flights_small$tailnum, "8$")
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(flights_small$tailnum, "8$"))
## [1] 1
mean(str_detect(flights_small$tailnum, "8$"))
## [1] 0.1
Grouped matches
# Extract strings with a noun
noun <- "(a|the) ([^ ]+)"
has_noun <- str_subset(sentences, noun) %>% head(10)
has_noun %>% str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
Replacing matches
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))
## # A tibble: 10 × 5
## carrier tailnum origin dest tailnum_rev
## <chr> <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH -14228
## 2 UA N24211 LGA IAH -24211
## 3 AA N619AA JFK MIA -619AA
## 4 B6 N804JB JFK BQN -804JB
## 5 DL N668DN LGA ATL -668DN
## 6 UA N39463 EWR ORD -39463
## 7 B6 N516JB EWR FLL -516JB
## 8 EV N829AS LGA IAD -829AS
## 9 B6 N593JB JFK MCO -593JB
## 10 AA N3ALAA LGA ORD -3ALAA
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-"))
## # A tibble: 10 × 5
## carrier tailnum origin dest tailnum_rev
## <chr> <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH -14228
## 2 UA N24211 LGA IAH -24211
## 3 AA N619AA JFK MIA -619--
## 4 B6 N804JB JFK BQN -804--
## 5 DL N668DN LGA ATL -668--
## 6 UA N39463 EWR ORD -39463
## 7 B6 N516JB EWR FLL -516--
## 8 EV N829AS LGA IAD -829--
## 9 B6 N593JB JFK MCO -593--
## 10 AA N3ALAA LGA ORD -3----
Splitting
sentences %>%
head(5) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
## [1] "a" "b" "c" "d"
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth" "planks."
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue" "background."
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of" "a"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a" "rare"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls." ""
## [,9]
## [1,] ""
## [2,] ""
## [3,] "well."
## [4,] "dish."
## [5,] ""
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
## [,1] [,2]
## [1,] "Name" "Hadley"
## [2,] "Country" "NZ"
## [3,] "Age" "35"
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
## Warning: `str_view_all()` was deprecated in stringr 1.5.0.
## ℹ Please use `str_view()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [1] │ <This> <is> <a> <sentence>. <This> <is> <another> <sentence>.
str_split(x, " ")[[1]]
## [1] "This" "is" "a" "sentence." "" "This"
## [7] "is" "another" "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This" "is" "a" "sentence" "This" "is" "another"
## [8] "sentence"
Other types of patterns
flights_small %>% filter(str_detect(tailnum, regex("^n", ignore_case = TRUE)))
## # A tibble: 10 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD
flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N39463 EWR ORD
## 3 B6 N516JB EWR FLL
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
## [1] │ <banana>
str_view(bananas, regex("banana", ignore_case = TRUE))
## [1] │ <banana>
## [2] │ <Banana>
## [3] │ <BANANA>
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
phone <- regex("
\\(? # optional opening parens
(\\d{3}) # area code
[) -]? # optional closing parens, space, or dash
(\\d{3}) # another three numbers
[ -]? # optional space or dash
(\\d{3}) # three more numbers
", comments = TRUE)
str_match("514-791-8141", phone)
## [,1] [,2] [,3] [,4]
## [1,] "514-791-814" "514" "791" "814"
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "İ" "i" "ı"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "İ" "i"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
## [1] │ <This> <is> <a> <sentence>.
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This" "is" "a" "sentence"