# Load package
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
char_data <- "I'm 'very'hungry"
stringr::str_length("I am hungry")
## [1] 11
stringr::str_c("I", " am", sep = ";")
## [1] "I; am"
stringr::str_c(c("I", "am"), collapse = "/")
## [1] "I/am"
str_sort(c("john", "mary", "aaron"))
## [1] "aaron" "john" "mary"
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)
flights_small
## # A tibble: 10 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD
flights_small %>% filter(str_detect(origin, "M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, ".M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "^E"))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N39463 EWR ORD
## 3 B6 N516JB EWR FLL
flights_small %>% filter(str_detect(carrier, "\\d"))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 B6 N804JB JFK BQN
## 2 B6 N516JB EWR FLL
## 3 B6 N593JB JFK MCO
flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(carrier, "[ABD]"))
## # A tibble: 9 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 B6 N593JB JFK MCO
## 9 AA N3ALAA LGA ORD
flights_small %>% filter(str_detect(carrier, "[^ABD]"))
## # A tibble: 8 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 B6 N804JB JFK BQN
## 4 DL N668DN LGA ATL
## 5 UA N39463 EWR ORD
## 6 B6 N516JB EWR FLL
## 7 EV N829AS LGA IAD
## 8 B6 N593JB JFK MCO
# ? 0 or 1
flights_small %>% filter(str_detect(carrier, "A?"))
## # A tibble: 10 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD
# + 1 or more
flights_small %>% filter(str_detect(carrier, "A+"))
## # A tibble: 5 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 UA N39463 EWR ORD
## 5 AA N3ALAA LGA ORD
# * 0 or more
flights_small %>% filter(str_detect(carrier, "A*"))
## # A tibble: 10 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD
# (\\d)\\1
flights_small %>% filter(str_detect(tailnum, "(\\d)\\1"))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 DL N668DN LGA ATL
flights_small %>%
summarise(sum(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
## `sum(str_detect(tailnum, "8$"))`
## <int>
## 1 1
flights_small %>%
summarise(mean(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
## `mean(str_detect(tailnum, "8$"))`
## <dbl>
## 1 0.1
colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
# extract strings with a color
str_subset(sentences, colour_match)
## [1] "Glue the sheet to the dark blue background."
## [2] "Two blue fish swam in the tank."
## [3] "The colt reared and threw the tall rider."
## [4] "The wide road shimmered in the hot sun."
## [5] "See the cat glaring at the scared mouse."
## [6] "A wisp of cloud hung in the blue air."
## [7] "Leaves turn brown and yellow in the fall."
## [8] "He ordered peach pie with ice cream."
## [9] "Pure bred poodles have curls."
## [10] "The spot on the blotter was made by green ink."
## [11] "Mud was spattered on the front of his white shirt."
## [12] "The sofa cushion is red and of light weight."
## [13] "The sky that morning was clear and bright blue."
## [14] "Torn scraps littered the stone floor."
## [15] "The doctor cured him with these pills."
## [16] "The new girl was fired today at noon."
## [17] "The third act was dull and tired the players."
## [18] "A blue crane is a tall wading bird."
## [19] "Live wires should be kept covered."
## [20] "It is hard to erase blue or red ink."
## [21] "The wreck occurred by the bank on Main Street."
## [22] "The lamp shone with a steady green flame."
## [23] "The box is held by a bright red snapper."
## [24] "The prince ordered his head chopped off."
## [25] "The houses are built of red clay bricks."
## [26] "The red tape bound the smuggled food."
## [27] "Nine men were hired to dig the ruins."
## [28] "The flint sputtered and lit a pine torch."
## [29] "Hedge apples may stain your hands green."
## [30] "The old pan was covered with hard fudge."
## [31] "The plant grew large and green in the window."
## [32] "The store walls were lined with colored frocks."
## [33] "The purple tie was ten years old."
## [34] "Bathe and relax in the cool green grass."
## [35] "The clan gathered on each dull night."
## [36] "The lake sparkled in the red hot sun."
## [37] "Mark the spot with a sign painted red."
## [38] "Smoke poured out of every crack."
## [39] "Serve the hot rum to the tired heroes."
## [40] "The couch cover and hall drapes were blue."
## [41] "He offered proof in the form of a large chart."
## [42] "A man in a blue sweater sat at the desk."
## [43] "A sip of tea revives his tired friend."
## [44] "The door was barred, locked, and bolted as well."
## [45] "A thick coat of black paint covered all."
## [46] "The small red neon lamp went out."
## [47] "Paint the sockets in the wall dull green."
## [48] "Wake and rise, and step into the green outdoors."
## [49] "The green light in the brown box flickered."
## [50] "He put his last cartridge into the gun and fired."
## [51] "The ram scared the school children off."
## [52] "Tear a thin sheet from the yellow pad."
## [53] "Dimes showered down from all sides."
## [54] "The sky in the west is tinged with orange red."
## [55] "The red paper brightened the dim stage."
## [56] "The hail pattered on the burnt brown grass."
## [57] "The big red apple fell to the ground."
has_colour <- str_subset(sentences, colour_match)
str_extract(has_colour, colour_match)
## [1] "blue" "blue" "red" "red" "red" "blue" "yellow" "red"
## [9] "red" "green" "red" "red" "blue" "red" "red" "red"
## [17] "red" "blue" "red" "blue" "red" "green" "red" "red"
## [25] "red" "red" "red" "red" "green" "red" "green" "red"
## [33] "purple" "green" "red" "red" "red" "red" "red" "blue"
## [41] "red" "blue" "red" "red" "red" "red" "green" "green"
## [49] "green" "red" "red" "yellow" "red" "orange" "red" "red"
## [57] "red"
# extract strings with a noun
noun <- "(a|the) ([^ ]+)"
has_noun <- str_subset(sentences, noun) %>% head(n = 10)
has_noun %>% str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))
## # A tibble: 10 × 5
## carrier tailnum origin dest tailnum_rev
## <chr> <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH -14228
## 2 UA N24211 LGA IAH -24211
## 3 AA N619AA JFK MIA -619AA
## 4 B6 N804JB JFK BQN -804JB
## 5 DL N668DN LGA ATL -668DN
## 6 UA N39463 EWR ORD -39463
## 7 B6 N516JB EWR FLL -516JB
## 8 EV N829AS LGA IAD -829AS
## 9 B6 N593JB JFK MCO -593JB
## 10 AA N3ALAA LGA ORD -3ALAA
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("[A-Z]", "-"))
## # A tibble: 10 × 5
## carrier tailnum origin dest tailnum_rev
## <chr> <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH -14228
## 2 UA N24211 LGA IAH -24211
## 3 AA N619AA JFK MIA -619--
## 4 B6 N804JB JFK BQN -804--
## 5 DL N668DN LGA ATL -668--
## 6 UA N39463 EWR ORD -39463
## 7 B6 N516JB EWR FLL -516--
## 8 EV N829AS LGA IAD -829--
## 9 B6 N593JB JFK MCO -593--
## 10 AA N3ALAA LGA ORD -3----
sentences[1] %>% str_split(" ", n = 2)
## [[1]]
## [1] "The"
## [2] "birch canoe slid on the smooth planks."
flights_small %>% filter(str_detect(tailnum, regex("^n", ignore_case = TRUE)))
## # A tibble: 10 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 EV N829AS LGA IAD
## 9 B6 N593JB JFK MCO
## 10 AA N3ALAA LGA ORD