knitr::opts_chunk$set(echo = TRUE)
# Load package]
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl) # for importing excel files
library(janitor) # cleaning data
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(nycflights13)
chac_data <- "I'm 'very' hungry."
stringr::str_length("I am hungry.")
## [1] 12
stringr::str_c(c("I", " am"), collapse = " ")
## [1] "I am"
stringr::str_c("I", " am", sep = ";")
## [1] "I; am"
str_sort(c("John", "Mary", "Aaron"))
## [1] "Aaron" "John" "Mary"
flights %>% glimpse()
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
flights_small <- flights %>% select(where(is.character)) %>% head(n = 10)
flights_small %>% filter(str_detect(dest, "M"))
## # A tibble: 2 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 AA N619AA JFK MIA
## 2 B6 N593JB JFK MCO
flights_small %>% filter(str_detect(dest, ".M"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(dest, "M."))
## # A tibble: 2 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 AA N619AA JFK MIA
## 2 B6 N593JB JFK MCO
flights_small %>% filter(str_detect(dest, "M\\."))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(origin, "E$"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(carrier, "\\d"))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 B6 N804JB JFK BQN
## 2 B6 N516JB EWR FLL
## 3 B6 N593JB JFK MCO
flights_small %>% filter(str_detect(carrier, "\\s"))
## # A tibble: 0 × 4
## # ℹ 4 variables: carrier <chr>, tailnum <chr>, origin <chr>, dest <chr>
flights_small %>% filter(str_detect(carrier, "[ABD]"))
## # A tibble: 9 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 AA N619AA JFK MIA
## 4 B6 N804JB JFK BQN
## 5 DL N668DN LGA ATL
## 6 UA N39463 EWR ORD
## 7 B6 N516JB EWR FLL
## 8 B6 N593JB JFK MCO
## 9 AA N3ALAA LGA ORD
flights_small %>% filter(str_detect(carrier, "[^ABD]"))
## # A tibble: 8 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N24211 LGA IAH
## 3 B6 N804JB JFK BQN
## 4 DL N668DN LGA ATL
## 5 UA N39463 EWR ORD
## 6 B6 N516JB EWR FLL
## 7 EV N829AS LGA IAD
## 8 B6 N593JB JFK MCO
# ? 0 or 1
flights %>% filter(str_detect(carrier, "A?"))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# + 1 or more
flights %>% filter(str_detect(carrier, "A+"))
## # A tibble: 92,450 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 558 600 -2 753 745
## 6 2013 1 1 558 600 -2 924 917
## 7 2013 1 1 558 600 -2 923 937
## 8 2013 1 1 559 600 -1 941 910
## 9 2013 1 1 559 600 -1 854 902
## 10 2013 1 1 606 610 -4 858 910
## # ℹ 92,440 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# * 0 or more
flights %>% filter(str_detect(carrier, "A*"))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# (..)\\1
flights %>% select(where(is.character)) %>% filter(str_detect(tailnum, "(\\d{2})\\1"))
## # A tibble: 1,990 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 EV N15555 EWR MKE
## 2 EV N11119 LGA CLE
## 3 UA N14242 EWR TPA
## 4 EV N14143 EWR PIT
## 5 EV N15555 EWR SAV
## 6 UA N12125 EWR LAX
## 7 EV N15555 EWR PWM
## 8 EV N15555 EWR BUF
## 9 EV N15555 EWR RIC
## 10 EV N13133 EWR DTW
## # ℹ 1,980 more rows
flights_small %>%
summarise(sum(str_detect(tailnum, "8$")))
## # A tibble: 1 × 1
## `sum(str_detect(tailnum, "8$"))`
## <int>
## 1 1
str_detect(flights_small$tailnum, "8$")
## [1] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
sum(str_detect(flights_small$tailnum, "8$"))
## [1] 1
mean(str_detect(flights_small$tailnum, "8$"))
## [1] 0.1
colours <- c("red", "orange", "yellow", "green", "blue", "pueple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|pueple"
# Extract strings with a color
has_colour <- str_subset(sentences, colour_match)
str_extract(has_colour, colour_match)
## [1] "blue" "blue" "red" "red" "red" "blue" "yellow" "red"
## [9] "red" "green" "red" "red" "blue" "red" "red" "red"
## [17] "red" "blue" "red" "blue" "red" "green" "red" "red"
## [25] "red" "red" "red" "red" "green" "red" "green" "red"
## [33] "green" "red" "red" "red" "red" "red" "blue" "red"
## [41] "blue" "red" "red" "red" "red" "green" "green" "green"
## [49] "red" "red" "yellow" "red" "orange" "red" "red" "red"
# Extract strings with a noun
noun <- "(a|the) ([^ ]+)"
had_nouns <- str_subset(sentences, noun) %>% head(10)
had_nouns %>% str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace("^[A-Z]", "-"))
## # A tibble: 10 × 5
## carrier tailnum origin dest tailnum_rev
## <chr> <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH -14228
## 2 UA N24211 LGA IAH -24211
## 3 AA N619AA JFK MIA -619AA
## 4 B6 N804JB JFK BQN -804JB
## 5 DL N668DN LGA ATL -668DN
## 6 UA N39463 EWR ORD -39463
## 7 B6 N516JB EWR FLL -516JB
## 8 EV N829AS LGA IAD -829AS
## 9 B6 N593JB JFK MCO -593JB
## 10 AA N3ALAA LGA ORD -3ALAA
flights_small %>% mutate(tailnum_rev = tailnum %>% str_replace_all("^[A-Z]", "-"))
## # A tibble: 10 × 5
## carrier tailnum origin dest tailnum_rev
## <chr> <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH -14228
## 2 UA N24211 LGA IAH -24211
## 3 AA N619AA JFK MIA -619AA
## 4 B6 N804JB JFK BQN -804JB
## 5 DL N668DN LGA ATL -668DN
## 6 UA N39463 EWR ORD -39463
## 7 B6 N516JB EWR FLL -516JB
## 8 EV N829AS LGA IAD -829AS
## 9 B6 N593JB JFK MCO -593JB
## 10 AA N3ALAA LGA ORD -3ALAA
sentences[1] %>% str_split(" ", n = 3, simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "The" "birch" "canoe slid on the smooth planks."
flights_small %>% filter(str_detect(origin, regex("^e", ignore_case = TRUE)))
## # A tibble: 3 × 4
## carrier tailnum origin dest
## <chr> <chr> <chr> <chr>
## 1 UA N14228 EWR IAH
## 2 UA N39463 EWR ORD
## 3 B6 N516JB EWR FLL