library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
library(devtools)
## Loading required package: usethis
find.package("devtools")
## [1] "C:/Users/James/OneDrive/Documents/R/R-4.3.2/library/devtools"
find_rtools()
## [1] TRUE
flights %>%
mutate(long_flight = (air_time >= 6 * 60)) %>%
View()
flights %>%
mutate(long_flight = (air_time >= 6 * 60)) %>%
count(long_flight)
## # A tibble: 3 × 2
## long_flight n
## <lgl> <int>
## 1 FALSE 322630
## 2 TRUE 4716
## 3 NA 9430
flights %>%
count(long_flight = air_time >= 6 * 60)
## # A tibble: 3 × 2
## long_flight n
## <lgl> <int>
## 1 FALSE 322630
## 2 TRUE 4716
## 3 NA 9430
flights %>%
group_by(date = make_date(year, month, day)) %>%
summarise(flights_n = n(), air_time_mean = mean(air_time, na.rm = TRUE)) %>%
ungroup()
## # A tibble: 365 × 3
## date flights_n air_time_mean
## <date> <int> <dbl>
## 1 2013-01-01 842 170.
## 2 2013-01-02 943 162.
## 3 2013-01-03 914 157.
## 4 2013-01-04 915 151.
## 5 2013-01-05 720 161.
## 6 2013-01-06 832 160.
## 7 2013-01-07 933 145.
## 8 2013-01-08 899 149.
## 9 2013-01-09 902 153.
## 10 2013-01-10 932 147.
## # ℹ 355 more rows
flights %>%
slice_sample(n = 15)
## # A tibble: 15 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 10 559 600 -1 817 829
## 2 2013 4 29 654 659 -5 931 1000
## 3 2013 4 19 2250 2057 113 234 2359
## 4 2013 7 13 1850 1815 35 2117 2044
## 5 2013 3 19 1700 1700 0 2045 2110
## 6 2013 1 17 2130 2000 90 2340 2137
## 7 2013 11 11 835 843 -8 1029 1051
## 8 2013 12 31 817 817 0 934 946
## 9 2013 1 1 1925 1900 25 2259 2238
## 10 2013 4 12 630 630 0 803 820
## 11 2013 7 21 1457 1500 -3 1807 1724
## 12 2013 6 4 633 640 -7 751 806
## 13 2013 12 14 1351 1345 6 1712 1705
## 14 2013 7 23 1927 1915 12 2238 2230
## 15 2013 2 6 1314 1310 4 1407 1419
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
slice_sample(prop = 0.15)
## # A tibble: 50,516 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 5 24 1237 1115 82 1520 1410
## 2 2013 8 26 1733 1735 -2 1959 2030
## 3 2013 8 30 1642 1645 -3 1755 1820
## 4 2013 1 3 829 834 -5 1046 1039
## 5 2013 7 13 1305 1314 -9 1622 1620
## 6 2013 6 18 1716 1620 56 1937 1853
## 7 2013 9 12 2339 2200 99 221 48
## 8 2013 3 12 2026 2030 -4 2313 2306
## 9 2013 4 3 1824 1829 -5 2027 2038
## 10 2013 5 21 603 610 -7 716 745
## # ℹ 50,506 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
select(year, month, day) %>%
mutate(date = make_date(year, month, day))
## # A tibble: 336,776 × 4
## year month day date
## <int> <int> <int> <date>
## 1 2013 1 1 2013-01-01
## 2 2013 1 1 2013-01-01
## 3 2013 1 1 2013-01-01
## 4 2013 1 1 2013-01-01
## 5 2013 1 1 2013-01-01
## 6 2013 1 1 2013-01-01
## 7 2013 1 1 2013-01-01
## 8 2013 1 1 2013-01-01
## 9 2013 1 1 2013-01-01
## 10 2013 1 1 2013-01-01
## # ℹ 336,766 more rows
numbers_1 <- tibble(number = c("#1", "Number8", "How are you 3"))
numbers_1 %>% mutate(number = parse_number(number))
## # A tibble: 3 × 1
## number
## <dbl>
## 1 1
## 2 8
## 3 3
flights %>%
select(starts_with("dep_"))
## # A tibble: 336,776 × 2
## dep_time dep_delay
## <int> <dbl>
## 1 517 2
## 2 533 4
## 3 542 2
## 4 544 -1
## 5 554 -6
## 6 554 -4
## 7 555 -5
## 8 557 -3
## 9 557 -3
## 10 558 -2
## # ℹ 336,766 more rows
flights %>%
select(ends_with("hour"))
## # A tibble: 336,776 × 2
## hour time_hour
## <dbl> <dttm>
## 1 5 2013-01-01 05:00:00
## 2 5 2013-01-01 05:00:00
## 3 5 2013-01-01 05:00:00
## 4 5 2013-01-01 05:00:00
## 5 6 2013-01-01 06:00:00
## 6 5 2013-01-01 05:00:00
## 7 6 2013-01-01 06:00:00
## 8 6 2013-01-01 06:00:00
## 9 6 2013-01-01 06:00:00
## 10 6 2013-01-01 06:00:00
## # ℹ 336,766 more rows
flights %>%
select(contains("hour"))
## # A tibble: 336,776 × 2
## hour time_hour
## <dbl> <dttm>
## 1 5 2013-01-01 05:00:00
## 2 5 2013-01-01 05:00:00
## 3 5 2013-01-01 05:00:00
## 4 5 2013-01-01 05:00:00
## 5 6 2013-01-01 06:00:00
## 6 5 2013-01-01 05:00:00
## 7 6 2013-01-01 06:00:00
## 8 6 2013-01-01 06:00:00
## 9 6 2013-01-01 06:00:00
## 10 6 2013-01-01 06:00:00
## # ℹ 336,766 more rows
flights %>%
mutate(origin = case_when(
(origin == "EWR") & dep_delay > 20 ~ "Newark International Airport - DELAYED",
(origin == "EWR") & dep_delay <= 20 ~ "Newark International Airport - ON TIME DEPARTURE",
)) %>%
count(origin)
## # A tibble: 3 × 2
## origin n
## <chr> <int>
## 1 Newark International Airport - DELAYED 25304
## 2 Newark International Airport - ON TIME DEPARTURE 92292
## 3 <NA> 219180
flights %>%
mutate(origin = str_replace_all(origin, c(
"^EWR$" = "Newark International", "^JFK$" = "John F. Kennedy International"
))) %>%
count(origin)
## # A tibble: 3 × 2
## origin n
## <chr> <int>
## 1 John F. Kennedy International 111279
## 2 LGA 104662
## 3 Newark International 120835
flights_top_carriers <- flights %>%
group_by(carrier) %>%
filter(n() >= 10000) %>%
ungroup()
beginning_with_am<- airlines %>%
filter(name %>% str_detect("^Am"))
flights %>%
anti_join(beginning_with_am, by = "carrier")
## # A tibble: 304,047 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 600 -6 812 837
## 5 2013 1 1 554 558 -4 740 728
## 6 2013 1 1 555 600 -5 913 854
## 7 2013 1 1 557 600 -3 709 723
## 8 2013 1 1 557 600 -3 838 846
## 9 2013 1 1 558 600 -2 849 851
## 10 2013 1 1 558 600 -2 853 856
## # ℹ 304,037 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
airline_names <- flights %>%
left_join(airlines, by = "carrier")
airline_names %>%
count(name) %>%
ggplot(aes(name, n)) +
geom_col()

airline_names %>%
count(name) %>%
mutate(name = fct_reorder(name, n)) %>%
ggplot(aes(name, n)) +
geom_col()

airline_names %>%
count(name) %>%
mutate(name = fct_reorder(name, n)) %>%
ggplot(aes(name, n)) +
geom_col() +
coord_flip()

crossing(
customer_channel = c("Bus", "Car"),
customer_status = c("New", "Repeat"),
spend_range = c("$0-$10", "$10-$20", "$20-$50", "$50+"))
## # A tibble: 16 × 3
## customer_channel customer_status spend_range
## <chr> <chr> <chr>
## 1 Bus New $0-$10
## 2 Bus New $10-$20
## 3 Bus New $20-$50
## 4 Bus New $50+
## 5 Bus Repeat $0-$10
## 6 Bus Repeat $10-$20
## 7 Bus Repeat $20-$50
## 8 Bus Repeat $50+
## 9 Car New $0-$10
## 10 Car New $10-$20
## 11 Car New $20-$50
## 12 Car New $50+
## 13 Car Repeat $0-$10
## 14 Car Repeat $10-$20
## 15 Car Repeat $20-$50
## 16 Car Repeat $50+
summary <- function(data, col_names, na.rm = TRUE) {
data %>%
summarise(across({{ col_names }},
list(
min = min,
max = max,
median = median,
mean = mean
),
na.rm = na.rm,
.names = "{col}_{fn}"
))
}
airline_names %>%
summary(c(air_time, arr_delay))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 1 × 8
## air_time_min air_time_max air_time_median air_time_mean arr_delay_min
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 20 695 129 151. -86
## # ℹ 3 more variables: arr_delay_max <dbl>, arr_delay_median <dbl>,
## # arr_delay_mean <dbl>
airline_names %>%
group_by(carrier) %>%
summary(c(air_time, arr_delay))
## # A tibble: 16 × 9
## carrier air_time_min air_time_max air_time_median air_time_mean arr_delay_min
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 9E 21 272 83 86.8 -68
## 2 AA 29 426 169 189. -75
## 3 AS 277 392 324 326. -74
## 4 B6 29 413 142 151. -71
## 5 DL 26 490 145 174. -71
## 6 EV 20 286 87 90.1 -62
## 7 F9 195 278 229 230. -47
## 8 FL 53 161 109 101. -44
## 9 HA 580 691 622. 623. -70
## 10 MQ 33 236 83 91.2 -53
## 11 OO 50 177 68 83.5 -26
## 12 UA 23 695 197 212. -75
## 13 US 21 359 76 88.6 -70
## 14 VX 264 406 337 337. -86
## 15 WN 31 362 122 148. -58
## 16 YV 32 122 56.5 65.7 -46
## # ℹ 3 more variables: arr_delay_max <dbl>, arr_delay_median <dbl>,
## # arr_delay_mean <dbl>