library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(nycflights13)
library(devtools)
## Loading required package: usethis
find.package("devtools")
## [1] "C:/Users/James/OneDrive/Documents/R/R-4.3.2/library/devtools"
find_rtools()
## [1] TRUE
flights %>%
  mutate(long_flight = (air_time >= 6 * 60)) %>%
  View()
flights %>%
  mutate(long_flight = (air_time >= 6 * 60)) %>%
  count(long_flight)
## # A tibble: 3 × 2
##   long_flight      n
##   <lgl>        <int>
## 1 FALSE       322630
## 2 TRUE          4716
## 3 NA            9430
flights %>%
  count(long_flight = air_time >= 6 * 60)
## # A tibble: 3 × 2
##   long_flight      n
##   <lgl>        <int>
## 1 FALSE       322630
## 2 TRUE          4716
## 3 NA            9430
flights %>%
  group_by(date = make_date(year, month, day)) %>%
  summarise(flights_n = n(), air_time_mean = mean(air_time, na.rm = TRUE)) %>%
  ungroup()
## # A tibble: 365 × 3
##    date       flights_n air_time_mean
##    <date>         <int>         <dbl>
##  1 2013-01-01       842          170.
##  2 2013-01-02       943          162.
##  3 2013-01-03       914          157.
##  4 2013-01-04       915          151.
##  5 2013-01-05       720          161.
##  6 2013-01-06       832          160.
##  7 2013-01-07       933          145.
##  8 2013-01-08       899          149.
##  9 2013-01-09       902          153.
## 10 2013-01-10       932          147.
## # ℹ 355 more rows
flights %>%
  slice_sample(n = 15)
## # A tibble: 15 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    10    10      559            600        -1      817            829
##  2  2013     4    29      654            659        -5      931           1000
##  3  2013     4    19     2250           2057       113      234           2359
##  4  2013     7    13     1850           1815        35     2117           2044
##  5  2013     3    19     1700           1700         0     2045           2110
##  6  2013     1    17     2130           2000        90     2340           2137
##  7  2013    11    11      835            843        -8     1029           1051
##  8  2013    12    31      817            817         0      934            946
##  9  2013     1     1     1925           1900        25     2259           2238
## 10  2013     4    12      630            630         0      803            820
## 11  2013     7    21     1457           1500        -3     1807           1724
## 12  2013     6     4      633            640        -7      751            806
## 13  2013    12    14     1351           1345         6     1712           1705
## 14  2013     7    23     1927           1915        12     2238           2230
## 15  2013     2     6     1314           1310         4     1407           1419
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
  slice_sample(prop = 0.15)
## # A tibble: 50,516 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     5    24     1237           1115        82     1520           1410
##  2  2013     8    26     1733           1735        -2     1959           2030
##  3  2013     8    30     1642           1645        -3     1755           1820
##  4  2013     1     3      829            834        -5     1046           1039
##  5  2013     7    13     1305           1314        -9     1622           1620
##  6  2013     6    18     1716           1620        56     1937           1853
##  7  2013     9    12     2339           2200        99      221             48
##  8  2013     3    12     2026           2030        -4     2313           2306
##  9  2013     4     3     1824           1829        -5     2027           2038
## 10  2013     5    21      603            610        -7      716            745
## # ℹ 50,506 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
  select(year, month, day) %>%
  mutate(date = make_date(year, month, day))
## # A tibble: 336,776 × 4
##     year month   day date      
##    <int> <int> <int> <date>    
##  1  2013     1     1 2013-01-01
##  2  2013     1     1 2013-01-01
##  3  2013     1     1 2013-01-01
##  4  2013     1     1 2013-01-01
##  5  2013     1     1 2013-01-01
##  6  2013     1     1 2013-01-01
##  7  2013     1     1 2013-01-01
##  8  2013     1     1 2013-01-01
##  9  2013     1     1 2013-01-01
## 10  2013     1     1 2013-01-01
## # ℹ 336,766 more rows
numbers_1 <- tibble(number = c("#1", "Number8", "How are you 3"))
numbers_1 %>% mutate(number = parse_number(number))
## # A tibble: 3 × 1
##   number
##    <dbl>
## 1      1
## 2      8
## 3      3
flights %>%
  select(starts_with("dep_"))
## # A tibble: 336,776 × 2
##    dep_time dep_delay
##       <int>     <dbl>
##  1      517         2
##  2      533         4
##  3      542         2
##  4      544        -1
##  5      554        -6
##  6      554        -4
##  7      555        -5
##  8      557        -3
##  9      557        -3
## 10      558        -2
## # ℹ 336,766 more rows
flights %>%
  select(ends_with("hour"))
## # A tibble: 336,776 × 2
##     hour time_hour          
##    <dbl> <dttm>             
##  1     5 2013-01-01 05:00:00
##  2     5 2013-01-01 05:00:00
##  3     5 2013-01-01 05:00:00
##  4     5 2013-01-01 05:00:00
##  5     6 2013-01-01 06:00:00
##  6     5 2013-01-01 05:00:00
##  7     6 2013-01-01 06:00:00
##  8     6 2013-01-01 06:00:00
##  9     6 2013-01-01 06:00:00
## 10     6 2013-01-01 06:00:00
## # ℹ 336,766 more rows
flights %>%
  select(contains("hour"))
## # A tibble: 336,776 × 2
##     hour time_hour          
##    <dbl> <dttm>             
##  1     5 2013-01-01 05:00:00
##  2     5 2013-01-01 05:00:00
##  3     5 2013-01-01 05:00:00
##  4     5 2013-01-01 05:00:00
##  5     6 2013-01-01 06:00:00
##  6     5 2013-01-01 05:00:00
##  7     6 2013-01-01 06:00:00
##  8     6 2013-01-01 06:00:00
##  9     6 2013-01-01 06:00:00
## 10     6 2013-01-01 06:00:00
## # ℹ 336,766 more rows
flights %>%
  mutate(origin = case_when(
    (origin == "EWR") & dep_delay > 20 ~ "Newark International Airport - DELAYED",
    (origin == "EWR") & dep_delay <= 20 ~ "Newark International Airport - ON TIME DEPARTURE",
  )) %>%
  count(origin)
## # A tibble: 3 × 2
##   origin                                                n
##   <chr>                                             <int>
## 1 Newark International Airport - DELAYED            25304
## 2 Newark International Airport - ON TIME DEPARTURE  92292
## 3 <NA>                                             219180
flights %>%
  mutate(origin = str_replace_all(origin, c(
    "^EWR$" = "Newark International",    "^JFK$" = "John F. Kennedy International"
  ))) %>%
  count(origin)
## # A tibble: 3 × 2
##   origin                             n
##   <chr>                          <int>
## 1 John F. Kennedy International 111279
## 2 LGA                           104662
## 3 Newark International          120835
flights_top_carriers <- flights %>%
  group_by(carrier) %>%
  filter(n() >= 10000) %>%
  ungroup()
beginning_with_am<- airlines %>%   
filter(name %>% str_detect("^Am")) 
flights %>%
  anti_join(beginning_with_am, by = "carrier")
## # A tibble: 304,047 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            600        -6      812            837
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      555            600        -5      913            854
##  7  2013     1     1      557            600        -3      709            723
##  8  2013     1     1      557            600        -3      838            846
##  9  2013     1     1      558            600        -2      849            851
## 10  2013     1     1      558            600        -2      853            856
## # ℹ 304,037 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
airline_names <- flights %>%
  left_join(airlines, by = "carrier")
airline_names %>%
  count(name) %>%
  ggplot(aes(name, n)) +
  geom_col()

airline_names %>%
  count(name) %>%
  mutate(name = fct_reorder(name, n)) %>%
  ggplot(aes(name, n)) +
  geom_col()

airline_names %>%   
count(name) %>%   
mutate(name = fct_reorder(name, n)) %>%   
ggplot(aes(name, n)) +   
geom_col() +   
coord_flip() 

crossing(
  customer_channel = c("Bus", "Car"),
  customer_status = c("New", "Repeat"),
  spend_range = c("$0-$10", "$10-$20", "$20-$50", "$50+"))
## # A tibble: 16 × 3
##    customer_channel customer_status spend_range
##    <chr>            <chr>           <chr>      
##  1 Bus              New             $0-$10     
##  2 Bus              New             $10-$20    
##  3 Bus              New             $20-$50    
##  4 Bus              New             $50+       
##  5 Bus              Repeat          $0-$10     
##  6 Bus              Repeat          $10-$20    
##  7 Bus              Repeat          $20-$50    
##  8 Bus              Repeat          $50+       
##  9 Car              New             $0-$10     
## 10 Car              New             $10-$20    
## 11 Car              New             $20-$50    
## 12 Car              New             $50+       
## 13 Car              Repeat          $0-$10     
## 14 Car              Repeat          $10-$20    
## 15 Car              Repeat          $20-$50    
## 16 Car              Repeat          $50+
summary <- function(data, col_names, na.rm = TRUE) {
  data %>%
    summarise(across({{ col_names }},
                     list(
                       min = min,
                       max = max,
                       median = median,
                       mean = mean
                     ),
                     na.rm = na.rm,
                     .names = "{col}_{fn}"
    ))
}
airline_names %>%
  summary(c(air_time, arr_delay))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(...)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 1 × 8
##   air_time_min air_time_max air_time_median air_time_mean arr_delay_min
##          <dbl>        <dbl>           <dbl>         <dbl>         <dbl>
## 1           20          695             129          151.           -86
## # ℹ 3 more variables: arr_delay_max <dbl>, arr_delay_median <dbl>,
## #   arr_delay_mean <dbl>
airline_names %>%
  group_by(carrier) %>%
summary(c(air_time, arr_delay))
## # A tibble: 16 × 9
##    carrier air_time_min air_time_max air_time_median air_time_mean arr_delay_min
##    <chr>          <dbl>        <dbl>           <dbl>         <dbl>         <dbl>
##  1 9E                21          272            83            86.8           -68
##  2 AA                29          426           169           189.            -75
##  3 AS               277          392           324           326.            -74
##  4 B6                29          413           142           151.            -71
##  5 DL                26          490           145           174.            -71
##  6 EV                20          286            87            90.1           -62
##  7 F9               195          278           229           230.            -47
##  8 FL                53          161           109           101.            -44
##  9 HA               580          691           622.          623.            -70
## 10 MQ                33          236            83            91.2           -53
## 11 OO                50          177            68            83.5           -26
## 12 UA                23          695           197           212.            -75
## 13 US                21          359            76            88.6           -70
## 14 VX               264          406           337           337.            -86
## 15 WN                31          362           122           148.            -58
## 16 YV                32          122            56.5          65.7           -46
## # ℹ 3 more variables: arr_delay_max <dbl>, arr_delay_median <dbl>,
## #   arr_delay_mean <dbl>