#install.packages("nycflights13")
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Explore the structure of the flights dataset

Number of observations

nrow(flights)
## [1] 336776

Number of variables

ncol(flights)
## [1] 19

What is contained in the variables?

str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

Step 3: Find out how many flights depart from JFK

Use the wrangling verbs filter(), count() and summarise() on the flights data to answer the following: What is the average number of flights that United Airlines (UA) flies out of JFK between 8.00 am and 9.00 am by day of the month?

How I’d actually do it

flights %>% 
  filter(origin == "JFK", carrier == "UA", dep_time >= 800, dep_time <= 900) %>% 
  summarise(mean = n()/31)
## # A tibble: 1 x 1
##    mean
##   <dbl>
## 1    19

How I’d do it using the suggested verbs

flights %>% 
  filter(origin == "JFK", carrier == "UA", dep_time >= 800, dep_time <= 900) %>% 
  count(day) %>% 
  summarise(mean = sum(n)/31)
## # A tibble: 1 x 1
##    mean
##   <dbl>
## 1    19

Step 4: Identify the plane responsible for delays

Use the wrangling verbs, group_by(), summarise() and arrange() to find the plane that has the highest total arrival delay (minutes).

How I’d actually do it

flights %>% 
  group_by(tailnum) %>% 
  summarise(total_delays = sum(arr_delay, na.rm = TRUE)) %>% 
  arrange(desc(total_delays)) %>% 
  slice(1)
## # A tibble: 1 x 2
##   tailnum total_delays
##   <chr>          <dbl>
## 1 N15910          7317

How I’d do it using the suggested verbs

flights %>% 
  group_by(tailnum) %>% 
  summarise(total_delays = sum(arr_delay, na.rm = TRUE)) %>% 
  arrange(desc(total_delays))
## # A tibble: 4,044 x 2
##    tailnum total_delays
##    <chr>          <dbl>
##  1 N15910          7317
##  2 N15980          7134
##  3 N16919          6904
##  4 N228JB          6778
##  5 N14998          6087
##  6 N192JB          5810
##  7 N292JB          5804
##  8 N12921          5788
##  9 N13958          5620
## 10 N10575          5566
## # ... with 4,034 more rows