Assignment #5

Question 1

Does the dataset contains 336,776 flights records and 19 fields?

library(nycflights13)

dim(flights)

## [1] 336776     19

Question 2

How could you know which flight has the largest departure delay?

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(nycflights13)

sortf <- arrange(flights,desc(dep_delay)) 

select(sortf, carrier, flight, tailnum, everything())

## # A tibble: 336,776 × 19
##    carrier flight tailnum  year month   day dep_time sched_dep_time dep_delay
##    <chr>    <int> <chr>   <int> <int> <int>    <int>          <int>     <dbl>
##  1 HA          51 N384HA   2013     1     9      641            900      1301
##  2 MQ        3535 N504MQ   2013     6    15     1432           1935      1137
##  3 MQ        3695 N517MQ   2013     1    10     1121           1635      1126
##  4 AA         177 N338AA   2013     9    20     1139           1845      1014
##  5 MQ        3075 N665MQ   2013     7    22      845           1600      1005
##  6 DL        2391 N959DL   2013     4    10     1100           1900       960
##  7 DL        2119 N927DA   2013     3    17     2321            810       911
##  8 DL        2007 N3762Y   2013     6    27      959           1900       899
##  9 DL        2047 N6716C   2013     7    22     2257            759       898
## 10 AA         172 N5DMAA   2013    12     5      756           1700       896
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

maxdep <- max(flights$dep_delay, na.rm=TRUE)

maxdep_id <- which(flights$dep_delay==maxdep)

flights[maxdep_id, 10:12]

## # A tibble: 1 × 3
##   carrier flight tailnum
##   <chr>    <int> <chr>  
## 1 HA          51 N384HA

Question 3

Your friend David tries to get the average departure delay per date, and he writes the following code but get outputs as NAs:

flights %>%

group_by(year, month, day) %>%

summarise(mean = mean(dep_delay))

Which of the following do you think will help solve his problem and provide the correct values?

Option 1:

flights %>%
  group_by(year, month, day) %>%
  summarise(mean_delay = mean(dep_delay, na.rm = TRUE))

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.

## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day mean_delay
##    <int> <int> <int>      <dbl>
##  1  2013     1     1      11.5 
##  2  2013     1     2      13.9 
##  3  2013     1     3      11.0 
##  4  2013     1     4       8.95
##  5  2013     1     5       5.73
##  6  2013     1     6       7.15
##  7  2013     1     7       5.42
##  8  2013     1     8       2.55
##  9  2013     1     9       2.28
## 10  2013     1    10       2.84
## # ℹ 355 more rows

Option 2:

not_cancelled <- flights %>%
  filter(!is.na(dep_delay))

not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(mean_delay = mean(dep_delay))

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.

## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day mean_delay
##    <int> <int> <int>      <dbl>
##  1  2013     1     1      11.5 
##  2  2013     1     2      13.9 
##  3  2013     1     3      11.0 
##  4  2013     1     4       8.95
##  5  2013     1     5       5.73
##  6  2013     1     6       7.15
##  7  2013     1     7       5.42
##  8  2013     1     8       2.55
##  9  2013     1     9       2.28
## 10  2013     1    10       2.84
## # ℹ 355 more rows

Option 3:

delays <- not_cancelled %>%
  group_by(tailnum) %>%
  summarise(delay = mean(arr_delay))

This code runs, but does not answer the question being asked. This code groups by tailnum (aircraft), not by date. Gives wrong analysis/output for this question.

Option 4:

not_cancelled <- flights %>%
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(mean_delay = mean(dep_delay))

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.

## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day mean_delay
##    <int> <int> <int>      <dbl>
##  1  2013     1     1      11.4 
##  2  2013     1     2      13.7 
##  3  2013     1     3      10.9 
##  4  2013     1     4       8.97
##  5  2013     1     5       5.73
##  6  2013     1     6       7.15
##  7  2013     1     7       5.42
##  8  2013     1     8       2.56
##  9  2013     1     9       2.30
## 10  2013     1    10       2.84
## # ℹ 355 more rows

Question 4

Write the code to get the tailnum that has the lowest average arrival delay?

avg_delay <- flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay))

avg_delay %>%
  arrange(avg_arr_delay) %>%
  slice(1)

## # A tibble: 1 × 2
##   tailnum avg_arr_delay
##   <chr>           <dbl>
## 1 N560AS            -53

Question 5

not_cancelled %>% 
 group_by(year, month, day) %>% 
 summarise(
  first = min(dep_time),
  last = max(dep_time)
 )

## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.

## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     1     1   517  2356
##  2  2013     1     2    42  2354
##  3  2013     1     3    32  2349
##  4  2013     1     4    25  2358
##  5  2013     1     5    14  2357
##  6  2013     1     6    16  2355
##  7  2013     1     7    49  2359
##  8  2013     1     8   454  2351
##  9  2013     1     9     2  2252
## 10  2013     1    10     3  2320
## # ℹ 355 more rows

Question 6

Write the code to obtain what proportion of flights have departure delays of more than an hour for each months.

delay_prop <- flights %>%
  filter(!is.na(dep_delay)) %>%
  group_by(month) %>%
  summarise(
    prop_over_1hr = mean(dep_delay > 60)
  )

delay_prop

## # A tibble: 12 × 2
##    month prop_over_1hr
##    <int>         <dbl>
##  1     1        0.0688
##  2     2        0.0698
##  3     3        0.0837
##  4     4        0.0916
##  5     5        0.0818
##  6     6        0.128 
##  7     7        0.134 
##  8     8        0.0796
##  9     9        0.0490
## 10    10        0.0469
## 11    11        0.0402
## 12    12        0.0942

Months 6 & 7 (June & July) have the highest proportions out of any month.

Question 7

Given that the n_distinct() function can count the number of dinstict (unique) values, could you find out which destinations have the most carriers?

dest_carriers <- flights %>%
  group_by(dest) %>%
  summarise(num_carriers = n_distinct(carrier)) %>%
  arrange(desc(num_carriers))

dest_carriers

## # A tibble: 105 × 2
##    dest  num_carriers
##    <chr>        <int>
##  1 ATL              7
##  2 BOS              7
##  3 CLT              7
##  4 ORD              7
##  5 TPA              7
##  6 AUS              6
##  7 DCA              6
##  8 DTW              6
##  9 IAD              6
## 10 MSP              6
## # ℹ 95 more rows

ORD, CLT, BOS, ATL, & TPA are all tied for the most carriers with 7.

Question 9

Does the following code achieve the similar goal with the above question except for missing the plotting step?

delays <- flights %>% 
 group_by(dest) %>% 
 summarise(
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
 ) %>% 
 filter(count > 20, dest != "HNL")