Question 1

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(nycflights13)
library(ggplot2)

data("flights", package = "nycflights13")
data("flights", package = "nycflights13") 
dim(flights)
## [1] 336776     19
nrow(flights) 
## [1] 336776
ncol(flights) 
## [1] 19
summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 

Question 2

max_dep <- max(flights$dep_delay, na.rm = TRUE)
max_dep
## [1] 1301
max_id <- which(flights$dep_delay == max_dep)

flights[max_id, c("year","month","day","dep_delay","carrier","flight","tailnum","origin","dest")]
## # A tibble: 1 × 9
##    year month   day dep_delay carrier flight tailnum origin dest 
##   <int> <int> <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
## 1  2013     1     9      1301 HA          51 N384HA  JFK    HNL

or

flights %>%
  arrange(desc(dep_delay)) %>%
  select(year, month, day, dep_delay, carrier, flight, tailnum, origin, dest) %>%
  slice(1)
## # A tibble: 1 × 9
##    year month   day dep_delay carrier flight tailnum origin dest 
##   <int> <int> <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>
## 1  2013     1     9      1301 HA          51 N384HA  JFK    HNL

Question 3

flights %>%
  group_by(year, month, day) %>%
  summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day mean_dep_delay
##    <int> <int> <int>          <dbl>
##  1  2013     1     1          11.5 
##  2  2013     1     2          13.9 
##  3  2013     1     3          11.0 
##  4  2013     1     4           8.95
##  5  2013     1     5           5.73
##  6  2013     1     6           7.15
##  7  2013     1     7           5.42
##  8  2013     1     8           2.55
##  9  2013     1     9           2.28
## 10  2013     1    10           2.84
## # ℹ 355 more rows

Question 4

flights %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  filter(!is.na(tailnum)) %>%
  arrange(avg_arr_delay) %>%
  slice(1)
## # A tibble: 1 × 2
##   tailnum avg_arr_delay
##   <chr>           <dbl>
## 1 N560AS            -53

Question 5

flights %>%
  filter(!is.na(dep_time)) %>%
  group_by(year, month, day) %>%
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  ) %>%
  arrange(desc(last))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     2     7    27  2400
##  2  2013     2    11     1  2400
##  3  2013     3    15    11  2400
##  4  2013     3    22    37  2400
##  5  2013     3    25    13  2400
##  6  2013     4     2     9  2400
##  7  2013     4     4    14  2400
##  8  2013     4    20     7  2400
##  9  2013     5    21   110  2400
## 10  2013     6    17     2  2400
## # ℹ 355 more rows

Question 6

flights %>%
  group_by(month) %>%
  summarise(
    proportion_over_60 = mean(dep_delay > 60, na.rm = TRUE)
  ) %>%
  arrange(desc(proportion_over_60))
## # A tibble: 12 × 2
##    month proportion_over_60
##    <int>              <dbl>
##  1     7             0.134 
##  2     6             0.128 
##  3    12             0.0942
##  4     4             0.0916
##  5     3             0.0837
##  6     5             0.0818
##  7     8             0.0796
##  8     2             0.0698
##  9     1             0.0688
## 10     9             0.0490
## 11    10             0.0469
## 12    11             0.0402

Question 7

flights %>%
  group_by(dest) %>%
  summarise(num_carriers = n_distinct(carrier)) %>%
  arrange(desc(num_carriers)) %>%
  head(10)
## # A tibble: 10 × 2
##    dest  num_carriers
##    <chr>        <int>
##  1 ATL              7
##  2 BOS              7
##  3 CLT              7
##  4 ORD              7
##  5 TPA              7
##  6 AUS              6
##  7 DCA              6
##  8 DTW              6
##  9 IAD              6
## 10 MSP              6

Question 8

delays <- flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(count > 20, dest != "HNL")

head(delays)
## # A tibble: 6 × 4
##   dest  count  dist delay
##   <chr> <int> <dbl> <dbl>
## 1 ABQ     254 1826   4.38
## 2 ACK     265  199   4.85
## 3 ALB     439  143  14.4 
## 4 ATL   17215  757. 11.3 
## 5 AUS    2439 1514.  6.02
## 6 AVL     275  584.  8.00
ggplot(delays, aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 0.3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Question 9

delays2 <- flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(count > 20, dest != "HNL")

head(delays2)
## # A tibble: 6 × 4
##   dest  count  dist delay
##   <chr> <int> <dbl> <dbl>
## 1 ABQ     254 1826   4.38
## 2 ACK     265  199   4.85
## 3 ALB     439  143  14.4 
## 4 ATL   17215  757. 11.3 
## 5 AUS    2439 1514.  6.02
## 6 AVL     275  584.  8.00