#Question 1

library(nycflights13)
dim(flights)
## [1] 336776     19

#Question 2

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sortf <- arrange(flights, desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
## # A tibble: 336,776 × 19
##    carrier flight tailnum  year month   day dep_time sched_dep_time dep_delay
##    <chr>    <int> <chr>   <int> <int> <int>    <int>          <int>     <dbl>
##  1 HA          51 N384HA   2013     1     9      641            900      1301
##  2 MQ        3535 N504MQ   2013     6    15     1432           1935      1137
##  3 MQ        3695 N517MQ   2013     1    10     1121           1635      1126
##  4 AA         177 N338AA   2013     9    20     1139           1845      1014
##  5 MQ        3075 N665MQ   2013     7    22      845           1600      1005
##  6 DL        2391 N959DL   2013     4    10     1100           1900       960
##  7 DL        2119 N927DA   2013     3    17     2321            810       911
##  8 DL        2007 N3762Y   2013     6    27      959           1900       899
##  9 DL        2047 N6716C   2013     7    22     2257            759       898
## 10 AA         172 N5DMAA   2013    12     5      756           1700       896
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
##   carrier flight tailnum
##   <chr>    <int> <chr>  
## 1 HA          51 N384HA

#Question 3

not_cancelled <- flights %>% 
  filter(!is.na(dep_delay))

not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows
flights %>%
  group_by(year, month, day) %>%
  summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows

#Question 4

library(dplyr)
library(nycflights13)

not_cancelled <- flights %>%
  filter(!is.na(arr_delay))  # remove cancelled flights

avg_delay <- not_cancelled %>%
  group_by(tailnum) %>%
  summarise(mean_arr_delay = mean(arr_delay))

lowest_delay <- avg_delay %>%
  arrange(mean_arr_delay) %>%
  slice(1)

lowest_delay
## # A tibble: 1 × 2
##   tailnum mean_arr_delay
##   <chr>            <dbl>
## 1 N560AS             -53

#Question 5

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  ) %>%
  arrange(desc(last))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     2     7    27  2400
##  2  2013     2    11     1  2400
##  3  2013     3    15    11  2400
##  4  2013     3    22    37  2400
##  5  2013     3    25    13  2400
##  6  2013     4     2     9  2400
##  7  2013     4     4    14  2400
##  8  2013     4    20     7  2400
##  9  2013     5    21   110  2400
## 10  2013     6    17     2  2400
## # ℹ 355 more rows

#Question 6

library(dplyr)
library(nycflights13)

flights %>%
  group_by(month) %>%
  summarise(
    prop_over_hour = mean(dep_delay > 60, na.rm = TRUE)
  ) %>%
  arrange(desc(prop_over_hour))
## # A tibble: 12 × 2
##    month prop_over_hour
##    <int>          <dbl>
##  1     7         0.134 
##  2     6         0.128 
##  3    12         0.0942
##  4     4         0.0916
##  5     3         0.0837
##  6     5         0.0818
##  7     8         0.0796
##  8     2         0.0698
##  9     1         0.0688
## 10     9         0.0490
## 11    10         0.0469
## 12    11         0.0402

#Question 7

library(dplyr)
library(nycflights13)

flights %>%
  group_by(dest) %>%
  summarise(num_carriers = n_distinct(carrier)) %>%
  arrange(desc(num_carriers))
## # A tibble: 105 × 2
##    dest  num_carriers
##    <chr>        <int>
##  1 ATL              7
##  2 BOS              7
##  3 CLT              7
##  4 ORD              7
##  5 TPA              7
##  6 AUS              6
##  7 DCA              6
##  8 DTW              6
##  9 IAD              6
## 10 MSP              6
## # ℹ 95 more rows
library(ggplot2)

#Question 8

by_dest <- group_by(flights, dest)

delay <- summarise(by_dest,
                   count = n(),
                   dist = mean(distance, na.rm = TRUE),
                   delay = mean(arr_delay, na.rm = TRUE))

delay <- filter(delay, count > 20, dest != "HNL")


ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#Question 9

library(dplyr)
library(nycflights13)

delays <- flights %>%
  group_by(dest) %>%
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>%
  filter(count > 20, dest != "HNL")

# View the summarized dataset
print(delays)
## # A tibble: 96 × 4
##    dest  count  dist delay
##    <chr> <int> <dbl> <dbl>
##  1 ABQ     254 1826   4.38
##  2 ACK     265  199   4.85
##  3 ALB     439  143  14.4 
##  4 ATL   17215  757. 11.3 
##  5 AUS    2439 1514.  6.02
##  6 AVL     275  584.  8.00
##  7 BDL     443  116   7.05
##  8 BGR     375  378   8.03
##  9 BHM     297  866. 16.9 
## 10 BNA    6333  758. 11.8 
## # ℹ 86 more rows