Question 1

# Q1.A

# Data is already loaded with the nycflights13 package
flights <- flights

head(flights)
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
# Q1.C

# i
colnames(flights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
class(flights$arr_time)
## [1] "integer"
# ii
dim(flights)
## [1] 336776     19
nrow(flights)
## [1] 336776
ncol(flights)
## [1] 19
# iii
tail(flights, 15)
## # A tibble: 15 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     9    30     2231           2245       -14     2335           2356
##  2  2013     9    30     2233           2113        80      112             30
##  3  2013     9    30     2235           2001       154       59           2249
##  4  2013     9    30     2237           2245        -8     2345           2353
##  5  2013     9    30     2240           2245        -5     2334           2351
##  6  2013     9    30     2240           2250       -10     2347              7
##  7  2013     9    30     2241           2246        -5     2345              1
##  8  2013     9    30     2307           2255        12     2359           2358
##  9  2013     9    30     2349           2359       -10      325            350
## 10  2013     9    30       NA           1842        NA       NA           2019
## 11  2013     9    30       NA           1455        NA       NA           1634
## 12  2013     9    30       NA           2200        NA       NA           2312
## 13  2013     9    30       NA           1210        NA       NA           1330
## 14  2013     9    30       NA           1159        NA       NA           1344
## 15  2013     9    30       NA            840        NA       NA           1020
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Q1.D

# i
morning_depart <- flights %>%
  filter(sched_dep_time < 1200)

# ii
flights_synopsis <- flights %>%
  select(sched_dep_time, dep_time, carrier, tailnum)

# iii
flights_dummy <- flights %>%
  mutate(afternoon_flight = ifelse(sched_dep_time >= 1200, 1, 0))

# iv
flights %>%
  summarise(
    mean_arrival_delay = mean(arr_delay, na.rm = TRUE),
    median_arrival_delay = median(arr_delay, na.rm = TRUE)
  )
## # A tibble: 1 × 2
##   mean_arrival_delay median_arrival_delay
##                <dbl>                <dbl>
## 1               6.90                   -5
# Q1.E

# i
mayflights <- flights %>%
  filter(month == 5) %>%
  select(tailnum, origin)

# ii
flights_delayed <- flights %>%
  filter(dep_delay > 60) %>%
  select(sched_dep_time, everything()) %>%
  arrange(desc(sched_dep_time))

tail(flights_delayed)
## # A tibble: 6 × 19
##   sched_dep_time  year month   day dep_time dep_delay arr_time sched_arr_time
##            <int> <int> <int> <int>    <int>     <dbl>    <int>          <int>
## 1            525  2013     1    16      636        71     1000            818
## 2            517  2013    10     4      651        94      917            757
## 3            517  2013    10    17      658       101      939            757
## 4            515  2013     1    19      623        68      902            814
## 5            500  2013     4    24      808       188     1008            640
## 6            500  2013     9    13      601        61      732            648
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# iii
n_distinct(flights_delayed$flight)
## [1] 2700
# iv
airports %>%
  filter(grepl("Atlanta", name))
## # A tibble: 2 × 8
##   faa   name                                   lat   lon   alt    tz dst   tzone
##   <chr> <chr>                                <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 ATL   Hartsfield Jackson Atlanta Intl       33.6 -84.4  1026    -5 A     Amer…
## 2 FFC   Atlanta Regional Airport - Falcon F…  33.4 -84.6   808    -5 A     Amer…
flights %>%
  filter(dest == "ATL") %>%
  summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE))
## # A tibble: 1 × 1
##   mean_arr_delay
##            <dbl>
## 1           11.3

Question 2

# Q2.A

# i
flights %>%
  group_by(origin) %>%
  summarise(meanad = mean(arr_delay, na.rm = TRUE))
## # A tibble: 3 × 2
##   origin meanad
##   <chr>   <dbl>
## 1 EWR      9.11
## 2 JFK      5.55
## 3 LGA      5.78
# ii
flights %>%
  group_by(dest) %>%
  summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(mean_dep_delay))
## # A tibble: 105 × 2
##    dest  mean_dep_delay
##    <chr>          <dbl>
##  1 CAE             35.6
##  2 TUL             34.9
##  3 OKC             30.6
##  4 BHM             29.7
##  5 TYS             28.5
##  6 JAC             26.5
##  7 DSM             26.2
##  8 RIC             23.6
##  9 ALB             23.6
## 10 MSN             23.6
## # ℹ 95 more rows
# iii
flights %>%
  group_by(tailnum) %>%
  summarise(flight_count = n()) %>%
  arrange(desc(flight_count))
## # A tibble: 4,044 × 2
##    tailnum flight_count
##    <chr>          <int>
##  1 <NA>            2512
##  2 N725MQ           575
##  3 N722MQ           513
##  4 N723MQ           507
##  5 N711MQ           486
##  6 N713MQ           483
##  7 N258JB           427
##  8 N298JB           407
##  9 N353JB           404
## 10 N351JB           402
## # ℹ 4,034 more rows
# iv
flights %>%
  mutate(delayed = dep_delay > 0) %>%
  group_by(origin) %>%
  summarise(
    total_flights = n(),
    delayed_flights = sum(delayed, na.rm = TRUE),
    delay_probability = delayed_flights / total_flights
  ) %>%
  arrange(desc(delay_probability))
## # A tibble: 3 × 4
##   origin total_flights delayed_flights delay_probability
##   <chr>          <int>           <int>             <dbl>
## 1 EWR           120835           52711             0.436
## 2 JFK           111279           42031             0.378
## 3 LGA           104662           33690             0.322
# Q2.B

# i
flights %>%
  ggplot(aes(x = sched_dep_time, y = dep_delay)) +
  geom_point(alpha = 0.1, color = "blue") +
  labs(
    title = "Scheduled Departure Time vs Departure Delay",
    x = "Scheduled Departure Time (HHMM)",
    y = "Departure Delay (Minutes)"
  ) +
  theme_minimal()
## Warning: Removed 8255 rows containing missing values or values outside the scale range
## (`geom_point()`).

# improved version
flights %>%
  ggplot(aes(x = sched_dep_time, y = dep_delay)) +
  geom_jitter(alpha = 0.1, color = "blue", height = 0, width = 5) +
  scale_y_continuous(
    breaks = seq(0, 300, by = 30),
    limits = c(0, 300)
  ) +
  labs(
    title = "Scheduled Departure Time vs Departure Delay",
    x = "Scheduled Departure Time (HHMM)",
    y = "Departure Delay (Minutes)"
  ) +
  theme_minimal()
## Warning: Removed 192440 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ii
flights %>%
  ggplot(aes(x = origin, y = dep_delay)) +
  geom_boxplot(outlier.alpha = 0.1) +
  coord_cartesian(ylim = c(-50, 200)) +
  labs(
    title = "Departure Delay Distribution by Origin Airport",
    x = "Origin Airport",
    y = "Departure Delay (Minutes)"
  ) +
  theme_minimal()
## Warning: Removed 8255 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# iii
flights %>%
  filter(carrier %in% c("UA", "AA", "US")) %>%
  ggplot(aes(x = origin, fill = carrier)) +
  geom_bar(position = "dodge") +
  labs(
    title = "Number of Flights by Carrier and Origin Airport",
    x = "Origin Airport",
    y = "Number of Flights",
    fill = "Carrier"
  ) +
  theme_minimal()

# Q2.C

# i
avg_delay_by_airline <- flights %>%
  group_by(carrier, month) %>%
  summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
ggplot(avg_delay_by_airline, aes(x = month, y = mean_dep_delay, color = carrier)) +
  geom_line() +
  scale_x_continuous(breaks = 1:12) +
  labs(
    title = "Average Departure Delay by Airline and Month",
    x = "Month",
    y = "Avg Departure Delay (Minutes)"
  ) +
  theme_minimal()

# OO flights per month
flights %>%
  filter(carrier == "OO") %>%
  group_by(month) %>%
  summarise(num_flights = n())
## # A tibble: 5 × 2
##   month num_flights
##   <int>       <int>
## 1     1           1
## 2     6           2
## 3     8           4
## 4     9          20
## 5    11           5
# ii
ontime_by_airport_month <- flights %>%
  mutate(ontime = dep_delay <= 5) %>%
  group_by(origin, month) %>%
  summarise(pct_ontime = mean(ontime, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
ggplot(ontime_by_airport_month, aes(x = month, y = pct_ontime, color = origin)) +
  geom_line(size = 1.1) +
  scale_x_continuous(breaks = 1:12) +
  labs(
    title = "Percent of On-Time Departures (≤5 min late)",
    x = "Month",
    y = "Percent On Time"
  ) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.