Question 1
# Q1.A
# Data is already loaded with the nycflights13 package
flights <- flights
head(flights)
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
# Q1.C
# i
colnames(flights)
## [1] "year" "month" "day" "dep_time"
## [5] "sched_dep_time" "dep_delay" "arr_time" "sched_arr_time"
## [9] "arr_delay" "carrier" "flight" "tailnum"
## [13] "origin" "dest" "air_time" "distance"
## [17] "hour" "minute" "time_hour"
class(flights$arr_time)
## [1] "integer"
# ii
dim(flights)
## [1] 336776 19
nrow(flights)
## [1] 336776
ncol(flights)
## [1] 19
# iii
tail(flights, 15)
## # A tibble: 15 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 9 30 2231 2245 -14 2335 2356
## 2 2013 9 30 2233 2113 80 112 30
## 3 2013 9 30 2235 2001 154 59 2249
## 4 2013 9 30 2237 2245 -8 2345 2353
## 5 2013 9 30 2240 2245 -5 2334 2351
## 6 2013 9 30 2240 2250 -10 2347 7
## 7 2013 9 30 2241 2246 -5 2345 1
## 8 2013 9 30 2307 2255 12 2359 2358
## 9 2013 9 30 2349 2359 -10 325 350
## 10 2013 9 30 NA 1842 NA NA 2019
## 11 2013 9 30 NA 1455 NA NA 1634
## 12 2013 9 30 NA 2200 NA NA 2312
## 13 2013 9 30 NA 1210 NA NA 1330
## 14 2013 9 30 NA 1159 NA NA 1344
## 15 2013 9 30 NA 840 NA NA 1020
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# Q1.D
# i
morning_depart <- flights %>%
filter(sched_dep_time < 1200)
# ii
flights_synopsis <- flights %>%
select(sched_dep_time, dep_time, carrier, tailnum)
# iii
flights_dummy <- flights %>%
mutate(afternoon_flight = ifelse(sched_dep_time >= 1200, 1, 0))
# iv
flights %>%
summarise(
mean_arrival_delay = mean(arr_delay, na.rm = TRUE),
median_arrival_delay = median(arr_delay, na.rm = TRUE)
)
## # A tibble: 1 × 2
## mean_arrival_delay median_arrival_delay
## <dbl> <dbl>
## 1 6.90 -5
# Q1.E
# i
mayflights <- flights %>%
filter(month == 5) %>%
select(tailnum, origin)
# ii
flights_delayed <- flights %>%
filter(dep_delay > 60) %>%
select(sched_dep_time, everything()) %>%
arrange(desc(sched_dep_time))
tail(flights_delayed)
## # A tibble: 6 × 19
## sched_dep_time year month day dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 525 2013 1 16 636 71 1000 818
## 2 517 2013 10 4 651 94 917 757
## 3 517 2013 10 17 658 101 939 757
## 4 515 2013 1 19 623 68 902 814
## 5 500 2013 4 24 808 188 1008 640
## 6 500 2013 9 13 601 61 732 648
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# iii
n_distinct(flights_delayed$flight)
## [1] 2700
# iv
airports %>%
filter(grepl("Atlanta", name))
## # A tibble: 2 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 ATL Hartsfield Jackson Atlanta Intl 33.6 -84.4 1026 -5 A Amer…
## 2 FFC Atlanta Regional Airport - Falcon F… 33.4 -84.6 808 -5 A Amer…
flights %>%
filter(dest == "ATL") %>%
summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE))
## # A tibble: 1 × 1
## mean_arr_delay
## <dbl>
## 1 11.3
Question 2
# Q2.A
# i
flights %>%
group_by(origin) %>%
summarise(meanad = mean(arr_delay, na.rm = TRUE))
## # A tibble: 3 × 2
## origin meanad
## <chr> <dbl>
## 1 EWR 9.11
## 2 JFK 5.55
## 3 LGA 5.78
# ii
flights %>%
group_by(dest) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(mean_dep_delay))
## # A tibble: 105 × 2
## dest mean_dep_delay
## <chr> <dbl>
## 1 CAE 35.6
## 2 TUL 34.9
## 3 OKC 30.6
## 4 BHM 29.7
## 5 TYS 28.5
## 6 JAC 26.5
## 7 DSM 26.2
## 8 RIC 23.6
## 9 ALB 23.6
## 10 MSN 23.6
## # ℹ 95 more rows
# iii
flights %>%
group_by(tailnum) %>%
summarise(flight_count = n()) %>%
arrange(desc(flight_count))
## # A tibble: 4,044 × 2
## tailnum flight_count
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N711MQ 486
## 6 N713MQ 483
## 7 N258JB 427
## 8 N298JB 407
## 9 N353JB 404
## 10 N351JB 402
## # ℹ 4,034 more rows
# iv
flights %>%
mutate(delayed = dep_delay > 0) %>%
group_by(origin) %>%
summarise(
total_flights = n(),
delayed_flights = sum(delayed, na.rm = TRUE),
delay_probability = delayed_flights / total_flights
) %>%
arrange(desc(delay_probability))
## # A tibble: 3 × 4
## origin total_flights delayed_flights delay_probability
## <chr> <int> <int> <dbl>
## 1 EWR 120835 52711 0.436
## 2 JFK 111279 42031 0.378
## 3 LGA 104662 33690 0.322
# Q2.B
# i
flights %>%
ggplot(aes(x = sched_dep_time, y = dep_delay)) +
geom_point(alpha = 0.1, color = "blue") +
labs(
title = "Scheduled Departure Time vs Departure Delay",
x = "Scheduled Departure Time (HHMM)",
y = "Departure Delay (Minutes)"
) +
theme_minimal()
## Warning: Removed 8255 rows containing missing values or values outside the scale range
## (`geom_point()`).

# improved version
flights %>%
ggplot(aes(x = sched_dep_time, y = dep_delay)) +
geom_jitter(alpha = 0.1, color = "blue", height = 0, width = 5) +
scale_y_continuous(
breaks = seq(0, 300, by = 30),
limits = c(0, 300)
) +
labs(
title = "Scheduled Departure Time vs Departure Delay",
x = "Scheduled Departure Time (HHMM)",
y = "Departure Delay (Minutes)"
) +
theme_minimal()
## Warning: Removed 192440 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ii
flights %>%
ggplot(aes(x = origin, y = dep_delay)) +
geom_boxplot(outlier.alpha = 0.1) +
coord_cartesian(ylim = c(-50, 200)) +
labs(
title = "Departure Delay Distribution by Origin Airport",
x = "Origin Airport",
y = "Departure Delay (Minutes)"
) +
theme_minimal()
## Warning: Removed 8255 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# iii
flights %>%
filter(carrier %in% c("UA", "AA", "US")) %>%
ggplot(aes(x = origin, fill = carrier)) +
geom_bar(position = "dodge") +
labs(
title = "Number of Flights by Carrier and Origin Airport",
x = "Origin Airport",
y = "Number of Flights",
fill = "Carrier"
) +
theme_minimal()

# Q2.C
# i
avg_delay_by_airline <- flights %>%
group_by(carrier, month) %>%
summarise(mean_dep_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'carrier'. You can override using the
## `.groups` argument.
ggplot(avg_delay_by_airline, aes(x = month, y = mean_dep_delay, color = carrier)) +
geom_line() +
scale_x_continuous(breaks = 1:12) +
labs(
title = "Average Departure Delay by Airline and Month",
x = "Month",
y = "Avg Departure Delay (Minutes)"
) +
theme_minimal()

# OO flights per month
flights %>%
filter(carrier == "OO") %>%
group_by(month) %>%
summarise(num_flights = n())
## # A tibble: 5 × 2
## month num_flights
## <int> <int>
## 1 1 1
## 2 6 2
## 3 8 4
## 4 9 20
## 5 11 5
# ii
ontime_by_airport_month <- flights %>%
mutate(ontime = dep_delay <= 5) %>%
group_by(origin, month) %>%
summarise(pct_ontime = mean(ontime, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
ggplot(ontime_by_airport_month, aes(x = month, y = pct_ontime, color = origin)) +
geom_line(size = 1.1) +
scale_x_continuous(breaks = 1:12) +
labs(
title = "Percent of On-Time Departures (≤5 min late)",
x = "Month",
y = "Percent On Time"
) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
