install.packages("nycflights13", repos = "https://cran.rstudio.com/")
##
## The downloaded binary packages are in
## /var/folders/60/znjt_0y54nx55cl82pvt6lfw0000gn/T//Rtmpv5CGuy/downloaded_packages
library(nycflights13)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Question 1
dim(flights)
## [1] 336776 19
Question 2
sortf <- arrange(flights, desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
## # A tibble: 336,776 × 19
## carrier flight tailnum year month day dep_time sched_dep_time dep_delay
## <chr> <int> <chr> <int> <int> <int> <int> <int> <dbl>
## 1 HA 51 N384HA 2013 1 9 641 900 1301
## 2 MQ 3535 N504MQ 2013 6 15 1432 1935 1137
## 3 MQ 3695 N517MQ 2013 1 10 1121 1635 1126
## 4 AA 177 N338AA 2013 9 20 1139 1845 1014
## 5 MQ 3075 N665MQ 2013 7 22 845 1600 1005
## 6 DL 2391 N959DL 2013 4 10 1100 1900 960
## 7 DL 2119 N927DA 2013 3 17 2321 810 911
## 8 DL 2007 N3762Y 2013 6 27 959 1900 899
## 9 DL 2047 N6716C 2013 7 22 2257 759 898
## 10 AA 172 N5DMAA 2013 12 5 756 1700 896
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
## carrier flight tailnum
## <chr> <int> <chr>
## 1 HA 51 N384HA
Question 3
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
Question 4
flights %>%
group_by(tailnum) %>%
summarise(avg_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(avg_delay) %>%
slice(1)
## # A tibble: 1 × 2
## tailnum avg_delay
## <chr> <dbl>
## 1 N560AS -53
Question 5
not_cancelled <- flights %>%
filter(!is.na(dep_time))
daily_extremes <- not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
) %>%
arrange(desc(last))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
head(daily_extremes)
## # A tibble: 6 × 5
## # Groups: year, month [3]
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 2 7 27 2400
## 2 2013 2 11 1 2400
## 3 2013 3 15 11 2400
## 4 2013 3 22 37 2400
## 5 2013 3 25 13 2400
## 6 2013 4 2 9 2400
Question 6
monthly_delays <- flights %>%
filter(!is.na(dep_delay)) %>%
group_by(month) %>%
summarise(
total_flights = n(),
delayed_over_hour = sum(dep_delay > 60),
proportion = delayed_over_hour / total_flights
) %>%
arrange(desc(proportion))
print(monthly_delays)
## # A tibble: 12 × 4
## month total_flights delayed_over_hour proportion
## <int> <int> <int> <dbl>
## 1 7 28485 3820 0.134
## 2 6 27234 3494 0.128
## 3 12 27110 2553 0.0942
## 4 4 27662 2535 0.0916
## 5 3 27973 2340 0.0837
## 6 5 28233 2309 0.0818
## 7 8 28841 2295 0.0796
## 8 2 23690 1654 0.0698
## 9 1 26483 1821 0.0688
## 10 9 27122 1330 0.0490
## 11 10 28653 1344 0.0469
## 12 11 27035 1086 0.0402
Question 7
dest_carriers <- flights %>%
group_by(dest) %>%
summarise(n_carriers = n_distinct(carrier)) %>%
arrange(desc(n_carriers))
head(dest_carriers, 10)
## # A tibble: 10 × 2
## dest n_carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6