Question 1
library(dplyr)
pacman::p_load(nycflights13)
View(flights) # this View(flights) function opens lets you directly view the whole dataset
glimpse(flights)
Rows: 336,776
Columns: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013…
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, 600, …
$ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0, 0, …
$ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924, 923,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917, 937,…
$ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4, -8, …
$ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6", "B6"…
$ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194, 112…
$ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516JB", …
$ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "LGA", …
$ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "ORD", …
$ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 361, 2…
$ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005, 247…
$ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6…
$ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0, 10, …
$ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-0…
Question 2
sortf <- arrange(flights,desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
Question 3
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups`
argument.
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups`
argument.
Question 4
library(dplyr)
avg_arr_delay <- flights %>%
filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(avg_delay = mean(arr_delay, na.rm = TRUE))
lowest_avg_tailnum <- avg_arr_delay %>%
filter(avg_delay == min(avg_delay))
lowest_avg_tailnum
Question 5
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups`
argument.
Question 6
monthly_delay_proportion <- flights %>%
group_by(month) %>%
summarise(
total_flights = n(),
delayed_flights = sum(dep_delay > 60, na.rm = TRUE),
proportion_delayed = delayed_flights / total_flights
) %>%
arrange(desc(proportion_delayed))
print(monthly_delay_proportion)
Question 7
dest_carrier_counts <- flights %>%
group_by(dest) %>%
summarise(num_carriers = n_distinct(carrier)) %>%
arrange(desc(num_carriers))
print(dest_carrier_counts)