#Question 1
library(nycflights13)
dim(flights)
## [1] 336776 19
#Question 2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sortf <- arrange(flights, desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
## # A tibble: 336,776 × 19
## carrier flight tailnum year month day dep_time sched_dep_time dep_delay
## <chr> <int> <chr> <int> <int> <int> <int> <int> <dbl>
## 1 HA 51 N384HA 2013 1 9 641 900 1301
## 2 MQ 3535 N504MQ 2013 6 15 1432 1935 1137
## 3 MQ 3695 N517MQ 2013 1 10 1121 1635 1126
## 4 AA 177 N338AA 2013 9 20 1139 1845 1014
## 5 MQ 3075 N665MQ 2013 7 22 845 1600 1005
## 6 DL 2391 N959DL 2013 4 10 1100 1900 960
## 7 DL 2119 N927DA 2013 3 17 2321 810 911
## 8 DL 2007 N3762Y 2013 6 27 959 1900 899
## 9 DL 2047 N6716C 2013 7 22 2257 759 898
## 10 AA 172 N5DMAA 2013 12 5 756 1700 896
## # ℹ 336,766 more rows
## # ℹ 10 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay == maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
## carrier flight tailnum
## <chr> <int> <chr>
## 1 HA 51 N384HA
#Question 3
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day mean
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ℹ 355 more rows
#Question 4
library(dplyr)
library(nycflights13)
not_cancelled <- flights %>%
filter(!is.na(arr_delay)) # remove cancelled flights
avg_delay <- not_cancelled %>%
group_by(tailnum) %>%
summarise(mean_arr_delay = mean(arr_delay))
lowest_delay <- avg_delay %>%
arrange(mean_arr_delay) %>%
slice(1)
lowest_delay
## # A tibble: 1 × 2
## tailnum mean_arr_delay
## <chr> <dbl>
## 1 N560AS -53
#Question 5
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
) %>%
arrange(desc(last))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 2 7 27 2400
## 2 2013 2 11 1 2400
## 3 2013 3 15 11 2400
## 4 2013 3 22 37 2400
## 5 2013 3 25 13 2400
## 6 2013 4 2 9 2400
## 7 2013 4 4 14 2400
## 8 2013 4 20 7 2400
## 9 2013 5 21 110 2400
## 10 2013 6 17 2 2400
## # ℹ 355 more rows
#Question 6
library(dplyr)
library(nycflights13)
flights %>%
group_by(month) %>%
summarise(
prop_over_hour = mean(dep_delay > 60, na.rm = TRUE)
) %>%
arrange(desc(prop_over_hour))
## # A tibble: 12 × 2
## month prop_over_hour
## <int> <dbl>
## 1 7 0.134
## 2 6 0.128
## 3 12 0.0942
## 4 4 0.0916
## 5 3 0.0837
## 6 5 0.0818
## 7 8 0.0796
## 8 2 0.0698
## 9 1 0.0688
## 10 9 0.0490
## 11 10 0.0469
## 12 11 0.0402
#Question 7
library(dplyr)
library(nycflights13)
flights %>%
group_by(dest) %>%
summarise(num_carriers = n_distinct(carrier)) %>%
arrange(desc(num_carriers))
## # A tibble: 105 × 2
## dest num_carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ℹ 95 more rows
library(ggplot2)
#Question 8
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
#Question 9
library(dplyr)
library(nycflights13)
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
# View the summarized dataset
print(delays)
## # A tibble: 96 × 4
## dest count dist delay
## <chr> <int> <dbl> <dbl>
## 1 ABQ 254 1826 4.38
## 2 ACK 265 199 4.85
## 3 ALB 439 143 14.4
## 4 ATL 17215 757. 11.3
## 5 AUS 2439 1514. 6.02
## 6 AVL 275 584. 8.00
## 7 BDL 443 116 7.05
## 8 BGR 375 378 8.03
## 9 BHM 297 866. 16.9
## 10 BNA 6333 758. 11.8
## # ℹ 86 more rows