# install.packages("pacman")
pacman::p_load(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Question #1
#view(flights)
#glimpse(flights)
#summary(flights)
Question #2 Flight with Largest Departure delay
#select(flights, starts_with("dep"))
#summarise(flights, delay=mean(dep_delay,na.rm=TRUE))
Question #3
#flights %>% group_by(year, month, day) %>%
#summarise(mean = mean(dep_delay))
#not_cancelled <- flights %>%
#filter(!is.na(dep_delay), !is.na(arr_delay))
#delays <- not_cancelled %>%
# group_by(tailnum) %>%
#summarise(
#delay = mean(arr_delay) )
#not_cancelled <- flights %>%
# filter(!is.na(dep_delay))
#not_cancelled %>%
#group_by(year, month, day) %>%
#summarise(mean = mean(dep_delay))
Question #4
delays <- flights %>%
group_by(tailnum) %>%
summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE))
delays %>% arrange(avg_arr_delay) %>% head(1)
## # A tibble: 1 × 2
## tailnum avg_arr_delay
## <chr> <dbl>
## 1 N560AS -53
Question #5
#not_cancelled %>%
#group_by(year, month, day) %>%
#summarise( first = min(dep_time),
#last = max(dep_time))
Question #6
flights %>%
mutate(delay_1hr = dep_delay > 60) %>%
group_by(month) %>%
summarise(prop = mean(delay_1hr, na.rm = TRUE)) %>%
arrange(desc(prop))
## # A tibble: 12 × 2
## month prop
## <int> <dbl>
## 1 7 0.134
## 2 6 0.128
## 3 12 0.0942
## 4 4 0.0916
## 5 3 0.0837
## 6 5 0.0818
## 7 8 0.0796
## 8 2 0.0698
## 9 1 0.0688
## 10 9 0.0490
## 11 10 0.0469
## 12 11 0.0402
Question #7
flights %>%
group_by(dest) %>%
summarise(carrier_count = n_distinct(carrier)) %>%
arrange(desc(carrier_count))
## # A tibble: 105 × 2
## dest carrier_count
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
## # ℹ 95 more rows
Question #9
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")