Q1. Explore data
pacman::p_load(nycflights13, tidyverse)
# View(flights)
# glimpse(flights)
summary(flights)
Q2. Checking code
sortf <- arrange(flights,desc(dep_delay))
select(sortf, carrier, flight, tailnum, everything())
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay==maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
## carrier flight tailnum
## <chr> <int> <chr>
## 1 HA 51 N384HA
Q.3 finding average departure delay per day
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
flights %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay, na.rm = TRUE))
not_cancelled <- flights %>%
filter(!is.na(dep_delay))
not_cancelled %>%
group_by(year, month, day) %>%
summarise(mean = mean(dep_delay))
Q5. checking last daily records
not_cancelled %>%
group_by(year, month, day) %>%
summarise(
first = min(dep_time),
last = max(dep_time)
)
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day first last
## <int> <int> <int> <int> <int>
## 1 2013 1 1 517 2356
## 2 2013 1 2 42 2354
## 3 2013 1 3 32 2349
## 4 2013 1 4 25 2358
## 5 2013 1 5 14 2357
## 6 2013 1 6 16 2355
## 7 2013 1 7 49 2359
## 8 2013 1 8 454 2351
## 9 2013 1 9 2 2252
## 10 2013 1 10 3 2320
## # ℹ 355 more rows
Q.6 Proportions of selected values
flights %>%
filter(!is.na(dep_delay)) %>%
group_by(month) %>%
summarise(prop_over_1h = mean(dep_delay > 60)) %>%
mutate(prop_pct = round(prop_over_1h * 100, 2)) %>%
arrange(desc(prop_over_1h))
## # A tibble: 12 × 3
## month prop_over_1h prop_pct
## <int> <dbl> <dbl>
## 1 7 0.134 13.4
## 2 6 0.128 12.8
## 3 12 0.0942 9.42
## 4 4 0.0916 9.16
## 5 3 0.0837 8.37
## 6 5 0.0818 8.18
## 7 8 0.0796 7.96
## 8 2 0.0698 6.98
## 9 1 0.0688 6.88
## 10 9 0.0490 4.9
## 11 10 0.0469 4.69
## 12 11 0.0402 4.02
Q.7 using distinct command
flights %>%
group_by(dest) %>%
summarise(n_carriers = n_distinct(carrier, na.rm = TRUE)) %>%
arrange(desc(n_carriers)) %>%
slice_head(n = 10)
## # A tibble: 10 × 2
## dest n_carriers
## <chr> <int>
## 1 ATL 7
## 2 BOS 7
## 3 CLT 7
## 4 ORD 7
## 5 TPA 7
## 6 AUS 6
## 7 DCA 6
## 8 DTW 6
## 9 IAD 6
## 10 MSP 6
Q.8 data manipulation
#install.packages("ggplot2")
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Q.9 data manipulation
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")