Q1. Explore data

pacman::p_load(nycflights13, tidyverse)
# View(flights) 
# glimpse(flights)
summary(flights)

Q2. Checking code

sortf <- arrange(flights,desc(dep_delay)) 
select(sortf, carrier, flight, tailnum, everything())
maxdep <- max(flights$dep_delay, na.rm=TRUE)
maxdep_id <- which(flights$dep_delay==maxdep)
flights[maxdep_id, 10:12]
## # A tibble: 1 × 3
##   carrier flight tailnum
##   <chr>    <int> <chr>  
## 1 HA          51 N384HA

Q.3 finding average departure delay per day

not_cancelled <- flights %>% 
 filter(!is.na(dep_delay), !is.na(arr_delay))
flights %>% 
 group_by(year, month, day) %>% 
 summarise(mean = mean(dep_delay, na.rm = TRUE))
not_cancelled <- flights %>% 
 filter(!is.na(dep_delay))

not_cancelled %>% 
 group_by(year, month, day) %>% 
 summarise(mean = mean(dep_delay))

Q5. checking last daily records

not_cancelled %>% 
 group_by(year, month, day) %>% 
 summarise(
  first = min(dep_time),
  last = max(dep_time)
 )
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by year, month, and day.
## ℹ Output is grouped by year and month.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(year, month, day))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     1     1   517  2356
##  2  2013     1     2    42  2354
##  3  2013     1     3    32  2349
##  4  2013     1     4    25  2358
##  5  2013     1     5    14  2357
##  6  2013     1     6    16  2355
##  7  2013     1     7    49  2359
##  8  2013     1     8   454  2351
##  9  2013     1     9     2  2252
## 10  2013     1    10     3  2320
## # ℹ 355 more rows

Q.6 Proportions of selected values

flights %>%
  filter(!is.na(dep_delay)) %>%
  group_by(month) %>%
  summarise(prop_over_1h = mean(dep_delay > 60)) %>%
  mutate(prop_pct = round(prop_over_1h * 100, 2)) %>%
  arrange(desc(prop_over_1h))
## # A tibble: 12 × 3
##    month prop_over_1h prop_pct
##    <int>        <dbl>    <dbl>
##  1     7       0.134     13.4 
##  2     6       0.128     12.8 
##  3    12       0.0942     9.42
##  4     4       0.0916     9.16
##  5     3       0.0837     8.37
##  6     5       0.0818     8.18
##  7     8       0.0796     7.96
##  8     2       0.0698     6.98
##  9     1       0.0688     6.88
## 10     9       0.0490     4.9 
## 11    10       0.0469     4.69
## 12    11       0.0402     4.02

Q.7 using distinct command

flights %>%
  group_by(dest) %>%
  summarise(n_carriers = n_distinct(carrier, na.rm = TRUE)) %>%
  arrange(desc(n_carriers)) %>%
  slice_head(n = 10)
## # A tibble: 10 × 2
##    dest  n_carriers
##    <chr>      <int>
##  1 ATL            7
##  2 BOS            7
##  3 CLT            7
##  4 ORD            7
##  5 TPA            7
##  6 AUS            6
##  7 DCA            6
##  8 DTW            6
##  9 IAD            6
## 10 MSP            6

Q.8 data manipulation

#install.packages("ggplot2")
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
                   count = n(),
                   dist  = mean(distance, na.rm = TRUE),
                   delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +   
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Q.9 data manipulation

delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")