library('nycflights13')

Exersise 5.3.1

  1. How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).

###Answer

library(nycflights13) library(tidyverse)

nycflights13::flights

filter(flights, month == 1, day == 1)

filter(flights, month == 11 | month == 12)

arrange(flights, dep_time)

tail(arrange(flights,desc(is.na(flights$dep_delay)), dep_delay))

df <- tibble(x = c(5, 2, NA), y = c(2, NA, 2)) rowSums(df)

arrange(df, desc(is.na(x)))

arrange(df, -(is.na(x)))

2)Sort flights to find the most delayed flights. Find the flights that left earliest.

###Answer

arrange(flights, dep_delay)

arrange(flights, desc(dep_delay) )

  1. Sort flights to find the fastest (highest speed) flights.

arrange(flights, air_time )

  1. Which flights travelled the farthest? Which travelled the shortest?

###Answer flights arrange(air_time) select(carrier, flight, air_time)

flights arrange(-air_time) select(carrier, flight, air_time)

Exersise 5.4.1

  1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.

###Answer

vars <- c(“dep_time”, “dep_delay”, “arr_time”, “arr_delay”) select(flights, dep_time, dep_delay, arr_time, arr_delay)

select(flights, ends_with(“time”), ends_with(“delay”))

select(flights, one_of(vars))

select(flights, .dots = vars)

select(flights, “dep_time”, “dep_delay”, “arr_time”, “arr_delay”)

select(flights, matches(“dep”), matches(“arr”), -matches(“sched”), -carrier)

select(flights, contains(“dep”), contains(“arr”), -contains(“sched”), -carrier)

select(flights, matches(“dep|arr”))

select(flights, matches(“time\(|delay\)”), -contains(“sched”), -contains(“air”))

select(flights, matches(“^dep|arr_delay|time$”))

  1. What happens if you include the name of a variable multiple times in a select() call?

###Answer

select(flights, dep_time, dep_time)

3)What does the any_of() function do? Why might it be helpful in conjunction with this vector? ###Answer

vars <- c(“year”, “month”, “day”, “dep_delay”, “arr_delay”) select(flights, one_of(vars))

  1. Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default?

###Answer

select(flights, contains(“TIME”))

Exersise 5.5.2

  1. Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight

###Answer

transmute(flights,deptime = dep_time/60, schedeptime=sched_dep_time/60)

  1. Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

###Answer

flights %>% mutate(dep_time = (dep_time %/% 100) * 60 + (dep_time %% 100), sched_dep_time = (sched_dep_time %/% 100) * 60 + (sched_dep_time %% 100), arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100), sched_arr_time = (sched_arr_time %/% 100) * 60 + (sched_arr_time %% 100)) %>% transmute((arr_time - dep_time) %% (60*24) - air_time)

  1. Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?

Answer

hours2mins <- function(x) { x %/% 100 * 60 + x %% 100 } select(flights, contains(“dep”)) %>% mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time))

these two numbers don’t match because we aren’t accounting for flights

where the departure time is the next day from the scheduled departure time.

select(flights, contains(“dep”)) %>% mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time)) %>% filter(dep_delay != dep_time_two) %>% mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time - 2400))

4)Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

###Answer

flights %>% filter(min_rank(-(dep_delay)) %in% 1:10)

flights %>% top_n(10, dep_delay)

5)What does 1:3 + 1:10 return? Why?

6)What trigonometric functions does R provide?

###Answer

summarise(flights, delay=mean(dep_delay,na.rm = TRUE))

by_date<- group_by(flights,year,month,day) summarise(by_date,delay=mean(dep_delay, na.rm=TRUE))

by_dest<- group_by(flights, dest) delay<- summarise(by_dest,count=n(),dist=mean(distance,na.rm=TRUE),delay=mean(arr_delay,na.rm = TRUE))

delay<- filter(delay,count>20,dest!=“HNL”)

ggplot(data = delay,mapping = aes(x=dist,y=delay))+ geom_point(aes(size=count),alpha=1/3)+ geom_smooth(se=FALSE)

## geom_smooth() using method = ‘loess’ and formula ‘y ~ x’

#the pipe, %>%: delay <- flights %>% group_by(dest)%>% summarise( count=n(), dist = mean(distance,na.rm=TRUE), delay= mean(arr_delay, na.rm = TRUE) )%>% filter(count > 20, dest != “HNL”)

#missing values

not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))

delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay) )

ggplot(data = delays, mapping = aes(x = delay)) + geom_freqpoly(binwidth = 10)

not_cancelled<- flights %>% filter(!is.na(arr_delay))

delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay, na.rm = TRUE), n = n() )

ggplot(data = delays, mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)

Exersise 5.6.7

  1. Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios

###Answer

elay_char <- flights %>% group_by(flight) %>% summarise(n = n(), fifteen_early = mean(arr_delay == -15, na.rm = TRUE), fifteen_late = mean(arr_delay == 15, na.rm = TRUE), ten_always = mean(arr_delay == 10, na.rm = TRUE), thirty_early = mean(arr_delay == -30, na.rm = TRUE), thirty_late = mean(arr_delay == 30, na.rm = TRUE), percentage_on_time = mean(arr_delay == 0, na.rm = TRUE), twohours = mean(arr_delay > 120, na.rm = TRUE)) %>% map_if(is_double, round, 2) %>% as_tibble()

A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.

delay_char %>% filter(fifteen_early == 0.5, fifteen_late == 0.5)

A flight is always 10 minutes late

delay_char %>% filter(ten_always == 1)

A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.

delay_char %>% filter(thirty_early == 0.5 & thirty_late == 0.5)

99% of the time a flight is on time. 1% of the time it’s 2 hours late.

delay_char %>% filter(percentage_on_time == 0.99 & twohours == 0.01)

Which is more important: arrival delay or departure delay?

  1. Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()).

not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay)) not_cancelled %>% count(dest)

# and not_cancelled %>% count(tailnum, wt = distance)

# (without using count()). not_cancelled %>% group_by(dest) %>% summarise(n = n())

not_cancelled %>% group_by(tailnum) %>% tally(wt = distance)

# or not_cancelled %>% group_by(tailnum) %>% summarize(n = sum(distance))

  1. Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay) ) is slightly suboptimal. Why? Which is the most important column? ###Answer

Because if a flight didn’t leave then it was cancelled. If the condition is.na(dep_delay) is met, then the flight was cancelled.

4)Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?

###Answer

flights %>% group_by(day) %>% summarise(cancelled = mean(is.na(dep_delay)), mean_dep = mean(dep_delay, na.rm = T), mean_arr = mean(arr_delay, na.rm = T)) %>% ggplot(aes(y = cancelled)) + geom_point(aes(x = mean_dep), colour = “red”) + geom_point(aes(x = mean_arr), colour = “blue”) + labs(x = “Avg delay per day”, y = “Cancelled flights p day”)

5)Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))

flights %>% summarise(n_car = n_distinct(carrier), n_air = n_distinct(dest), n_or = n_distinct(origin))

flights %>% group_by(carrier) %>% mutate(avg_carrier = mean(dep_delay, na.rm = T)) %>% group_by(carrier, origin) %>% mutate(origin_mean = mean(dep_delay, na.rm = T), deviations = origin_mean - avg_carrier) %>% summarise(deviations = mean(deviations), mean = mean(avg_carrier)) %>% ggplot(aes(origin, deviations)) + geom_col() + facet_wrap(~ carrier)

Tearing out the effect is not straight forward but we can make some informed guesses. For example, whenever there are substantial deviations, they seem to be higher in EWR airport rather than in other airports. On the other hand, there are some airlines that look particular bad like 9E and MQ. And the same pattern is not found on the vast majority of other airlines, which would suggest it’s an airport issues rather than an airline issue.

flights %>% group_by(carrier, dest) %>% summarise(mean_departure = mean(dep_delay, na.rm = T), mean_arrival = mean(arr_delay, na.rm = T))

For each plane, count the number of flights before the first delay of greater than 1 hour.

flights %>% mutate(dep_date = time_hour) %>% group_by(tailnum) %>% arrange(dep_date) %>% mutate(cumulative = !cumany(arr_delay > 60)) %>% filter(cumulative == T) %>% tally(sort = TRUE)

or

flights %>% group_by(tailnum) %>% arrange(time_hour) %>% mutate(cum = arr_delay > 60, cum_any = cumsum(cum)) %>% filter(cum_any < 1) %>% tally(sort = TRUE)

6)What does the sort argument to count() do. When might you use it?

###Answer

flights %>% count(flight, sort = T)

Exersise 5.7.1

1)Refer back to the lists of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping.

2)Which plane (tailnum) has the worst on-time record?

###Answer

flights %>% filter(!is.na(arr_delay)) %>% group_by(tailnum) %>% summarise(prop_time = sum(arr_delay <= 30)/n(), mean_arr = mean(arr_delay, na.rm = TRUE), fl = n()) %>% arrange(desc(prop_time))

All these flights are always late.

3)What time of day should you fly if you want to avoid delays as much as possible?

flights %>% group_by(hour) %>% filter(!is.na(dep_delay)) %>% summarise( delay = mean( dep_delay > 0 , na.rm = T)) %>% ggplot(aes(hour, delay, fill = delay)) + geom_col()

4)For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination.

flights %>% group_by(dest) %>% filter(!is.na(dep_delay)) %>% summarise(tot_mins = sum(dep_delay[dep_delay > 0]))

flights %>% filter(!is.na(dep_delay)) %>% group_by(tailnum, dest) %>% summarise(m = mean(dep_delay > 0), n = n()) %>% arrange(desc(m))

5)Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag(), explore how the delay of a flight is related to the delay of the immediately preceding flight.

flights %>% mutate(new_sched_dep_time = lubridate::make_datetime(year, month, day, hour, minute)) %>% group_by(origin) %>% arrange(new_sched_dep_time) %>% mutate(prev_flight_dep_delay = lag(dep_delay)) %>% ggplot(aes(x=prev_flight_dep_delay, y= dep_delay)) + geom_point()

###Answer

6)Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time of a flight relative to the shortest flight to that destination. Which flights were most delayed in the air?

(1)

flights %>% group_by(dest) %>% arrange(air_time) %>% slice(1:5) %>% select(tailnum, sched_dep_time, sched_arr_time, air_time) %>% arrange(air_time)

#(2) flights %>% group_by(dest) %>% mutate(shortest = air_time - min(air_time, na.rm = T)) %>% top_n(1, air_time) %>% arrange(-air_time) %>% select(tailnum, sched_dep_time, sched_arr_time, shortest)

7)Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.

flights %>% group_by(dest) %>% filter(n_distinct(carrier) > 2) %>% group_by(carrier) %>% summarise(n = n_distinct(dest)) %>% arrange(-n)