library('nycflights13')
###Answer
library(nycflights13) library(tidyverse)
nycflights13::flights
filter(flights, month == 1, day == 1)
filter(flights, month == 11 | month == 12)
arrange(flights, dep_time)
tail(arrange(flights,desc(is.na(flights$dep_delay)), dep_delay))
df <- tibble(x = c(5, 2, NA), y = c(2, NA, 2)) rowSums(df)
arrange(df, desc(is.na(x)))
arrange(df, -(is.na(x)))
2)Sort flights to find the most delayed flights. Find the flights that left earliest.
###Answer
arrange(flights, dep_delay)
arrange(flights, desc(dep_delay) )
arrange(flights, air_time )
###Answer flights arrange(air_time) select(carrier, flight, air_time)
flights arrange(-air_time) select(carrier, flight, air_time)
###Answer
vars <- c(“dep_time”, “dep_delay”, “arr_time”, “arr_delay”) select(flights, dep_time, dep_delay, arr_time, arr_delay)
select(flights, ends_with(“time”), ends_with(“delay”))
select(flights, one_of(vars))
select(flights, .dots = vars)
select(flights, “dep_time”, “dep_delay”, “arr_time”, “arr_delay”)
select(flights, matches(“dep”), matches(“arr”), -matches(“sched”), -carrier)
select(flights, contains(“dep”), contains(“arr”), -contains(“sched”), -carrier)
select(flights, matches(“dep|arr”))
select(flights, matches(“time\(|delay\)”), -contains(“sched”), -contains(“air”))
select(flights, matches(“^dep|arr_delay|time$”))
###Answer
select(flights, dep_time, dep_time)
3)What does the any_of() function do? Why might it be helpful in conjunction with this vector? ###Answer
vars <- c(“year”, “month”, “day”, “dep_delay”, “arr_delay”) select(flights, one_of(vars))
###Answer
select(flights, contains(“TIME”))
###Answer
transmute(flights,deptime = dep_time/60, schedeptime=sched_dep_time/60)
###Answer
flights %>% mutate(dep_time = (dep_time %/% 100) * 60 + (dep_time %% 100), sched_dep_time = (sched_dep_time %/% 100) * 60 + (sched_dep_time %% 100), arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100), sched_arr_time = (sched_arr_time %/% 100) * 60 + (sched_arr_time %% 100)) %>% transmute((arr_time - dep_time) %% (60*24) - air_time)
hours2mins <- function(x) { x %/% 100 * 60 + x %% 100 } select(flights, contains(“dep”)) %>% mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time))
select(flights, contains(“dep”)) %>% mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time)) %>% filter(dep_delay != dep_time_two) %>% mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time - 2400))
4)Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().
###Answer
flights %>% filter(min_rank(-(dep_delay)) %in% 1:10)
flights %>% top_n(10, dep_delay)
5)What does 1:3 + 1:10 return? Why?
6)What trigonometric functions does R provide?
###Answer
summarise(flights, delay=mean(dep_delay,na.rm = TRUE))
by_date<- group_by(flights,year,month,day) summarise(by_date,delay=mean(dep_delay, na.rm=TRUE))
by_dest<- group_by(flights, dest) delay<- summarise(by_dest,count=n(),dist=mean(distance,na.rm=TRUE),delay=mean(arr_delay,na.rm = TRUE))
delay<- filter(delay,count>20,dest!=“HNL”)
ggplot(data = delay,mapping = aes(x=dist,y=delay))+ geom_point(aes(size=count),alpha=1/3)+ geom_smooth(se=FALSE)
## geom_smooth() using method = ‘loess’ and formula ‘y ~
x’
#the pipe, %>%: delay <- flights %>% group_by(dest)%>% summarise( count=n(), dist = mean(distance,na.rm=TRUE), delay= mean(arr_delay, na.rm = TRUE) )%>% filter(count > 20, dest != “HNL”)
#missing values
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay))
delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay) )
ggplot(data = delays, mapping = aes(x = delay)) + geom_freqpoly(binwidth = 10)
not_cancelled<- flights %>% filter(!is.na(arr_delay))
delays <- not_cancelled %>% group_by(tailnum) %>% summarise( delay = mean(arr_delay, na.rm = TRUE), n = n() )
ggplot(data = delays, mapping = aes(x = n, y = delay)) + geom_point(alpha = 1/10)
###Answer
elay_char <- flights %>% group_by(flight) %>% summarise(n = n(), fifteen_early = mean(arr_delay == -15, na.rm = TRUE), fifteen_late = mean(arr_delay == 15, na.rm = TRUE), ten_always = mean(arr_delay == 10, na.rm = TRUE), thirty_early = mean(arr_delay == -30, na.rm = TRUE), thirty_late = mean(arr_delay == 30, na.rm = TRUE), percentage_on_time = mean(arr_delay == 0, na.rm = TRUE), twohours = mean(arr_delay > 120, na.rm = TRUE)) %>% map_if(is_double, round, 2) %>% as_tibble()
A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.
delay_char %>% filter(fifteen_early == 0.5, fifteen_late == 0.5)
A flight is always 10 minutes late
delay_char %>% filter(ten_always == 1)
A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.
delay_char %>% filter(thirty_early == 0.5 & thirty_late == 0.5)
99% of the time a flight is on time. 1% of the time it’s 2 hours late.
delay_char %>% filter(percentage_on_time == 0.99 & twohours == 0.01)
Which is more important: arrival delay or departure delay?
not_cancelled <- flights %>% filter(!is.na(dep_delay), !is.na(arr_delay)) not_cancelled %>% count(dest)
# and not_cancelled %>% count(tailnum, wt = distance)
# (without using count()). not_cancelled %>% group_by(dest) %>% summarise(n = n())
not_cancelled %>% group_by(tailnum) %>% tally(wt = distance)
# or not_cancelled %>% group_by(tailnum) %>% summarize(n = sum(distance))
Because if a flight didn’t leave then it was cancelled. If the condition is.na(dep_delay) is met, then the flight was cancelled.
4)Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
###Answer
flights %>% group_by(day) %>% summarise(cancelled = mean(is.na(dep_delay)), mean_dep = mean(dep_delay, na.rm = T), mean_arr = mean(arr_delay, na.rm = T)) %>% ggplot(aes(y = cancelled)) + geom_point(aes(x = mean_dep), colour = “red”) + geom_point(aes(x = mean_arr), colour = “blue”) + labs(x = “Avg delay per day”, y = “Cancelled flights p day”)
5)Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))
flights %>% summarise(n_car = n_distinct(carrier), n_air = n_distinct(dest), n_or = n_distinct(origin))
flights %>% group_by(carrier) %>% mutate(avg_carrier = mean(dep_delay, na.rm = T)) %>% group_by(carrier, origin) %>% mutate(origin_mean = mean(dep_delay, na.rm = T), deviations = origin_mean - avg_carrier) %>% summarise(deviations = mean(deviations), mean = mean(avg_carrier)) %>% ggplot(aes(origin, deviations)) + geom_col() + facet_wrap(~ carrier)
Tearing out the effect is not straight forward but we can make some informed guesses. For example, whenever there are substantial deviations, they seem to be higher in EWR airport rather than in other airports. On the other hand, there are some airlines that look particular bad like 9E and MQ. And the same pattern is not found on the vast majority of other airlines, which would suggest it’s an airport issues rather than an airline issue.
flights %>% group_by(carrier, dest) %>% summarise(mean_departure = mean(dep_delay, na.rm = T), mean_arrival = mean(arr_delay, na.rm = T))
For each plane, count the number of flights before the first delay of greater than 1 hour.
flights %>% mutate(dep_date = time_hour) %>% group_by(tailnum) %>% arrange(dep_date) %>% mutate(cumulative = !cumany(arr_delay > 60)) %>% filter(cumulative == T) %>% tally(sort = TRUE)
or
flights %>% group_by(tailnum) %>% arrange(time_hour) %>% mutate(cum = arr_delay > 60, cum_any = cumsum(cum)) %>% filter(cum_any < 1) %>% tally(sort = TRUE)
6)What does the sort argument to count() do. When might you use it?
###Answer
flights %>% count(flight, sort = T)
1)Refer back to the lists of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping.
2)Which plane (tailnum) has the worst on-time record?
###Answer
flights %>% filter(!is.na(arr_delay)) %>% group_by(tailnum) %>% summarise(prop_time = sum(arr_delay <= 30)/n(), mean_arr = mean(arr_delay, na.rm = TRUE), fl = n()) %>% arrange(desc(prop_time))
All these flights are always late.
3)What time of day should you fly if you want to avoid delays as much as possible?
flights %>% group_by(hour) %>% filter(!is.na(dep_delay)) %>% summarise( delay = mean( dep_delay > 0 , na.rm = T)) %>% ggplot(aes(hour, delay, fill = delay)) + geom_col()
4)For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination.
flights %>% group_by(dest) %>% filter(!is.na(dep_delay)) %>% summarise(tot_mins = sum(dep_delay[dep_delay > 0]))
flights %>% filter(!is.na(dep_delay)) %>% group_by(tailnum, dest) %>% summarise(m = mean(dep_delay > 0), n = n()) %>% arrange(desc(m))
5)Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag(), explore how the delay of a flight is related to the delay of the immediately preceding flight.
flights %>% mutate(new_sched_dep_time = lubridate::make_datetime(year, month, day, hour, minute)) %>% group_by(origin) %>% arrange(new_sched_dep_time) %>% mutate(prev_flight_dep_delay = lag(dep_delay)) %>% ggplot(aes(x=prev_flight_dep_delay, y= dep_delay)) + geom_point()
###Answer
6)Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time of a flight relative to the shortest flight to that destination. Which flights were most delayed in the air?
flights %>% group_by(dest) %>% arrange(air_time) %>% slice(1:5) %>% select(tailnum, sched_dep_time, sched_arr_time, air_time) %>% arrange(air_time)
#(2) flights %>% group_by(dest) %>% mutate(shortest = air_time - min(air_time, na.rm = T)) %>% top_n(1, air_time) %>% arrange(-air_time) %>% select(tailnum, sched_dep_time, sched_arr_time, shortest)
7)Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.
flights %>% group_by(dest) %>% filter(n_distinct(carrier) > 2) %>% group_by(carrier) %>% summarise(n = n_distinct(dest)) %>% arrange(-n)