Use this homework to practice what you’ve learned in session 1:
Creating a tidy dataset
Applying key dplry functions
There are 7 questions and a blank chunk (grey box) for you to type your code into. You can type your answer to any question under the question. When you are done, knit your notebook to HTML by selecting drop down arrow next to the gear symbol in the top left and Knit to HTML.
For a refresher on interacting with RMarkdown documents, check out this video.
For this homework we will use flights data from the
nycflights13 package. The dataset contains all 336,776
flights that departed from New Your City in 2013. The key columns we
will use for this exercise are:
dep_delay: The number of minutes past the scheduled
departing time (i.e. how long the flight was late departing)arr_delay: The number of minutes past the scheduled
arrival time (i.e. how long the flight was late arriving)carrier: The 2-letter code of an airline; you
can use this site to look up codesdest: Airport code of the flight’s destinationWe will use glimpse to see what data is available.
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, ~
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, ~
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1~
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,~
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,~
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1~
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "~
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4~
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394~
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",~
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",~
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1~
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, ~
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6~
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0~
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0~
Q1. Filter the dataset to just those flights going to Atlanta,
dest is equal to ATL.
flights %>%
filter(dest== "ATL")
Q2. How many observations, rows, are in the dataset?
flights %>%
filter(dest== "ATL")%>%
glimpse()
## Rows: 17,215
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time <int> 554, 600, 606, 615, 658, 754, 807, 814, 830, 855, 857, ~
## $ sched_dep_time <int> 600, 600, 610, 615, 700, 759, 810, 810, 835, 859, 900, ~
## $ dep_delay <dbl> -6, 0, -4, 0, -2, -5, -3, 4, -5, -4, -3, -15, -4, -2, -~
## $ arr_time <int> 812, 837, 837, 833, 944, 1039, 1043, 1047, 1052, 1143, ~
## $ sched_arr_time <int> 837, 825, 845, 842, 939, 1041, 1043, 1030, 1105, 1145, ~
## $ arr_delay <dbl> -25, 12, -8, -9, 5, -2, 0, 17, -13, -2, -9, 6, 0, 2, -1~
## $ carrier <chr> "DL", "MQ", "DL", "DL", "DL", "DL", "DL", "FL", "MQ", "~
## $ flight <int> 461, 4650, 1743, 575, 1547, 2047, 269, 346, 4610, 1747,~
## $ tailnum <chr> "N668DN", "N542MQ", "N3739P", "N326NB", "N6703D", "N935~
## $ origin <chr> "LGA", "LGA", "JFK", "EWR", "LGA", "LGA", "JFK", "LGA",~
## $ dest <chr> "ATL", "ATL", "ATL", "ATL", "ATL", "ATL", "ATL", "ATL",~
## $ air_time <dbl> 116, 134, 128, 120, 126, 126, 126, 132, 123, 129, 125, ~
## $ distance <dbl> 762, 762, 760, 746, 762, 762, 760, 762, 762, 762, 746, ~
## $ hour <dbl> 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10, 11, 12,~
## $ minute <dbl> 0, 0, 10, 15, 0, 59, 10, 10, 35, 59, 0, 55, 0, 23, 59, ~
## $ time_hour <dttm> 2013-01-01 06:00:00, 2013-01-01 06:00:00, 2013-01-01 0~
# 17,215 observations/instances and 19 features
Q3. How many unique carrier variables are in the full
dataset?
flights %>%
group_by(carrier) %>%
summarise(carrier_count = n())
distinct(flights, carrier)
x <- flights %>%
select(carrier)
n_distinct(x)
## [1] 16
Q4. What is the average arrival delay, arr_delay, across
all flights?
flights %>%
select(arr_delay) %>%
filter(!is.na(arr_delay)) %>%
summarise(avg_delay = mean(arr_delay))
Q5. What is the average arrival delay, arr_delay, of
flights that departed on time?
flights %>%
select(arr_delay, dep_delay) %>%
filter(dep_delay == "0") %>%
filter(!is.na(arr_delay)) %>%
summarise(avg_deplay = mean(arr_delay))
Q6. Add a new column in the dataset that represents a total flight
delay, dep_delay minus arr_delay.
flights_updated <- flights %>%
mutate(total_flight_delays = (dep_delay-arr_delay))
Q7. Create a dataset for those flights that left late. Exclude flights that arrived on time. What is the average total flight delay across these carriers?
flights_updatedv1 <-flights_updated %>%
filter(dep_delay>0 & arr_delay >0)
flights_updatedv1 %>%
filter(!is.na(total_flight_delays)) %>%
summarise(avg_total_flight_delays = mean(total_flight_delays))