Use this homework to practice what you’ve learned in session 1:

Creating a tidy dataset
Applying key dplry functions

There are 7 questions and a blank chunk (grey box) for you to type your code into. You can type your answer to any question under the question. When you are done, knit your notebook to HTML by selecting drop down arrow next to the gear symbol in the top left and Knit to HTML.

For a refresher on interacting with RMarkdown documents, check out this video.

For this homework we will use flights data from the nycflights13 package. The dataset contains all 336,776 flights that departed from New Your City in 2013. The key columns we will use for this exercise are:

We will use glimpse to see what data is available.

glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, ~
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, ~
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1~
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,~
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,~
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1~
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "~
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4~
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394~
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",~
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",~
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1~
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, ~
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6~
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0~
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0~

Q1. Filter the dataset to just those flights going to Atlanta, dest is equal to ATL.

flights %>% 
  filter(dest== "ATL")

Q2. How many observations, rows, are in the dataset?

flights %>% 
  filter(dest== "ATL")%>%
  glimpse()
## Rows: 17,215
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time       <int> 554, 600, 606, 615, 658, 754, 807, 814, 830, 855, 857, ~
## $ sched_dep_time <int> 600, 600, 610, 615, 700, 759, 810, 810, 835, 859, 900, ~
## $ dep_delay      <dbl> -6, 0, -4, 0, -2, -5, -3, 4, -5, -4, -3, -15, -4, -2, -~
## $ arr_time       <int> 812, 837, 837, 833, 944, 1039, 1043, 1047, 1052, 1143, ~
## $ sched_arr_time <int> 837, 825, 845, 842, 939, 1041, 1043, 1030, 1105, 1145, ~
## $ arr_delay      <dbl> -25, 12, -8, -9, 5, -2, 0, 17, -13, -2, -9, 6, 0, 2, -1~
## $ carrier        <chr> "DL", "MQ", "DL", "DL", "DL", "DL", "DL", "FL", "MQ", "~
## $ flight         <int> 461, 4650, 1743, 575, 1547, 2047, 269, 346, 4610, 1747,~
## $ tailnum        <chr> "N668DN", "N542MQ", "N3739P", "N326NB", "N6703D", "N935~
## $ origin         <chr> "LGA", "LGA", "JFK", "EWR", "LGA", "LGA", "JFK", "LGA",~
## $ dest           <chr> "ATL", "ATL", "ATL", "ATL", "ATL", "ATL", "ATL", "ATL",~
## $ air_time       <dbl> 116, 134, 128, 120, 126, 126, 126, 132, 123, 129, 125, ~
## $ distance       <dbl> 762, 762, 760, 746, 762, 762, 760, 762, 762, 762, 746, ~
## $ hour           <dbl> 6, 6, 6, 6, 7, 7, 8, 8, 8, 8, 9, 9, 10, 10, 10, 11, 12,~
## $ minute         <dbl> 0, 0, 10, 15, 0, 59, 10, 10, 35, 59, 0, 55, 0, 23, 59, ~
## $ time_hour      <dttm> 2013-01-01 06:00:00, 2013-01-01 06:00:00, 2013-01-01 0~
# 17,215 observations/instances and 19 features 

Q3. How many unique carrier variables are in the full dataset?

flights %>%
  group_by(carrier) %>%
  summarise(carrier_count = n())
distinct(flights, carrier)
x <- flights %>%
select(carrier)

n_distinct(x)
## [1] 16

Q4. What is the average arrival delay, arr_delay, across all flights?

flights %>%
  select(arr_delay) %>%
  filter(!is.na(arr_delay)) %>%
  summarise(avg_delay = mean(arr_delay))

Q5. What is the average arrival delay, arr_delay, of flights that departed on time?

flights %>%
  select(arr_delay, dep_delay) %>%
  filter(dep_delay == "0") %>%
  filter(!is.na(arr_delay)) %>%
  summarise(avg_deplay = mean(arr_delay))

Q6. Add a new column in the dataset that represents a total flight delay, dep_delay minus arr_delay.

flights_updated <- flights %>%
  mutate(total_flight_delays = (dep_delay-arr_delay))

Q7. Create a dataset for those flights that left late. Exclude flights that arrived on time. What is the average total flight delay across these carriers?

flights_updatedv1 <-flights_updated %>%
  filter(dep_delay>0 & arr_delay >0)

flights_updatedv1 %>%
  filter(!is.na(total_flight_delays)) %>%
  summarise(avg_total_flight_delays = mean(total_flight_delays))