Sections 5.1 & 5.2: Introduction, Prerequisites, nycflights13, dplyr Basics, Filter Rows with filter(), Comparisons, Logical Operators, Missing Values.

1.1 Find all flights that had an arrival delay of two or more hours

#the first argument is the name of teh data frame, which is flights. I'm trying to find all rows that has arr_delay equals to or over 120.
filter(flights, arr_delay >= 120)
## # A tibble: 10,200 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # … with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

1.2 Find all flights that flew to Houston

#Filter out all flights of which destination equals to IAH or HOU
filter(flights, dest == 'IAH' | dest == 'HOU')
## # A tibble: 9,313 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      623            627        -4      933            932
##  4  2013     1     1      728            732        -4     1041           1038
##  5  2013     1     1      739            739         0     1104           1038
##  6  2013     1     1      908            908         0     1228           1219
##  7  2013     1     1     1028           1026         2     1350           1339
##  8  2013     1     1     1044           1045        -1     1352           1351
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1205           1200         5     1503           1505
## # … with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

1.3 Find all flights that were operated by United, American, or Delta

#Filter out all flights of which carrier equals to AA, UA or DL
filter(flights, carrier %in% c('AA','UA','DL'))
## # A tibble: 139,504 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            600        -6      812            837
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            600        -1      941            910
## 10  2013     1     1      559            600        -1      854            902
## # … with 139,494 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

1.4 Find all flights that departed in summer

#Filter out all flights of which month equals to 7 or 8 or 9
filter(flights, month %in% c(7,8,9))
## # A tibble: 86,326 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

1.5 Find all flights that arrived more than two hours late, but didn’t leave late.

#Filter out all flights of which the arrival delay is over 120, but the depart delay is less than or equal to 0
filter(flights, arr_delay > 120 & dep_delay <= 0)
## # A tibble: 29 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    27     1419           1420        -1     1754           1550
##  2  2013    10     7     1350           1350         0     1736           1526
##  3  2013    10     7     1357           1359        -2     1858           1654
##  4  2013    10    16      657            700        -3     1258           1056
##  5  2013    11     1      658            700        -2     1329           1015
##  6  2013     3    18     1844           1847        -3       39           2219
##  7  2013     4    17     1635           1640        -5     2049           1845
##  8  2013     4    18      558            600        -2     1149            850
##  9  2013     4    18      655            700        -5     1213            950
## 10  2013     5    22     1827           1830        -3     2217           2010
## # … with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

1.6 Find all flights that were delayed by at least an hour, but made up over 30 minutes in flight.

#Filter out all flights of which depart delay is over or equals to 60, but the arrival delay is less than 30, because it made up more than 30 minutes in flight.
filter(flights, dep_delay >= 60 & arr_delay < 30)
## # A tibble: 206 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     3     1850           1745        65     2148           2120
##  2  2013     1     3     1950           1845        65     2228           2227
##  3  2013     1     3     2015           1915        60     2135           2111
##  4  2013     1     6     1019            900        79     1558           1530
##  5  2013     1     7     1543           1430        73     1758           1735
##  6  2013     1    11     1020            920        60     1311           1245
##  7  2013     1    12     1706           1600        66     1949           1927
##  8  2013     1    12     1953           1845        68     2154           2137
##  9  2013     1    19     1456           1355        61     1636           1615
## 10  2013     1    21     1531           1430        61     1843           1815
## # … with 196 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3 How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

#Filter out all flights of which dep_time is NA, and use the count function to count the total number.
filter(flights, is.na(dep_time))
## # A tibble: 8,255 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # … with 8,245 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
count(filter(flights, is.na(dep_time)))
## # A tibble: 1 × 1
##       n
##   <int>
## 1  8255
  • There are 8255 flights that have a missing dep_time. Other missing values are dep_delay, arr_time, arr_delay, air_time, and some tailnum. These rows might represent the cancelled flight that never took off. Therefore there were no departure and arrival time documented.

Sections 5.3: Arrange Rows with Rows()

1 How could you use arrange() to sort all missing values to the start? (Hint: use is.na())?

#is.na(dep_time) is True or False, NA=True, which is also 1, therefore sorting in descending order will put 1 at first. This is to first sort by desc(is.na(dep_time)), then sort the rest of the rows by dep_time in ascending order. 
arrange(flights, desc(is.na(dep_time)), dep_time)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1       NA           1630        NA       NA           1815
##  2  2013     1     1       NA           1935        NA       NA           2240
##  3  2013     1     1       NA           1500        NA       NA           1825
##  4  2013     1     1       NA            600        NA       NA            901
##  5  2013     1     2       NA           1540        NA       NA           1747
##  6  2013     1     2       NA           1620        NA       NA           1746
##  7  2013     1     2       NA           1355        NA       NA           1459
##  8  2013     1     2       NA           1420        NA       NA           1644
##  9  2013     1     2       NA           1321        NA       NA           1536
## 10  2013     1     2       NA           1545        NA       NA           1910
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

2.Sort flights to find the most delayed flights. Find the flights that left earliest.

#the most delayed flights have the most arr_delay. Therefore I sort the arr_delay in descending order.
arrange(flights, desc(arr_delay))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     7    22     2257            759       898      121           1026
##  9  2013    12     5      756           1700       896     1058           2020
## 10  2013     5     3     1133           2055       878     1250           2215
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#the flights that left earliest have the least dep_delay, therefore I sort the dep_delay in ascending order.
arrange(flights, dep_delay)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013    10    23     1907           1932       -25     2143           2143
##  8  2013     3    30     2030           2055       -25     2213           2250
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Sections 5.4: Select columns with select()

3 What does the any_of() function do? Why might it be helpful in conjunction with this vector?

#This is to select the following columns: year, month, day, dep_delay, arr_delay
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, any_of(vars))
## # A tibble: 336,776 × 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1         2        11
##  2  2013     1     1         4        20
##  3  2013     1     1         2        33
##  4  2013     1     1        -1       -18
##  5  2013     1     1        -6       -25
##  6  2013     1     1        -4        12
##  7  2013     1     1        -5        19
##  8  2013     1     1        -3       -14
##  9  2013     1     1        -3        -8
## 10  2013     1     1        -2         8
## # … with 336,766 more rows
  • any_of() is a selection helper, it means ‘match any of these characters exactly once.’ It selects variables within a character vector, and it doesn’t check for missing variables. It might be helpful in conjunction with this vector because it is easier to programmatically generate character vectors with variable names than to generate unquoted variable names.

Sections 5.5: Select columns with select()

2 Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

#My expectation is that air_time=arr_time-dep_time, first I will convert the hours and minutes to minutes for calculation purpose. 
flights_air_time_cal <- mutate(flights,
    dep_time_min = dep_time %/% 100 * 60 + dep_time %% 100,
    arr_time_min = arr_time %/% 100 * 60 + arr_time %% 100,
    air_time_compare = air_time - (arr_time - dep_time)
)

select(flights_air_time_cal, dep_time_min, arr_time_min, air_time, air_time_compare)
## # A tibble: 336,776 × 4
##    dep_time_min arr_time_min air_time air_time_compare
##           <dbl>        <dbl>    <dbl>            <dbl>
##  1          317          510      227              -86
##  2          333          530      227              -90
##  3          342          563      160             -221
##  4          344          604      183             -277
##  5          354          492      116             -142
##  6          354          460      150              -36
##  7          355          553      158             -200
##  8          357          429       53              -99
##  9          357          518      140             -141
## 10          358          473      138              -57
## # … with 336,766 more rows
  • I see that the air time doesn’t equal to arrival time - depature time, I think the reason of this difference is that the flight might not take off and land in the same time zone. To fix this, we might need more information to convert the original time data into the same time zone.

4 Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

#By using the min_rank() function, ties are assigned the minimum ranking possible.
flights_delayed <- (mutate(flights, dep_delay_min_rank = min_rank(desc(dep_delay))))

flights_delayed <- filter(flights_delayed, dep_delay_min_rank <= 10)

arrange(flights_delayed, dep_delay_min_rank)
## # A tibble: 10 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # … with 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, dep_delay_min_rank <int>

5 What does 1:3 + 1:10 return? Why?

#By using the min_rank() function, ties are assigned the minimum ranking possible.
1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
##  [1]  2  4  6  5  7  9  8 10 12 11
  • I see a warning message saying that the longer object length is not a multiple of a shorter object length. This is because when we add two vectors of different length, R will recycle the shorter one to create a vector that has the same length as the longer one. When the length of the longer one is not a multiple of the shorter one, it is normally a bug.

Sections 5.6: Grouped summaries with summarize(), Combining multiple operations with the Pipe, Missing Values, Counts, Useful Summary Functions, Grouping by Multiple Variables, Ungrouping

5 Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))

flights %>%
  group_by(carrier) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))
## # A tibble: 16 × 2
##    carrier arr_delay
##    <chr>       <dbl>
##  1 F9         21.9  
##  2 FL         20.1  
##  3 EV         15.8  
##  4 YV         15.6  
##  5 OO         11.9  
##  6 MQ         10.8  
##  7 WN          9.65 
##  8 B6          9.46 
##  9 9E          7.38 
## 10 UA          3.56 
## 11 US          2.13 
## 12 VX          1.76 
## 13 DL          1.64 
## 14 AA          0.364
## 15 HA         -6.92 
## 16 AS         -9.93
  • to disentangle the effects of bad airports vs. bad carriers, I can compare the average delay of each carrier to the average delay of the other carriers’ flights that have the same origin and destination.

6 What does the sort argument to count() do. When might you use it?

flights %>%
  count(dest, sort = TRUE)
## # A tibble: 105 × 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705
## # … with 95 more rows
  • It sorts the results in order of count numbers. You can use it when you want to sort by count numbers.

Sections 5.7: Grouped mutates (and filters)

2 Which plane (tailnum) has the worst on-time record?

# This is to calculate the quartile of the nubmer of flights by tailnum. To avoid a sample size that is too small, we only consider the flights that flew at least 20 flights.
quantile(count(flights, tailnum)$n)
##   0%  25%  50%  75% 100% 
##    1   23   54  110 2512
# This is to show the flight that has the highest proportion of delayed or cancelled.
flights %>%
  filter(!is.na(tailnum), is.na(arr_time) | !is.na(arr_delay)) %>%
  mutate(on_time = !is.na(arr_time) & (arr_delay <= 0)) %>%
  group_by(tailnum) %>%
  summarise(on_time = mean(on_time), n = n()) %>%
  filter(n >= 20) %>%
  filter(min_rank(on_time) == 1)
## # A tibble: 1 × 3
##   tailnum on_time     n
##   <chr>     <dbl> <int>
## 1 N988AT    0.189    37
# This is to show the flight that has the largest mean time delayed.
flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(arr_delay = mean(arr_delay), n = n()) %>%
  filter(n >= 20) %>%
  filter(min_rank(desc(arr_delay)) == 1)
## # A tibble: 1 × 3
##   tailnum arr_delay     n
##   <chr>       <dbl> <int>
## 1 N203FR       59.1    41
  • Depend on which measure is used to define on-time record, the answer could be N9888AT or N203FR.

4 For each destination, compute the total minutes of delay. For each flight, compute the proportion of the total delay for its destination.

flights %>%
  filter(arr_delay > 0) %>%
  group_by(dest, origin, carrier, flight) %>%
  summarise(arr_delay = sum(arr_delay)) %>%
  group_by(dest) %>%
  mutate(
    arr_delay_prop = arr_delay / sum(arr_delay)
  ) %>%
  arrange(dest, desc(arr_delay_prop)) %>%
  select(carrier, flight, origin, dest, arr_delay_prop)
## `summarise()` has grouped output by 'dest', 'origin', 'carrier'. You can
## override using the `.groups` argument.
## # A tibble: 8,834 × 5
## # Groups:   dest [103]
##    carrier flight origin dest  arr_delay_prop
##    <chr>    <int> <chr>  <chr>          <dbl>
##  1 B6        1505 JFK    ABQ           0.567 
##  2 B6          65 JFK    ABQ           0.433 
##  3 B6        1191 JFK    ACK           0.475 
##  4 B6        1491 JFK    ACK           0.414 
##  5 B6        1291 JFK    ACK           0.0898
##  6 B6        1195 JFK    ACK           0.0208
##  7 EV        4309 EWR    ALB           0.174 
##  8 EV        4271 EWR    ALB           0.137 
##  9 EV        4117 EWR    ALB           0.0951
## 10 EV        4088 EWR    ALB           0.0865
## # … with 8,824 more rows

7 Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.

flights %>%
   # find all airports with > 1 carrier
   group_by(dest) %>%
   mutate(n_carriers = n_distinct(carrier)) %>%
   filter(n_carriers > 1) %>%
   # rank carriers by number of destinations
   group_by(carrier) %>%
   summarize(n_dest = n_distinct(dest)) %>%
   arrange(desc(n_dest))
## # A tibble: 16 × 2
##    carrier n_dest
##    <chr>    <int>
##  1 EV          51
##  2 9E          48
##  3 UA          42
##  4 DL          39
##  5 B6          35
##  6 AA          19
##  7 MQ          19
##  8 WN          10
##  9 OO           5
## 10 US           5
## 11 VX           4
## 12 YV           3
## 13 FL           2
## 14 AS           1
## 15 F9           1
## 16 HA           1