exercise

Exercise 5,6 & 7

library(tidyverse)

## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(nycflights13)

5.2.4 Exercises

Had an arrival delay of two or more hours

filter(flights,arr_delay >= 120)

## # A tibble: 10,200 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Flew to Houston (IAH or HOU)

filter(flights, dest %in% c("IAH", "HOU"))

## # A tibble: 9,313 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      623            627        -4      933            932
##  4  2013     1     1      728            732        -4     1041           1038
##  5  2013     1     1      739            739         0     1104           1038
##  6  2013     1     1      908            908         0     1228           1219
##  7  2013     1     1     1028           1026         2     1350           1339
##  8  2013     1     1     1044           1045        -1     1352           1351
##  9  2013     1     1     1114            900       134     1447           1222
## 10  2013     1     1     1205           1200         5     1503           1505
## # ... with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Were operated by United, American, or Delta

filter(flights, carrier %in% c ("UA","DL","AA"))

## # A tibble: 139,504 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      554            600        -6      812            837
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            600        -1      941            910
## 10  2013     1     1      559            600        -1      854            902
## # ... with 139,494 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Departed in summer (July, August, and September)

filter(flights, month %in% 7:9)

## # A tibble: 86,326 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7     1        1           2029       212      236           2359
##  2  2013     7     1        2           2359         3      344            344
##  3  2013     7     1       29           2245       104      151              1
##  4  2013     7     1       43           2130       193      322             14
##  5  2013     7     1       44           2150       174      300            100
##  6  2013     7     1       46           2051       235      304           2358
##  7  2013     7     1       48           2001       287      308           2305
##  8  2013     7     1       58           2155       183      335             43
##  9  2013     7     1      100           2146       194      327             30
## 10  2013     7     1      100           2245       135      337            135
## # ... with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Arrived more than two hours late, but didn’t leave late

filter(flights, arr_delay>120,dep_delay<=0)

## # A tibble: 29 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    27     1419           1420        -1     1754           1550
##  2  2013    10     7     1350           1350         0     1736           1526
##  3  2013    10     7     1357           1359        -2     1858           1654
##  4  2013    10    16      657            700        -3     1258           1056
##  5  2013    11     1      658            700        -2     1329           1015
##  6  2013     3    18     1844           1847        -3       39           2219
##  7  2013     4    17     1635           1640        -5     2049           1845
##  8  2013     4    18      558            600        -2     1149            850
##  9  2013     4    18      655            700        -5     1213            950
## 10  2013     5    22     1827           1830        -3     2217           2010
## # ... with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Were delayed by at least an hour, but made up over 30 minutes in flight

filter(flights,dep_delay<=60,(dep_delay-arr_delay>30))

## # A tibble: 16,131 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      701            700         1     1123           1154
##  2  2013     1     1      820            820         0     1249           1329
##  3  2013     1     1      840            845        -5     1311           1350
##  4  2013     1     1      857            851         6     1157           1222
##  5  2013     1     1      909            810        59     1331           1315
##  6  2013     1     1     1025            951        34     1258           1302
##  7  2013     1     1     1153           1200        -7     1450           1529
##  8  2013     1     1     1245           1249        -4     1722           1800
##  9  2013     1     1     1625           1550        35     2054           2050
## 10  2013     1     1     1627           1630        -3     1940           2020
## # ... with 16,121 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Departed between midnight and 6am (inclusive)

filter(flights, dep_time >= 2400 | dep_time <= 600)

## # A tibble: 9,373 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?

filter(flights, between(dep_time, 601, 800))

## # A tibble: 44,094 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      601            600         1      844            850
##  2  2013     1     1      602            610        -8      812            820
##  3  2013     1     1      602            605        -3      821            805
##  4  2013     1     1      606            610        -4      858            910
##  5  2013     1     1      606            610        -4      837            845
##  6  2013     1     1      607            607         0      858            915
##  7  2013     1     1      608            600         8      807            735
##  8  2013     1     1      611            600        11      945            931
##  9  2013     1     1      613            610         3      925            921
## 10  2013     1     1      615            615         0     1039           1100
## # ... with 44,084 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

filter(flights,between(month,3,4))

## # A tibble: 57,164 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     3     1        4           2159       125      318             56
##  2  2013     3     1       50           2358        52      526            438
##  3  2013     3     1      117           2245       152      223           2354
##  4  2013     3     1      454            500        -6      633            648
##  5  2013     3     1      505            515       -10      746            810
##  6  2013     3     1      521            530        -9      813            827
##  7  2013     3     1      537            540        -3      856            850
##  8  2013     3     1      541            545        -4     1014           1023
##  9  2013     3     1      549            600       -11      639            703
## 10  2013     3     1      550            600       -10      747            801
## # ... with 57,154 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

This is a shortcut for x >= left & x <= right

How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

sum(is.na(flights$dep_time))

## [1] 8255

map_dbl(flights, ~ sum(is.na(.x)))

##           year          month            day       dep_time sched_dep_time 
##              0              0              0           8255              0 
##      dep_delay       arr_time sched_arr_time      arr_delay        carrier 
##           8255           8713              0           9430              0 
##         flight        tailnum         origin           dest       air_time 
##              0           2512              0              0           9430 
##       distance           hour         minute      time_hour 
##              0              0              0              0

8255 flights have a missing dep_time

dep_delay, arr_time, air_time, tailnum, air_time & arr_delay

Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)

5.3.1 Exercises

How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).

df <- tibble(x = c(5, 2, NA),
             y = c(2, NA, 2))
rowSums(df)

## [1]  7 NA NA

arrange(df, desc(is.na(x)))

## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1    NA     2
## 2     5     2
## 3     2    NA

arrange(df, -(is.na(x)))

## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1    NA     2
## 2     5     2
## 3     2    NA

We’re basically saying, those which are TRUE to being NA, sort them in descending order.

Sort flights to find the most delayed flights. Find the flights that left earliest.

arrange(flights, dep_delay)

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013    10    23     1907           1932       -25     2143           2143
##  8  2013     3    30     2030           2055       -25     2213           2250
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

arrange(flights, desc(dep_delay) )

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Sort flights to find the fastest flights.

arrange(flights, air_time )

## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1    16     1355           1315        40     1442           1411
##  2  2013     4    13      537            527        10      622            628
##  3  2013    12     6      922            851        31     1021            954
##  4  2013     2     3     2153           2129        24     2247           2224
##  5  2013     2     5     1303           1315       -12     1342           1411
##  6  2013     2    12     2123           2130        -7     2211           2225
##  7  2013     3     2     1450           1500       -10     1547           1608
##  8  2013     3     8     2026           1935        51     2131           2056
##  9  2013     3    18     1456           1329        87     1533           1426
## 10  2013     3    19     2226           2145        41     2305           2246
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Which flights travelled the longest? Which travelled the shortest?

flights %>%
  arrange(air_time) %>%
  select(carrier, flight, air_time)

## # A tibble: 336,776 x 3
##    carrier flight air_time
##    <chr>    <int>    <dbl>
##  1 EV        4368       20
##  2 EV        4631       20
##  3 EV        4276       21
##  4 EV        4619       21
##  5 EV        4368       21
##  6 EV        4619       21
##  7 US        2132       21
##  8 9E        3650       21
##  9 EV        4118       21
## 10 EV        4276       21
## # ... with 336,766 more rows

flights %>%
  arrange(-air_time) %>%
  select(carrier, flight, air_time)

## # A tibble: 336,776 x 3
##    carrier flight air_time
##    <chr>    <int>    <dbl>
##  1 UA          15      695
##  2 HA          51      691
##  3 HA          51      686
##  4 HA          51      686
##  5 HA          51      683
##  6 HA          51      679
##  7 UA          15      676
##  8 HA          51      676
##  9 HA          51      675
## 10 UA          15      671
## # ... with 336,766 more rows

5.4.1 Exercises

Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.

vars <- c("dep_time", "dep_delay", "arr_time", "arr_delay")
select(flights, dep_time, dep_delay, arr_time, arr_delay)

## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows

select(flights, ends_with("time"), ends_with("delay"))

## # A tibble: 336,776 x 7
##    dep_time sched_dep_time arr_time sched_arr_time air_time dep_delay arr_delay
##       <int>          <int>    <int>          <int>    <dbl>     <dbl>     <dbl>
##  1      517            515      830            819      227         2        11
##  2      533            529      850            830      227         4        20
##  3      542            540      923            850      160         2        33
##  4      544            545     1004           1022      183        -1       -18
##  5      554            600      812            837      116        -6       -25
##  6      554            558      740            728      150        -4        12
##  7      555            600      913            854      158        -5        19
##  8      557            600      709            723       53        -3       -14
##  9      557            600      838            846      140        -3        -8
## 10      558            600      753            745      138        -2         8
## # ... with 336,766 more rows

select(flights, one_of(vars))

## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows

select(flights, .dots = vars)

## # A tibble: 336,776 x 4
##    .dots1 .dots2 .dots3 .dots4
##     <int>  <dbl>  <int>  <dbl>
##  1    517      2    830     11
##  2    533      4    850     20
##  3    542      2    923     33
##  4    544     -1   1004    -18
##  5    554     -6    812    -25
##  6    554     -4    740     12
##  7    555     -5    913     19
##  8    557     -3    709    -14
##  9    557     -3    838     -8
## 10    558     -2    753      8
## # ... with 336,766 more rows

select(flights, "dep_time", "dep_delay", "arr_time", "arr_delay")

## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows

select(flights, matches("dep"), matches("arr"), -matches("sched"), -carrier)

## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows

select(flights, contains("dep"), contains("arr"), -contains("sched"), -carrier)

## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows

select(flights, matches("^dep|^arr"))

## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows

select(flights, matches("time$|delay$"), -contains("sched"), -contains("air"))

## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows

select(flights, matches("^dep|arr_delay|time$"))

## # A tibble: 336,776 x 7
##    dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay air_time
##       <int>          <int>     <dbl>    <int>          <int>     <dbl>    <dbl>
##  1      517            515         2      830            819        11      227
##  2      533            529         4      850            830        20      227
##  3      542            540         2      923            850        33      160
##  4      544            545        -1     1004           1022       -18      183
##  5      554            600        -6      812            837       -25      116
##  6      554            558        -4      740            728        12      150
##  7      555            600        -5      913            854        19      158
##  8      557            600        -3      709            723       -14       53
##  9      557            600        -3      838            846        -8      140
## 10      558            600        -2      753            745         8      138
## # ... with 336,766 more rows

What happens if you include the name of a variable multiple times in a select() call?

select(flights, dep_time, dep_time)

## # A tibble: 336,776 x 1
##    dep_time
##       <int>
##  1      517
##  2      533
##  3      542
##  4      544
##  5      554
##  6      554
##  7      555
##  8      557
##  9      557
## 10      558
## # ... with 336,766 more rows

nothing, it is returned at once

What does the one_of() function do? Why might it be helpful in conjunction with this vector? vars <- c(“year”, “month”, “day”, “dep_delay”, “arr_delay”)

vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))

## # A tibble: 336,776 x 5
##     year month   day dep_delay arr_delay
##    <int> <int> <int>     <dbl>     <dbl>
##  1  2013     1     1         2        11
##  2  2013     1     1         4        20
##  3  2013     1     1         2        33
##  4  2013     1     1        -1       -18
##  5  2013     1     1        -6       -25
##  6  2013     1     1        -4        12
##  7  2013     1     1        -5        19
##  8  2013     1     1        -3       -14
##  9  2013     1     1        -3        -8
## 10  2013     1     1        -2         8
## # ... with 336,766 more rows

Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default? select(flights, contains(“TIME”))

select(flights, contains("TIME"))

## # A tibble: 336,776 x 6
##    dep_time sched_dep_time arr_time sched_arr_time air_time time_hour          
##       <int>          <int>    <int>          <int>    <dbl> <dttm>             
##  1      517            515      830            819      227 2013-01-01 05:00:00
##  2      533            529      850            830      227 2013-01-01 05:00:00
##  3      542            540      923            850      160 2013-01-01 05:00:00
##  4      544            545     1004           1022      183 2013-01-01 05:00:00
##  5      554            600      812            837      116 2013-01-01 06:00:00
##  6      554            558      740            728      150 2013-01-01 05:00:00
##  7      555            600      913            854      158 2013-01-01 06:00:00
##  8      557            600      709            723       53 2013-01-01 06:00:00
##  9      557            600      838            846      140 2013-01-01 06:00:00
## 10      558            600      753            745      138 2013-01-01 06:00:00
## # ... with 336,766 more rows

5.5.2 Exercises

Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.

transmute(flights,deptime = dep_time/60, schedeptime=sched_dep_time/60)

## # A tibble: 336,776 x 2
##    deptime schedeptime
##      <dbl>       <dbl>
##  1    8.62        8.58
##  2    8.88        8.82
##  3    9.03        9   
##  4    9.07        9.08
##  5    9.23       10   
##  6    9.23        9.3 
##  7    9.25       10   
##  8    9.28       10   
##  9    9.28       10   
## 10    9.3        10   
## # ... with 336,766 more rows

Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?

flights %>% 
  mutate(dep_time = (dep_time %/% 100) * 60 + (dep_time %% 100),
         sched_dep_time = (sched_dep_time %/% 100) * 60 + (sched_dep_time %% 100),
         arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100),
         sched_arr_time = (sched_arr_time %/% 100) * 60 + (sched_arr_time %% 100)) %>%
  transmute((arr_time - dep_time) %% (60*24) - air_time)

## # A tibble: 336,776 x 1
##    `(arr_time - dep_time)%%(60 * 24) - air_time`
##                                            <dbl>
##  1                                           -34
##  2                                           -30
##  3                                            61
##  4                                            77
##  5                                            22
##  6                                           -44
##  7                                            40
##  8                                            19
##  9                                            21
## 10                                           -23
## # ... with 336,766 more rows

Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?

hours2mins <- function(x) {
  x %/% 100 * 60 + x %% 100
}
select(flights, contains("dep")) %>%
  mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time))

## # A tibble: 336,776 x 4
##    dep_time sched_dep_time dep_delay dep_time_two
##       <int>          <int>     <dbl>        <dbl>
##  1      517            515         2            2
##  2      533            529         4            4
##  3      542            540         2            2
##  4      544            545        -1           -1
##  5      554            600        -6           -6
##  6      554            558        -4           -4
##  7      555            600        -5           -5
##  8      557            600        -3           -3
##  9      557            600        -3           -3
## 10      558            600        -2           -2
## # ... with 336,766 more rows

# these two numbers don’t match because we aren’t accounting for flights
# where the departure time is the next day from the scheduled departure time.
select(flights, contains("dep")) %>%
  mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time)) %>%
  filter(dep_delay != dep_time_two) %>%
  mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time - 2400))

## # A tibble: 1,207 x 4
##    dep_time sched_dep_time dep_delay dep_time_two
##       <int>          <int>     <dbl>        <dbl>
##  1      848           1835       853          853
##  2       42           2359        43           43
##  3      126           2250       156          156
##  4       32           2359        33           33
##  5       50           2145       185          185
##  6      235           2359       156          156
##  7       25           2359        26           26
##  8      106           2245       141          141
##  9       14           2359        15           15
## 10       37           2230       127          127
## # ... with 1,197 more rows

Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().*

flights %>%
  filter(min_rank(-(dep_delay)) %in% 1:10)

## # A tibble: 10 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     1    10     1121           1635      1126     1239           1810
##  3  2013    12     5      756           1700       896     1058           2020
##  4  2013     3    17     2321            810       911      135           1020
##  5  2013     4    10     1100           1900       960     1342           2211
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     6    27      959           1900       899     1236           2226
##  8  2013     7    22      845           1600      1005     1044           1815
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013     9    20     1139           1845      1014     1457           2210
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

flights %>%
  top_n(10, dep_delay)

## # A tibble: 10 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     1    10     1121           1635      1126     1239           1810
##  3  2013    12     5      756           1700       896     1058           2020
##  4  2013     3    17     2321            810       911      135           1020
##  5  2013     4    10     1100           1900       960     1342           2211
##  6  2013     6    15     1432           1935      1137     1607           2120
##  7  2013     6    27      959           1900       899     1236           2226
##  8  2013     7    22      845           1600      1005     1044           1815
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013     9    20     1139           1845      1014     1457           2210
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

What does 1:3 + 1:10 return? Why?

What trigonometric functions does R provide?

practice

summarise(flights, delay=mean(dep_delay,na.rm = TRUE))

## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1  12.6

by_date<- group_by(flights,year,month,day)
summarise(by_date,delay=mean(dep_delay, na.rm=TRUE))

## # A tibble: 365 x 4
## # Groups:   year, month [12]
##     year month   day delay
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ... with 355 more rows

by_dest<- group_by(flights, dest)
delay<- summarise(by_dest,count=n(),dist=mean(distance,na.rm=TRUE),delay=mean(arr_delay,na.rm = TRUE))

delay<- filter(delay,count>20,dest!="HNL")

ggplot(data = delay,mapping = aes(x=dist,y=delay))+
  geom_point(aes(size=count),alpha=1/3)+
  geom_smooth(se=FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#the pipe, %>%:
delay <- flights %>%
  group_by(dest)%>%
  summarise(
    count=n(),
    dist = mean(distance,na.rm=TRUE),
    delay= mean(arr_delay, na.rm = TRUE)
  )%>%
    filter(count > 20, dest != "HNL")

#missing values
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  )

ggplot(data = delays, mapping = aes(x = delay)) + 
  geom_freqpoly(binwidth = 10)

not_cancelled<- flights %>%
  filter(!is.na(arr_delay))

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()
  )


ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
  geom_point(alpha = 1/10)

5.6.7 Exercises

Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios:

delay_char <-
  flights %>%
  group_by(flight) %>%
  summarise(n = n(),
            fifteen_early = mean(arr_delay == -15, na.rm = TRUE),
            fifteen_late = mean(arr_delay == 15, na.rm = TRUE),
            ten_always = mean(arr_delay == 10, na.rm = TRUE),
            thirty_early = mean(arr_delay == -30, na.rm = TRUE),
            thirty_late = mean(arr_delay == 30, na.rm = TRUE),
            percentage_on_time = mean(arr_delay == 0, na.rm = TRUE),
            twohours = mean(arr_delay > 120, na.rm = TRUE)) %>%
  map_if(is_double, round, 2) %>%
  as_tibble()

A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.

delay_char %>%
  filter(fifteen_early == 0.5, fifteen_late == 0.5)

## # A tibble: 0 x 9
## # ... with 9 variables: flight <int>, n <int>, fifteen_early <dbl>,
## #   fifteen_late <dbl>, ten_always <dbl>, thirty_early <dbl>,
## #   thirty_late <dbl>, percentage_on_time <dbl>, twohours <dbl>

A flight is always 10 minutes late.

delay_char %>%
  filter(ten_always == 1)

## # A tibble: 5 x 9
##   flight     n fifteen_early fifteen_late ten_always thirty_early thirty_late
##    <int> <int>         <dbl>        <dbl>      <dbl>        <dbl>       <dbl>
## 1   2254     1             0            0          1            0           0
## 2   3656     1             0            0          1            0           0
## 3   3785     2             0            0          1            0           0
## 4   3880     1             0            0          1            0           0
## 5   5854     1             0            0          1            0           0
## # ... with 2 more variables: percentage_on_time <dbl>, twohours <dbl>

A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.

delay_char %>%
  filter(thirty_early == 0.5 & thirty_late == 0.5)

## # A tibble: 0 x 9
## # ... with 9 variables: flight <int>, n <int>, fifteen_early <dbl>,
## #   fifteen_late <dbl>, ten_always <dbl>, thirty_early <dbl>,
## #   thirty_late <dbl>, percentage_on_time <dbl>, twohours <dbl>

99% of the time a flight is on time. 1% of the time it’s 2 hours late.

delay_char %>%
  filter(percentage_on_time == 0.99 & twohours == 0.01)

## # A tibble: 0 x 9
## # ... with 9 variables: flight <int>, n <int>, fifteen_early <dbl>,
## #   fifteen_late <dbl>, ten_always <dbl>, thirty_early <dbl>,
## #   thirty_late <dbl>, percentage_on_time <dbl>, twohours <dbl>

Which is more important: arrival delay or departure delay? depends

Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()).

not_cancelled <-
  flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
  count(dest)

## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows

# and
not_cancelled %>%
  count(tailnum, wt = distance)

## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # ... with 4,027 more rows

# (without using count()).
#######################
not_cancelled %>%
  group_by(dest) %>%
  summarise(n = n())

## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows

# and
not_cancelled %>%
  group_by(tailnum) %>%
  tally(wt = distance)

## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # ... with 4,027 more rows

# or
not_cancelled %>%
  group_by(tailnum) %>%
  summarize(n = sum(distance))

## # A tibble: 4,037 x 2
##    tailnum      n
##    <chr>    <dbl>
##  1 D942DN    3418
##  2 N0EGMQ  239143
##  3 N10156  109664
##  4 N102UW   25722
##  5 N103US   24619
##  6 N104UW   24616
##  7 N10575  139903
##  8 N105UW   23618
##  9 N107US   21677
## 10 N108UW   32070
## # ... with 4,027 more rows

Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay) ) is slightly suboptimal. Why? Which is the most important column?

Because if a flight didn’t leave then it was cancelled. If the condition is.na(dep_delay) is met, then the flight was cancelled.

Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?

Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?

flights %>%
  group_by(day) %>%
  summarise(cancelled = mean(is.na(dep_delay)),
            mean_dep = mean(dep_delay, na.rm = T),
            mean_arr = mean(arr_delay, na.rm = T)) %>%
  ggplot(aes(y = cancelled)) +
  geom_point(aes(x = mean_dep), colour = "red") +
  geom_point(aes(x = mean_arr), colour = "blue") +
  labs(x = "Avg delay per day", y = "Cancelled flights p day")

Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))

flights %>%
  summarise(n_car = n_distinct(carrier),
            n_air = n_distinct(dest),
            n_or = n_distinct(origin))

## # A tibble: 1 x 3
##   n_car n_air  n_or
##   <int> <int> <int>
## 1    16   105     3

flights %>%
  group_by(carrier) %>%
  mutate(avg_carrier = mean(dep_delay, na.rm = T)) %>%
  group_by(carrier, origin) %>%
  mutate(origin_mean = mean(dep_delay, na.rm = T),
         deviations = origin_mean - avg_carrier) %>%
  summarise(deviations = mean(deviations), mean = mean(avg_carrier)) %>%
  ggplot(aes(origin, deviations)) + geom_col() + facet_wrap(~ carrier)

Tearing out the effect is not straight forward but we can make some informed guesses. For example, whenever there are substantial deviations, they seem to be higher in EWR airport rather than in other airports. On the other hand, there are some airlines that look particular bad like 9E and MQ. And the same pattern is not found on the vast majority of other airlines, which would suggest it’s an airport issues rather than an airline issue.

flights %>%
  group_by(carrier, dest) %>%
  summarise(mean_departure = mean(dep_delay, na.rm = T),
            mean_arrival = mean(arr_delay, na.rm = T))

## # A tibble: 314 x 4
## # Groups:   carrier [16]
##    carrier dest  mean_departure mean_arrival
##    <chr>   <chr>          <dbl>        <dbl>
##  1 9E      ATL            0.965        0.857
##  2 9E      AUS           19           -3.5  
##  3 9E      AVL           -2.6        -12.1  
##  4 9E      BGR           34          NaN    
##  5 9E      BNA           19.1          9.29 
##  6 9E      BOS           14.8          5.66 
##  7 9E      BTV           -4.5         -2.5  
##  8 9E      BUF           15.5          6.71 
##  9 9E      BWI           17.5          8.73 
## 10 9E      CAE           -3.67         6    
## # ... with 304 more rows

For each plane, count the number of flights before the first delay of greater than 1 hour.

flights %>%
    mutate(dep_date = time_hour) %>%
    group_by(tailnum) %>%
    arrange(dep_date) %>%
    mutate(cumulative = !cumany(arr_delay > 60)) %>%
    filter(cumulative == T) %>%
    tally(sort = TRUE)

## # A tibble: 3,744 x 2
##    tailnum     n
##    <chr>   <int>
##  1 N705TW     97
##  2 N765US     97
##  3 N12125     94
##  4 N320AA     94
##  5 N13110     91
##  6 N3763D     82
##  7 N58101     82
##  8 N17122     81
##  9 N961UW     80
## 10 N950UW     79
## # ... with 3,734 more rows

flights %>%
  group_by(tailnum) %>%
  arrange(time_hour) %>%
  mutate(cum = arr_delay > 60,
         cum_any = cumsum(cum)) %>%
  filter(cum_any < 1) %>%
  tally(sort = TRUE)

## # A tibble: 3,744 x 2
##    tailnum     n
##    <chr>   <int>
##  1 N705TW     97
##  2 N765US     97
##  3 N12125     94
##  4 N320AA     94
##  5 N13110     91
##  6 N3763D     82
##  7 N58101     82
##  8 N17122     81
##  9 N961UW     80
## 10 N950UW     79
## # ... with 3,734 more rows

What does the sort argument to count() do. When might you use it?

flights %>%
  count(flight, sort = T)

## # A tibble: 3,844 x 2
##    flight     n
##     <int> <int>
##  1     15   968
##  2     27   898
##  3    181   882
##  4    301   871
##  5    161   786
##  6    695   782
##  7   1109   716
##  8    745   711
##  9    359   709
## 10      1   701
## # ... with 3,834 more rows

5.7.1 Exercises

Refer back to the table of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping.

Which one?

Which plane (tailnum) has the worst on-time record?

flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(prop_time = sum(arr_delay <= 30)/n(),
            mean_arr = mean(arr_delay, na.rm = TRUE),
            fl = n()) %>%
  arrange(desc(prop_time))

## # A tibble: 4,037 x 4
##    tailnum prop_time mean_arr    fl
##    <chr>       <dbl>    <dbl> <int>
##  1 N103US          1    -6.93    46
##  2 N1200K          1    -9.38    21
##  3 N121DE          1    15        2
##  4 N137DL          1    -5        1
##  5 N143DA          1    24        1
##  6 N14628          1    -6        1
##  7 N14629          1   -16.2      4
##  8 N1607B          1   -16        3
##  9 N1608           1   -11.3      3
## 10 N1610D          1   -14.5      2
## # ... with 4,027 more rows

All these flights are always late.

What time of day should you fly if you want to avoid delays as much as possible?

flights %>%
  group_by(hour) %>%
  filter(!is.na(dep_delay)) %>%
  summarise( delay = mean( dep_delay > 0 , na.rm = T)) %>%
  ggplot(aes(hour, delay, fill = delay)) + geom_col()

# or
flights %>%
  group_by(hour) %>%
  summarize(m = mean(dep_delay, na.rm = TRUE),
            sd = sd(dep_delay, na.rm = TRUE),
            low_ci = m - 2*sd,
            high_ci = m + 2*sd,
            n = n()) %>%
  ggplot(aes(hour, m, ymin = low_ci, ymax = high_ci)) +
  geom_pointrange()

## Warning: Removed 1 rows containing missing values (geom_pointrange).

Worst time to flight is in the early evening. Although that happens because more flights go out on that specific time also.

For each destination, compute the total minutes of delay. For each, flight, compute the proportion of the total delay for its destination.

flights %>%
  group_by(dest) %>%
  filter(!is.na(dep_delay)) %>%
  summarise(tot_mins = sum(dep_delay[dep_delay > 0]))

## # A tibble: 104 x 2
##    dest  tot_mins
##    <chr>    <dbl>
##  1 ABQ       4076
##  2 ACK       2603
##  3 ALB      10934
##  4 ANC        105
##  5 ATL     254414
##  6 AUS      36623
##  7 AVL       3092
##  8 BDL       8471
##  9 BGR       8170
## 10 BHM       8817
## # ... with 94 more rows

flights %>%
  filter(!is.na(dep_delay)) %>%
  group_by(tailnum, dest) %>%
  summarise(m = mean(dep_delay > 0), n = n()) %>%
  arrange(desc(m))

## # A tibble: 44,218 x 4
## # Groups:   tailnum [4,037]
##    tailnum dest      m     n
##    <chr>   <chr> <dbl> <int>
##  1 D942DN  MCO       1     2
##  2 N10156  BDL       1     1
##  3 N10156  CLE       1     1
##  4 N10156  DCA       1     2
##  5 N10156  GSO       1     1
##  6 N10156  GSP       1     1
##  7 N10156  IAD       1     1
##  8 N10156  IND       1     2
##  9 N10156  MHT       1     1
## 10 N10156  MSN       1     1
## # ... with 44,208 more rows

Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag() explore how the delay of a flight is related to the delay of the immediately preceding flight.

flights %>%
  mutate(new_sched_dep_time = lubridate::make_datetime(year, month, day, hour, minute)) %>%
  arrange(new_sched_dep_time) %>%
  mutate(prev_time = lag(dep_delay)) %>%
  # filter(between(dep_delay, 0, 300), between(prev_time, 0, 300)) %>% # play with this one
  select(origin, new_sched_dep_time, dep_delay, prev_time) %>%
  ggplot(aes(dep_delay, prev_time)) + geom_point(alpha = 1/10) +
  geom_smooth()

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

## Warning: Removed 14167 rows containing non-finite values (stat_smooth).

## Warning: Removed 14167 rows containing missing values (geom_point).

# or
flights %>%
  select(year, month, day, hour, dest, dep_delay) %>%
  group_by(dest) %>%
  mutate(lag_delay = lag(dep_delay)) %>%
  arrange(dest) %>%
  filter(!is.na(lag_delay)) %>%
  summarize(cor = cor(dep_delay, lag_delay, use = "complete.obs"),
            n = n()) %>%
  arrange(desc(cor)) %>%
  filter(row_number(desc(cor)) %in% 1:10)

## # A tibble: 10 x 3
##    dest    cor     n
##    <chr> <dbl> <int>
##  1 SBN   0.687     9
##  2 ORD   0.403 16641
##  3 HDN   0.365    13
##  4 ATL   0.351 16897
##  5 SFO   0.344 13229
##  6 BZN   0.325    34
##  7 BOS   0.323 15048
##  8 FLL   0.312 11933
##  9 BNA   0.302  6104
## 10 MDW   0.296  4043

Although there is a lot of noise, you can see a sort of straight line going on there. There is also a correlation between the lagged values in many of the destionatinons. So correlation between flights is mostly in specific airports.

Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time a flight relative to the shortest flight to that destination. Which flights were most delayed in the air?

# (1)
flights %>%
  group_by(dest) %>%
  arrange(air_time) %>%
  slice(1:5) %>%
  select(tailnum, sched_dep_time, sched_arr_time, air_time) %>%
  arrange(air_time)

## Adding missing grouping variables: `dest`

## # A tibble: 517 x 5
## # Groups:   dest [105]
##    dest  tailnum sched_dep_time sched_arr_time air_time
##    <chr> <chr>            <int>          <int>    <dbl>
##  1 BDL   N16911            1315           1411       20
##  2 BDL   N12167             527            628       20
##  3 BDL   N27200             851            954       21
##  4 BDL   N13955            1315           1411       21
##  5 BDL   N12160            1329           1426       21
##  6 BOS   N947UW            1500           1608       21
##  7 PHL   N13913            2129           2224       21
##  8 PHL   N12921            2130           2225       21
##  9 PHL   N8501F            1935           2056       21
## 10 PHL   N22909            2129           2224       22
## # ... with 507 more rows

# (2)
flights %>%
  group_by(dest) %>%
  mutate(shortest = air_time - min(air_time, na.rm = T)) %>%
  top_n(1, air_time) %>%
  arrange(-air_time) %>%
  select(tailnum, sched_dep_time, sched_arr_time, shortest)

## Warning in min(air_time, na.rm = T): no non-missing arguments to min; returning
## Inf

## Adding missing grouping variables: `dest`

## # A tibble: 112 x 5
## # Groups:   dest [104]
##    dest  tailnum sched_dep_time sched_arr_time shortest
##    <chr> <chr>            <int>          <int>    <dbl>
##  1 HNL   N77066            1335           1836      133
##  2 SFO   N703TW            1730           2110      195
##  3 LAX   N178DN            1815           2146      165
##  4 ANC   N572UA            1615           1953       46
##  5 SAN   N794JB            1620           1934      134
##  6 SNA   N16709            1819           2137      131
##  7 BUR   N624JB            1730           2046      110
##  8 LAS   N852UA            1729           2013      143
##  9 SJC   N632JB            1830           2205       91
## 10 SEA   N17245            1727           2040      119
## # ... with 102 more rows

Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.

flights %>%
  group_by(dest) %>%
  filter(n_distinct(carrier) > 2) %>%
  group_by(carrier) %>%
  summarise(n = n_distinct(dest)) %>%
  arrange(-n)

## # A tibble: 15 x 2
##    carrier     n
##    <chr>   <int>
##  1 DL         37
##  2 EV         36
##  3 UA         36
##  4 9E         35
##  5 B6         30
##  6 AA         17
##  7 MQ         17
##  8 WN          9
##  9 OO          5
## 10 US          5
## 11 VX          3
## 12 YV          3
## 13 FL          2
## 14 AS          1
## 15 F9          1

#Exercise 7

library(tidyverse)

7.3.4 Exercises Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.

head(diamonds)

## # A tibble: 6 x 10
##   carat cut       color clarity depth table price     x     y     z
##   <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
## 2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
## 3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
## 4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
## 5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
## 6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48

ggplot(data=diamonds, mapping = aes(x = x),binwidth=0.1)+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=diamonds, mapping = aes(x = y),binwidth=0.1)+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=diamonds, mapping = aes(x = z),binwidth=0.1)+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)

ggplot(data=diamonds,mapping = aes(x=price), binwidth=0.01)+
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?

diamonds%>%
  filter(carat == 0.99)%>%
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1    23

diamonds%>%
  filter(carat==1)%>%
  count()

## # A tibble: 1 x 1
##       n
##   <int>
## 1  1558

ggplot(data=diamonds,mapping = aes(x=carat))+
  geom_histogram(binwidth = 0.01)+
  xlim(c(0.97,1.03))

## Warning: Removed 48621 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?

ggplot(data=diamonds,mapping = aes(x=carat))+
  geom_histogram()+
  xlim(c(0.97,1.035))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 48621 rows containing non-finite values (stat_bin).

## Warning: Removed 2 rows containing missing values (geom_bar).

ggplot(data=diamonds,mapping = aes(x=carat))+
  geom_histogram()+
 coord_cartesian(xlim=c(0.97,1.035))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data=diamonds,mapping = aes(x=carat))+
  geom_histogram(binwidth = 0.01)+
 coord_cartesian(xlim=c(0.97,1.035))

7.4.1 Exercises

What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference?

geom_histogram() removed rows with NA values; Apparently geom_bar() doesn’t remove NA, but rather treat it as another factor or category.

What does na.rm = TRUE do in mean() and sum()?

summarise(diamonds, delay = mean(carat, na.rm = TRUE))

## # A tibble: 1 x 1
##   delay
##   <dbl>
## 1 0.798

To ignore NA’s when calculating mean and sum.

exercise_5

linda

12/12/2019

5.7.1 Exercises