Exercise 5,6 & 7
library(tidyverse)
## -- Attaching packages ------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ---------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
5.2.4 Exercises
Had an arrival delay of two or more hours
filter(flights,arr_delay >= 120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # ... with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Flew to Houston (IAH or HOU)
filter(flights, dest %in% c("IAH", "HOU"))
## # A tibble: 9,313 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # ... with 9,303 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Were operated by United, American, or Delta
filter(flights, carrier %in% c ("UA","DL","AA"))
## # A tibble: 139,504 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 554 600 -6 812 837
## 5 2013 1 1 554 558 -4 740 728
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 600 -1 941 910
## 10 2013 1 1 559 600 -1 854 902
## # ... with 139,494 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Departed in summer (July, August, and September)
filter(flights, month %in% 7:9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 1 1 2029 212 236 2359
## 2 2013 7 1 2 2359 3 344 344
## 3 2013 7 1 29 2245 104 151 1
## 4 2013 7 1 43 2130 193 322 14
## 5 2013 7 1 44 2150 174 300 100
## 6 2013 7 1 46 2051 235 304 2358
## 7 2013 7 1 48 2001 287 308 2305
## 8 2013 7 1 58 2155 183 335 43
## 9 2013 7 1 100 2146 194 327 30
## 10 2013 7 1 100 2245 135 337 135
## # ... with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Arrived more than two hours late, but didn’t leave late
filter(flights, arr_delay>120,dep_delay<=0)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 27 1419 1420 -1 1754 1550
## 2 2013 10 7 1350 1350 0 1736 1526
## 3 2013 10 7 1357 1359 -2 1858 1654
## 4 2013 10 16 657 700 -3 1258 1056
## 5 2013 11 1 658 700 -2 1329 1015
## 6 2013 3 18 1844 1847 -3 39 2219
## 7 2013 4 17 1635 1640 -5 2049 1845
## 8 2013 4 18 558 600 -2 1149 850
## 9 2013 4 18 655 700 -5 1213 950
## 10 2013 5 22 1827 1830 -3 2217 2010
## # ... with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Were delayed by at least an hour, but made up over 30 minutes in flight
filter(flights,dep_delay<=60,(dep_delay-arr_delay>30))
## # A tibble: 16,131 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 701 700 1 1123 1154
## 2 2013 1 1 820 820 0 1249 1329
## 3 2013 1 1 840 845 -5 1311 1350
## 4 2013 1 1 857 851 6 1157 1222
## 5 2013 1 1 909 810 59 1331 1315
## 6 2013 1 1 1025 951 34 1258 1302
## 7 2013 1 1 1153 1200 -7 1450 1529
## 8 2013 1 1 1245 1249 -4 1722 1800
## 9 2013 1 1 1625 1550 35 2054 2050
## 10 2013 1 1 1627 1630 -3 1940 2020
## # ... with 16,121 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Departed between midnight and 6am (inclusive)
filter(flights, dep_time >= 2400 | dep_time <= 600)
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?
filter(flights, between(dep_time, 601, 800))
## # A tibble: 44,094 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 601 600 1 844 850
## 2 2013 1 1 602 610 -8 812 820
## 3 2013 1 1 602 605 -3 821 805
## 4 2013 1 1 606 610 -4 858 910
## 5 2013 1 1 606 610 -4 837 845
## 6 2013 1 1 607 607 0 858 915
## 7 2013 1 1 608 600 8 807 735
## 8 2013 1 1 611 600 11 945 931
## 9 2013 1 1 613 610 3 925 921
## 10 2013 1 1 615 615 0 1039 1100
## # ... with 44,084 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights,between(month,3,4))
## # A tibble: 57,164 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 3 1 4 2159 125 318 56
## 2 2013 3 1 50 2358 52 526 438
## 3 2013 3 1 117 2245 152 223 2354
## 4 2013 3 1 454 500 -6 633 648
## 5 2013 3 1 505 515 -10 746 810
## 6 2013 3 1 521 530 -9 813 827
## 7 2013 3 1 537 540 -3 856 850
## 8 2013 3 1 541 545 -4 1014 1023
## 9 2013 3 1 549 600 -11 639 703
## 10 2013 3 1 550 600 -10 747 801
## # ... with 57,154 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
This is a shortcut for x >= left & x <= right
How many flights have a missing dep_time? What other variables are missing? What might these rows represent?
sum(is.na(flights$dep_time))
## [1] 8255
map_dbl(flights, ~ sum(is.na(.x)))
## year month day dep_time sched_dep_time
## 0 0 0 8255 0
## dep_delay arr_time sched_arr_time arr_delay carrier
## 8255 8713 0 9430 0
## flight tailnum origin dest air_time
## 0 2512 0 0 9430
## distance hour minute time_hour
## 0 0 0 0
8255 flights have a missing dep_time
dep_delay, arr_time, air_time, tailnum, air_time & arr_delay
Why is NA ^ 0 not missing? Why is NA | TRUE not missing? Why is FALSE & NA not missing? Can you figure out the general rule? (NA * 0 is a tricky counterexample!)
5.3.1 Exercises
How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).
df <- tibble(x = c(5, 2, NA),
y = c(2, NA, 2))
rowSums(df)
## [1] 7 NA NA
arrange(df, desc(is.na(x)))
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 NA 2
## 2 5 2
## 3 2 NA
arrange(df, -(is.na(x)))
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 NA 2
## 2 5 2
## 3 2 NA
We’re basically saying, those which are TRUE to being NA, sort them in descending order.
Sort flights to find the most delayed flights. Find the flights that left earliest.
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 7 2040 2123 -43 40 2352
## 2 2013 2 3 2022 2055 -33 2240 2338
## 3 2013 11 10 1408 1440 -32 1549 1559
## 4 2013 1 11 1900 1930 -30 2233 2243
## 5 2013 1 29 1703 1730 -27 1947 1957
## 6 2013 8 9 729 755 -26 1002 955
## 7 2013 10 23 1907 1932 -25 2143 2143
## 8 2013 3 30 2030 2055 -25 2213 2250
## 9 2013 3 2 1431 1455 -24 1601 1631
## 10 2013 5 5 934 958 -24 1225 1309
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay) )
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Sort flights to find the fastest flights.
arrange(flights, air_time )
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 16 1355 1315 40 1442 1411
## 2 2013 4 13 537 527 10 622 628
## 3 2013 12 6 922 851 31 1021 954
## 4 2013 2 3 2153 2129 24 2247 2224
## 5 2013 2 5 1303 1315 -12 1342 1411
## 6 2013 2 12 2123 2130 -7 2211 2225
## 7 2013 3 2 1450 1500 -10 1547 1608
## 8 2013 3 8 2026 1935 51 2131 2056
## 9 2013 3 18 1456 1329 87 1533 1426
## 10 2013 3 19 2226 2145 41 2305 2246
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Which flights travelled the longest? Which travelled the shortest?
flights %>%
arrange(air_time) %>%
select(carrier, flight, air_time)
## # A tibble: 336,776 x 3
## carrier flight air_time
## <chr> <int> <dbl>
## 1 EV 4368 20
## 2 EV 4631 20
## 3 EV 4276 21
## 4 EV 4619 21
## 5 EV 4368 21
## 6 EV 4619 21
## 7 US 2132 21
## 8 9E 3650 21
## 9 EV 4118 21
## 10 EV 4276 21
## # ... with 336,766 more rows
flights %>%
arrange(-air_time) %>%
select(carrier, flight, air_time)
## # A tibble: 336,776 x 3
## carrier flight air_time
## <chr> <int> <dbl>
## 1 UA 15 695
## 2 HA 51 691
## 3 HA 51 686
## 4 HA 51 686
## 5 HA 51 683
## 6 HA 51 679
## 7 UA 15 676
## 8 HA 51 676
## 9 HA 51 675
## 10 UA 15 671
## # ... with 336,766 more rows
5.4.1 Exercises
Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.
vars <- c("dep_time", "dep_delay", "arr_time", "arr_delay")
select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, ends_with("time"), ends_with("delay"))
## # A tibble: 336,776 x 7
## dep_time sched_dep_time arr_time sched_arr_time air_time dep_delay arr_delay
## <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 517 515 830 819 227 2 11
## 2 533 529 850 830 227 4 20
## 3 542 540 923 850 160 2 33
## 4 544 545 1004 1022 183 -1 -18
## 5 554 600 812 837 116 -6 -25
## 6 554 558 740 728 150 -4 12
## 7 555 600 913 854 158 -5 19
## 8 557 600 709 723 53 -3 -14
## 9 557 600 838 846 140 -3 -8
## 10 558 600 753 745 138 -2 8
## # ... with 336,766 more rows
select(flights, one_of(vars))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, .dots = vars)
## # A tibble: 336,776 x 4
## .dots1 .dots2 .dots3 .dots4
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, "dep_time", "dep_delay", "arr_time", "arr_delay")
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, matches("dep"), matches("arr"), -matches("sched"), -carrier)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, contains("dep"), contains("arr"), -contains("sched"), -carrier)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, matches("^dep|^arr"))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, matches("time$|delay$"), -contains("sched"), -contains("air"))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
select(flights, matches("^dep|arr_delay|time$"))
## # A tibble: 336,776 x 7
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay air_time
## <int> <int> <dbl> <int> <int> <dbl> <dbl>
## 1 517 515 2 830 819 11 227
## 2 533 529 4 850 830 20 227
## 3 542 540 2 923 850 33 160
## 4 544 545 -1 1004 1022 -18 183
## 5 554 600 -6 812 837 -25 116
## 6 554 558 -4 740 728 12 150
## 7 555 600 -5 913 854 19 158
## 8 557 600 -3 709 723 -14 53
## 9 557 600 -3 838 846 -8 140
## 10 558 600 -2 753 745 8 138
## # ... with 336,766 more rows
What happens if you include the name of a variable multiple times in a select() call?
select(flights, dep_time, dep_time)
## # A tibble: 336,776 x 1
## dep_time
## <int>
## 1 517
## 2 533
## 3 542
## 4 544
## 5 554
## 6 554
## 7 555
## 8 557
## 9 557
## 10 558
## # ... with 336,766 more rows
nothing, it is returned at once
What does the one_of() function do? Why might it be helpful in conjunction with this vector? vars <- c(“year”, “month”, “day”, “dep_delay”, “arr_delay”)
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))
## # A tibble: 336,776 x 5
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2 11
## 2 2013 1 1 4 20
## 3 2013 1 1 2 33
## 4 2013 1 1 -1 -18
## 5 2013 1 1 -6 -25
## 6 2013 1 1 -4 12
## 7 2013 1 1 -5 19
## 8 2013 1 1 -3 -14
## 9 2013 1 1 -3 -8
## 10 2013 1 1 -2 8
## # ... with 336,766 more rows
Does the result of running the following code surprise you? How do the select helpers deal with case by default? How can you change that default? select(flights, contains(“TIME”))
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
## dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
## <int> <int> <int> <int> <dbl> <dttm>
## 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 533 529 850 830 227 2013-01-01 05:00:00
## 3 542 540 923 850 160 2013-01-01 05:00:00
## 4 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 554 600 812 837 116 2013-01-01 06:00:00
## 6 554 558 740 728 150 2013-01-01 05:00:00
## 7 555 600 913 854 158 2013-01-01 06:00:00
## 8 557 600 709 723 53 2013-01-01 06:00:00
## 9 557 600 838 846 140 2013-01-01 06:00:00
## 10 558 600 753 745 138 2013-01-01 06:00:00
## # ... with 336,766 more rows
5.5.2 Exercises
Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.
transmute(flights,deptime = dep_time/60, schedeptime=sched_dep_time/60)
## # A tibble: 336,776 x 2
## deptime schedeptime
## <dbl> <dbl>
## 1 8.62 8.58
## 2 8.88 8.82
## 3 9.03 9
## 4 9.07 9.08
## 5 9.23 10
## 6 9.23 9.3
## 7 9.25 10
## 8 9.28 10
## 9 9.28 10
## 10 9.3 10
## # ... with 336,766 more rows
Compare air_time with arr_time - dep_time. What do you expect to see? What do you see? What do you need to do to fix it?
flights %>%
mutate(dep_time = (dep_time %/% 100) * 60 + (dep_time %% 100),
sched_dep_time = (sched_dep_time %/% 100) * 60 + (sched_dep_time %% 100),
arr_time = (arr_time %/% 100) * 60 + (arr_time %% 100),
sched_arr_time = (sched_arr_time %/% 100) * 60 + (sched_arr_time %% 100)) %>%
transmute((arr_time - dep_time) %% (60*24) - air_time)
## # A tibble: 336,776 x 1
## `(arr_time - dep_time)%%(60 * 24) - air_time`
## <dbl>
## 1 -34
## 2 -30
## 3 61
## 4 77
## 5 22
## 6 -44
## 7 40
## 8 19
## 9 21
## 10 -23
## # ... with 336,766 more rows
Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?
hours2mins <- function(x) {
x %/% 100 * 60 + x %% 100
}
select(flights, contains("dep")) %>%
mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time))
## # A tibble: 336,776 x 4
## dep_time sched_dep_time dep_delay dep_time_two
## <int> <int> <dbl> <dbl>
## 1 517 515 2 2
## 2 533 529 4 4
## 3 542 540 2 2
## 4 544 545 -1 -1
## 5 554 600 -6 -6
## 6 554 558 -4 -4
## 7 555 600 -5 -5
## 8 557 600 -3 -3
## 9 557 600 -3 -3
## 10 558 600 -2 -2
## # ... with 336,766 more rows
# these two numbers don’t match because we aren’t accounting for flights
# where the departure time is the next day from the scheduled departure time.
select(flights, contains("dep")) %>%
mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time)) %>%
filter(dep_delay != dep_time_two) %>%
mutate(dep_time_two = hours2mins(dep_time) - hours2mins(sched_dep_time - 2400))
## # A tibble: 1,207 x 4
## dep_time sched_dep_time dep_delay dep_time_two
## <int> <int> <dbl> <dbl>
## 1 848 1835 853 853
## 2 42 2359 43 43
## 3 126 2250 156 156
## 4 32 2359 33 33
## 5 50 2145 185 185
## 6 235 2359 156 156
## 7 25 2359 26 26
## 8 106 2245 141 141
## 9 14 2359 15 15
## 10 37 2230 127 127
## # ... with 1,197 more rows
Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().*
flights %>%
filter(min_rank(-(dep_delay)) %in% 1:10)
## # A tibble: 10 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 1 10 1121 1635 1126 1239 1810
## 3 2013 12 5 756 1700 896 1058 2020
## 4 2013 3 17 2321 810 911 135 1020
## 5 2013 4 10 1100 1900 960 1342 2211
## 6 2013 6 15 1432 1935 1137 1607 2120
## 7 2013 6 27 959 1900 899 1236 2226
## 8 2013 7 22 845 1600 1005 1044 1815
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 9 20 1139 1845 1014 1457 2210
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
top_n(10, dep_delay)
## # A tibble: 10 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 1 10 1121 1635 1126 1239 1810
## 3 2013 12 5 756 1700 896 1058 2020
## 4 2013 3 17 2321 810 911 135 1020
## 5 2013 4 10 1100 1900 960 1342 2211
## 6 2013 6 15 1432 1935 1137 1607 2120
## 7 2013 6 27 959 1900 899 1236 2226
## 8 2013 7 22 845 1600 1005 1044 1815
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 9 20 1139 1845 1014 1457 2210
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
What does 1:3 + 1:10 return? Why?
What trigonometric functions does R provide?
practice
summarise(flights, delay=mean(dep_delay,na.rm = TRUE))
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 12.6
by_date<- group_by(flights,year,month,day)
summarise(by_date,delay=mean(dep_delay, na.rm=TRUE))
## # A tibble: 365 x 4
## # Groups: year, month [12]
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.5
## 2 2013 1 2 13.9
## 3 2013 1 3 11.0
## 4 2013 1 4 8.95
## 5 2013 1 5 5.73
## 6 2013 1 6 7.15
## 7 2013 1 7 5.42
## 8 2013 1 8 2.55
## 9 2013 1 9 2.28
## 10 2013 1 10 2.84
## # ... with 355 more rows
by_dest<- group_by(flights, dest)
delay<- summarise(by_dest,count=n(),dist=mean(distance,na.rm=TRUE),delay=mean(arr_delay,na.rm = TRUE))
delay<- filter(delay,count>20,dest!="HNL")
ggplot(data = delay,mapping = aes(x=dist,y=delay))+
geom_point(aes(size=count),alpha=1/3)+
geom_smooth(se=FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#the pipe, %>%:
delay <- flights %>%
group_by(dest)%>%
summarise(
count=n(),
dist = mean(distance,na.rm=TRUE),
delay= mean(arr_delay, na.rm = TRUE)
)%>%
filter(count > 20, dest != "HNL")
#missing values
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay)
)
ggplot(data = delays, mapping = aes(x = delay)) +
geom_freqpoly(binwidth = 10)
not_cancelled<- flights %>%
filter(!is.na(arr_delay))
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
5.6.7 Exercises
Brainstorm at least 5 different ways to assess the typical delay characteristics of a group of flights. Consider the following scenarios:
delay_char <-
flights %>%
group_by(flight) %>%
summarise(n = n(),
fifteen_early = mean(arr_delay == -15, na.rm = TRUE),
fifteen_late = mean(arr_delay == 15, na.rm = TRUE),
ten_always = mean(arr_delay == 10, na.rm = TRUE),
thirty_early = mean(arr_delay == -30, na.rm = TRUE),
thirty_late = mean(arr_delay == 30, na.rm = TRUE),
percentage_on_time = mean(arr_delay == 0, na.rm = TRUE),
twohours = mean(arr_delay > 120, na.rm = TRUE)) %>%
map_if(is_double, round, 2) %>%
as_tibble()
A flight is 15 minutes early 50% of the time, and 15 minutes late 50% of the time.
delay_char %>%
filter(fifteen_early == 0.5, fifteen_late == 0.5)
## # A tibble: 0 x 9
## # ... with 9 variables: flight <int>, n <int>, fifteen_early <dbl>,
## # fifteen_late <dbl>, ten_always <dbl>, thirty_early <dbl>,
## # thirty_late <dbl>, percentage_on_time <dbl>, twohours <dbl>
A flight is always 10 minutes late.
delay_char %>%
filter(ten_always == 1)
## # A tibble: 5 x 9
## flight n fifteen_early fifteen_late ten_always thirty_early thirty_late
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2254 1 0 0 1 0 0
## 2 3656 1 0 0 1 0 0
## 3 3785 2 0 0 1 0 0
## 4 3880 1 0 0 1 0 0
## 5 5854 1 0 0 1 0 0
## # ... with 2 more variables: percentage_on_time <dbl>, twohours <dbl>
A flight is 30 minutes early 50% of the time, and 30 minutes late 50% of the time.
delay_char %>%
filter(thirty_early == 0.5 & thirty_late == 0.5)
## # A tibble: 0 x 9
## # ... with 9 variables: flight <int>, n <int>, fifteen_early <dbl>,
## # fifteen_late <dbl>, ten_always <dbl>, thirty_early <dbl>,
## # thirty_late <dbl>, percentage_on_time <dbl>, twohours <dbl>
99% of the time a flight is on time. 1% of the time it’s 2 hours late.
delay_char %>%
filter(percentage_on_time == 0.99 & twohours == 0.01)
## # A tibble: 0 x 9
## # ... with 9 variables: flight <int>, n <int>, fifteen_early <dbl>,
## # fifteen_late <dbl>, ten_always <dbl>, thirty_early <dbl>,
## # thirty_late <dbl>, percentage_on_time <dbl>, twohours <dbl>
Which is more important: arrival delay or departure delay? depends
Come up with another approach that will give you the same output as not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count()).
not_cancelled <-
flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
not_cancelled %>%
count(dest)
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
# and
not_cancelled %>%
count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows
# (without using count()).
#######################
not_cancelled %>%
group_by(dest) %>%
summarise(n = n())
## # A tibble: 104 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ... with 94 more rows
# and
not_cancelled %>%
group_by(tailnum) %>%
tally(wt = distance)
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows
# or
not_cancelled %>%
group_by(tailnum) %>%
summarize(n = sum(distance))
## # A tibble: 4,037 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 239143
## 3 N10156 109664
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 24616
## 7 N10575 139903
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # ... with 4,027 more rows
Our definition of cancelled flights (is.na(dep_delay) | is.na(arr_delay) ) is slightly suboptimal. Why? Which is the most important column?
Because if a flight didn’t leave then it was cancelled. If the condition is.na(dep_delay) is met, then the flight was cancelled.
Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
Look at the number of cancelled flights per day. Is there a pattern? Is the proportion of cancelled flights related to the average delay?
flights %>%
group_by(day) %>%
summarise(cancelled = mean(is.na(dep_delay)),
mean_dep = mean(dep_delay, na.rm = T),
mean_arr = mean(arr_delay, na.rm = T)) %>%
ggplot(aes(y = cancelled)) +
geom_point(aes(x = mean_dep), colour = "red") +
geom_point(aes(x = mean_arr), colour = "blue") +
labs(x = "Avg delay per day", y = "Cancelled flights p day")
Which carrier has the worst delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights %>% group_by(carrier, dest) %>% summarise(n()))
flights %>%
summarise(n_car = n_distinct(carrier),
n_air = n_distinct(dest),
n_or = n_distinct(origin))
## # A tibble: 1 x 3
## n_car n_air n_or
## <int> <int> <int>
## 1 16 105 3
flights %>%
group_by(carrier) %>%
mutate(avg_carrier = mean(dep_delay, na.rm = T)) %>%
group_by(carrier, origin) %>%
mutate(origin_mean = mean(dep_delay, na.rm = T),
deviations = origin_mean - avg_carrier) %>%
summarise(deviations = mean(deviations), mean = mean(avg_carrier)) %>%
ggplot(aes(origin, deviations)) + geom_col() + facet_wrap(~ carrier)
Tearing out the effect is not straight forward but we can make some informed guesses. For example, whenever there are substantial deviations, they seem to be higher in EWR airport rather than in other airports. On the other hand, there are some airlines that look particular bad like 9E and MQ. And the same pattern is not found on the vast majority of other airlines, which would suggest it’s an airport issues rather than an airline issue.
flights %>%
group_by(carrier, dest) %>%
summarise(mean_departure = mean(dep_delay, na.rm = T),
mean_arrival = mean(arr_delay, na.rm = T))
## # A tibble: 314 x 4
## # Groups: carrier [16]
## carrier dest mean_departure mean_arrival
## <chr> <chr> <dbl> <dbl>
## 1 9E ATL 0.965 0.857
## 2 9E AUS 19 -3.5
## 3 9E AVL -2.6 -12.1
## 4 9E BGR 34 NaN
## 5 9E BNA 19.1 9.29
## 6 9E BOS 14.8 5.66
## 7 9E BTV -4.5 -2.5
## 8 9E BUF 15.5 6.71
## 9 9E BWI 17.5 8.73
## 10 9E CAE -3.67 6
## # ... with 304 more rows
For each plane, count the number of flights before the first delay of greater than 1 hour.
flights %>%
mutate(dep_date = time_hour) %>%
group_by(tailnum) %>%
arrange(dep_date) %>%
mutate(cumulative = !cumany(arr_delay > 60)) %>%
filter(cumulative == T) %>%
tally(sort = TRUE)
## # A tibble: 3,744 x 2
## tailnum n
## <chr> <int>
## 1 N705TW 97
## 2 N765US 97
## 3 N12125 94
## 4 N320AA 94
## 5 N13110 91
## 6 N3763D 82
## 7 N58101 82
## 8 N17122 81
## 9 N961UW 80
## 10 N950UW 79
## # ... with 3,734 more rows
or
flights %>%
group_by(tailnum) %>%
arrange(time_hour) %>%
mutate(cum = arr_delay > 60,
cum_any = cumsum(cum)) %>%
filter(cum_any < 1) %>%
tally(sort = TRUE)
## # A tibble: 3,744 x 2
## tailnum n
## <chr> <int>
## 1 N705TW 97
## 2 N765US 97
## 3 N12125 94
## 4 N320AA 94
## 5 N13110 91
## 6 N3763D 82
## 7 N58101 82
## 8 N17122 81
## 9 N961UW 80
## 10 N950UW 79
## # ... with 3,734 more rows
What does the sort argument to count() do. When might you use it?
flights %>%
count(flight, sort = T)
## # A tibble: 3,844 x 2
## flight n
## <int> <int>
## 1 15 968
## 2 27 898
## 3 181 882
## 4 301 871
## 5 161 786
## 6 695 782
## 7 1109 716
## 8 745 711
## 9 359 709
## 10 1 701
## # ... with 3,834 more rows
Refer back to the table of useful mutate and filtering functions. Describe how each operation changes when you combine it with grouping.
Which one?
Which plane (tailnum) has the worst on-time record?
flights %>%
filter(!is.na(arr_delay)) %>%
group_by(tailnum) %>%
summarise(prop_time = sum(arr_delay <= 30)/n(),
mean_arr = mean(arr_delay, na.rm = TRUE),
fl = n()) %>%
arrange(desc(prop_time))
## # A tibble: 4,037 x 4
## tailnum prop_time mean_arr fl
## <chr> <dbl> <dbl> <int>
## 1 N103US 1 -6.93 46
## 2 N1200K 1 -9.38 21
## 3 N121DE 1 15 2
## 4 N137DL 1 -5 1
## 5 N143DA 1 24 1
## 6 N14628 1 -6 1
## 7 N14629 1 -16.2 4
## 8 N1607B 1 -16 3
## 9 N1608 1 -11.3 3
## 10 N1610D 1 -14.5 2
## # ... with 4,027 more rows
All these flights are always late.
What time of day should you fly if you want to avoid delays as much as possible?
flights %>%
group_by(hour) %>%
filter(!is.na(dep_delay)) %>%
summarise( delay = mean( dep_delay > 0 , na.rm = T)) %>%
ggplot(aes(hour, delay, fill = delay)) + geom_col()
# or
flights %>%
group_by(hour) %>%
summarize(m = mean(dep_delay, na.rm = TRUE),
sd = sd(dep_delay, na.rm = TRUE),
low_ci = m - 2*sd,
high_ci = m + 2*sd,
n = n()) %>%
ggplot(aes(hour, m, ymin = low_ci, ymax = high_ci)) +
geom_pointrange()
## Warning: Removed 1 rows containing missing values (geom_pointrange).
Worst time to flight is in the early evening. Although that happens because more flights go out on that specific time also.
For each destination, compute the total minutes of delay. For each, flight, compute the proportion of the total delay for its destination.
flights %>%
group_by(dest) %>%
filter(!is.na(dep_delay)) %>%
summarise(tot_mins = sum(dep_delay[dep_delay > 0]))
## # A tibble: 104 x 2
## dest tot_mins
## <chr> <dbl>
## 1 ABQ 4076
## 2 ACK 2603
## 3 ALB 10934
## 4 ANC 105
## 5 ATL 254414
## 6 AUS 36623
## 7 AVL 3092
## 8 BDL 8471
## 9 BGR 8170
## 10 BHM 8817
## # ... with 94 more rows
flights %>%
filter(!is.na(dep_delay)) %>%
group_by(tailnum, dest) %>%
summarise(m = mean(dep_delay > 0), n = n()) %>%
arrange(desc(m))
## # A tibble: 44,218 x 4
## # Groups: tailnum [4,037]
## tailnum dest m n
## <chr> <chr> <dbl> <int>
## 1 D942DN MCO 1 2
## 2 N10156 BDL 1 1
## 3 N10156 CLE 1 1
## 4 N10156 DCA 1 2
## 5 N10156 GSO 1 1
## 6 N10156 GSP 1 1
## 7 N10156 IAD 1 1
## 8 N10156 IND 1 2
## 9 N10156 MHT 1 1
## 10 N10156 MSN 1 1
## # ... with 44,208 more rows
Delays are typically temporally correlated: even once the problem that caused the initial delay has been resolved, later flights are delayed to allow earlier flights to leave. Using lag() explore how the delay of a flight is related to the delay of the immediately preceding flight.
flights %>%
mutate(new_sched_dep_time = lubridate::make_datetime(year, month, day, hour, minute)) %>%
arrange(new_sched_dep_time) %>%
mutate(prev_time = lag(dep_delay)) %>%
# filter(between(dep_delay, 0, 300), between(prev_time, 0, 300)) %>% # play with this one
select(origin, new_sched_dep_time, dep_delay, prev_time) %>%
ggplot(aes(dep_delay, prev_time)) + geom_point(alpha = 1/10) +
geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 14167 rows containing non-finite values (stat_smooth).
## Warning: Removed 14167 rows containing missing values (geom_point).
# or
flights %>%
select(year, month, day, hour, dest, dep_delay) %>%
group_by(dest) %>%
mutate(lag_delay = lag(dep_delay)) %>%
arrange(dest) %>%
filter(!is.na(lag_delay)) %>%
summarize(cor = cor(dep_delay, lag_delay, use = "complete.obs"),
n = n()) %>%
arrange(desc(cor)) %>%
filter(row_number(desc(cor)) %in% 1:10)
## # A tibble: 10 x 3
## dest cor n
## <chr> <dbl> <int>
## 1 SBN 0.687 9
## 2 ORD 0.403 16641
## 3 HDN 0.365 13
## 4 ATL 0.351 16897
## 5 SFO 0.344 13229
## 6 BZN 0.325 34
## 7 BOS 0.323 15048
## 8 FLL 0.312 11933
## 9 BNA 0.302 6104
## 10 MDW 0.296 4043
Although there is a lot of noise, you can see a sort of straight line going on there. There is also a correlation between the lagged values in many of the destionatinons. So correlation between flights is mostly in specific airports.
Look at each destination. Can you find flights that are suspiciously fast? (i.e. flights that represent a potential data entry error). Compute the air time a flight relative to the shortest flight to that destination. Which flights were most delayed in the air?
# (1)
flights %>%
group_by(dest) %>%
arrange(air_time) %>%
slice(1:5) %>%
select(tailnum, sched_dep_time, sched_arr_time, air_time) %>%
arrange(air_time)
## Adding missing grouping variables: `dest`
## # A tibble: 517 x 5
## # Groups: dest [105]
## dest tailnum sched_dep_time sched_arr_time air_time
## <chr> <chr> <int> <int> <dbl>
## 1 BDL N16911 1315 1411 20
## 2 BDL N12167 527 628 20
## 3 BDL N27200 851 954 21
## 4 BDL N13955 1315 1411 21
## 5 BDL N12160 1329 1426 21
## 6 BOS N947UW 1500 1608 21
## 7 PHL N13913 2129 2224 21
## 8 PHL N12921 2130 2225 21
## 9 PHL N8501F 1935 2056 21
## 10 PHL N22909 2129 2224 22
## # ... with 507 more rows
# (2)
flights %>%
group_by(dest) %>%
mutate(shortest = air_time - min(air_time, na.rm = T)) %>%
top_n(1, air_time) %>%
arrange(-air_time) %>%
select(tailnum, sched_dep_time, sched_arr_time, shortest)
## Warning in min(air_time, na.rm = T): no non-missing arguments to min; returning
## Inf
## Adding missing grouping variables: `dest`
## # A tibble: 112 x 5
## # Groups: dest [104]
## dest tailnum sched_dep_time sched_arr_time shortest
## <chr> <chr> <int> <int> <dbl>
## 1 HNL N77066 1335 1836 133
## 2 SFO N703TW 1730 2110 195
## 3 LAX N178DN 1815 2146 165
## 4 ANC N572UA 1615 1953 46
## 5 SAN N794JB 1620 1934 134
## 6 SNA N16709 1819 2137 131
## 7 BUR N624JB 1730 2046 110
## 8 LAS N852UA 1729 2013 143
## 9 SJC N632JB 1830 2205 91
## 10 SEA N17245 1727 2040 119
## # ... with 102 more rows
Find all destinations that are flown by at least two carriers. Use that information to rank the carriers.
flights %>%
group_by(dest) %>%
filter(n_distinct(carrier) > 2) %>%
group_by(carrier) %>%
summarise(n = n_distinct(dest)) %>%
arrange(-n)
## # A tibble: 15 x 2
## carrier n
## <chr> <int>
## 1 DL 37
## 2 EV 36
## 3 UA 36
## 4 9E 35
## 5 B6 30
## 6 AA 17
## 7 MQ 17
## 8 WN 9
## 9 OO 5
## 10 US 5
## 11 VX 3
## 12 YV 3
## 13 FL 2
## 14 AS 1
## 15 F9 1
#Exercise 7
library(tidyverse)
7.3.4 Exercises Explore the distribution of each of the x, y, and z variables in diamonds. What do you learn? Think about a diamond and how you might decide which dimension is the length, width, and depth.
head(diamonds)
## # A tibble: 6 x 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
ggplot(data=diamonds, mapping = aes(x = x),binwidth=0.1)+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=diamonds, mapping = aes(x = y),binwidth=0.1)+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=diamonds, mapping = aes(x = z),binwidth=0.1)+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Explore the distribution of price. Do you discover anything unusual or surprising? (Hint: Carefully think about the binwidth and make sure you try a wide range of values.)
ggplot(data=diamonds,mapping = aes(x=price), binwidth=0.01)+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
How many diamonds are 0.99 carat? How many are 1 carat? What do you think is the cause of the difference?
diamonds%>%
filter(carat == 0.99)%>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 23
diamonds%>%
filter(carat==1)%>%
count()
## # A tibble: 1 x 1
## n
## <int>
## 1 1558
ggplot(data=diamonds,mapping = aes(x=carat))+
geom_histogram(binwidth = 0.01)+
xlim(c(0.97,1.03))
## Warning: Removed 48621 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Compare and contrast coord_cartesian() vs xlim() or ylim() when zooming in on a histogram. What happens if you leave binwidth unset? What happens if you try and zoom so only half a bar shows?
ggplot(data=diamonds,mapping = aes(x=carat))+
geom_histogram()+
xlim(c(0.97,1.035))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 48621 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
ggplot(data=diamonds,mapping = aes(x=carat))+
geom_histogram()+
coord_cartesian(xlim=c(0.97,1.035))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=diamonds,mapping = aes(x=carat))+
geom_histogram(binwidth = 0.01)+
coord_cartesian(xlim=c(0.97,1.035))
7.4.1 Exercises
What happens to missing values in a histogram? What happens to missing values in a bar chart? Why is there a difference?
geom_histogram() removed rows with NA values; Apparently geom_bar() doesn’t remove NA, but rather treat it as another factor or category.
What does na.rm = TRUE do in mean() and sum()?
summarise(diamonds, delay = mean(carat, na.rm = TRUE))
## # A tibble: 1 x 1
## delay
## <dbl>
## 1 0.798
To ignore NA’s when calculating mean and sum.