library(nycflights13)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Note: you will probablly have to install nycflights13 using install.packages and the load it with the library command. nycflights13 is a relational database containing the following tables (data frames). This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley
| data frame | description |
|---|---|
| airlines | Airline names |
| airports | Airport metadata |
| flights | Flights data |
| planes | Planes meta data |
| weather | Hourly data |
This is data about all airline flights into and out of New York City in 2021. This project will parallel Chapter 5 in Wickham and Hadley. You should start reading this chapter now, and try to complete reading it by the end of this week.
flights
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
nycflights13 if necessary.Write up your solutions to the exercises in 5.2.4 in this document, including the code chunks you use to determine the answer.
flights %>%
filter(arr_delay >= 120)
## # A tibble: 10,200 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # … with 10,190 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dest == "IAH")
## # A tibble: 7,198 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # … with 7,188 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dest == "HOU")
## # A tibble: 2,115 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1208 1158 10 1540 1502
## 2 2013 1 1 1306 1300 6 1622 1610
## 3 2013 1 1 1708 1700 8 2037 2005
## 4 2013 1 1 2030 2035 -5 2354 2342
## 5 2013 1 2 734 700 34 1045 1025
## 6 2013 1 2 1156 1158 -2 1517 1502
## 7 2013 1 2 1319 1305 14 1633 1615
## 8 2013 1 2 1810 1655 75 2146 2000
## 9 2013 1 2 2031 2035 -4 2353 2342
## 10 2013 1 3 704 700 4 1036 1025
## # … with 2,105 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, carrier == "UA")
## # A tibble: 58,665 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 554 558 -4 740 728
## 4 2013 1 1 558 600 -2 924 917
## 5 2013 1 1 558 600 -2 923 937
## 6 2013 1 1 559 600 -1 854 902
## 7 2013 1 1 607 607 0 858 915
## 8 2013 1 1 611 600 11 945 931
## 9 2013 1 1 623 627 -4 933 932
## 10 2013 1 1 628 630 -2 1016 947
## # … with 58,655 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, carrier == "AA")
## # A tibble: 32,729 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 542 540 2 923 850
## 2 2013 1 1 558 600 -2 753 745
## 3 2013 1 1 559 600 -1 941 910
## 4 2013 1 1 606 610 -4 858 910
## 5 2013 1 1 623 610 13 920 915
## 6 2013 1 1 628 630 -2 1137 1140
## 7 2013 1 1 629 630 -1 824 810
## 8 2013 1 1 635 635 0 1028 940
## 9 2013 1 1 656 700 -4 854 850
## 10 2013 1 1 656 659 -3 949 959
## # … with 32,719 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, carrier == "DL")
## # A tibble: 48,110 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 554 600 -6 812 837
## 2 2013 1 1 602 610 -8 812 820
## 3 2013 1 1 606 610 -4 837 845
## 4 2013 1 1 615 615 0 833 842
## 5 2013 1 1 653 700 -7 936 1009
## 6 2013 1 1 655 655 0 1021 1030
## 7 2013 1 1 655 700 -5 1037 1045
## 8 2013 1 1 655 700 -5 1002 1020
## 9 2013 1 1 657 700 -3 959 1013
## 10 2013 1 1 658 700 -2 944 939
## # … with 48,100 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month == 7 | month == 8 | month == 9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 1 1 2029 212 236 2359
## 2 2013 7 1 2 2359 3 344 344
## 3 2013 7 1 29 2245 104 151 1
## 4 2013 7 1 43 2130 193 322 14
## 5 2013 7 1 44 2150 174 300 100
## 6 2013 7 1 46 2051 235 304 2358
## 7 2013 7 1 48 2001 287 308 2305
## 8 2013 7 1 58 2155 183 335 43
## 9 2013 7 1 100 2146 194 327 30
## 10 2013 7 1 100 2245 135 337 135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, arr_delay > 120, dep_delay <= 0)
## # A tibble: 29 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 27 1419 1420 -1 1754 1550
## 2 2013 10 7 1350 1350 0 1736 1526
## 3 2013 10 7 1357 1359 -2 1858 1654
## 4 2013 10 16 657 700 -3 1258 1056
## 5 2013 11 1 658 700 -2 1329 1015
## 6 2013 3 18 1844 1847 -3 39 2219
## 7 2013 4 17 1635 1640 -5 2049 1845
## 8 2013 4 18 558 600 -2 1149 850
## 9 2013 4 18 655 700 -5 1213 950
## 10 2013 5 22 1827 1830 -3 2217 2010
## # … with 19 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dep_delay >= 60, dep_delay - arr_delay > 30)
## # A tibble: 1,844 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 2205 1720 285 46 2040
## 2 2013 1 1 2326 2130 116 131 18
## 3 2013 1 3 1503 1221 162 1803 1555
## 4 2013 1 3 1839 1700 99 2056 1950
## 5 2013 1 3 1850 1745 65 2148 2120
## 6 2013 1 3 1941 1759 102 2246 2139
## 7 2013 1 3 1950 1845 65 2228 2227
## 8 2013 1 3 2015 1915 60 2135 2111
## 9 2013 1 3 2257 2000 177 45 2224
## 10 2013 1 4 1917 1700 137 2135 1950
## # … with 1,834 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, between(dep_time, 0000, 0600))
## # A tibble: 9,344 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 9,334 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, between(month, 7, 9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 1 1 2029 212 236 2359
## 2 2013 7 1 2 2359 3 344 344
## 3 2013 7 1 29 2245 104 151 1
## 4 2013 7 1 43 2130 193 322 14
## 5 2013 7 1 44 2150 174 300 100
## 6 2013 7 1 46 2051 235 304 2358
## 7 2013 7 1 48 2001 287 308 2305
## 8 2013 7 1 58 2155 183 335 43
## 9 2013 7 1 100 2146 194 327 30
## 10 2013 7 1 100 2245 135 337 135
## # … with 86,316 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, !between(dep_time, 0601, 2359))
## # A tibble: 9,373 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 9,363 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
dep_time.dep_delay, 8713 have a missing arr_time, 9430 have a missing arr_delay, and 9430 have a missing air_time.summary(flights$dep_time) # NA's means missing data
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 907 1401 1349 1744 2400 8255
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
NA ^ 0
## [1] 1
NA | TRUE
## [1] TRUE
Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.
arrange(flights, desc(is.na(dep_time)), dep_time)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 NA 1630 NA NA 1815
## 2 2013 1 1 NA 1935 NA NA 2240
## 3 2013 1 1 NA 1500 NA NA 1825
## 4 2013 1 1 NA 600 NA NA 901
## 5 2013 1 2 NA 1540 NA NA 1747
## 6 2013 1 2 NA 1620 NA NA 1746
## 7 2013 1 2 NA 1355 NA NA 1459
## 8 2013 1 2 NA 1420 NA NA 1644
## 9 2013 1 2 NA 1321 NA NA 1536
## 10 2013 1 2 NA 1545 NA NA 1910
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 7 2040 2123 -43 40 2352
## 2 2013 2 3 2022 2055 -33 2240 2338
## 3 2013 11 10 1408 1440 -32 1549 1559
## 4 2013 1 11 1900 1930 -30 2233 2243
## 5 2013 1 29 1703 1730 -27 1947 1957
## 6 2013 8 9 729 755 -26 1002 955
## 7 2013 10 23 1907 1932 -25 2143 2143
## 8 2013 3 30 2030 2055 -25 2213 2250
## 9 2013 3 2 1431 1455 -24 1601 1631
## 10 2013 5 5 934 958 -24 1225 1309
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(distance / air_time))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 5 25 1709 1700 9 1923 1937
## 2 2013 7 2 1558 1513 45 1745 1719
## 3 2013 5 13 2040 2025 15 2225 2226
## 4 2013 3 23 1914 1910 4 2045 2043
## 5 2013 1 12 1559 1600 -1 1849 1917
## 6 2013 11 17 650 655 -5 1059 1150
## 7 2013 2 21 2355 2358 -3 412 438
## 8 2013 11 17 759 800 -1 1212 1255
## 9 2013 11 16 2003 1925 38 17 36
## 10 2013 11 16 2349 2359 -10 402 440
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(distance))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 857 900 -3 1516 1530
## 2 2013 1 2 909 900 9 1525 1530
## 3 2013 1 3 914 900 14 1504 1530
## 4 2013 1 4 900 900 0 1516 1530
## 5 2013 1 5 858 900 -2 1519 1530
## 6 2013 1 6 1019 900 79 1558 1530
## 7 2013 1 7 1042 900 102 1620 1530
## 8 2013 1 8 901 900 1 1504 1530
## 9 2013 1 9 641 900 1301 1242 1530
## 10 2013 1 10 859 900 -1 1449 1530
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, distance)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 27 NA 106 NA NA 245
## 2 2013 1 3 2127 2129 -2 2222 2224
## 3 2013 1 4 1240 1200 40 1333 1306
## 4 2013 1 4 1829 1615 134 1937 1721
## 5 2013 1 4 2128 2129 -1 2218 2224
## 6 2013 1 5 1155 1200 -5 1241 1306
## 7 2013 1 6 2125 2129 -4 2224 2224
## 8 2013 1 7 2124 2129 -5 2212 2224
## 9 2013 1 8 2127 2130 -3 2304 2225
## 10 2013 1 9 2126 2129 -3 2217 2224
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Write up your solutions to the exercises in 5.3.1 in this document, including the code chunks you use to determine the answer.
select(flights, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
select(flights, "dep_time", "dep_delay", "arr_time", "arr_delay")
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
select(flights, c("dep_time", "dep_delay", "arr_time", "arr_delay"))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
select(flights, c(dep_time, dep_delay, arr_time, arr_delay))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
select(flights, starts_with("dep_"), starts_with("arr_"))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
select(flights, dep_time, dep_delay, arr_time, arr_delay, dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
vars <- c("year", "month", "day", "dep_delay", "arr_delay")
select(flights, one_of(vars))
## # A tibble: 336,776 x 5
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2 11
## 2 2013 1 1 4 20
## 3 2013 1 1 2 33
## 4 2013 1 1 -1 -18
## 5 2013 1 1 -6 -25
## 6 2013 1 1 -4 12
## 7 2013 1 1 -5 19
## 8 2013 1 1 -3 -14
## 9 2013 1 1 -3 -8
## 10 2013 1 1 -2 8
## # … with 336,766 more rows
select(flights, contains("TIME"))
## # A tibble: 336,776 x 6
## dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
## <int> <int> <int> <int> <dbl> <dttm>
## 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 533 529 850 830 227 2013-01-01 05:00:00
## 3 542 540 923 850 160 2013-01-01 05:00:00
## 4 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 554 600 812 837 116 2013-01-01 06:00:00
## 6 554 558 740 728 150 2013-01-01 05:00:00
## 7 555 600 913 854 158 2013-01-01 06:00:00
## 8 557 600 709 723 53 2013-01-01 06:00:00
## 9 557 600 838 846 140 2013-01-01 06:00:00
## 10 558 600 753 745 138 2013-01-01 06:00:00
## # … with 336,766 more rows
Write up your solutions to the exercises in 5.5.2 in this document, including the code chunks you use to determine the answer.
#select(flights, dep_time, sched_dep_time)
minutes <-
mutate(flights,
dep_time_minutes = dep_time %/% 100 * 60 + dep_time %% 100, # hours extraction + minutes extraction
sched_dep_time_minutes = sched_dep_time %/% 100 * 60 + sched_dep_time %% 100)
select(minutes, dep_time, sched_dep_time, dep_time_minutes, sched_dep_time_minutes)
## # A tibble: 336,776 x 4
## dep_time sched_dep_time dep_time_minutes sched_dep_time_minutes
## <int> <int> <dbl> <dbl>
## 1 517 515 317 315
## 2 533 529 333 329
## 3 542 540 342 340
## 4 544 545 344 345
## 5 554 600 354 360
## 6 554 558 354 358
## 7 555 600 355 360
## 8 557 600 357 360
## 9 557 600 357 360
## 10 558 600 358 360
## # … with 336,766 more rows
#select(flights, air_time, arr_time, dep_time)
compare <- mutate(flights, air_time,
comp_time = arr_time - dep_time)
select(compare, air_time, comp_time)
## # A tibble: 336,776 x 2
## air_time comp_time
## <dbl> <int>
## 1 227 313
## 2 227 317
## 3 160 381
## 4 183 460
## 5 116 258
## 6 150 186
## 7 158 358
## 8 53 152
## 9 140 281
## 10 138 195
## # … with 336,766 more rows
compare2 <- mutate(flights, air_time,
compare_time = (arr_time %/% 100 * 60 + arr_time %% 100) - (dep_time %/% 100 * 60 + dep_time %% 100))
select(compare2, air_time, compare_time)
## # A tibble: 336,776 x 2
## air_time compare_time
## <dbl> <dbl>
## 1 227 193
## 2 227 197
## 3 160 221
## 4 183 260
## 5 116 138
## 6 150 106
## 7 158 198
## 8 53 72
## 9 140 161
## 10 138 115
## # … with 336,766 more rows
#arrange(flights, desc(dep_delay))
delay_rank <- mutate(flights,
rank = min_rank(desc(dep_delay)))
delay_rank2 <- filter(delay_rank,
rank <= 10)
delay_rank3 <- arrange(delay_rank2, rank)
select(delay_rank3, rank, dep_delay, carrier, flight)
## # A tibble: 10 x 4
## rank dep_delay carrier flight
## <int> <dbl> <chr> <int>
## 1 1 1301 HA 51
## 2 2 1137 MQ 3535
## 3 3 1126 MQ 3695
## 4 4 1014 AA 177
## 5 5 1005 MQ 3075
## 6 6 960 DL 2391
## 7 7 911 DL 2119
## 8 8 899 DL 2007
## 9 9 898 DL 2047
## 10 10 896 AA 172
1:3 + 1:10
## Warning in 1:3 + 1:10: longer object length is not a multiple of shorter object
## length
## [1] 2 4 6 5 7 9 8 10 12 11
?Trig
Write up your solutions to the exercises in 5.6.7 in this document, including the code chunks you use to determine the answer.
flights %>%
group_by(flight) %>%
summarise(early = mean(arr_delay <= -15, na.rm = TRUE),
late = mean(arr_delay >= 15, na.rm = TRUE)) %>%
filter(early == .5,
late == .5)
## # A tibble: 21 x 3
## flight early late
## <int> <dbl> <dbl>
## 1 107 0.5 0.5
## 2 2072 0.5 0.5
## 3 2366 0.5 0.5
## 4 2500 0.5 0.5
## 5 2552 0.5 0.5
## 6 3495 0.5 0.5
## 7 3505 0.5 0.5
## 8 3518 0.5 0.5
## 9 3544 0.5 0.5
## 10 3651 0.5 0.5
## # … with 11 more rows
flights %>%
group_by(flight) %>%
summarise(late_10 = mean(arr_delay >= 10, na.rm = TRUE)) %>%
filter(late_10 == 1) # always means 100%
## # A tibble: 98 x 2
## flight late_10
## <int> <dbl>
## 1 94 1
## 2 730 1
## 3 974 1
## 4 1084 1
## 5 1226 1
## 6 1510 1
## 7 1514 1
## 8 1859 1
## 9 1868 1
## 10 1998 1
## # … with 88 more rows
flights %>%
group_by(flight) %>%
summarise(early_30 = mean(arr_delay <= -30, na.rm = TRUE),
late_30 = mean(arr_delay >= 30, na.rm = TRUE)) %>%
filter(early_30 == .5,
late_30 == .5)
## # A tibble: 3 x 3
## flight early_30 late_30
## <int> <dbl> <dbl>
## 1 3651 0.5 0.5
## 2 3916 0.5 0.5
## 3 3951 0.5 0.5
flights %>%
group_by(flight) %>%
summarise(ontime = mean(arr_delay <= 0, na.rm = TRUE),
late_120 = mean(arr_delay >= 120, na.rm = TRUE)) %>%
filter(ontime == .99,
late_120 == .01)
## # A tibble: 0 x 3
## # … with 3 variables: flight <int>, ontime <dbl>, late_120 <dbl>
not_cancelled %>% count(dest) and not_cancelled %>% count(tailnum, wt = distance) (without using count())
not_cancelled <- flights %>%
count(dest)
not_cancelled
## # A tibble: 105 x 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 265
## 3 ALB 439
## 4 ANC 8
## 5 ATL 17215
## 6 AUS 2439
## 7 AVL 275
## 8 BDL 443
## 9 BGR 375
## 10 BHM 297
## # … with 95 more rows
not_cancelled2 <- flights %>%
count(tailnum, wt = distance)
not_cancelled2
## # A tibble: 4,044 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 250866
## 3 N10156 115966
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 25157
## 7 N10575 150194
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # … with 4,034 more rows
not_cancelled_dest <- flights %>%
group_by(dest) %>%
summarise(n())
not_cancelled_dest
## # A tibble: 105 x 2
## dest `n()`
## <chr> <int>
## 1 ABQ 254
## 2 ACK 265
## 3 ALB 439
## 4 ANC 8
## 5 ATL 17215
## 6 AUS 2439
## 7 AVL 275
## 8 BDL 443
## 9 BGR 375
## 10 BHM 297
## # … with 95 more rows
not_cancelled_tailnum <- flights %>%
group_by(tailnum) %>%
summarise(n = sum(distance))
not_cancelled_tailnum
## # A tibble: 4,044 x 2
## tailnum n
## <chr> <dbl>
## 1 D942DN 3418
## 2 N0EGMQ 250866
## 3 N10156 115966
## 4 N102UW 25722
## 5 N103US 24619
## 6 N104UW 25157
## 7 N10575 150194
## 8 N105UW 23618
## 9 N107US 21677
## 10 N108UW 32070
## # … with 4,034 more rows
flights %>%
filter(is.na(dep_delay) | is.na(arr_delay)) %>%
select(dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay)
## # A tibble: 9,430 x 6
## dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay
## <int> <int> <dbl> <int> <int> <dbl>
## 1 1525 1530 -5 1934 1805 NA
## 2 1528 1459 29 2002 1647 NA
## 3 1740 1745 -5 2158 2020 NA
## 4 1807 1738 29 2251 2103 NA
## 5 1939 1840 59 29 2151 NA
## 6 1952 1930 22 2358 2207 NA
## 7 2016 1930 46 NA 2220 NA
## 8 NA 1630 NA NA 1815 NA
## 9 NA 1935 NA NA 2240 NA
## 10 NA 1500 NA NA 1825 NA
## # … with 9,420 more rows
bad_dep_airports <- flights %>%
group_by(origin) %>%
summarise(worst_dep_airport = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(worst_dep_airport))
bad_dep_airports
## # A tibble: 3 x 2
## origin worst_dep_airport
## <chr> <dbl>
## 1 EWR 15.1
## 2 JFK 12.1
## 3 LGA 10.3
bad_arr_airports <- flights %>%
group_by(dest) %>%
summarise(worst_arr_airport = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(worst_arr_airport))
bad_arr_airports
## # A tibble: 105 x 2
## dest worst_arr_airport
## <chr> <dbl>
## 1 CAE 41.8
## 2 TUL 33.7
## 3 OKC 30.6
## 4 JAC 28.1
## 5 TYS 24.1
## 6 MSN 20.2
## 7 RIC 20.1
## 8 CAK 19.7
## 9 DSM 19.0
## 10 GRR 18.2
## # … with 95 more rows
bad_carrier <- flights %>%
group_by(carrier) %>%
summarise(worst_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(worst_delay))
bad_carrier
## # A tibble: 16 x 2
## carrier worst_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
flights %>%
count(carrier)
## # A tibble: 16 x 2
## carrier n
## <chr> <int>
## 1 9E 18460
## 2 AA 32729
## 3 AS 714
## 4 B6 54635
## 5 DL 48110
## 6 EV 54173
## 7 F9 685
## 8 FL 3260
## 9 HA 342
## 10 MQ 26397
## 11 OO 32
## 12 UA 58665
## 13 US 20536
## 14 VX 5162
## 15 WN 12275
## 16 YV 601